]> git.donarmstrong.com Git - mothur.git/commitdiff
added pca command
authorwestcott <westcott>
Mon, 4 Jan 2010 18:12:00 +0000 (18:12 +0000)
committerwestcott <westcott>
Mon, 4 Jan 2010 18:12:00 +0000 (18:12 +0000)
13 files changed:
Mothur.xcodeproj/project.pbxproj
aligncommand.cpp
bayesian.cpp
bayesian.h
classifyseqscommand.cpp
classifyseqscommand.h
commandfactory.cpp
getoturepcommand.cpp
getoturepcommand.h
pcacommand.cpp [new file with mode: 0644]
pcacommand.h [new file with mode: 0644]
readdistcommand.cpp
unifracunweightedcommand.cpp

index 1fbb5b7fec6768db74d64df3c32e05500f0bfb1b..16d9d8eb82ef029c9debdd6b4f920e372c0c41b2 100644 (file)
                A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; };
                A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
                A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
+               A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D176CE10F2558500159497 /* pcacommand.cpp */; };
                A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; };
                A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; };
                A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; };
                A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; };
                A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; };
                A7B0450D106CEEC90046FC83 /* slayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slayer.cpp; sourceTree = SOURCE_ROOT; };
+               A7D176CD10F2558500159497 /* pcacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pcacommand.h; sourceTree = SOURCE_ROOT; };
+               A7D176CE10F2558500159497 /* pcacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pcacommand.cpp; sourceTree = SOURCE_ROOT; };
                A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = "<group>"; };
                A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = "<group>"; };
                A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; };
                                375873F60F7D649C0040F377 /* nocommands.cpp */,
                                3792946E0F2E191800B9034A /* parsimonycommand.h */,
                                3792946F0F2E191800B9034A /* parsimonycommand.cpp */,
+                               A7D176CD10F2558500159497 /* pcacommand.h */,
+                               A7D176CE10F2558500159497 /* pcacommand.cpp */,
                                A773808E10B6EFD1002A6A61 /* phylotypecommand.h */,
                                A773808F10B6EFD1002A6A61 /* phylotypecommand.cpp */,
                                A7027F2610DFF86D00BF45FE /* preclustercommand.h */,
                                A727E84A10D14568001A8432 /* readblast.cpp in Sources */,
                                A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */,
                                A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */,
+                               A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
index 96e1a7899c4fec8d769c46aede121bb8de240c07..f0f7243115f267b6654ab8cb25ceb05b2ef96029 100644 (file)
@@ -93,7 +93,7 @@ AlignCommand::AlignCommand(string option){
                        temp = validParameter.validFile(parameters, "flip", false);                     if (temp == "not found"){       temp = "f";                             }
                        flip = isTrue(temp); 
                        
-                       temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found"){       temp = "0.80";                  }
+                       temp = validParameter.validFile(parameters, "threshold", false);        if (temp == "not found"){       temp = "0.50";                  }
                        convert(temp, threshold); 
                        
                        search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
@@ -131,12 +131,12 @@ void AlignCommand::help(){
                mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
                mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
                mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
-               mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
-               mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -2.0.\n");
+               mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n");
+               mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n");
                mothurOut("The flip parameter is used to specify whether or not you want mothur to try the reverse compement if a sequence falls below the threshold.  The default is false.\n");
                mothurOut("The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. \n");
                mothurOut("If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported.\n");
-               mothurOut("The default for the threshold parameter is 0.80, meaning at least 80% of the bases must remain or the sequence is reported as potentially reversed.\n");
+               mothurOut("The default for the threshold parameter is 0.50, meaning at least 50% of the bases must remain or the sequence is reported as potentially reversed.\n");
                mothurOut("The align.seqs command should be in the following format: \n");
                mothurOut("align.seqs(template=yourTemplateFile, candidate=yourCandidateFile, align=yourAlignmentMethod, search=yourSearchmethod, ksize=yourKmerSize, match=yourMatchBonus, mismatch=yourMismatchpenalty, gapopen=yourGapopenPenalty, gapextend=yourGapExtendPenalty) \n");
                mothurOut("Example align.seqs(candidate=candidate.fasta, template=core.filtered, align=kmer, search=gotoh, ksize=8, match=2.0, mismatch=3.0, gapopen=-2.0, gapextend=-1.0)\n");
@@ -231,6 +231,15 @@ int AlignCommand::execute(){
                        rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str());
                        rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str());
                        
+                       //append alignment and report files
+                       for(int i=1;i<processors;i++){
+                               appendAlignFiles((alignFileName + toString(processIDS[i]) + ".temp"), alignFileName);
+                               remove((alignFileName + toString(processIDS[i]) + ".temp").c_str());
+                               
+                               appendReportFiles((reportFileName + toString(processIDS[i]) + ".temp"), reportFileName);
+                               remove((reportFileName + toString(processIDS[i]) + ".temp").c_str());
+                       }
+                       
                        vector<string> nonBlankAccnosFiles;
                        //delete blank accnos files generated with multiple processes
                        for(int i=0;i<processors;i++){  
@@ -253,16 +262,6 @@ int AlignCommand::execute(){
                                }else{  mothurOut(" If the reverse compliment proved to be better it was reported.");  }
                                mothurOutEndLine();
                        }
-                       
-                       //append other files
-                       for(int i=1;i<processors;i++){
-                               appendAlignFiles((alignFileName + toString(processIDS[i]) + ".temp"), alignFileName);
-                               remove((alignFileName + toString(processIDS[i]) + ".temp").c_str());
-                               
-                               appendReportFiles((reportFileName + toString(processIDS[i]) + ".temp"), reportFileName);
-                               remove((reportFileName + toString(processIDS[i]) + ".temp").c_str());
-                       }
-                       
                }
 #else
                ifstream inFASTA;
@@ -316,7 +315,7 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName,
                openInputFile(candidateFileName, inFASTA);
 
                inFASTA.seekg(line->start);
-
+               
                for(int i=0;i<line->numSeqs;i++){
                        
                        Sequence* candidateSeq = new Sequence(inFASTA);  gobble(inFASTA);
@@ -392,7 +391,13 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName,
                                if (needToDeleteCopy) {   delete copy;   }
                        }
                        delete candidateSeq;
+                       
+                       //report progress
+                       if((i+1) % 100 == 0){   mothurOut(toString(i+1)); mothurOutEndLine();           }
+                       
                }
+               //report progress
+               if((line->numSeqs) % 100 != 0){ mothurOut(toString(line->numSeqs)); mothurOutEndLine();         }
                
                alignmentFile.close();
                inFASTA.close();
index 5272b2d1761285f934e9e8e1a474fd27f1edc328..e6adfc24b863442a58edcd7f348c83f843b10635 100644 (file)
@@ -11,8 +11,8 @@
 #include "kmer.hpp"
 
 /**************************************************************************************************/
-Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, bool p) : 
-Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), probs(p)  {
+Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : 
+Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
        try {
                                        
                numKmers = database->getMaxKmer() + 1;
@@ -116,18 +116,9 @@ string Bayesian::getTaxonomy(Sequence* seq) {
                int index = getMostProbableTaxonomy(queryKmers);
                                        
                //bootstrap - to set confidenceScore
-               if (probs) {
-                       int numToSelect = queryKmers.size() / 8;
-                       tax = bootstrapResults(queryKmers, index, numToSelect);
-               }else{
-                       TaxNode seqTax = phyloTree->get(index);
-                       while (seqTax.level != 0) { //while you are not at the root
-                               tax = seqTax.name + ";" + tax;
-                               seqTax = phyloTree->get(seqTax.parent);
-                       }
-                       simpleTax = tax;
-               }
-                               
+               int numToSelect = queryKmers.size() / 8;
+               tax = bootstrapResults(queryKmers, index, numToSelect);
+                                               
                return tax;     
        }
        catch(exception& e) {
@@ -151,7 +142,7 @@ string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
                map<string, int>::iterator itBoot2;
                map<int, int>::iterator itConvert;
                
-               for (int i = 0; i < 100; i++) {
+               for (int i = 0; i < iters; i++) {
                        vector<int> temp;
                                                
                        for (int j = 0; j < numToSelect; j++) {
index 0020962f4276c9deb2129a27b2d69974783aab82..7100bf33b00c3fa85bd0c928c00daf82647f851a 100644 (file)
@@ -18,7 +18,7 @@
 class Bayesian : public Classify {
        
 public:
-       Bayesian(string, string, string, int, int, bool);
+       Bayesian(string, string, string, int, int, int);
        ~Bayesian() {};
        
        string getTaxonomy(Sequence*);
@@ -35,8 +35,7 @@ private:
        vector<int> genusTotals;
        vector<int> genusNodes;  //indexes in phyloTree where genus' are located
        
-       int kmerSize, numKmers, confidenceThreshold;
-       bool probs;
+       int kmerSize, numKmers, confidenceThreshold, iters;
        
        string bootstrapResults(vector<int>, int, int);
        int getMostProbableTaxonomy(vector<int>);
index eeef1cc95d87f3eb3a3c15c6b3dd9aea606795eb..7221ac0cc490feb350d351bf303c26789cb417b6 100644 (file)
@@ -25,7 +25,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
                else {
                        
                        //valid paramters for this command
-                       string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs"};
+                       string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters"};
                        vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -97,6 +97,10 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
                        
                        temp = validParameter.validFile(parameters, "probs", false);            if (temp == "not found"){       temp = "true";                  }
                        probs = isTrue(temp);
+                       
+                       temp = validParameter.validFile(parameters, "iters", false);            if (temp == "not found") { temp = "100";                        }
+                       convert(temp, iters); 
+
 
                        
                        if ((method == "bayesian") && (search != "kmer"))  { 
@@ -134,11 +138,12 @@ void ClassifySeqsCommand::help(){
                mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
                mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
                mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
-               mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
-               mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -2.0.\n");
+               mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n");
+               mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n");
                mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n");
                mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n");
                mothurOut("The probs parameter shut off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be run.\n");
+               mothurOut("The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n");
                mothurOut("The classify.seqs command should be in the following format: \n");
                mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
                mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
@@ -159,19 +164,19 @@ int ClassifySeqsCommand::execute(){
        try {
                if (abort == true) {    return 0;       }
                
-               if(method == "bayesian")                        {       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, probs);           }
+               if(method == "bayesian")                        {       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);           }
                else if(method == "knn")                        {       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
                else {
                        mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
                        mothurOutEndLine();
-                       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, probs);   
+                       classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);   
                }
 
                int numFastaSeqs = 0;
                
-               string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
+               string newTaxonomyFile = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "taxonomy";
                string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp";
-               string taxSummary = getRootName(fastaFileName) + "tax.summary";
+               string taxSummary = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "tax.summary";
                
                int start = time(NULL);
 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
@@ -356,7 +361,13 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa
                                taxonomy = classify->getTaxonomy(candidateSeq);
                                
                                if (taxonomy != "bad seq") {
-                                       outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+                                       //output confidence scores or not
+                                       if (probs) {
+                                               outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+                                       }else{
+                                               outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
+                                       }
+                                       
                                        outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
                                }
                        }                               
index 993968b3bdcf88164521214e2bfc5e884ed10f85..2c6390c1083e0ca72836f7c21c3f8c438a5c362e 100644 (file)
@@ -45,7 +45,7 @@ private:
        Classify* classify;
        
        string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName;
-       int processors, kmerSize, numWanted, cutoff;
+       int processors, kmerSize, numWanted, cutoff, iters;
        float match, misMatch, gapOpen, gapExtend;
        bool abort, probs;
        
index d2b22d59b06c469a12c7fce92b132a7971453f50..2417d8dda900fe4aac1cdf0d4ebea36209c2796c 100644 (file)
@@ -62,6 +62,7 @@
 #include "phylotypecommand.h"
 #include "mgclustercommand.h"
 #include "preclustercommand.h"
+#include "pcacommand.h"
 
 /*******************************************************/
 
@@ -133,6 +134,7 @@ CommandFactory::CommandFactory(){
        commands["phylotype"]                   = "phylotype";
        commands["mgcluster"]                   = "mgcluster";
        commands["pre.cluster"]                 = "pre.cluster";
+       commands["pca"]                                 = "pca";
 
 }
 /***********************************************************/
@@ -202,6 +204,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
                else if(commandName == "phylotype")                             {       command = new PhylotypeCommand(optionString);                   }
                else if(commandName == "mgcluster")                             {       command = new MGClusterCommand(optionString);                   }
                else if(commandName == "pre.cluster")                   {       command = new PreClusterCommand(optionString);                  }
+               else if(commandName == "pca")                                   {       command = new PCACommand(optionString);                                 }
                else                                                                                    {       command = new NoCommand(optionString);                                  }
 
                return command;
index ed26cbecb3f0b4aa67da540ba26ddfaf500602ce..41cc6b5eefca511b4137ecf635a841ca24159f1a 100644 (file)
@@ -42,7 +42,7 @@ GetOTURepCommand::GetOTURepCommand(string option){
                        help(); abort = true;
                } else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta","list","label","name", "group", "sorted"};
+                       string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column"};
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -69,6 +69,19 @@ GetOTURepCommand::GetOTURepCommand(string option){
                        listfile = validParameter.validFile(parameters, "list", true);
                        if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
                        else if (listfile == "not open") { abort = true; }      
+                       
+                       phylipfile = validParameter.validFile(parameters, "phylip", true);
+                       if (phylipfile == "not found") { phylipfile = "";  }
+                       else if (phylipfile == "not open") { abort = true; }    
+                       
+                       columnfile = validParameter.validFile(parameters, "column", true);
+                       if (columnfile == "not found") { columnfile = ""; }
+                       else if (columnfile == "not open") { abort = true; }    
+                       
+                       if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
+                       else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
+               
+                       if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
 
                        //check for optional parameter and set defaults
                        // ...at some point should added some additional type checking...
@@ -110,10 +123,6 @@ GetOTURepCommand::GetOTURepCommand(string option){
                        }
                        
                        if (abort == false) {
-                       
-                               if(globaldata->gSparseMatrix != NULL)   {
-                                       matrix = globaldata->gSparseMatrix;
-                               }
 
                                //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
                                if (globaldata->gListVector != NULL) {
@@ -132,15 +141,6 @@ GetOTURepCommand::GetOTURepCommand(string option){
                                        }
                                } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
 
-                               // Create a data structure to quickly access the distance information.
-                               // It consists of a vector of distance maps, where each map contains
-                               // all distances of a certain sequence. Vector and maps are accessed
-                               // via the index of a sequence in the distance matrix
-                               seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
-                               for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
-                                       seqVec[currentCell->row][currentCell->column] = currentCell->dist;
-                               }
-
                                fasta = new FastaMap();
                        }
                }
@@ -191,22 +191,21 @@ int GetOTURepCommand::execute(){
        try {
        
                if (abort == true) { return 0; }
-
                int error;
                
                //read fastafile
                fasta->readFastaFile(fastafile);
                
-               in.close();
-               
-               //set format to list so input can get listvector
-               globaldata->setFormat("list");
+               //in.close();
+               //read distance file and generate indexed distance file that can be used by findrep function
+//....new reading class....//
                
                //if user gave a namesfile then use it
-               if (namesfile != "") {
-                       readNamesFile();
-               }
+               if (namesfile != "") {  readNamesFile();        }
                
+               //set format to list so input can get listvector
+               globaldata->setFormat("list");
+
                //read list file
                read = new ReadOTUFile(listfile);
                read->read(&*globaldata); 
@@ -274,8 +273,6 @@ int GetOTURepCommand::execute(){
                        if (error == 1) { return 0; } //there is an error in hte input files, abort command
                }
                
-               delete matrix;
-               globaldata->gSparseMatrix = NULL;
                delete list;
                globaldata->gListVector = NULL;
 
@@ -371,7 +368,7 @@ string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, i
                        SeqMap::iterator it;
                        SeqMap currMap;
                        for (size_t i=0; i < seqIndex.size(); i++) {
-                               currMap = seqVec[seqIndex[i]];
+       //currMap = seqVec[seqIndex[i]];
                                for (size_t j=0; j < seqIndex.size(); j++) {
                                        it = currMap.find(seqIndex[j]);         
                                        if (it != currMap.end()) {
index d92df1a4045adbabd768210a628353d9b61383b3..58fec8d4f3e3aaaf148a9ff8455c4fba93c6b35f 100644 (file)
@@ -13,7 +13,6 @@
 
 #include "command.hpp"
 #include "globaldata.hpp"
-#include "sparsematrix.hpp"
 #include "listvector.hpp"
 #include "inputdata.h"
 #include "readotu.h"
@@ -44,25 +43,18 @@ public:
 
 private:
        GlobalData* globaldata;
-       SparseMatrix* matrix;
        ListVector* list;
        ReadOTUFile* read;
        InputData* input;
        FastaMap* fasta;
        GroupMap* groupMap;
-       string filename, fastafile, listfile, namesfile, groupfile, label, sorted;
+       string filename, fastafile, listfile, namesfile, groupfile, label, sorted, phylipfile, columnfile, namefile;
        ofstream out;
        ifstream in, inNames;
-       bool groupError;
-
-       bool abort, allLines;
+       bool abort, allLines, groupError;
        set<string> labels; //holds labels to be used
        map<string, int> nameToIndex;  //maps sequence name to index in sparsematrix
 
-       vector<SeqMap> seqVec;                  // contains maps with sequence index and distance
-                                                                       // for all distances related to a certain sequence
-
-
        void readNamesFile();
        int process(ListVector*);
        string findRep(int, string&, ListVector*, int&);        // returns the name of the "representative" sequence of given bin, 
diff --git a/pcacommand.cpp b/pcacommand.cpp
new file mode 100644 (file)
index 0000000..ef89ae5
--- /dev/null
@@ -0,0 +1,597 @@
+
+/*
+ *  pcacommand.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 1/4/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "pcacommand.h"
+
+//**********************************************************************************************************************
+
+PCACommand::PCACommand(string option){
+       try {
+               abort = false;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; }
+               
+               else {
+                       //valid paramters for this command
+                       string Array[] =  {"phylip","lt"};
+                       vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+                       
+                       OptionParser parser(option);
+                       map<string, string> parameters = parser. getParameters();
+                       
+                       ValidParameters validParameter;
+               
+                       //check to make sure all parameters are valid for command
+                       for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //required parameters
+                       phylipfile = validParameter.validFile(parameters, "phylip", true);
+                       if (phylipfile == "not open") { abort = true; }
+                       else if (phylipfile == "not found") { phylipfile = ""; abort = true; }  
+                       else {  filename = phylipfile;  }
+                       
+                       //columnfile = validParameter.validFile(parameters, "column", true);
+                       //if (columnfile == "not open") { abort = true; }       
+                       //else if (columnfile == "not found") { columnfile = ""; }
+                       //else {  format = "column";    }
+                       
+                       //namefile = validParameter.validFile(parameters, "name", true);
+                       //if (namefile == "not open") { abort = true; } 
+                       //else if (namefile == "not found") { namefile = ""; }
+                       
+                       
+                       //error checking on files       
+                       if (phylipfile == "")   { mothurOut("You must provide a distance file before running the pca command."); mothurOutEndLine(); abort = true; }            
+                       //if ((phylipfile == "") && (columnfile == "")) { mothurOut("You must provide a distance file before running the pca command."); mothurOutEndLine(); abort = true; }
+                       //else if ((phylipfile != "") && (columnfile != "")) { mothurOut("You may not use both the column and the phylip parameters."); mothurOutEndLine(); abort = true; }
+                       
+                       //if (columnfile != "") {
+                       //      if (namefile == "") {  mothurOut("You need to provide a namefile if you are going to use the column format."); mothurOutEndLine(); abort = true; }
+                       //}
+                       
+                       string temp = validParameter.validFile(parameters, "lt", false);                                if (temp == "not found") { temp = "false"; }
+                       bool lt = isTrue(temp);
+                       
+                       if (lt)         {  matrix = 2;  }
+                       else            {  matrix = 1;  }
+
+
+               }
+
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "PCACommand");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+void PCACommand::help(){
+       try {
+       
+               mothurOut("The pca command..."); mothurOutEndLine();
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "help");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+PCACommand::~PCACommand(){}
+//**********************************************************************************************************************
+int PCACommand::execute(){
+       try {
+       
+               if (abort == true) { return 0; }
+               
+               cout.setf(ios::fixed, ios::floatfield);
+               cout.setf(ios::showpoint);
+               cerr.setf(ios::fixed, ios::floatfield);
+               cerr.setf(ios::showpoint);
+               
+               vector<string> names;
+               vector<vector<double> > D;
+       
+               fbase = filename;
+               if(fbase.find_last_of(".")!=string::npos){
+                       fbase.erase(fbase.find_last_of(".")+1); 
+               }
+               else{
+                       fbase += ".";
+               }
+               read(filename, matrix, names, D);
+       
+               double offset = 0.0000;
+               vector<double> d;
+               vector<double> e;
+               vector<vector<double> > G = D;
+               vector<vector<double> > copy_G;
+               int rank = D.size();
+               
+               cout << "\nProcessing...\n";
+               
+               for(int count=0;count<2;count++){
+                       recenter(offset, D, G);   
+                       tred2(G, d, e);
+                       qtli(d, e, G);
+                       offset = d[d.size()-1];
+                       if(offset > 0.0) break;
+               } 
+               
+               
+               output(fbase, names, G, d);
+               
+               return 0;
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "execute");
+               exit(1);
+       }
+}
+/*********************************************************************************************************************************/
+
+inline double SIGN(const double a, const double b)
+{
+    return b>=0 ? (a>=0 ? a:-a) : (a>=0 ? -a:a);
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::get_comment(istream& f, char begin, char end){
+       try {
+               char d=f.get();
+               while(d != end){        d = f.get();    }
+               d = f.peek();
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "get_comment");
+               exit(1);
+       }
+}      
+
+/*********************************************************************************************************************************/
+
+void PCACommand::read_mega(istream& f, int square_m, vector<string>& name_list, vector<vector<double> >& d){
+       try {
+               get_comment(f, '#', '\n');
+               
+               char test = f.peek();
+               
+               while(test == '!'){                                                             //get header comments
+                       get_comment(f, '!', ';');
+                       while(isspace(test=f.get()))            {;}
+                       f.putback(test);
+                       test = f.peek();
+               }
+               while(test != '\n'){                                                    //get sequence names
+                       get_comment(f, '[', ']');
+                       char d = f.get();
+                       d = f.get();
+                       if(d == '#'){
+                               string name;
+                               f >> name;
+                               name_list.push_back(name);
+                               while(isspace(test=f.get()))            {;}
+                               f.putback(test);
+                       }
+                       else{
+                               break;
+                       }
+               }
+               int rank = name_list.size();
+               d.resize(rank);
+               for(int i=0;i<rank;i++){                d[i].resize(rank);              }
+               
+               d[0][0] = 0.0000;
+               get_comment(f, '[', ']');
+               for(int i=1;i<rank;i++){
+                       get_comment(f, '[', ']');
+                       d[i][i]=0.0000;
+                       for(int j=0;j<i;j++){
+                               f >> d[i][j];
+                               if (d[i][j] == -0.0000)
+                                       d[i][j] = 0.0000;
+                               d[j][i]=d[i][j];
+                       }
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "read_mega");
+               exit(1);
+       }
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::read_phylip(istream& f, int square_m, vector<string>& name_list, vector<vector<double> >& d){
+       try {
+               //     int count1=0;
+               //     int count2=0;
+               
+               int rank;
+               f >> rank;
+               
+               name_list.resize(rank);
+               d.resize(rank);
+               if(square_m == 1){
+                       for(int i=0;i<rank;i++)
+                               d[i].resize(rank);
+                       for(int i=0;i<rank;i++) {
+                               f >> name_list[i];
+                               //                      cout << i << "\t" << name_list[i] << endl;
+                               for(int j=0;j<rank;j++) {
+                                       f >> d[i][j];
+                                       if (d[i][j] == -0.0000)
+                                               d[i][j] = 0.0000;
+                               }
+                       }
+               }
+               else if(square_m == 2){
+                       for(int i=0;i<rank;i++){
+                               d[i].resize(rank);
+                       }
+                       d[0][0] = 0.0000;
+                       f >> name_list[0];
+                       for(int i=1;i<rank;i++){
+                               f >> name_list[i];
+                               d[i][i]=0.0000;
+                               for(int j=0;j<i;j++){
+                                       f >> d[i][j];
+                                       if (d[i][j] == -0.0000)
+                                               d[i][j] = 0.0000;
+                                       d[j][i]=d[i][j];
+                               }
+                       }
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "read_phylip");
+               exit(1);
+       }
+
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::read(string fname, int m, vector<string>& names, vector<vector<double> >& D){
+       try {
+               ifstream f(fname.c_str());
+               if(!f) {
+                       cerr << "Error: Could not open " << fname << endl;
+                       exit(1);
+               }
+               char test = f.peek();
+               
+               if(test == '#'){
+                       read_mega(f, m, names, D);
+               }
+               else{
+                       read_phylip(f, m, names, D);
+               }
+               
+               int rank = D.size();
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "read");
+               exit(1);
+       }
+}
+
+/*********************************************************************************************************************************/
+double PCACommand::pythag(double a, double b){
+    return(pow(a*a+b*b,0.5));
+}
+/*********************************************************************************************************************************/
+
+void PCACommand::matrix_mult(vector<vector<double> > first, vector<vector<double> > second, vector<vector<double> >& product){
+       try {
+               int first_rows = first.size();
+               int first_cols = first[0].size();
+               int second_cols = second[0].size();
+               
+               product.resize(first_rows);
+               for(int i=0;i<first_rows;i++){
+                       product[i].resize(second_cols);
+               }
+               
+               for(int i=0;i<first_rows;i++){
+                       for(int j=0;j<second_cols;j++){
+                               product[i][j] = 0.0;
+                               for(int k=0;k<first_cols;k++){
+                                       product[i][j] += first[i][k] * second[k][j];
+                               }
+                       }
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "matrix_mult");
+               exit(1);
+       }
+
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::recenter(double offset, vector<vector<double> > D, vector<vector<double> >& G){
+       try {
+               int rank = D.size();
+               
+               vector<vector<double> > A(rank);
+               vector<vector<double> > C(rank);
+               for(int i=0;i<rank;i++){
+                       A[i].resize(rank);
+                       C[i].resize(rank);
+               }
+               
+               double scale = -1.0000 / (double) rank;
+               
+               for(int i=0;i<rank;i++){
+                       A[i][i] = 0.0000;
+                       C[i][i] = 1.0000 + scale;
+                       for(int j=i+1;j<rank;j++){
+                               A[i][j] = A[j][i] = -0.5 * D[i][j] * D[i][j] + offset;
+                               C[i][j] = C[j][i] = scale;
+                       }
+               }
+               
+               matrix_mult(C,A,A);
+               matrix_mult(A,C,G);
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "recenter");
+               exit(1);
+       }
+
+}
+
+/*********************************************************************************************************************************/
+
+//  This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479
+
+void PCACommand::tred2(vector<vector<double> >& a, vector<double>& d, vector<double>& e){
+       try {
+               double scale, hh, h, g, f;
+               
+               int n = a.size();
+               
+               d.resize(n);
+               e.resize(n);
+               
+               for(int i=n-1;i>0;i--){
+                       int l=i-1;
+                       h = scale = 0.0000;
+                       if(l>0){
+                               for(int k=0;k<l+1;k++){
+                                       scale += fabs(a[i][k]);
+                               }
+                               if(scale == 0.0){
+                                       e[i] = a[i][l];
+                               }
+                               else{
+                                       for(int k=0;k<l+1;k++){
+                                               a[i][k] /= scale;
+                                               h += a[i][k] * a[i][k];
+                                       }
+                                       f = a[i][l];
+                                       g = (f >= 0.0 ? -sqrt(h) : sqrt(h));
+                                       e[i] = scale * g;
+                                       h -= f * g;
+                                       a[i][l] = f - g;
+                                       f = 0.0;
+                                       for(int j=0;j<l+1;j++){
+                                               a[j][i] = a[i][j] / h;
+                                               g = 0.0;
+                                               for(int k=0;k<j+1;k++){
+                                                       g += a[j][k] * a[i][k];
+                                               }
+                                               for(int k=j+1;k<l+1;k++){
+                                                       g += a[k][j] * a[i][k];
+                                               }
+                                               e[j] = g / h;
+                                               f += e[j] * a[i][j];
+                                       }
+                                       hh = f / (h + h);
+                                       for(int j=0;j<l+1;j++){
+                                               f = a[i][j];
+                                               e[j] = g = e[j] - hh * f;
+                                               for(int k=0;k<j+1;k++){
+                                                       a[j][k] -= (f * e[k] + g * a[i][k]);
+                                               }
+                                       }
+                               }
+                       }
+                       else{
+                               e[i] = a[i][l];
+                       }
+                       
+                       d[i] = h;
+               }
+               
+               d[0] = 0.0000;
+               e[0] = 0.0000;
+               
+               for(int i=0;i<n;i++){
+                       int l = i;
+                       if(d[i] != 0.0){
+                               for(int j=0;j<l;j++){
+                                       g = 0.0000;
+                                       for(int k=0;k<l;k++){
+                                               g += a[i][k] * a[k][j];
+                                       }
+                                       for(int k=0;k<l;k++){
+                                               a[k][j] -= g * a[k][i];
+                                       }
+                               }
+                       }
+                       d[i] = a[i][i];
+                       a[i][i] = 1.0000;
+                       for(int j=0;j<l;j++){
+                               a[j][i] = a[i][j] = 0.0;
+                       }
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "tred2");
+               exit(1);
+       }
+
+}
+
+/*********************************************************************************************************************************/
+
+//  This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479
+
+void PCACommand::qtli(vector<double>& d, vector<double>& e, vector<vector<double> >& z) {
+       try {
+               int m, i, iter;
+               double s, r, p, g, f, dd, c, b;
+               
+               int n = d.size();
+               for(int i=1;i<=n;i++){
+                       e[i-1] = e[i];
+               }
+               e[n-1] = 0.0000;
+               
+               for(int l=0;l<n;l++){
+                       iter = 0;
+                       do {
+                               for(m=l;m<n-1;m++){
+                                       dd = fabs(d[m]) + fabs(d[m+1]);
+                                       if(fabs(e[m])+dd == dd) break;
+                               }
+                               if(m != l){
+                                       if(iter++ == 30) cerr << "Too many iterations in tqli\n";
+                                       g = (d[l+1]-d[l]) / (2.0 * e[l]);
+                                       r = pythag(g, 1.0);
+                                       g = d[m] - d[l] + e[l] / (g + SIGN(r,g));
+                                       s = c = 1.0;
+                                       p = 0.0000;
+                                       for(i=m-1;i>=l;i--){
+                                               f = s * e[i];
+                                               b = c * e[i];
+                                               e[i+1] = (r=pythag(f,g));
+                                               if(r==0.0){
+                                                       d[i+1] -= p;
+                                                       e[m] = 0.0000;
+                                                       break;
+                                               }
+                                               s = f / r;
+                                               c = g / r;
+                                               g = d[i+1] - p;
+                                               r = (d[i] - g) * s + 2.0 * c * b;
+                                               d[i+1] = g + ( p = s * r);
+                                               g = c * r - b;
+                                               for(int k=0;k<n;k++){
+                                                       f = z[k][i+1];
+                                                       z[k][i+1] = s * z[k][i] + c * f;
+                                                       z[k][i] = c * z[k][i] - s * f;
+                                               }
+                                       }
+                                       if(r == 0.00 && i >= l) continue;
+                                       d[l] -= p;
+                                       e[l] = g;
+                                       e[m] = 0.0;
+                               }
+                       } while (m != l);
+               }
+               
+               int k;
+               for(int i=0;i<n;i++){
+                       p=d[k=i];
+                       for(int j=i;j<n;j++){
+                               if(d[j] >= p){
+                                       p=d[k=j];
+                               }
+                       }
+                       if(k!=i){
+                               d[k]=d[i];
+                               d[i]=p;
+                               for(int j=0;j<n;j++){
+                                       p=z[j][i];
+                                       z[j][i] = z[j][k];
+                                       z[j][k] = p;
+                               }
+                       }
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "qtli");
+               exit(1);
+       }
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::output(string fnameRoot, vector<string> name_list, vector<vector<double> > G, vector<double> d) {
+       try {
+               int rank = name_list.size();
+               double dsum = 0.0000;
+               for(int i=0;i<rank;i++){
+                       dsum += d[i];
+                       for(int j=0;j<rank;j++){
+                               if(d[j] >= 0)   {       G[i][j] *= pow(d[j],0.5);       }
+                               else                    {       G[i][j] = 0.00000;                      }
+                       }
+               }
+               
+               ofstream pcaData((fnameRoot+"pca").c_str(), ios::trunc);
+               pcaData.setf(ios::fixed, ios::floatfield);
+               pcaData.setf(ios::showpoint);   
+               
+               ofstream pcaLoadings((fnameRoot+"pca.loadings").c_str(), ios::trunc);
+               pcaLoadings.setf(ios::fixed, ios::floatfield);
+               pcaLoadings.setf(ios::showpoint);       
+               
+               pcaLoadings << "axis\tloading\n";
+               for(int i=0;i<rank;i++){
+                       pcaLoadings << i+1 << '\t' << d[i] * 100.0 / dsum << endl;
+               }
+               
+               pcaData << "SeqName";
+               for(int i=0;i<rank;i++){
+                       pcaData << '\t' << "axis" << i+1;
+               }
+               pcaData << endl;
+               
+               for(int i=0;i<rank;i++){
+                       pcaData << name_list[i] << '\t';
+                       for(int j=0;j<rank;j++){
+                               pcaData << G[i][j] << '\t';
+                       }
+                       pcaData << endl;
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "output");
+               exit(1);
+       }
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::print_matrix(vector<vector<double> > A) {
+       try {
+               int rank = A.size();
+               for(int i=0;i<rank;i++){
+                       for(int j=0;j<rank;j++){
+                               cout << A[i][j] << "  ";
+                       }
+                       cout << endl;
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "PCACommand", "print_matrix");
+               exit(1);
+       }
+}
+/*********************************************************************************************************************************/
+
diff --git a/pcacommand.h b/pcacommand.h
new file mode 100644 (file)
index 0000000..822be3b
--- /dev/null
@@ -0,0 +1,49 @@
+#ifndef PCACOMMAND_H
+#define PCACOMMAND_H
+
+/*
+ *  pcacommand.h
+ *  Mothur
+ *
+ *  Created by westcott on 1/4/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+#include "command.hpp"
+
+
+/*****************************************************************/
+class PCACommand : public Command {
+       
+public:
+       PCACommand(string);     
+       ~PCACommand();
+       int execute();  
+       void help();
+       
+private:
+
+       bool abort;
+       string phylipfile, columnfile, namefile, format, filename, fbase;
+       float cutoff, precision;
+       int matrix;
+       
+       void get_comment(istream&, char, char);
+       void read_mega(istream&, int, vector<string>&, vector<vector<double> >&);
+       void read_phylip(istream&, int, vector<string>&, vector<vector<double> >&);
+       void read(string, int, vector<string>&, vector<vector<double> >&);
+       double pythag(double, double);
+       void matrix_mult(vector<vector<double> >, vector<vector<double> >, vector<vector<double> >&);
+       void recenter(double, vector<vector<double> >, vector<vector<double> >&);
+       void tred2(vector<vector<double> >&, vector<double>&, vector<double>&);
+       void qtli(vector<double>&, vector<double>&, vector<vector<double> >&);
+       void output(string, vector<string>, vector<vector<double> >, vector<double>);
+       void print_matrix(vector<vector<double> >);
+       
+};
+       
+/*****************************************************************/
+       
+#endif
+
index f6cf5db3948ea8da4557fad5b98da360019922e7..7e81d7f1a3efe2535f1d9d11715bb2c517a508ec 100644 (file)
@@ -206,6 +206,7 @@ int ReadDistCommand::execute(){
                        if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix;  }
                        globaldata->gSparseMatrix = read->getMatrix();
                        numDists = globaldata->gSparseMatrix->getNNodes();
+       cout << "matrix contains " << numDists << " distances." << endl;
                        
       int lines = cutoff / (1.0/precision);
       vector<float> dist_cutoff(lines+1,0);
index 54a442697107f92c4fd9e887cacd425b32e0a825..326e695b20965c451ac17fe5a70bc11b923d6c3e 100644 (file)
@@ -132,7 +132,7 @@ int UnifracUnweightedCommand::execute() {
                        
                        //get unweighted scores for random trees
                        for (int j = 0; j < iters; j++) {
-                               //we need a different getValues because when we swap the labels we only want to swap those in each parwise comparison
+                               //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
                                randomData = unweighted->getValues(T[i], "", "");
                        
                                for(int k = 0; k < numComp; k++) {      
@@ -147,7 +147,6 @@ int UnifracUnweightedCommand::execute() {
                                        //add randoms score to validscores
                                        validScores[randomData[k]] = randomData[k];
                                }
-                               
                        }
                
                        for(int a = 0; a < numComp; a++) {