From: westcott Date: Mon, 4 Jan 2010 18:12:00 +0000 (+0000) Subject: added pca command X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=f27b66ce6415eb14c434f9850019c7cf140e023e added pca command --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 1fbb5b7..16d9d8e 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -180,6 +180,7 @@ A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; }; A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; }; A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; }; + A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D176CE10F2558500159497 /* pcacommand.cpp */; }; A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; }; A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; }; A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; }; @@ -571,6 +572,8 @@ A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; }; A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; }; A7B0450D106CEEC90046FC83 /* slayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slayer.cpp; sourceTree = SOURCE_ROOT; }; + A7D176CD10F2558500159497 /* pcacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pcacommand.h; sourceTree = SOURCE_ROOT; }; + A7D176CE10F2558500159497 /* pcacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pcacommand.cpp; sourceTree = SOURCE_ROOT; }; A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = ""; }; A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = ""; }; A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; }; @@ -932,6 +935,8 @@ 375873F60F7D649C0040F377 /* nocommands.cpp */, 3792946E0F2E191800B9034A /* parsimonycommand.h */, 3792946F0F2E191800B9034A /* parsimonycommand.cpp */, + A7D176CD10F2558500159497 /* pcacommand.h */, + A7D176CE10F2558500159497 /* pcacommand.cpp */, A773808E10B6EFD1002A6A61 /* phylotypecommand.h */, A773808F10B6EFD1002A6A61 /* phylotypecommand.cpp */, A7027F2610DFF86D00BF45FE /* preclustercommand.h */, @@ -1305,6 +1310,7 @@ A727E84A10D14568001A8432 /* readblast.cpp in Sources */, A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */, A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */, + A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp index 96e1a78..f0f7243 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -93,7 +93,7 @@ AlignCommand::AlignCommand(string option){ temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ temp = "f"; } flip = isTrue(temp); - temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "0.80"; } + temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "0.50"; } convert(temp, threshold); search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; } @@ -131,12 +131,12 @@ void AlignCommand::help(){ mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"); mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"); - mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n"); - mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n"); + mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"); + mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"); mothurOut("The flip parameter is used to specify whether or not you want mothur to try the reverse compement if a sequence falls below the threshold. The default is false.\n"); mothurOut("The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. \n"); mothurOut("If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported.\n"); - mothurOut("The default for the threshold parameter is 0.80, meaning at least 80% of the bases must remain or the sequence is reported as potentially reversed.\n"); + mothurOut("The default for the threshold parameter is 0.50, meaning at least 50% of the bases must remain or the sequence is reported as potentially reversed.\n"); mothurOut("The align.seqs command should be in the following format: \n"); mothurOut("align.seqs(template=yourTemplateFile, candidate=yourCandidateFile, align=yourAlignmentMethod, search=yourSearchmethod, ksize=yourKmerSize, match=yourMatchBonus, mismatch=yourMismatchpenalty, gapopen=yourGapopenPenalty, gapextend=yourGapExtendPenalty) \n"); mothurOut("Example align.seqs(candidate=candidate.fasta, template=core.filtered, align=kmer, search=gotoh, ksize=8, match=2.0, mismatch=3.0, gapopen=-2.0, gapextend=-1.0)\n"); @@ -231,6 +231,15 @@ int AlignCommand::execute(){ rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str()); rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str()); + //append alignment and report files + for(int i=1;i nonBlankAccnosFiles; //delete blank accnos files generated with multiple processes for(int i=0;istart); - + for(int i=0;inumSeqs;i++){ Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); @@ -392,7 +391,13 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName, if (needToDeleteCopy) { delete copy; } } delete candidateSeq; + + //report progress + if((i+1) % 100 == 0){ mothurOut(toString(i+1)); mothurOutEndLine(); } + } + //report progress + if((line->numSeqs) % 100 != 0){ mothurOut(toString(line->numSeqs)); mothurOutEndLine(); } alignmentFile.close(); inFASTA.close(); diff --git a/bayesian.cpp b/bayesian.cpp index 5272b2d..e6adfc2 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -11,8 +11,8 @@ #include "kmer.hpp" /**************************************************************************************************/ -Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, bool p) : -Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), probs(p) { +Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : +Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { try { numKmers = database->getMaxKmer() + 1; @@ -116,18 +116,9 @@ string Bayesian::getTaxonomy(Sequence* seq) { int index = getMostProbableTaxonomy(queryKmers); //bootstrap - to set confidenceScore - if (probs) { - int numToSelect = queryKmers.size() / 8; - tax = bootstrapResults(queryKmers, index, numToSelect); - }else{ - TaxNode seqTax = phyloTree->get(index); - while (seqTax.level != 0) { //while you are not at the root - tax = seqTax.name + ";" + tax; - seqTax = phyloTree->get(seqTax.parent); - } - simpleTax = tax; - } - + int numToSelect = queryKmers.size() / 8; + tax = bootstrapResults(queryKmers, index, numToSelect); + return tax; } catch(exception& e) { @@ -151,7 +142,7 @@ string Bayesian::bootstrapResults(vector kmers, int tax, int numToSelect) { map::iterator itBoot2; map::iterator itConvert; - for (int i = 0; i < 100; i++) { + for (int i = 0; i < iters; i++) { vector temp; for (int j = 0; j < numToSelect; j++) { diff --git a/bayesian.h b/bayesian.h index 0020962..7100bf3 100644 --- a/bayesian.h +++ b/bayesian.h @@ -18,7 +18,7 @@ class Bayesian : public Classify { public: - Bayesian(string, string, string, int, int, bool); + Bayesian(string, string, string, int, int, int); ~Bayesian() {}; string getTaxonomy(Sequence*); @@ -35,8 +35,7 @@ private: vector genusTotals; vector genusNodes; //indexes in phyloTree where genus' are located - int kmerSize, numKmers, confidenceThreshold; - bool probs; + int kmerSize, numKmers, confidenceThreshold, iters; string bootstrapResults(vector, int, int); int getMostProbableTaxonomy(vector); diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index eeef1cc..7221ac0 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -25,7 +25,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){ else { //valid paramters for this command - string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs"}; + string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); OptionParser parser(option); @@ -97,6 +97,10 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){ temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; } probs = isTrue(temp); + + temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; } + convert(temp, iters); + if ((method == "bayesian") && (search != "kmer")) { @@ -134,11 +138,12 @@ void ClassifySeqsCommand::help(){ mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n"); mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"); mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"); - mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n"); - mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n"); + mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"); + mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"); mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n"); mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n"); mothurOut("The probs parameter shut off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be run.\n"); + mothurOut("The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n"); mothurOut("The classify.seqs command should be in the following format: \n"); mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n"); mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n"); @@ -159,19 +164,19 @@ int ClassifySeqsCommand::execute(){ try { if (abort == true) { return 0; } - if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, probs); } + if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters); } else if(method == "knn") { classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted); } else { mothurOut(search + " is not a valid method option. I will run the command using bayesian."); mothurOutEndLine(); - classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, probs); + classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters); } int numFastaSeqs = 0; - string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy"; + string newTaxonomyFile = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "taxonomy"; string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp"; - string taxSummary = getRootName(fastaFileName) + "tax.summary"; + string taxSummary = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "tax.summary"; int start = time(NULL); #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) @@ -356,7 +361,13 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa taxonomy = classify->getTaxonomy(candidateSeq); if (taxonomy != "bad seq") { - outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + //output confidence scores or not + if (probs) { + outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + }else{ + outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; + } + outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; } } diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 993968b..2c6390c 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -45,7 +45,7 @@ private: Classify* classify; string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName; - int processors, kmerSize, numWanted, cutoff; + int processors, kmerSize, numWanted, cutoff, iters; float match, misMatch, gapOpen, gapExtend; bool abort, probs; diff --git a/commandfactory.cpp b/commandfactory.cpp index d2b22d5..2417d8d 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -62,6 +62,7 @@ #include "phylotypecommand.h" #include "mgclustercommand.h" #include "preclustercommand.h" +#include "pcacommand.h" /*******************************************************/ @@ -133,6 +134,7 @@ CommandFactory::CommandFactory(){ commands["phylotype"] = "phylotype"; commands["mgcluster"] = "mgcluster"; commands["pre.cluster"] = "pre.cluster"; + commands["pca"] = "pca"; } /***********************************************************/ @@ -202,6 +204,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "phylotype") { command = new PhylotypeCommand(optionString); } else if(commandName == "mgcluster") { command = new MGClusterCommand(optionString); } else if(commandName == "pre.cluster") { command = new PreClusterCommand(optionString); } + else if(commandName == "pca") { command = new PCACommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index ed26cbe..41cc6b5 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -42,7 +42,7 @@ GetOTURepCommand::GetOTURepCommand(string option){ help(); abort = true; } else { //valid paramters for this command - string Array[] = {"fasta","list","label","name", "group", "sorted"}; + string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -69,6 +69,19 @@ GetOTURepCommand::GetOTURepCommand(string option){ listfile = validParameter.validFile(parameters, "list", true); if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; } else if (listfile == "not open") { abort = true; } + + phylipfile = validParameter.validFile(parameters, "phylip", true); + if (phylipfile == "not found") { phylipfile = ""; } + else if (phylipfile == "not open") { abort = true; } + + columnfile = validParameter.validFile(parameters, "column", true); + if (columnfile == "not found") { columnfile = ""; } + else if (columnfile == "not open") { abort = true; } + + if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; } + else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; } + + if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -110,10 +123,6 @@ GetOTURepCommand::GetOTURepCommand(string option){ } if (abort == false) { - - if(globaldata->gSparseMatrix != NULL) { - matrix = globaldata->gSparseMatrix; - } //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix if (globaldata->gListVector != NULL) { @@ -132,15 +141,6 @@ GetOTURepCommand::GetOTURepCommand(string option){ } } else { mothurOut("error, no listvector."); mothurOutEndLine(); } - // Create a data structure to quickly access the distance information. - // It consists of a vector of distance maps, where each map contains - // all distances of a certain sequence. Vector and maps are accessed - // via the index of a sequence in the distance matrix - seqVec = vector(globaldata->gListVector->size()); - for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) { - seqVec[currentCell->row][currentCell->column] = currentCell->dist; - } - fasta = new FastaMap(); } } @@ -191,22 +191,21 @@ int GetOTURepCommand::execute(){ try { if (abort == true) { return 0; } - int error; //read fastafile fasta->readFastaFile(fastafile); - in.close(); - - //set format to list so input can get listvector - globaldata->setFormat("list"); + //in.close(); + //read distance file and generate indexed distance file that can be used by findrep function +//....new reading class....// //if user gave a namesfile then use it - if (namesfile != "") { - readNamesFile(); - } + if (namesfile != "") { readNamesFile(); } + //set format to list so input can get listvector + globaldata->setFormat("list"); + //read list file read = new ReadOTUFile(listfile); read->read(&*globaldata); @@ -274,8 +273,6 @@ int GetOTURepCommand::execute(){ if (error == 1) { return 0; } //there is an error in hte input files, abort command } - delete matrix; - globaldata->gSparseMatrix = NULL; delete list; globaldata->gListVector = NULL; @@ -371,7 +368,7 @@ string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, i SeqMap::iterator it; SeqMap currMap; for (size_t i=0; i < seqIndex.size(); i++) { - currMap = seqVec[seqIndex[i]]; + //currMap = seqVec[seqIndex[i]]; for (size_t j=0; j < seqIndex.size(); j++) { it = currMap.find(seqIndex[j]); if (it != currMap.end()) { diff --git a/getoturepcommand.h b/getoturepcommand.h index d92df1a..58fec8d 100644 --- a/getoturepcommand.h +++ b/getoturepcommand.h @@ -13,7 +13,6 @@ #include "command.hpp" #include "globaldata.hpp" -#include "sparsematrix.hpp" #include "listvector.hpp" #include "inputdata.h" #include "readotu.h" @@ -44,25 +43,18 @@ public: private: GlobalData* globaldata; - SparseMatrix* matrix; ListVector* list; ReadOTUFile* read; InputData* input; FastaMap* fasta; GroupMap* groupMap; - string filename, fastafile, listfile, namesfile, groupfile, label, sorted; + string filename, fastafile, listfile, namesfile, groupfile, label, sorted, phylipfile, columnfile, namefile; ofstream out; ifstream in, inNames; - bool groupError; - - bool abort, allLines; + bool abort, allLines, groupError; set labels; //holds labels to be used map nameToIndex; //maps sequence name to index in sparsematrix - vector seqVec; // contains maps with sequence index and distance - // for all distances related to a certain sequence - - void readNamesFile(); int process(ListVector*); string findRep(int, string&, ListVector*, int&); // returns the name of the "representative" sequence of given bin, diff --git a/pcacommand.cpp b/pcacommand.cpp new file mode 100644 index 0000000..ef89ae5 --- /dev/null +++ b/pcacommand.cpp @@ -0,0 +1,597 @@ + +/* + * pcacommand.cpp + * Mothur + * + * Created by westcott on 1/4/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "pcacommand.h" + +//********************************************************************************************************************** + +PCACommand::PCACommand(string option){ + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"phylip","lt"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser. getParameters(); + + ValidParameters validParameter; + + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //required parameters + phylipfile = validParameter.validFile(parameters, "phylip", true); + if (phylipfile == "not open") { abort = true; } + else if (phylipfile == "not found") { phylipfile = ""; abort = true; } + else { filename = phylipfile; } + + //columnfile = validParameter.validFile(parameters, "column", true); + //if (columnfile == "not open") { abort = true; } + //else if (columnfile == "not found") { columnfile = ""; } + //else { format = "column"; } + + //namefile = validParameter.validFile(parameters, "name", true); + //if (namefile == "not open") { abort = true; } + //else if (namefile == "not found") { namefile = ""; } + + + //error checking on files + if (phylipfile == "") { mothurOut("You must provide a distance file before running the pca command."); mothurOutEndLine(); abort = true; } + //if ((phylipfile == "") && (columnfile == "")) { mothurOut("You must provide a distance file before running the pca command."); mothurOutEndLine(); abort = true; } + //else if ((phylipfile != "") && (columnfile != "")) { mothurOut("You may not use both the column and the phylip parameters."); mothurOutEndLine(); abort = true; } + + //if (columnfile != "") { + // if (namefile == "") { mothurOut("You need to provide a namefile if you are going to use the column format."); mothurOutEndLine(); abort = true; } + //} + + string temp = validParameter.validFile(parameters, "lt", false); if (temp == "not found") { temp = "false"; } + bool lt = isTrue(temp); + + if (lt) { matrix = 2; } + else { matrix = 1; } + + + } + + } + catch(exception& e) { + errorOut(e, "PCACommand", "PCACommand"); + exit(1); + } +} +//********************************************************************************************************************** +void PCACommand::help(){ + try { + + mothurOut("The pca command..."); mothurOutEndLine(); + } + catch(exception& e) { + errorOut(e, "PCACommand", "help"); + exit(1); + } +} +//********************************************************************************************************************** +PCACommand::~PCACommand(){} +//********************************************************************************************************************** +int PCACommand::execute(){ + try { + + if (abort == true) { return 0; } + + cout.setf(ios::fixed, ios::floatfield); + cout.setf(ios::showpoint); + cerr.setf(ios::fixed, ios::floatfield); + cerr.setf(ios::showpoint); + + vector names; + vector > D; + + fbase = filename; + if(fbase.find_last_of(".")!=string::npos){ + fbase.erase(fbase.find_last_of(".")+1); + } + else{ + fbase += "."; + } + read(filename, matrix, names, D); + + double offset = 0.0000; + vector d; + vector e; + vector > G = D; + vector > copy_G; + int rank = D.size(); + + cout << "\nProcessing...\n"; + + for(int count=0;count<2;count++){ + recenter(offset, D, G); + tred2(G, d, e); + qtli(d, e, G); + offset = d[d.size()-1]; + if(offset > 0.0) break; + } + + + output(fbase, names, G, d); + + return 0; + } + catch(exception& e) { + errorOut(e, "PCACommand", "execute"); + exit(1); + } +} +/*********************************************************************************************************************************/ + +inline double SIGN(const double a, const double b) +{ + return b>=0 ? (a>=0 ? a:-a) : (a>=0 ? -a:a); +} + +/*********************************************************************************************************************************/ + +void PCACommand::get_comment(istream& f, char begin, char end){ + try { + char d=f.get(); + while(d != end){ d = f.get(); } + d = f.peek(); + } + catch(exception& e) { + errorOut(e, "PCACommand", "get_comment"); + exit(1); + } +} + +/*********************************************************************************************************************************/ + +void PCACommand::read_mega(istream& f, int square_m, vector& name_list, vector >& d){ + try { + get_comment(f, '#', '\n'); + + char test = f.peek(); + + while(test == '!'){ //get header comments + get_comment(f, '!', ';'); + while(isspace(test=f.get())) {;} + f.putback(test); + test = f.peek(); + } + while(test != '\n'){ //get sequence names + get_comment(f, '[', ']'); + char d = f.get(); + d = f.get(); + if(d == '#'){ + string name; + f >> name; + name_list.push_back(name); + while(isspace(test=f.get())) {;} + f.putback(test); + } + else{ + break; + } + } + int rank = name_list.size(); + d.resize(rank); + for(int i=0;i> d[i][j]; + if (d[i][j] == -0.0000) + d[i][j] = 0.0000; + d[j][i]=d[i][j]; + } + } + } + catch(exception& e) { + errorOut(e, "PCACommand", "read_mega"); + exit(1); + } +} + +/*********************************************************************************************************************************/ + +void PCACommand::read_phylip(istream& f, int square_m, vector& name_list, vector >& d){ + try { + // int count1=0; + // int count2=0; + + int rank; + f >> rank; + + name_list.resize(rank); + d.resize(rank); + if(square_m == 1){ + for(int i=0;i> name_list[i]; + // cout << i << "\t" << name_list[i] << endl; + for(int j=0;j> d[i][j]; + if (d[i][j] == -0.0000) + d[i][j] = 0.0000; + } + } + } + else if(square_m == 2){ + for(int i=0;i> name_list[0]; + for(int i=1;i> name_list[i]; + d[i][i]=0.0000; + for(int j=0;j> d[i][j]; + if (d[i][j] == -0.0000) + d[i][j] = 0.0000; + d[j][i]=d[i][j]; + } + } + } + } + catch(exception& e) { + errorOut(e, "PCACommand", "read_phylip"); + exit(1); + } + +} + +/*********************************************************************************************************************************/ + +void PCACommand::read(string fname, int m, vector& names, vector >& D){ + try { + ifstream f(fname.c_str()); + if(!f) { + cerr << "Error: Could not open " << fname << endl; + exit(1); + } + char test = f.peek(); + + if(test == '#'){ + read_mega(f, m, names, D); + } + else{ + read_phylip(f, m, names, D); + } + + int rank = D.size(); + } + catch(exception& e) { + errorOut(e, "PCACommand", "read"); + exit(1); + } +} + +/*********************************************************************************************************************************/ +double PCACommand::pythag(double a, double b){ + return(pow(a*a+b*b,0.5)); +} +/*********************************************************************************************************************************/ + +void PCACommand::matrix_mult(vector > first, vector > second, vector >& product){ + try { + int first_rows = first.size(); + int first_cols = first[0].size(); + int second_cols = second[0].size(); + + product.resize(first_rows); + for(int i=0;i > D, vector >& G){ + try { + int rank = D.size(); + + vector > A(rank); + vector > C(rank); + for(int i=0;i >& a, vector& d, vector& e){ + try { + double scale, hh, h, g, f; + + int n = a.size(); + + d.resize(n); + e.resize(n); + + for(int i=n-1;i>0;i--){ + int l=i-1; + h = scale = 0.0000; + if(l>0){ + for(int k=0;k= 0.0 ? -sqrt(h) : sqrt(h)); + e[i] = scale * g; + h -= f * g; + a[i][l] = f - g; + f = 0.0; + for(int j=0;j& d, vector& e, vector >& z) { + try { + int m, i, iter; + double s, r, p, g, f, dd, c, b; + + int n = d.size(); + for(int i=1;i<=n;i++){ + e[i-1] = e[i]; + } + e[n-1] = 0.0000; + + for(int l=0;l=l;i--){ + f = s * e[i]; + b = c * e[i]; + e[i+1] = (r=pythag(f,g)); + if(r==0.0){ + d[i+1] -= p; + e[m] = 0.0000; + break; + } + s = f / r; + c = g / r; + g = d[i+1] - p; + r = (d[i] - g) * s + 2.0 * c * b; + d[i+1] = g + ( p = s * r); + g = c * r - b; + for(int k=0;k= l) continue; + d[l] -= p; + e[l] = g; + e[m] = 0.0; + } + } while (m != l); + } + + int k; + for(int i=0;i= p){ + p=d[k=j]; + } + } + if(k!=i){ + d[k]=d[i]; + d[i]=p; + for(int j=0;j name_list, vector > G, vector d) { + try { + int rank = name_list.size(); + double dsum = 0.0000; + for(int i=0;i= 0) { G[i][j] *= pow(d[j],0.5); } + else { G[i][j] = 0.00000; } + } + } + + ofstream pcaData((fnameRoot+"pca").c_str(), ios::trunc); + pcaData.setf(ios::fixed, ios::floatfield); + pcaData.setf(ios::showpoint); + + ofstream pcaLoadings((fnameRoot+"pca.loadings").c_str(), ios::trunc); + pcaLoadings.setf(ios::fixed, ios::floatfield); + pcaLoadings.setf(ios::showpoint); + + pcaLoadings << "axis\tloading\n"; + for(int i=0;i > A) { + try { + int rank = A.size(); + for(int i=0;i&, vector >&); + void read_phylip(istream&, int, vector&, vector >&); + void read(string, int, vector&, vector >&); + double pythag(double, double); + void matrix_mult(vector >, vector >, vector >&); + void recenter(double, vector >, vector >&); + void tred2(vector >&, vector&, vector&); + void qtli(vector&, vector&, vector >&); + void output(string, vector, vector >, vector); + void print_matrix(vector >); + +}; + +/*****************************************************************/ + +#endif + diff --git a/readdistcommand.cpp b/readdistcommand.cpp index f6cf5db..7e81d7f 100644 --- a/readdistcommand.cpp +++ b/readdistcommand.cpp @@ -206,6 +206,7 @@ int ReadDistCommand::execute(){ if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix; } globaldata->gSparseMatrix = read->getMatrix(); numDists = globaldata->gSparseMatrix->getNNodes(); + cout << "matrix contains " << numDists << " distances." << endl; int lines = cutoff / (1.0/precision); vector dist_cutoff(lines+1,0); diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index 54a4426..326e695 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -132,7 +132,7 @@ int UnifracUnweightedCommand::execute() { //get unweighted scores for random trees for (int j = 0; j < iters; j++) { - //we need a different getValues because when we swap the labels we only want to swap those in each parwise comparison + //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison randomData = unweighted->getValues(T[i], "", ""); for(int k = 0; k < numComp; k++) { @@ -147,7 +147,6 @@ int UnifracUnweightedCommand::execute() { //add randoms score to validscores validScores[randomData[k]] = randomData[k]; } - } for(int a = 0; a < numComp; a++) {