From: westcott <westcott>
Date: Mon, 4 Jan 2010 18:12:00 +0000 (+0000)
Subject: added pca command
X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=f27b66ce6415eb14c434f9850019c7cf140e023e

added pca command
---

diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj
index 1fbb5b7..16d9d8e 100644
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -180,6 +180,7 @@
 		A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; };
 		A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
 		A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
+		A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D176CE10F2558500159497 /* pcacommand.cpp */; };
 		A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; };
 		A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; };
 		A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; };
@@ -571,6 +572,8 @@
 		A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; };
 		A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; };
 		A7B0450D106CEEC90046FC83 /* slayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slayer.cpp; sourceTree = SOURCE_ROOT; };
+		A7D176CD10F2558500159497 /* pcacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pcacommand.h; sourceTree = SOURCE_ROOT; };
+		A7D176CE10F2558500159497 /* pcacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pcacommand.cpp; sourceTree = SOURCE_ROOT; };
 		A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = "<group>"; };
 		A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = "<group>"; };
 		A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; };
@@ -932,6 +935,8 @@
 				375873F60F7D649C0040F377 /* nocommands.cpp */,
 				3792946E0F2E191800B9034A /* parsimonycommand.h */,
 				3792946F0F2E191800B9034A /* parsimonycommand.cpp */,
+				A7D176CD10F2558500159497 /* pcacommand.h */,
+				A7D176CE10F2558500159497 /* pcacommand.cpp */,
 				A773808E10B6EFD1002A6A61 /* phylotypecommand.h */,
 				A773808F10B6EFD1002A6A61 /* phylotypecommand.cpp */,
 				A7027F2610DFF86D00BF45FE /* preclustercommand.h */,
@@ -1305,6 +1310,7 @@
 				A727E84A10D14568001A8432 /* readblast.cpp in Sources */,
 				A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */,
 				A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */,
+				A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
diff --git a/aligncommand.cpp b/aligncommand.cpp
index 96e1a78..f0f7243 100644
--- a/aligncommand.cpp
+++ b/aligncommand.cpp
@@ -93,7 +93,7 @@ AlignCommand::AlignCommand(string option){
 			temp = validParameter.validFile(parameters, "flip", false);			if (temp == "not found"){	temp = "f";				}
 			flip = isTrue(temp); 
 			
-			temp = validParameter.validFile(parameters, "threshold", false);	if (temp == "not found"){	temp = "0.80";			}
+			temp = validParameter.validFile(parameters, "threshold", false);	if (temp == "not found"){	temp = "0.50";			}
 			convert(temp, threshold); 
 			
 			search = validParameter.validFile(parameters, "search", false);		if (search == "not found"){	search = "kmer";		}
@@ -131,12 +131,12 @@ void AlignCommand::help(){
 		mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
 		mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
 		mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
-		mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
-		mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -2.0.\n");
+		mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n");
+		mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n");
 		mothurOut("The flip parameter is used to specify whether or not you want mothur to try the reverse compement if a sequence falls below the threshold.  The default is false.\n");
 		mothurOut("The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. \n");
 		mothurOut("If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported.\n");
-		mothurOut("The default for the threshold parameter is 0.80, meaning at least 80% of the bases must remain or the sequence is reported as potentially reversed.\n");
+		mothurOut("The default for the threshold parameter is 0.50, meaning at least 50% of the bases must remain or the sequence is reported as potentially reversed.\n");
 		mothurOut("The align.seqs command should be in the following format: \n");
 		mothurOut("align.seqs(template=yourTemplateFile, candidate=yourCandidateFile, align=yourAlignmentMethod, search=yourSearchmethod, ksize=yourKmerSize, match=yourMatchBonus, mismatch=yourMismatchpenalty, gapopen=yourGapopenPenalty, gapextend=yourGapExtendPenalty) \n");
 		mothurOut("Example align.seqs(candidate=candidate.fasta, template=core.filtered, align=kmer, search=gotoh, ksize=8, match=2.0, mismatch=3.0, gapopen=-2.0, gapextend=-1.0)\n");
@@ -231,6 +231,15 @@ int AlignCommand::execute(){
 			rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str());
 			rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str());
 			
+			//append alignment and report files
+			for(int i=1;i<processors;i++){
+				appendAlignFiles((alignFileName + toString(processIDS[i]) + ".temp"), alignFileName);
+				remove((alignFileName + toString(processIDS[i]) + ".temp").c_str());
+				
+				appendReportFiles((reportFileName + toString(processIDS[i]) + ".temp"), reportFileName);
+				remove((reportFileName + toString(processIDS[i]) + ".temp").c_str());
+			}
+			
 			vector<string> nonBlankAccnosFiles;
 			//delete blank accnos files generated with multiple processes
 			for(int i=0;i<processors;i++){  
@@ -253,16 +262,6 @@ int AlignCommand::execute(){
 				}else{  mothurOut(" If the reverse compliment proved to be better it was reported.");  }
 				mothurOutEndLine();
 			}
-			
-			//append other files
-			for(int i=1;i<processors;i++){
-				appendAlignFiles((alignFileName + toString(processIDS[i]) + ".temp"), alignFileName);
-				remove((alignFileName + toString(processIDS[i]) + ".temp").c_str());
-				
-				appendReportFiles((reportFileName + toString(processIDS[i]) + ".temp"), reportFileName);
-				remove((reportFileName + toString(processIDS[i]) + ".temp").c_str());
-			}
-			
 		}
 #else
 		ifstream inFASTA;
@@ -316,7 +315,7 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName,
 		openInputFile(candidateFileName, inFASTA);
 
 		inFASTA.seekg(line->start);
-
+		
 		for(int i=0;i<line->numSeqs;i++){
 			
 			Sequence* candidateSeq = new Sequence(inFASTA);  gobble(inFASTA);
@@ -392,7 +391,13 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName,
 				if (needToDeleteCopy) {   delete copy;   }
 			}
 			delete candidateSeq;
+			
+			//report progress
+			if((i+1) % 100 == 0){	mothurOut(toString(i+1)); mothurOutEndLine();		}
+			
 		}
+		//report progress
+		if((line->numSeqs) % 100 != 0){	mothurOut(toString(line->numSeqs)); mothurOutEndLine();		}
 		
 		alignmentFile.close();
 		inFASTA.close();
diff --git a/bayesian.cpp b/bayesian.cpp
index 5272b2d..e6adfc2 100644
--- a/bayesian.cpp
+++ b/bayesian.cpp
@@ -11,8 +11,8 @@
 #include "kmer.hpp"
 
 /**************************************************************************************************/
-Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, bool p) : 
-Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), probs(p)  {
+Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : 
+Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
 	try {
 					
 		numKmers = database->getMaxKmer() + 1;
@@ -116,18 +116,9 @@ string Bayesian::getTaxonomy(Sequence* seq) {
 		int index = getMostProbableTaxonomy(queryKmers);
 					
 		//bootstrap - to set confidenceScore
-		if (probs) {
-			int numToSelect = queryKmers.size() / 8;
-			tax = bootstrapResults(queryKmers, index, numToSelect);
-		}else{
-			TaxNode seqTax = phyloTree->get(index);
-			while (seqTax.level != 0) { //while you are not at the root
-				tax = seqTax.name + ";" + tax;
-				seqTax = phyloTree->get(seqTax.parent);
-			}
-			simpleTax = tax;
-		}
-				
+		int numToSelect = queryKmers.size() / 8;
+		tax = bootstrapResults(queryKmers, index, numToSelect);
+						
 		return tax;	
 	}
 	catch(exception& e) {
@@ -151,7 +142,7 @@ string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
 		map<string, int>::iterator itBoot2;
 		map<int, int>::iterator itConvert;
 		
-		for (int i = 0; i < 100; i++) {
+		for (int i = 0; i < iters; i++) {
 			vector<int> temp;
 						
 			for (int j = 0; j < numToSelect; j++) {
diff --git a/bayesian.h b/bayesian.h
index 0020962..7100bf3 100644
--- a/bayesian.h
+++ b/bayesian.h
@@ -18,7 +18,7 @@
 class Bayesian : public Classify {
 	
 public:
-	Bayesian(string, string, string, int, int, bool);
+	Bayesian(string, string, string, int, int, int);
 	~Bayesian() {};
 	
 	string getTaxonomy(Sequence*);
@@ -35,8 +35,7 @@ private:
 	vector<int> genusTotals;
 	vector<int> genusNodes;  //indexes in phyloTree where genus' are located
 	
-	int kmerSize, numKmers, confidenceThreshold;
-	bool probs;
+	int kmerSize, numKmers, confidenceThreshold, iters;
 	
 	string bootstrapResults(vector<int>, int, int);
 	int getMostProbableTaxonomy(vector<int>);
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp
index eeef1cc..7221ac0 100644
--- a/classifyseqscommand.cpp
+++ b/classifyseqscommand.cpp
@@ -25,7 +25,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
 		else {
 			
 			//valid paramters for this command
-			string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs"};
+			string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters"};
 			vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -97,6 +97,10 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
 			
 			temp = validParameter.validFile(parameters, "probs", false);		if (temp == "not found"){	temp = "true";			}
 			probs = isTrue(temp);
+			
+			temp = validParameter.validFile(parameters, "iters", false);		if (temp == "not found") { temp = "100";			}
+			convert(temp, iters); 
+
 
 			
 			if ((method == "bayesian") && (search != "kmer"))  { 
@@ -134,11 +138,12 @@ void ClassifySeqsCommand::help(){
 		mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
 		mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
 		mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
-		mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
-		mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -2.0.\n");
+		mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n");
+		mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -1.0.\n");
 		mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n");
 		mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy.  The default is 0.\n");
 		mothurOut("The probs parameter shut off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be run.\n");
+		mothurOut("The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method.  The default is 100.\n");
 		mothurOut("The classify.seqs command should be in the following format: \n");
 		mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
 		mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
@@ -159,19 +164,19 @@ int ClassifySeqsCommand::execute(){
 	try {
 		if (abort == true) {	return 0;	}
 		
-		if(method == "bayesian")			{	classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, probs);		}
+		if(method == "bayesian")			{	classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);		}
 		else if(method == "knn")			{	classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);				}
 		else {
 			mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
 			mothurOutEndLine();
-			classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, probs);	
+			classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);	
 		}
 
 		int numFastaSeqs = 0;
 		
-		string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
+		string newTaxonomyFile = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "taxonomy";
 		string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp";
-		string taxSummary = getRootName(fastaFileName) + "tax.summary";
+		string taxSummary = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "tax.summary";
 		
 		int start = time(NULL);
 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
@@ -356,7 +361,13 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa
 				taxonomy = classify->getTaxonomy(candidateSeq);
 				
 				if (taxonomy != "bad seq") {
-					outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+					//output confidence scores or not
+					if (probs) {
+						outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+					}else{
+						outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
+					}
+					
 					outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
 				}
 			}				
diff --git a/classifyseqscommand.h b/classifyseqscommand.h
index 993968b..2c6390c 100644
--- a/classifyseqscommand.h
+++ b/classifyseqscommand.h
@@ -45,7 +45,7 @@ private:
 	Classify* classify;
 	
 	string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName;
-	int processors, kmerSize, numWanted, cutoff;
+	int processors, kmerSize, numWanted, cutoff, iters;
 	float match, misMatch, gapOpen, gapExtend;
 	bool abort, probs;
 	
diff --git a/commandfactory.cpp b/commandfactory.cpp
index d2b22d5..2417d8d 100644
--- a/commandfactory.cpp
+++ b/commandfactory.cpp
@@ -62,6 +62,7 @@
 #include "phylotypecommand.h"
 #include "mgclustercommand.h"
 #include "preclustercommand.h"
+#include "pcacommand.h"
 
 /*******************************************************/
 
@@ -133,6 +134,7 @@ CommandFactory::CommandFactory(){
 	commands["phylotype"]			= "phylotype";
 	commands["mgcluster"]			= "mgcluster";
 	commands["pre.cluster"]			= "pre.cluster";
+	commands["pca"]					= "pca";
 
 }
 /***********************************************************/
@@ -202,6 +204,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
 		else if(commandName == "phylotype")				{	command = new PhylotypeCommand(optionString);			}
 		else if(commandName == "mgcluster")				{	command = new MGClusterCommand(optionString);			}
 		else if(commandName == "pre.cluster")			{	command = new PreClusterCommand(optionString);			}
+		else if(commandName == "pca")					{	command = new PCACommand(optionString);					}
 		else											{	command = new NoCommand(optionString);					}
 
 		return command;
diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp
index ed26cbe..41cc6b5 100644
--- a/getoturepcommand.cpp
+++ b/getoturepcommand.cpp
@@ -42,7 +42,7 @@ GetOTURepCommand::GetOTURepCommand(string option){
 			help(); abort = true;
 		} else {
 			//valid paramters for this command
-			string Array[] =  {"fasta","list","label","name", "group", "sorted"};
+			string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column"};
 			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -69,6 +69,19 @@ GetOTURepCommand::GetOTURepCommand(string option){
 			listfile = validParameter.validFile(parameters, "list", true);
 			if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
 			else if (listfile == "not open") { abort = true; }	
+			
+			phylipfile = validParameter.validFile(parameters, "phylip", true);
+			if (phylipfile == "not found") { phylipfile = "";  }
+			else if (phylipfile == "not open") { abort = true; }	
+			
+			columnfile = validParameter.validFile(parameters, "column", true);
+			if (columnfile == "not found") { columnfile = ""; }
+			else if (columnfile == "not open") { abort = true; }	
+			
+			if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
+			else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
+		
+			if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
 
 			//check for optional parameter and set defaults
 			// ...at some point should added some additional type checking...
@@ -110,10 +123,6 @@ GetOTURepCommand::GetOTURepCommand(string option){
 			}
 			
 			if (abort == false) {
-			
-				if(globaldata->gSparseMatrix != NULL)	{
-					matrix = globaldata->gSparseMatrix;
-				}
 
 				//globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
 				if (globaldata->gListVector != NULL) {
@@ -132,15 +141,6 @@ GetOTURepCommand::GetOTURepCommand(string option){
 					}
 				} else { mothurOut("error, no listvector."); mothurOutEndLine(); }
 
-				// Create a data structure to quickly access the distance information.
-				// It consists of a vector of distance maps, where each map contains
-				// all distances of a certain sequence. Vector and maps are accessed
-				// via the index of a sequence in the distance matrix
-				seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
-				for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
-					seqVec[currentCell->row][currentCell->column] = currentCell->dist;
-				}
-
 				fasta = new FastaMap();
 			}
 		}
@@ -191,22 +191,21 @@ int GetOTURepCommand::execute(){
 	try {
 	
 		if (abort == true) { return 0; }
-
 		int error;
 		
 		//read fastafile
 		fasta->readFastaFile(fastafile);
 		
-		in.close();
-		
-		//set format to list so input can get listvector
-		globaldata->setFormat("list");
+		//in.close();
+		//read distance file and generate indexed distance file that can be used by findrep function
+//....new reading class....//
 		
 		//if user gave a namesfile then use it
-		if (namesfile != "") {
-			readNamesFile();
-		}
+		if (namesfile != "") {	readNamesFile();	}
 		
+		//set format to list so input can get listvector
+		globaldata->setFormat("list");
+
 		//read list file
 		read = new ReadOTUFile(listfile);
 		read->read(&*globaldata); 
@@ -274,8 +273,6 @@ int GetOTURepCommand::execute(){
 			if (error == 1) { return 0; } //there is an error in hte input files, abort command
 		}
 		
-		delete matrix;
-		globaldata->gSparseMatrix = NULL;
 		delete list;
 		globaldata->gListVector = NULL;
 
@@ -371,7 +368,7 @@ string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, i
 			SeqMap::iterator it;
 			SeqMap currMap;
 			for (size_t i=0; i < seqIndex.size(); i++) {
-				currMap = seqVec[seqIndex[i]];
+	//currMap = seqVec[seqIndex[i]];
 				for (size_t j=0; j < seqIndex.size(); j++) {
 					it = currMap.find(seqIndex[j]);		
 					if (it != currMap.end()) {
diff --git a/getoturepcommand.h b/getoturepcommand.h
index d92df1a..58fec8d 100644
--- a/getoturepcommand.h
+++ b/getoturepcommand.h
@@ -13,7 +13,6 @@
 
 #include "command.hpp"
 #include "globaldata.hpp"
-#include "sparsematrix.hpp"
 #include "listvector.hpp"
 #include "inputdata.h"
 #include "readotu.h"
@@ -44,25 +43,18 @@ public:
 
 private:
 	GlobalData* globaldata;
-	SparseMatrix* matrix;
 	ListVector* list;
 	ReadOTUFile* read;
 	InputData* input;
 	FastaMap* fasta;
 	GroupMap* groupMap;
-	string filename, fastafile, listfile, namesfile, groupfile, label, sorted;
+	string filename, fastafile, listfile, namesfile, groupfile, label, sorted, phylipfile, columnfile, namefile;
 	ofstream out;
 	ifstream in, inNames;
-	bool groupError;
-
-	bool abort, allLines;
+	bool abort, allLines, groupError;
 	set<string> labels; //holds labels to be used
 	map<string, int> nameToIndex;  //maps sequence name to index in sparsematrix
 
-	vector<SeqMap> seqVec;			// contains maps with sequence index and distance
-									// for all distances related to a certain sequence
-
-
 	void readNamesFile();
 	int process(ListVector*);
 	string findRep(int, string&, ListVector*, int&); 	// returns the name of the "representative" sequence of given bin, 
diff --git a/pcacommand.cpp b/pcacommand.cpp
new file mode 100644
index 0000000..ef89ae5
--- /dev/null
+++ b/pcacommand.cpp
@@ -0,0 +1,597 @@
+
+/*
+ *  pcacommand.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 1/4/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "pcacommand.h"
+
+//**********************************************************************************************************************
+
+PCACommand::PCACommand(string option){
+	try {
+		abort = false;
+		
+		//allow user to run help
+		if(option == "help") { help(); abort = true; }
+		
+		else {
+			//valid paramters for this command
+			string Array[] =  {"phylip","lt"};
+			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+			
+			OptionParser parser(option);
+			map<string, string> parameters = parser. getParameters();
+			
+			ValidParameters validParameter;
+		
+			//check to make sure all parameters are valid for command
+			for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
+				if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+			}
+			
+			//required parameters
+			phylipfile = validParameter.validFile(parameters, "phylip", true);
+			if (phylipfile == "not open") { abort = true; }
+			else if (phylipfile == "not found") { phylipfile = ""; abort = true; }	
+			else {	filename = phylipfile;  }
+			
+			//columnfile = validParameter.validFile(parameters, "column", true);
+			//if (columnfile == "not open") { abort = true; }	
+			//else if (columnfile == "not found") { columnfile = ""; }
+			//else {  format = "column";	}
+			
+			//namefile = validParameter.validFile(parameters, "name", true);
+			//if (namefile == "not open") { abort = true; }	
+			//else if (namefile == "not found") { namefile = ""; }
+			
+			
+			//error checking on files	
+			if (phylipfile == "")	{ mothurOut("You must provide a distance file before running the pca command."); mothurOutEndLine(); abort = true; }		
+			//if ((phylipfile == "") && (columnfile == ""))	{ mothurOut("You must provide a distance file before running the pca command."); mothurOutEndLine(); abort = true; }
+			//else if ((phylipfile != "") && (columnfile != "")) { mothurOut("You may not use both the column and the phylip parameters."); mothurOutEndLine(); abort = true; }
+			
+			//if (columnfile != "") {
+			//	if (namefile == "") {  mothurOut("You need to provide a namefile if you are going to use the column format."); mothurOutEndLine(); abort = true; }
+			//}
+			
+			string temp = validParameter.validFile(parameters, "lt", false);				if (temp == "not found") { temp = "false"; }
+			bool lt = isTrue(temp);
+			
+			if (lt)		{  matrix = 2;	}
+			else		{  matrix = 1;  }
+
+
+		}
+
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "PCACommand");
+		exit(1);
+	}
+}
+//**********************************************************************************************************************
+void PCACommand::help(){
+	try {
+	
+		mothurOut("The pca command..."); mothurOutEndLine();
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "help");
+		exit(1);
+	}
+}
+//**********************************************************************************************************************
+PCACommand::~PCACommand(){}
+//**********************************************************************************************************************
+int PCACommand::execute(){
+	try {
+	
+		if (abort == true) { return 0; }
+		
+		cout.setf(ios::fixed, ios::floatfield);
+		cout.setf(ios::showpoint);
+		cerr.setf(ios::fixed, ios::floatfield);
+		cerr.setf(ios::showpoint);
+		
+		vector<string> names;
+		vector<vector<double> > D;
+	
+		fbase = filename;
+		if(fbase.find_last_of(".")!=string::npos){
+			fbase.erase(fbase.find_last_of(".")+1); 
+		}
+		else{
+			fbase += ".";
+		}
+		read(filename, matrix, names, D);
+   	
+		double offset = 0.0000;
+		vector<double> d;
+		vector<double> e;
+		vector<vector<double> > G = D;
+		vector<vector<double> > copy_G;
+		int rank = D.size();
+		
+		cout << "\nProcessing...\n";
+		
+		for(int count=0;count<2;count++){
+			recenter(offset, D, G);   
+			tred2(G, d, e);
+			qtli(d, e, G);
+			offset = d[d.size()-1];
+			if(offset > 0.0) break;
+		} 
+		
+		
+		output(fbase, names, G, d);
+		
+		return 0;
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "execute");
+		exit(1);
+	}
+}
+/*********************************************************************************************************************************/
+
+inline double SIGN(const double a, const double b)
+{
+    return b>=0 ? (a>=0 ? a:-a) : (a>=0 ? -a:a);
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::get_comment(istream& f, char begin, char end){
+	try {
+		char d=f.get();
+		while(d != end){	d = f.get();	}
+		d = f.peek();
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "get_comment");
+		exit(1);
+	}
+}	
+
+/*********************************************************************************************************************************/
+
+void PCACommand::read_mega(istream& f, int square_m, vector<string>& name_list, vector<vector<double> >& d){
+	try {
+		get_comment(f, '#', '\n');
+		
+		char test = f.peek();
+		
+		while(test == '!'){								//get header comments
+			get_comment(f, '!', ';');
+			while(isspace(test=f.get()))		{;}
+			f.putback(test);
+			test = f.peek();
+		}
+		while(test != '\n'){							//get sequence names
+			get_comment(f, '[', ']');
+			char d = f.get();
+			d = f.get();
+			if(d == '#'){
+				string name;
+				f >> name;
+				name_list.push_back(name);
+				while(isspace(test=f.get()))		{;}
+				f.putback(test);
+			}
+			else{
+				break;
+			}
+		}
+		int rank = name_list.size();
+		d.resize(rank);
+		for(int i=0;i<rank;i++){		d[i].resize(rank);		}
+		
+		d[0][0] = 0.0000;
+		get_comment(f, '[', ']');
+		for(int i=1;i<rank;i++){
+			get_comment(f, '[', ']');
+			d[i][i]=0.0000;
+			for(int j=0;j<i;j++){
+				f >> d[i][j];
+				if (d[i][j] == -0.0000)
+					d[i][j] = 0.0000;
+				d[j][i]=d[i][j];
+			}
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "read_mega");
+		exit(1);
+	}
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::read_phylip(istream& f, int square_m, vector<string>& name_list, vector<vector<double> >& d){
+	try {
+		//     int count1=0;
+		//     int count2=0;
+		
+		int rank;
+		f >> rank;
+		
+		name_list.resize(rank);
+		d.resize(rank);
+		if(square_m == 1){
+			for(int i=0;i<rank;i++)
+				d[i].resize(rank);
+			for(int i=0;i<rank;i++) {
+				f >> name_list[i];
+				//			cout << i << "\t" << name_list[i] << endl;
+				for(int j=0;j<rank;j++) {
+					f >> d[i][j];
+					if (d[i][j] == -0.0000)
+						d[i][j] = 0.0000;
+				}
+			}
+		}
+		else if(square_m == 2){
+			for(int i=0;i<rank;i++){
+				d[i].resize(rank);
+			}
+			d[0][0] = 0.0000;
+			f >> name_list[0];
+			for(int i=1;i<rank;i++){
+				f >> name_list[i];
+				d[i][i]=0.0000;
+				for(int j=0;j<i;j++){
+					f >> d[i][j];
+					if (d[i][j] == -0.0000)
+						d[i][j] = 0.0000;
+					d[j][i]=d[i][j];
+				}
+			}
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "read_phylip");
+		exit(1);
+	}
+
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::read(string fname, int m, vector<string>& names, vector<vector<double> >& D){
+	try {
+		ifstream f(fname.c_str());
+		if(!f) {
+			cerr << "Error: Could not open " << fname << endl;
+			exit(1);
+		}
+		char test = f.peek();
+		
+		if(test == '#'){
+			read_mega(f, m, names, D);
+		}
+		else{
+			read_phylip(f, m, names, D);
+		}
+		
+		int rank = D.size();
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "read");
+		exit(1);
+	}
+}
+
+/*********************************************************************************************************************************/
+double PCACommand::pythag(double a, double b){
+    return(pow(a*a+b*b,0.5));
+}
+/*********************************************************************************************************************************/
+
+void PCACommand::matrix_mult(vector<vector<double> > first, vector<vector<double> > second, vector<vector<double> >& product){
+	try {
+		int first_rows = first.size();
+		int first_cols = first[0].size();
+		int second_cols = second[0].size();
+		
+		product.resize(first_rows);
+		for(int i=0;i<first_rows;i++){
+			product[i].resize(second_cols);
+		}
+		
+		for(int i=0;i<first_rows;i++){
+			for(int j=0;j<second_cols;j++){
+				product[i][j] = 0.0;
+				for(int k=0;k<first_cols;k++){
+					product[i][j] += first[i][k] * second[k][j];
+				}
+			}
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "matrix_mult");
+		exit(1);
+	}
+
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::recenter(double offset, vector<vector<double> > D, vector<vector<double> >& G){
+	try {
+		int rank = D.size();
+		
+		vector<vector<double> > A(rank);
+		vector<vector<double> > C(rank);
+		for(int i=0;i<rank;i++){
+			A[i].resize(rank);
+			C[i].resize(rank);
+		}
+		
+		double scale = -1.0000 / (double) rank;
+		
+		for(int i=0;i<rank;i++){
+			A[i][i] = 0.0000;
+			C[i][i] = 1.0000 + scale;
+			for(int j=i+1;j<rank;j++){
+				A[i][j] = A[j][i] = -0.5 * D[i][j] * D[i][j] + offset;
+				C[i][j] = C[j][i] = scale;
+			}
+		}
+		
+		matrix_mult(C,A,A);
+		matrix_mult(A,C,G);
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "recenter");
+		exit(1);
+	}
+
+}
+
+/*********************************************************************************************************************************/
+
+//  This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479
+
+void PCACommand::tred2(vector<vector<double> >& a, vector<double>& d, vector<double>& e){
+	try {
+		double scale, hh, h, g, f;
+		
+		int n = a.size();
+		
+		d.resize(n);
+		e.resize(n);
+		
+		for(int i=n-1;i>0;i--){
+			int l=i-1;
+			h = scale = 0.0000;
+			if(l>0){
+				for(int k=0;k<l+1;k++){
+					scale += fabs(a[i][k]);
+				}
+				if(scale == 0.0){
+					e[i] = a[i][l];
+				}
+				else{
+					for(int k=0;k<l+1;k++){
+						a[i][k] /= scale;
+						h += a[i][k] * a[i][k];
+					}
+					f = a[i][l];
+					g = (f >= 0.0 ? -sqrt(h) : sqrt(h));
+					e[i] = scale * g;
+					h -= f * g;
+					a[i][l] = f - g;
+					f = 0.0;
+					for(int j=0;j<l+1;j++){
+						a[j][i] = a[i][j] / h;
+						g = 0.0;
+						for(int k=0;k<j+1;k++){
+							g += a[j][k] * a[i][k];
+						}
+						for(int k=j+1;k<l+1;k++){
+							g += a[k][j] * a[i][k];
+						}
+						e[j] = g / h;
+						f += e[j] * a[i][j];
+					}
+					hh = f / (h + h);
+					for(int j=0;j<l+1;j++){
+						f = a[i][j];
+						e[j] = g = e[j] - hh * f;
+						for(int k=0;k<j+1;k++){
+							a[j][k] -= (f * e[k] + g * a[i][k]);
+						}
+					}
+				}
+			}
+			else{
+				e[i] = a[i][l];
+			}
+			
+			d[i] = h;
+		}
+		
+		d[0] = 0.0000;
+		e[0] = 0.0000;
+		
+		for(int i=0;i<n;i++){
+			int l = i;
+			if(d[i] != 0.0){
+				for(int j=0;j<l;j++){
+					g = 0.0000;
+					for(int k=0;k<l;k++){
+						g += a[i][k] * a[k][j];
+					}
+					for(int k=0;k<l;k++){
+						a[k][j] -= g * a[k][i];
+					}
+				}
+			}
+			d[i] = a[i][i];
+			a[i][i] = 1.0000;
+			for(int j=0;j<l;j++){
+				a[j][i] = a[i][j] = 0.0;
+			}
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "tred2");
+		exit(1);
+	}
+
+}
+
+/*********************************************************************************************************************************/
+
+//  This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479
+
+void PCACommand::qtli(vector<double>& d, vector<double>& e, vector<vector<double> >& z) {
+	try {
+		int m, i, iter;
+		double s, r, p, g, f, dd, c, b;
+		
+		int n = d.size();
+		for(int i=1;i<=n;i++){
+			e[i-1] = e[i];
+		}
+		e[n-1] = 0.0000;
+		
+		for(int l=0;l<n;l++){
+			iter = 0;
+			do {
+				for(m=l;m<n-1;m++){
+					dd = fabs(d[m]) + fabs(d[m+1]);
+					if(fabs(e[m])+dd == dd) break;
+				}
+				if(m != l){
+					if(iter++ == 30) cerr << "Too many iterations in tqli\n";
+					g = (d[l+1]-d[l]) / (2.0 * e[l]);
+					r = pythag(g, 1.0);
+					g = d[m] - d[l] + e[l] / (g + SIGN(r,g));
+					s = c = 1.0;
+					p = 0.0000;
+					for(i=m-1;i>=l;i--){
+						f = s * e[i];
+						b = c * e[i];
+						e[i+1] = (r=pythag(f,g));
+						if(r==0.0){
+							d[i+1] -= p;
+							e[m] = 0.0000;
+							break;
+						}
+						s = f / r;
+						c = g / r;
+						g = d[i+1] - p;
+						r = (d[i] - g) * s + 2.0 * c * b;
+						d[i+1] = g + ( p = s * r);
+						g = c * r - b;
+						for(int k=0;k<n;k++){
+							f = z[k][i+1];
+							z[k][i+1] = s * z[k][i] + c * f;
+							z[k][i] = c * z[k][i] - s * f;
+						}
+					}
+					if(r == 0.00 && i >= l) continue;
+					d[l] -= p;
+					e[l] = g;
+					e[m] = 0.0;
+				}
+			} while (m != l);
+		}
+		
+		int k;
+		for(int i=0;i<n;i++){
+			p=d[k=i];
+			for(int j=i;j<n;j++){
+				if(d[j] >= p){
+					p=d[k=j];
+				}
+			}
+			if(k!=i){
+				d[k]=d[i];
+				d[i]=p;
+				for(int j=0;j<n;j++){
+					p=z[j][i];
+					z[j][i] = z[j][k];
+					z[j][k] = p;
+				}
+			}
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "qtli");
+		exit(1);
+	}
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::output(string fnameRoot, vector<string> name_list, vector<vector<double> > G, vector<double> d) {
+	try {
+		int rank = name_list.size();
+		double dsum = 0.0000;
+		for(int i=0;i<rank;i++){
+			dsum += d[i];
+			for(int j=0;j<rank;j++){
+				if(d[j] >= 0)	{	G[i][j] *= pow(d[j],0.5);	}
+				else			{	G[i][j] = 0.00000;			}
+			}
+		}
+		
+		ofstream pcaData((fnameRoot+"pca").c_str(), ios::trunc);
+		pcaData.setf(ios::fixed, ios::floatfield);
+		pcaData.setf(ios::showpoint);	
+		
+		ofstream pcaLoadings((fnameRoot+"pca.loadings").c_str(), ios::trunc);
+		pcaLoadings.setf(ios::fixed, ios::floatfield);
+		pcaLoadings.setf(ios::showpoint);	
+		
+		pcaLoadings << "axis\tloading\n";
+		for(int i=0;i<rank;i++){
+			pcaLoadings << i+1 << '\t' << d[i] * 100.0 / dsum << endl;
+		}
+		
+		pcaData << "SeqName";
+		for(int i=0;i<rank;i++){
+			pcaData << '\t' << "axis" << i+1;
+		}
+		pcaData << endl;
+		
+		for(int i=0;i<rank;i++){
+			pcaData << name_list[i] << '\t';
+			for(int j=0;j<rank;j++){
+				pcaData << G[i][j] << '\t';
+			}
+			pcaData << endl;
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "output");
+		exit(1);
+	}
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::print_matrix(vector<vector<double> > A) {
+	try {
+		int rank = A.size();
+		for(int i=0;i<rank;i++){
+			for(int j=0;j<rank;j++){
+				cout << A[i][j] << "  ";
+			}
+			cout << endl;
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "PCACommand", "print_matrix");
+		exit(1);
+	}
+}
+/*********************************************************************************************************************************/
+
diff --git a/pcacommand.h b/pcacommand.h
new file mode 100644
index 0000000..822be3b
--- /dev/null
+++ b/pcacommand.h
@@ -0,0 +1,49 @@
+#ifndef PCACOMMAND_H
+#define PCACOMMAND_H
+
+/*
+ *  pcacommand.h
+ *  Mothur
+ *
+ *  Created by westcott on 1/4/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+ 
+#include "command.hpp"
+
+
+/*****************************************************************/
+class PCACommand : public Command {
+	
+public:
+	PCACommand(string);	
+	~PCACommand();
+	int execute();	
+	void help();
+	
+private:
+
+	bool abort;
+	string phylipfile, columnfile, namefile, format, filename, fbase;
+	float cutoff, precision;
+	int matrix;
+	
+	void get_comment(istream&, char, char);
+	void read_mega(istream&, int, vector<string>&, vector<vector<double> >&);
+	void read_phylip(istream&, int, vector<string>&, vector<vector<double> >&);
+	void read(string, int, vector<string>&, vector<vector<double> >&);
+	double pythag(double, double);
+	void matrix_mult(vector<vector<double> >, vector<vector<double> >, vector<vector<double> >&);
+	void recenter(double, vector<vector<double> >, vector<vector<double> >&);
+	void tred2(vector<vector<double> >&, vector<double>&, vector<double>&);
+	void qtli(vector<double>&, vector<double>&, vector<vector<double> >&);
+	void output(string, vector<string>, vector<vector<double> >, vector<double>);
+	void print_matrix(vector<vector<double> >);
+	
+};
+	
+/*****************************************************************/
+	
+#endif
+
diff --git a/readdistcommand.cpp b/readdistcommand.cpp
index f6cf5db..7e81d7f 100644
--- a/readdistcommand.cpp
+++ b/readdistcommand.cpp
@@ -206,6 +206,7 @@ int ReadDistCommand::execute(){
 			if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix;  }
 			globaldata->gSparseMatrix = read->getMatrix();
 			numDists = globaldata->gSparseMatrix->getNNodes();
+	cout << "matrix contains " << numDists << " distances." << endl;
 			
       int lines = cutoff / (1.0/precision);
       vector<float> dist_cutoff(lines+1,0);
diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp
index 54a4426..326e695 100644
--- a/unifracunweightedcommand.cpp
+++ b/unifracunweightedcommand.cpp
@@ -132,7 +132,7 @@ int UnifracUnweightedCommand::execute() {
 			
 			//get unweighted scores for random trees
 			for (int j = 0; j < iters; j++) {
-				//we need a different getValues because when we swap the labels we only want to swap those in each parwise comparison
+				//we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
 				randomData = unweighted->getValues(T[i], "", "");
 			
 				for(int k = 0; k < numComp; k++) {	
@@ -147,7 +147,6 @@ int UnifracUnweightedCommand::execute() {
 					//add randoms score to validscores
 					validScores[randomData[k]] = randomData[k];
 				}
-				
 			}
 		
 			for(int a = 0; a < numComp; a++) {