A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; };
A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
+ A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D176CE10F2558500159497 /* pcacommand.cpp */; };
A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; };
A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; };
A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; };
A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; };
A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; };
A7B0450D106CEEC90046FC83 /* slayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slayer.cpp; sourceTree = SOURCE_ROOT; };
+ A7D176CD10F2558500159497 /* pcacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pcacommand.h; sourceTree = SOURCE_ROOT; };
+ A7D176CE10F2558500159497 /* pcacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pcacommand.cpp; sourceTree = SOURCE_ROOT; };
A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = "<group>"; };
A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = "<group>"; };
A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; };
375873F60F7D649C0040F377 /* nocommands.cpp */,
3792946E0F2E191800B9034A /* parsimonycommand.h */,
3792946F0F2E191800B9034A /* parsimonycommand.cpp */,
+ A7D176CD10F2558500159497 /* pcacommand.h */,
+ A7D176CE10F2558500159497 /* pcacommand.cpp */,
A773808E10B6EFD1002A6A61 /* phylotypecommand.h */,
A773808F10B6EFD1002A6A61 /* phylotypecommand.cpp */,
A7027F2610DFF86D00BF45FE /* preclustercommand.h */,
A727E84A10D14568001A8432 /* readblast.cpp in Sources */,
A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */,
A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */,
+ A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ temp = "f"; }
flip = isTrue(temp);
- temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "0.80"; }
+ temp = validParameter.validFile(parameters, "threshold", false); if (temp == "not found"){ temp = "0.50"; }
convert(temp, threshold);
search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; }
mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n");
- mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
- mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n");
+ mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n");
+ mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n");
mothurOut("The flip parameter is used to specify whether or not you want mothur to try the reverse compement if a sequence falls below the threshold. The default is false.\n");
mothurOut("The threshold is used to specify a cutoff at which an alignment is deemed 'bad' and the reverse complement may be tried. \n");
mothurOut("If the flip parameter is set to true the reverse complement of the sequence is aligned and the better alignment is reported.\n");
- mothurOut("The default for the threshold parameter is 0.80, meaning at least 80% of the bases must remain or the sequence is reported as potentially reversed.\n");
+ mothurOut("The default for the threshold parameter is 0.50, meaning at least 50% of the bases must remain or the sequence is reported as potentially reversed.\n");
mothurOut("The align.seqs command should be in the following format: \n");
mothurOut("align.seqs(template=yourTemplateFile, candidate=yourCandidateFile, align=yourAlignmentMethod, search=yourSearchmethod, ksize=yourKmerSize, match=yourMatchBonus, mismatch=yourMismatchpenalty, gapopen=yourGapopenPenalty, gapextend=yourGapExtendPenalty) \n");
mothurOut("Example align.seqs(candidate=candidate.fasta, template=core.filtered, align=kmer, search=gotoh, ksize=8, match=2.0, mismatch=3.0, gapopen=-2.0, gapextend=-1.0)\n");
rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str());
rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str());
+ //append alignment and report files
+ for(int i=1;i<processors;i++){
+ appendAlignFiles((alignFileName + toString(processIDS[i]) + ".temp"), alignFileName);
+ remove((alignFileName + toString(processIDS[i]) + ".temp").c_str());
+
+ appendReportFiles((reportFileName + toString(processIDS[i]) + ".temp"), reportFileName);
+ remove((reportFileName + toString(processIDS[i]) + ".temp").c_str());
+ }
+
vector<string> nonBlankAccnosFiles;
//delete blank accnos files generated with multiple processes
for(int i=0;i<processors;i++){
}else{ mothurOut(" If the reverse compliment proved to be better it was reported."); }
mothurOutEndLine();
}
-
- //append other files
- for(int i=1;i<processors;i++){
- appendAlignFiles((alignFileName + toString(processIDS[i]) + ".temp"), alignFileName);
- remove((alignFileName + toString(processIDS[i]) + ".temp").c_str());
-
- appendReportFiles((reportFileName + toString(processIDS[i]) + ".temp"), reportFileName);
- remove((reportFileName + toString(processIDS[i]) + ".temp").c_str());
- }
-
}
#else
ifstream inFASTA;
openInputFile(candidateFileName, inFASTA);
inFASTA.seekg(line->start);
-
+
for(int i=0;i<line->numSeqs;i++){
Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
if (needToDeleteCopy) { delete copy; }
}
delete candidateSeq;
+
+ //report progress
+ if((i+1) % 100 == 0){ mothurOut(toString(i+1)); mothurOutEndLine(); }
+
}
+ //report progress
+ if((line->numSeqs) % 100 != 0){ mothurOut(toString(line->numSeqs)); mothurOutEndLine(); }
alignmentFile.close();
inFASTA.close();
#include "kmer.hpp"
/**************************************************************************************************/
-Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, bool p) :
-Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), probs(p) {
+Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) :
+Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
try {
numKmers = database->getMaxKmer() + 1;
int index = getMostProbableTaxonomy(queryKmers);
//bootstrap - to set confidenceScore
- if (probs) {
- int numToSelect = queryKmers.size() / 8;
- tax = bootstrapResults(queryKmers, index, numToSelect);
- }else{
- TaxNode seqTax = phyloTree->get(index);
- while (seqTax.level != 0) { //while you are not at the root
- tax = seqTax.name + ";" + tax;
- seqTax = phyloTree->get(seqTax.parent);
- }
- simpleTax = tax;
- }
-
+ int numToSelect = queryKmers.size() / 8;
+ tax = bootstrapResults(queryKmers, index, numToSelect);
+
return tax;
}
catch(exception& e) {
map<string, int>::iterator itBoot2;
map<int, int>::iterator itConvert;
- for (int i = 0; i < 100; i++) {
+ for (int i = 0; i < iters; i++) {
vector<int> temp;
for (int j = 0; j < numToSelect; j++) {
class Bayesian : public Classify {
public:
- Bayesian(string, string, string, int, int, bool);
+ Bayesian(string, string, string, int, int, int);
~Bayesian() {};
string getTaxonomy(Sequence*);
vector<int> genusTotals;
vector<int> genusNodes; //indexes in phyloTree where genus' are located
- int kmerSize, numKmers, confidenceThreshold;
- bool probs;
+ int kmerSize, numKmers, confidenceThreshold, iters;
string bootstrapResults(vector<int>, int, int);
int getMostProbableTaxonomy(vector<int>);
else {
//valid paramters for this command
- string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs"};
+ string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters"};
vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
OptionParser parser(option);
temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; }
probs = isTrue(temp);
+
+ temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; }
+ convert(temp, iters);
+
if ((method == "bayesian") && (search != "kmer")) {
mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n");
- mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
- mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n");
+ mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n");
+ mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n");
mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n");
mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n");
mothurOut("The probs parameter shut off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be run.\n");
+ mothurOut("The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n");
mothurOut("The classify.seqs command should be in the following format: \n");
mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
try {
if (abort == true) { return 0; }
- if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, probs); }
+ if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters); }
else if(method == "knn") { classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted); }
else {
mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
mothurOutEndLine();
- classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, probs);
+ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);
}
int numFastaSeqs = 0;
- string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
+ string newTaxonomyFile = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "taxonomy";
string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp";
- string taxSummary = getRootName(fastaFileName) + "tax.summary";
+ string taxSummary = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "tax.summary";
int start = time(NULL);
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
taxonomy = classify->getTaxonomy(candidateSeq);
if (taxonomy != "bad seq") {
- outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+ //output confidence scores or not
+ if (probs) {
+ outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+ }else{
+ outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
+ }
+
outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
}
}
Classify* classify;
string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName;
- int processors, kmerSize, numWanted, cutoff;
+ int processors, kmerSize, numWanted, cutoff, iters;
float match, misMatch, gapOpen, gapExtend;
bool abort, probs;
#include "phylotypecommand.h"
#include "mgclustercommand.h"
#include "preclustercommand.h"
+#include "pcacommand.h"
/*******************************************************/
commands["phylotype"] = "phylotype";
commands["mgcluster"] = "mgcluster";
commands["pre.cluster"] = "pre.cluster";
+ commands["pca"] = "pca";
}
/***********************************************************/
else if(commandName == "phylotype") { command = new PhylotypeCommand(optionString); }
else if(commandName == "mgcluster") { command = new MGClusterCommand(optionString); }
else if(commandName == "pre.cluster") { command = new PreClusterCommand(optionString); }
+ else if(commandName == "pca") { command = new PCACommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
help(); abort = true;
} else {
//valid paramters for this command
- string Array[] = {"fasta","list","label","name", "group", "sorted"};
+ string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
listfile = validParameter.validFile(parameters, "list", true);
if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
else if (listfile == "not open") { abort = true; }
+
+ phylipfile = validParameter.validFile(parameters, "phylip", true);
+ if (phylipfile == "not found") { phylipfile = ""; }
+ else if (phylipfile == "not open") { abort = true; }
+
+ columnfile = validParameter.validFile(parameters, "column", true);
+ if (columnfile == "not found") { columnfile = ""; }
+ else if (columnfile == "not open") { abort = true; }
+
+ if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
+ else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
+
+ if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
}
if (abort == false) {
-
- if(globaldata->gSparseMatrix != NULL) {
- matrix = globaldata->gSparseMatrix;
- }
//globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
if (globaldata->gListVector != NULL) {
}
} else { mothurOut("error, no listvector."); mothurOutEndLine(); }
- // Create a data structure to quickly access the distance information.
- // It consists of a vector of distance maps, where each map contains
- // all distances of a certain sequence. Vector and maps are accessed
- // via the index of a sequence in the distance matrix
- seqVec = vector<SeqMap>(globaldata->gListVector->size());
- for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
- seqVec[currentCell->row][currentCell->column] = currentCell->dist;
- }
-
fasta = new FastaMap();
}
}
try {
if (abort == true) { return 0; }
-
int error;
//read fastafile
fasta->readFastaFile(fastafile);
- in.close();
-
- //set format to list so input can get listvector
- globaldata->setFormat("list");
+ //in.close();
+ //read distance file and generate indexed distance file that can be used by findrep function
+//....new reading class....//
//if user gave a namesfile then use it
- if (namesfile != "") {
- readNamesFile();
- }
+ if (namesfile != "") { readNamesFile(); }
+ //set format to list so input can get listvector
+ globaldata->setFormat("list");
+
//read list file
read = new ReadOTUFile(listfile);
read->read(&*globaldata);
if (error == 1) { return 0; } //there is an error in hte input files, abort command
}
- delete matrix;
- globaldata->gSparseMatrix = NULL;
delete list;
globaldata->gListVector = NULL;
SeqMap::iterator it;
SeqMap currMap;
for (size_t i=0; i < seqIndex.size(); i++) {
- currMap = seqVec[seqIndex[i]];
+ //currMap = seqVec[seqIndex[i]];
for (size_t j=0; j < seqIndex.size(); j++) {
it = currMap.find(seqIndex[j]);
if (it != currMap.end()) {
#include "command.hpp"
#include "globaldata.hpp"
-#include "sparsematrix.hpp"
#include "listvector.hpp"
#include "inputdata.h"
#include "readotu.h"
private:
GlobalData* globaldata;
- SparseMatrix* matrix;
ListVector* list;
ReadOTUFile* read;
InputData* input;
FastaMap* fasta;
GroupMap* groupMap;
- string filename, fastafile, listfile, namesfile, groupfile, label, sorted;
+ string filename, fastafile, listfile, namesfile, groupfile, label, sorted, phylipfile, columnfile, namefile;
ofstream out;
ifstream in, inNames;
- bool groupError;
-
- bool abort, allLines;
+ bool abort, allLines, groupError;
set<string> labels; //holds labels to be used
map<string, int> nameToIndex; //maps sequence name to index in sparsematrix
- vector<SeqMap> seqVec; // contains maps with sequence index and distance
- // for all distances related to a certain sequence
-
-
void readNamesFile();
int process(ListVector*);
string findRep(int, string&, ListVector*, int&); // returns the name of the "representative" sequence of given bin,
--- /dev/null
+
+/*
+ * pcacommand.cpp
+ * Mothur
+ *
+ * Created by westcott on 1/4/10.
+ * Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "pcacommand.h"
+
+//**********************************************************************************************************************
+
+PCACommand::PCACommand(string option){
+ try {
+ abort = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; }
+
+ else {
+ //valid paramters for this command
+ string Array[] = {"phylip","lt"};
+ vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+
+ OptionParser parser(option);
+ map<string, string> parameters = parser. getParameters();
+
+ ValidParameters validParameter;
+
+ //check to make sure all parameters are valid for command
+ for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //required parameters
+ phylipfile = validParameter.validFile(parameters, "phylip", true);
+ if (phylipfile == "not open") { abort = true; }
+ else if (phylipfile == "not found") { phylipfile = ""; abort = true; }
+ else { filename = phylipfile; }
+
+ //columnfile = validParameter.validFile(parameters, "column", true);
+ //if (columnfile == "not open") { abort = true; }
+ //else if (columnfile == "not found") { columnfile = ""; }
+ //else { format = "column"; }
+
+ //namefile = validParameter.validFile(parameters, "name", true);
+ //if (namefile == "not open") { abort = true; }
+ //else if (namefile == "not found") { namefile = ""; }
+
+
+ //error checking on files
+ if (phylipfile == "") { mothurOut("You must provide a distance file before running the pca command."); mothurOutEndLine(); abort = true; }
+ //if ((phylipfile == "") && (columnfile == "")) { mothurOut("You must provide a distance file before running the pca command."); mothurOutEndLine(); abort = true; }
+ //else if ((phylipfile != "") && (columnfile != "")) { mothurOut("You may not use both the column and the phylip parameters."); mothurOutEndLine(); abort = true; }
+
+ //if (columnfile != "") {
+ // if (namefile == "") { mothurOut("You need to provide a namefile if you are going to use the column format."); mothurOutEndLine(); abort = true; }
+ //}
+
+ string temp = validParameter.validFile(parameters, "lt", false); if (temp == "not found") { temp = "false"; }
+ bool lt = isTrue(temp);
+
+ if (lt) { matrix = 2; }
+ else { matrix = 1; }
+
+
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "PCACommand");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+void PCACommand::help(){
+ try {
+
+ mothurOut("The pca command..."); mothurOutEndLine();
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "help");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+PCACommand::~PCACommand(){}
+//**********************************************************************************************************************
+int PCACommand::execute(){
+ try {
+
+ if (abort == true) { return 0; }
+
+ cout.setf(ios::fixed, ios::floatfield);
+ cout.setf(ios::showpoint);
+ cerr.setf(ios::fixed, ios::floatfield);
+ cerr.setf(ios::showpoint);
+
+ vector<string> names;
+ vector<vector<double> > D;
+
+ fbase = filename;
+ if(fbase.find_last_of(".")!=string::npos){
+ fbase.erase(fbase.find_last_of(".")+1);
+ }
+ else{
+ fbase += ".";
+ }
+ read(filename, matrix, names, D);
+
+ double offset = 0.0000;
+ vector<double> d;
+ vector<double> e;
+ vector<vector<double> > G = D;
+ vector<vector<double> > copy_G;
+ int rank = D.size();
+
+ cout << "\nProcessing...\n";
+
+ for(int count=0;count<2;count++){
+ recenter(offset, D, G);
+ tred2(G, d, e);
+ qtli(d, e, G);
+ offset = d[d.size()-1];
+ if(offset > 0.0) break;
+ }
+
+
+ output(fbase, names, G, d);
+
+ return 0;
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "execute");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
+
+inline double SIGN(const double a, const double b)
+{
+ return b>=0 ? (a>=0 ? a:-a) : (a>=0 ? -a:a);
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::get_comment(istream& f, char begin, char end){
+ try {
+ char d=f.get();
+ while(d != end){ d = f.get(); }
+ d = f.peek();
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "get_comment");
+ exit(1);
+ }
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::read_mega(istream& f, int square_m, vector<string>& name_list, vector<vector<double> >& d){
+ try {
+ get_comment(f, '#', '\n');
+
+ char test = f.peek();
+
+ while(test == '!'){ //get header comments
+ get_comment(f, '!', ';');
+ while(isspace(test=f.get())) {;}
+ f.putback(test);
+ test = f.peek();
+ }
+ while(test != '\n'){ //get sequence names
+ get_comment(f, '[', ']');
+ char d = f.get();
+ d = f.get();
+ if(d == '#'){
+ string name;
+ f >> name;
+ name_list.push_back(name);
+ while(isspace(test=f.get())) {;}
+ f.putback(test);
+ }
+ else{
+ break;
+ }
+ }
+ int rank = name_list.size();
+ d.resize(rank);
+ for(int i=0;i<rank;i++){ d[i].resize(rank); }
+
+ d[0][0] = 0.0000;
+ get_comment(f, '[', ']');
+ for(int i=1;i<rank;i++){
+ get_comment(f, '[', ']');
+ d[i][i]=0.0000;
+ for(int j=0;j<i;j++){
+ f >> d[i][j];
+ if (d[i][j] == -0.0000)
+ d[i][j] = 0.0000;
+ d[j][i]=d[i][j];
+ }
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "read_mega");
+ exit(1);
+ }
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::read_phylip(istream& f, int square_m, vector<string>& name_list, vector<vector<double> >& d){
+ try {
+ // int count1=0;
+ // int count2=0;
+
+ int rank;
+ f >> rank;
+
+ name_list.resize(rank);
+ d.resize(rank);
+ if(square_m == 1){
+ for(int i=0;i<rank;i++)
+ d[i].resize(rank);
+ for(int i=0;i<rank;i++) {
+ f >> name_list[i];
+ // cout << i << "\t" << name_list[i] << endl;
+ for(int j=0;j<rank;j++) {
+ f >> d[i][j];
+ if (d[i][j] == -0.0000)
+ d[i][j] = 0.0000;
+ }
+ }
+ }
+ else if(square_m == 2){
+ for(int i=0;i<rank;i++){
+ d[i].resize(rank);
+ }
+ d[0][0] = 0.0000;
+ f >> name_list[0];
+ for(int i=1;i<rank;i++){
+ f >> name_list[i];
+ d[i][i]=0.0000;
+ for(int j=0;j<i;j++){
+ f >> d[i][j];
+ if (d[i][j] == -0.0000)
+ d[i][j] = 0.0000;
+ d[j][i]=d[i][j];
+ }
+ }
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "read_phylip");
+ exit(1);
+ }
+
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::read(string fname, int m, vector<string>& names, vector<vector<double> >& D){
+ try {
+ ifstream f(fname.c_str());
+ if(!f) {
+ cerr << "Error: Could not open " << fname << endl;
+ exit(1);
+ }
+ char test = f.peek();
+
+ if(test == '#'){
+ read_mega(f, m, names, D);
+ }
+ else{
+ read_phylip(f, m, names, D);
+ }
+
+ int rank = D.size();
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "read");
+ exit(1);
+ }
+}
+
+/*********************************************************************************************************************************/
+double PCACommand::pythag(double a, double b){
+ return(pow(a*a+b*b,0.5));
+}
+/*********************************************************************************************************************************/
+
+void PCACommand::matrix_mult(vector<vector<double> > first, vector<vector<double> > second, vector<vector<double> >& product){
+ try {
+ int first_rows = first.size();
+ int first_cols = first[0].size();
+ int second_cols = second[0].size();
+
+ product.resize(first_rows);
+ for(int i=0;i<first_rows;i++){
+ product[i].resize(second_cols);
+ }
+
+ for(int i=0;i<first_rows;i++){
+ for(int j=0;j<second_cols;j++){
+ product[i][j] = 0.0;
+ for(int k=0;k<first_cols;k++){
+ product[i][j] += first[i][k] * second[k][j];
+ }
+ }
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "matrix_mult");
+ exit(1);
+ }
+
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::recenter(double offset, vector<vector<double> > D, vector<vector<double> >& G){
+ try {
+ int rank = D.size();
+
+ vector<vector<double> > A(rank);
+ vector<vector<double> > C(rank);
+ for(int i=0;i<rank;i++){
+ A[i].resize(rank);
+ C[i].resize(rank);
+ }
+
+ double scale = -1.0000 / (double) rank;
+
+ for(int i=0;i<rank;i++){
+ A[i][i] = 0.0000;
+ C[i][i] = 1.0000 + scale;
+ for(int j=i+1;j<rank;j++){
+ A[i][j] = A[j][i] = -0.5 * D[i][j] * D[i][j] + offset;
+ C[i][j] = C[j][i] = scale;
+ }
+ }
+
+ matrix_mult(C,A,A);
+ matrix_mult(A,C,G);
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "recenter");
+ exit(1);
+ }
+
+}
+
+/*********************************************************************************************************************************/
+
+// This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479
+
+void PCACommand::tred2(vector<vector<double> >& a, vector<double>& d, vector<double>& e){
+ try {
+ double scale, hh, h, g, f;
+
+ int n = a.size();
+
+ d.resize(n);
+ e.resize(n);
+
+ for(int i=n-1;i>0;i--){
+ int l=i-1;
+ h = scale = 0.0000;
+ if(l>0){
+ for(int k=0;k<l+1;k++){
+ scale += fabs(a[i][k]);
+ }
+ if(scale == 0.0){
+ e[i] = a[i][l];
+ }
+ else{
+ for(int k=0;k<l+1;k++){
+ a[i][k] /= scale;
+ h += a[i][k] * a[i][k];
+ }
+ f = a[i][l];
+ g = (f >= 0.0 ? -sqrt(h) : sqrt(h));
+ e[i] = scale * g;
+ h -= f * g;
+ a[i][l] = f - g;
+ f = 0.0;
+ for(int j=0;j<l+1;j++){
+ a[j][i] = a[i][j] / h;
+ g = 0.0;
+ for(int k=0;k<j+1;k++){
+ g += a[j][k] * a[i][k];
+ }
+ for(int k=j+1;k<l+1;k++){
+ g += a[k][j] * a[i][k];
+ }
+ e[j] = g / h;
+ f += e[j] * a[i][j];
+ }
+ hh = f / (h + h);
+ for(int j=0;j<l+1;j++){
+ f = a[i][j];
+ e[j] = g = e[j] - hh * f;
+ for(int k=0;k<j+1;k++){
+ a[j][k] -= (f * e[k] + g * a[i][k]);
+ }
+ }
+ }
+ }
+ else{
+ e[i] = a[i][l];
+ }
+
+ d[i] = h;
+ }
+
+ d[0] = 0.0000;
+ e[0] = 0.0000;
+
+ for(int i=0;i<n;i++){
+ int l = i;
+ if(d[i] != 0.0){
+ for(int j=0;j<l;j++){
+ g = 0.0000;
+ for(int k=0;k<l;k++){
+ g += a[i][k] * a[k][j];
+ }
+ for(int k=0;k<l;k++){
+ a[k][j] -= g * a[k][i];
+ }
+ }
+ }
+ d[i] = a[i][i];
+ a[i][i] = 1.0000;
+ for(int j=0;j<l;j++){
+ a[j][i] = a[i][j] = 0.0;
+ }
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "tred2");
+ exit(1);
+ }
+
+}
+
+/*********************************************************************************************************************************/
+
+// This function is taken from Numerical Recipes in C++ by Press et al., 2nd edition, pg. 479
+
+void PCACommand::qtli(vector<double>& d, vector<double>& e, vector<vector<double> >& z) {
+ try {
+ int m, i, iter;
+ double s, r, p, g, f, dd, c, b;
+
+ int n = d.size();
+ for(int i=1;i<=n;i++){
+ e[i-1] = e[i];
+ }
+ e[n-1] = 0.0000;
+
+ for(int l=0;l<n;l++){
+ iter = 0;
+ do {
+ for(m=l;m<n-1;m++){
+ dd = fabs(d[m]) + fabs(d[m+1]);
+ if(fabs(e[m])+dd == dd) break;
+ }
+ if(m != l){
+ if(iter++ == 30) cerr << "Too many iterations in tqli\n";
+ g = (d[l+1]-d[l]) / (2.0 * e[l]);
+ r = pythag(g, 1.0);
+ g = d[m] - d[l] + e[l] / (g + SIGN(r,g));
+ s = c = 1.0;
+ p = 0.0000;
+ for(i=m-1;i>=l;i--){
+ f = s * e[i];
+ b = c * e[i];
+ e[i+1] = (r=pythag(f,g));
+ if(r==0.0){
+ d[i+1] -= p;
+ e[m] = 0.0000;
+ break;
+ }
+ s = f / r;
+ c = g / r;
+ g = d[i+1] - p;
+ r = (d[i] - g) * s + 2.0 * c * b;
+ d[i+1] = g + ( p = s * r);
+ g = c * r - b;
+ for(int k=0;k<n;k++){
+ f = z[k][i+1];
+ z[k][i+1] = s * z[k][i] + c * f;
+ z[k][i] = c * z[k][i] - s * f;
+ }
+ }
+ if(r == 0.00 && i >= l) continue;
+ d[l] -= p;
+ e[l] = g;
+ e[m] = 0.0;
+ }
+ } while (m != l);
+ }
+
+ int k;
+ for(int i=0;i<n;i++){
+ p=d[k=i];
+ for(int j=i;j<n;j++){
+ if(d[j] >= p){
+ p=d[k=j];
+ }
+ }
+ if(k!=i){
+ d[k]=d[i];
+ d[i]=p;
+ for(int j=0;j<n;j++){
+ p=z[j][i];
+ z[j][i] = z[j][k];
+ z[j][k] = p;
+ }
+ }
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "qtli");
+ exit(1);
+ }
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::output(string fnameRoot, vector<string> name_list, vector<vector<double> > G, vector<double> d) {
+ try {
+ int rank = name_list.size();
+ double dsum = 0.0000;
+ for(int i=0;i<rank;i++){
+ dsum += d[i];
+ for(int j=0;j<rank;j++){
+ if(d[j] >= 0) { G[i][j] *= pow(d[j],0.5); }
+ else { G[i][j] = 0.00000; }
+ }
+ }
+
+ ofstream pcaData((fnameRoot+"pca").c_str(), ios::trunc);
+ pcaData.setf(ios::fixed, ios::floatfield);
+ pcaData.setf(ios::showpoint);
+
+ ofstream pcaLoadings((fnameRoot+"pca.loadings").c_str(), ios::trunc);
+ pcaLoadings.setf(ios::fixed, ios::floatfield);
+ pcaLoadings.setf(ios::showpoint);
+
+ pcaLoadings << "axis\tloading\n";
+ for(int i=0;i<rank;i++){
+ pcaLoadings << i+1 << '\t' << d[i] * 100.0 / dsum << endl;
+ }
+
+ pcaData << "SeqName";
+ for(int i=0;i<rank;i++){
+ pcaData << '\t' << "axis" << i+1;
+ }
+ pcaData << endl;
+
+ for(int i=0;i<rank;i++){
+ pcaData << name_list[i] << '\t';
+ for(int j=0;j<rank;j++){
+ pcaData << G[i][j] << '\t';
+ }
+ pcaData << endl;
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "output");
+ exit(1);
+ }
+}
+
+/*********************************************************************************************************************************/
+
+void PCACommand::print_matrix(vector<vector<double> > A) {
+ try {
+ int rank = A.size();
+ for(int i=0;i<rank;i++){
+ for(int j=0;j<rank;j++){
+ cout << A[i][j] << " ";
+ }
+ cout << endl;
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "PCACommand", "print_matrix");
+ exit(1);
+ }
+}
+/*********************************************************************************************************************************/
+
--- /dev/null
+#ifndef PCACOMMAND_H
+#define PCACOMMAND_H
+
+/*
+ * pcacommand.h
+ * Mothur
+ *
+ * Created by westcott on 1/4/10.
+ * Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "command.hpp"
+
+
+/*****************************************************************/
+class PCACommand : public Command {
+
+public:
+ PCACommand(string);
+ ~PCACommand();
+ int execute();
+ void help();
+
+private:
+
+ bool abort;
+ string phylipfile, columnfile, namefile, format, filename, fbase;
+ float cutoff, precision;
+ int matrix;
+
+ void get_comment(istream&, char, char);
+ void read_mega(istream&, int, vector<string>&, vector<vector<double> >&);
+ void read_phylip(istream&, int, vector<string>&, vector<vector<double> >&);
+ void read(string, int, vector<string>&, vector<vector<double> >&);
+ double pythag(double, double);
+ void matrix_mult(vector<vector<double> >, vector<vector<double> >, vector<vector<double> >&);
+ void recenter(double, vector<vector<double> >, vector<vector<double> >&);
+ void tred2(vector<vector<double> >&, vector<double>&, vector<double>&);
+ void qtli(vector<double>&, vector<double>&, vector<vector<double> >&);
+ void output(string, vector<string>, vector<vector<double> >, vector<double>);
+ void print_matrix(vector<vector<double> >);
+
+};
+
+/*****************************************************************/
+
+#endif
+
if (globaldata->gSparseMatrix != NULL) { delete globaldata->gSparseMatrix; }
globaldata->gSparseMatrix = read->getMatrix();
numDists = globaldata->gSparseMatrix->getNNodes();
+ cout << "matrix contains " << numDists << " distances." << endl;
int lines = cutoff / (1.0/precision);
vector<float> dist_cutoff(lines+1,0);
//get unweighted scores for random trees
for (int j = 0; j < iters; j++) {
- //we need a different getValues because when we swap the labels we only want to swap those in each parwise comparison
+ //we need a different getValues because when we swap the labels we only want to swap those in each pairwise comparison
randomData = unweighted->getValues(T[i], "", "");
for(int k = 0; k < numComp; k++) {
//add randoms score to validscores
validScores[randomData[k]] = randomData[k];
}
-
}
for(int a = 0; a < numComp; a++) {