7EC3D4550FA0FFF900338DA5 /* coverage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4500FA0FFF900338DA5 /* coverage.cpp */; };
7EC3D4560FA0FFF900338DA5 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */; };
8DD76F6A0486A84900D96B5E /* Mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6859E8B029090EE04C91782 /* Mothur.1 */; };
+ A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */; };
A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; };
A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; };
A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = SOURCE_ROOT; };
7EC3D4540FA0FFF900338DA5 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = SOURCE_ROOT; };
8DD76F6C0486A84900D96B5E /* mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mothur; sourceTree = BUILT_PRODUCTS_DIR; };
+ A7027F2610DFF86D00BF45FE /* preclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = preclustercommand.h; sourceTree = SOURCE_ROOT; };
+ A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = preclustercommand.cpp; sourceTree = SOURCE_ROOT; };
A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getgroupcommand.cpp; sourceTree = SOURCE_ROOT; };
A70B53A50F4CD7AD0064797E /* getgroupcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getgroupcommand.h; sourceTree = SOURCE_ROOT; };
A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlabelcommand.cpp; sourceTree = SOURCE_ROOT; };
3792946F0F2E191800B9034A /* parsimonycommand.cpp */,
A773808E10B6EFD1002A6A61 /* phylotypecommand.h */,
A773808F10B6EFD1002A6A61 /* phylotypecommand.cpp */,
+ A7027F2610DFF86D00BF45FE /* preclustercommand.h */,
+ A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */,
37D927FE0F21331F001D4494 /* quitcommand.h */,
37D927FD0F21331F001D4494 /* quitcommand.cpp */,
37D928080F21331F001D4494 /* rarefactcommand.h */,
A773809010B6EFD1002A6A61 /* phylotypecommand.cpp in Sources */,
A727E84A10D14568001A8432 /* readblast.cpp in Sources */,
A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */,
+ A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
#include "classifyseqscommand.h"
#include "phylotypecommand.h"
#include "mgclustercommand.h"
+#include "preclustercommand.h"
/*******************************************************/
commands["classify.seqs"] = "classify.seqs";
commands["phylotype"] = "phylotype";
commands["mgcluster"] = "mgcluster";
+ commands["pre.cluster"] = "pre.cluster";
}
/***********************************************************/
else if(commandName == "classify.seqs") { command = new ClassifySeqsCommand(optionString); }
else if(commandName == "phylotype") { command = new PhylotypeCommand(optionString); }
else if(commandName == "mgcluster") { command = new MGClusterCommand(optionString); }
+ else if(commandName == "pre.cluster") { command = new PreClusterCommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
}
}
-/**************************************************************************************************/
+/**************************************************************************************************
void DistanceCommand::appendFiles(string temp, string filename) {
try{
ofstream output;
bool abort;
vector<string> Estimators; //holds estimators to be used
- void appendFiles(string, string);
+ //void appendFiles(string, string);
void createProcesses(string);
int driver(/*Dist*, SequenceDB, */int, int, string, float);
#include "listvector.hpp"
#include "sparsematrix.hpp"
+
/***********************************************************************/
-HCluster::HCluster(RAbundVector* rav, ListVector* lv) : rabund(rav), list(lv){
+HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m) : rabund(rav), list(lv), method(m){
try {
mapWanted = false;
+ exitedBreak = false;
numSeqs = list->getNumSeqs();
//initialize cluster array
smallRow = row;
smallCol = col;
smallDist = distance;
- bool clustered = false;
//find upmost parent of row and col
smallRow = getUpmostParent(smallRow);
smallCol = getUpmostParent(smallCol);
//cout << "row = " << row << " smallRow = " << smallRow << " col = " << col << " smallCol = " << smallCol << " dist = " << distance << endl;
-
- //are they active in the link table
- int linkValue = makeActive(); //after this point this nodes info is active in linkTable
- //printInfo();
-//cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl;
- //can we cluster???
- if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {
- //printInfo();
- updateArrayandLinkTable();
- clusterBins();
- clusterNames();
- clustered = true;
- //printInfo();
+ //you don't want to cluster with yourself
+ if (smallRow != smallCol) {
+ //are they active in the link table
+ int linkValue = makeActive(); //after this point this nodes info is active in linkTable
+ //printInfo();
+ //cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl;
+ //can we cluster???
+ bool cluster = false;
+
+ if (method == "nearest") { cluster = true; }
+ else if (method == "average") { cout << "still working on this... " << endl; //got to figure this out
+ }else{ //assume furthest
+ if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; }
+ }
+
+ if (cluster) {
+ updateArrayandLinkTable();
+ clusterBins();
+ clusterNames();
+ }
}
-
//printInfo();
}
catch(exception& e) {
exit(1);
}
}
+//**********************************************************************************************************************
+vector<seqDist> HCluster::getSeqs(ifstream& filehandle, NameAssignment* nameMap, float cutoff){
+ try {
+ string firstName, secondName;
+ float distance, prevDistance;
+ vector<seqDist> sameSeqs;
+ prevDistance = -1;
+
+ //if you are not at the beginning of the file
+ if (exitedBreak) {
+ sameSeqs.push_back(next);
+ prevDistance = next.dist;
+ exitedBreak = false;
+ }
+
+ //get entry
+ while (!filehandle.eof()) {
+
+ filehandle >> firstName >> secondName >> distance; gobble(filehandle);
+
+ //save first one
+ if (prevDistance == -1) { prevDistance = distance; }
+
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+
+ //using cutoff
+ if (distance > cutoff) { break; }
+
+ if (distance != -1) { //-1 means skip me
+
+ //are the distances the same
+ if (distance == prevDistance) { //save in vector
+ seqDist temp(itA->second, itB->second, distance);
+ sameSeqs.push_back(temp);
+ exitedBreak = false;
+ }else{
+ next.seq1 = itA->second;
+ next.seq2 = itB->second;
+ next.dist = distance;
+ exitedBreak = true;
+ break;
+ }
+ }
+ }
+
+ //rndomize matching dists
+ random_shuffle(sameSeqs.begin(), sameSeqs.end());
+
+ return sameSeqs;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "getSeqs");
+ exit(1);
+ }
+}
/***********************************************************************/
#include "mothur.h"
+#include "nameassignment.hpp"
class RAbundVector;
class ListVector;
class HCluster {
public:
- HCluster(RAbundVector*, ListVector*);
+ HCluster(RAbundVector*, ListVector*, string);
~HCluster(){};
void update(int, int, float);
void setMapWanted(bool m);
map<string, int> getSeqtoBin() { return seq2Bin; }
+ vector<seqDist> getSeqs(ifstream&, NameAssignment*, float);
protected:
void clusterBins();
int smallCol;
float smallDist;
map<string, int> seq2Bin;
- bool mapWanted;
+ bool mapWanted, exitedBreak;
+ seqDist next;
+ string method;
};
cutoff += (5 / (precision * 10.0));
method = validParameter.validFile(parameters, "method", false);
- if (method == "not found") { method = "nearest"; }
+ if (method == "not found") { method = "furthest"; }
if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
fileroot = getRootName(distfile);
- tag = "fn"; //until we figure out average and nearest methods
+ if (method == "furthest") { tag = "fn"; }
+ else if (method == "nearest") { tag = "nn"; }
+ else { tag = "an"; }
openOutputFile(fileroot+ tag + ".sabund", sabundFile);
openOutputFile(fileroot+ tag + ".rabund", rabundFile);
mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
mothurOut("The hcluster command should be in the following format: \n");
mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
- mothurOut("The acceptable hcluster methods is furthest, but we hope to add nearest and average in the future.\n\n");
+ mothurOut("The acceptable hcluster methods are furthest and nearest, but we hope to add average in the future.\n\n");
}
catch(exception& e) {
errorOut(e, "HClusterCommand", "help");
string firstName, secondName;
float distance;
- cluster = new HCluster(rabund, list);
+ cluster = new HCluster(rabund, list, method);
vector<seqDist> seqs; seqs.resize(1); // to start loop
- exitedBreak = false; //lets you know if there is a distance stored in next
-
- while (seqs.size() != 0){
- seqs = getSeqs(in);
- random_shuffle(seqs.begin(), seqs.end());
-
- if (seqs.size() == 0) { break; } //there are no more distances
+ while (seqs.size() != 0){
+ seqs = cluster->getSeqs(in, globaldata->nameMap, cutoff);
+
for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
if (print_start && isTrue(timing)) {
print_start = false;
}
- //cout << "before cluster update" << endl;
+
if (seqs[i].seq1 != seqs[i].seq2) {
cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
float rndDist = roundDist(seqs[i].dist, precision);
- //cout << "after cluster update clusterSomething = " << clusteredSomething << " rndDist = " << rndDist << " rndPreviousDist = " << rndPreviousDist << endl;
-
-
+
if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
printData("unique");
}
sabundFile.close();
rabundFile.close();
listFile.close();
-
delete cluster;
- //if (isTrue(timing)) {
- mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); mothurOutEndLine();
- //}
+
+ mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); mothurOutEndLine();
+
return 0;
}
catch(exception& e) {
}
//**********************************************************************************************************************
-vector<seqDist> HClusterCommand::getSeqs(ifstream& filehandle){
- try {
- string firstName, secondName;
- float distance, prevDistance;
- vector<seqDist> sameSeqs;
- prevDistance = -1;
-
- //if you are not at the beginning of the file
- if (exitedBreak) {
- sameSeqs.push_back(next);
- prevDistance = next.dist;
- exitedBreak = false;
- }
-
- //get entry
- while (filehandle) {
-
- filehandle >> firstName >> secondName >> distance;
-//cout << firstName << '\t' << secondName << '\t' << distance << endl;
- gobble(filehandle);
-
- //save first one
- if (prevDistance == -1) { prevDistance = distance; }
- //cout << prevDistance << endl;
-//if (globaldata->nameMap == NULL) { cout << "null" << endl; }
- map<string,int>::iterator itA = globaldata->nameMap->find(firstName);
- map<string,int>::iterator itB = globaldata->nameMap->find(secondName);
-
- if(itA == globaldata->nameMap->end()){
- cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);
- }
- if(itB == globaldata->nameMap->end()){
- cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);
- }
- //cout << "here" << endl;
- //using cutoff
- if (distance > cutoff) { break; }
-
- if (distance != -1) { //-1 means skip me
-
- //are the distances the same
- if (distance == prevDistance) { //save in vector
- seqDist temp;
- temp.seq1 = itA->second;
- temp.seq2 = itB->second;
- temp.dist = distance;
- sameSeqs.push_back(temp);
- exitedBreak = false;
- //what about precision??
-
- }else{
- next.seq1 = itA->second;
- next.seq2 = itB->second;
- next.dist = distance;
- exitedBreak = true;
- break;
- }
-
- }
- }
-
- return sameSeqs;
- }
- catch(exception& e) {
- errorOut(e, "HClusterCommand", "getSeqs");
- exit(1);
- }
-
-
-}
-
-//**********************************************************************************************************************
//Applications Center, 1030 S. Highway A1A, Patrick AFB, FL 32925-3002, USA
//Received January 28, 2009; Revised April 14, 2009; Accepted April 15, 2009
/************************************************************/
-struct seqDist {
- int seq1;
- int seq2;
- float dist;
-};
-/************************************************************/
class HClusterCommand : public Command {
public:
ListVector oldList;
ReadCluster* read;
- bool abort;
-
- string method, fileroot, tag, distfile, format, phylipfile, columnfile, namefile, sort;
+ bool abort, sorted, print_start;
+ string method, fileroot, tag, distfile, format, phylipfile, columnfile, namefile, sort, showabund, timing;
double cutoff;
- string showabund, timing;
int precision, length;
ofstream sabundFile, rabundFile, listFile;
- //ifstream in;
- seqDist next;
- bool exitedBreak, sorted;
-
- bool print_start;
time_t start;
unsigned long loops;
void printData(string label);
- vector<seqDist> getSeqs(ifstream&);
};
/************************************************************/
*/
#include "mgclustercommand.h"
-#include "readcluster.h"
//**********************************************************************************************************************
MGClusterCommand::MGClusterCommand(string option){
start = time(NULL);
oldList = *list;
+ if (method == "furthest") { tag = "fn"; }
+ else if (method == "nearest") { tag = "nn"; }
+ else { tag = "an"; }
+
+ //open output files
+ openOutputFile(fileroot+ tag + ".list", listFile);
+ openOutputFile(fileroot+ tag + ".rabund", rabundFile);
+ openOutputFile(fileroot+ tag + ".sabund", sabundFile);
+
if (!hclusterWanted) {
//get distmatrix and overlap
SparseMatrix* distMatrix = read->getDistMatrix();
if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix); }
else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix); }
else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix); }
- tag = cluster->getTag();
cluster->setMapWanted(true);
- //open output files
- openOutputFile(fileroot+ tag + ".list", listFile);
- openOutputFile(fileroot+ tag + ".rabund", rabundFile);
- openOutputFile(fileroot+ tag + ".sabund", sabundFile);
-
//cluster using cluster classes
while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
sortHclusterFiles(distFile, overlapFile);
//create cluster
- hcluster = new HCluster(rabund, list);
+ hcluster = new HCluster(rabund, list, method);
hcluster->setMapWanted(true);
- //open output files
- openOutputFile(fileroot+ tag + ".list", listFile);
- openOutputFile(fileroot+ tag + ".rabund", rabundFile);
- openOutputFile(fileroot+ tag + ".sabund", sabundFile);
-
- vector<DistNode> seqs; seqs.resize(1); // to start loop
- exitedBreak = false; //lets you know if there is a distance stored in next
-
+ vector<seqDist> seqs; seqs.resize(1); // to start loop
ifstream inHcluster;
openInputFile(distFile, inHcluster);
while (seqs.size() != 0){
- seqs = getSeqs(inHcluster);
- random_shuffle(seqs.begin(), seqs.end());
+ seqs = hcluster->getSeqs(inHcluster, nameMap, cutoff);
for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
while (!done) {
//get next overlap
- DistNode overlapNode;
+ seqDist overlapNode;
if (!hclusterWanted) {
if (count < overlapMatrix.size()) { //do we have another node in the matrix
overlapNode = overlapMatrix[count];
void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
try {
//sort distFile
- ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff);
- readCluster->setFormat("column");
- readCluster->read(nameMap);
- string sortedDistFile = readCluster->getOutputFile();
- ListVector* extralist = readCluster->getListVector(); //we get this so we can delete it and not have a memory leak
-
+ string sortedDistFile = sortFile(unsortedDist);
remove(unsortedDist.c_str()); //delete unsorted file
distFile = sortedDistFile;
- delete extralist;
- delete readCluster;
//sort overlap file
- readCluster = new ReadCluster(unsortedOverlap, cutoff);
- readCluster->setFormat("column");
- readCluster->read(nameMap);
- string sortedOverlapFile = readCluster->getOutputFile();
- extralist = readCluster->getListVector(); //we get this so we can delete it and not have a memory leak
-
+ string sortedOverlapFile = sortFile(unsortedOverlap);
remove(unsortedOverlap.c_str()); //delete unsorted file
overlapFile = sortedOverlapFile;
- delete extralist;
- delete readCluster;
}
catch(exception& e) {
errorOut(e, "MGClusterCommand", "sortHclusterFiles");
exit(1);
}
}
-//**********************************************************************************************************************
-vector<DistNode> MGClusterCommand::getSeqs(ifstream& filehandle){
- try {
- string firstName, secondName;
- float distance, prevDistance;
- vector<DistNode> sameSeqs;
- prevDistance = -1;
-
- //if you are not at the beginning of the file
- if (exitedBreak) {
- sameSeqs.push_back(next);
- prevDistance = next.dist;
- exitedBreak = false;
- }
-
- //get entry
- while (!filehandle.eof()) {
-
- filehandle >> firstName >> secondName >> distance; gobble(filehandle);
-
- //save first one
- if (prevDistance == -1) { prevDistance = distance; }
-
- map<string,int>::iterator itA = nameMap->find(firstName);
- map<string,int>::iterator itB = nameMap->find(secondName);
- if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
- if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
-
- //using cutoff
- if (distance > cutoff) { break; }
-
- if (distance != -1) { //-1 means skip me
-
- //are the distances the same
- if (distance == prevDistance) { //save in vector
- DistNode temp(itA->second, itB->second, distance);
- sameSeqs.push_back(temp);
- exitedBreak = false;
- }else{
- next.seq1 = itA->second;
- next.seq2 = itB->second;
- next.dist = distance;
- exitedBreak = true;
- break;
- }
- }
- }
-
- return sameSeqs;
- }
- catch(exception& e) {
- errorOut(e, "MGClusterCommand", "getSeqs");
- exit(1);
- }
-}
+
//**********************************************************************************************************************
HCluster* hcluster;
ListVector* list;
ListVector oldList;
- vector<DistNode> overlapMatrix;
- DistNode next;
-
+ vector<seqDist> overlapMatrix;
string blastfile, method, namefile, overlapFile, distFile;
ofstream sabundFile, rabundFile, listFile;
float cutoff, penalty;
int precision, length, precisionLength;
- bool abort, minWanted, hclusterWanted, exitedBreak, merge;
+ bool abort, minWanted, hclusterWanted, merge;
void printData(ListVector*);
ListVector* mergeOPFs(map<string, int>, float);
void sortHclusterFiles(string, string);
- vector<DistNode> getSeqs(ifstream&);
+ vector<seqDist> getSeqs(ifstream&);
};
int smallChild; //used to make linkTable work with list and rabund. represents bin number of this cluster node
clusterNode(int num, int par, int kid) : numSeq(num), parent(par), smallChild(kid) {};
};
-
+/************************************************************/
+struct seqDist {
+ int seq1;
+ int seq2;
+ float dist;
+ seqDist() {}
+ seqDist(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {}
+ ~seqDist() {}
+};
/***********************************************************************/
// snagged from http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.2
}
}
+/**************************************************************************************************/
+inline void appendFiles(string temp, string filename) {
+ try{
+ ofstream output;
+ ifstream input;
+
+ //open output file in append mode
+ openOutputFileAppend(filename, output);
+ openInputFile(temp, input);
+
+ while(char c = input.get()){
+ if(input.eof()) { break; }
+ else { output << c; }
+ }
+
+ input.close();
+ output.close();
+ }
+ catch(exception& e) {
+ errorOut(e, "mothur", "appendFiles");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+inline string sortFile(string distFile){
+ try {
+ string outfile = getRootName(distFile) + "sorted.dist";
+
+ //if you can, use the unix sort since its been optimized for years
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ string command = "sort -n -k +3 " + distFile + " -o " + outfile;
+ system(command.c_str());
+ #else //you are stuck with my best attempt...
+ //windows sort does not have a way to specify a column, only a character in the line
+ //since we cannot assume that the distance will always be at the the same character location on each line
+ //due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back.
+
+ //read in file line by file and put distance first
+ string tempDistFile = distFile + ".temp";
+ ifstream input;
+ ofstream output;
+ openInputFile(distFile, input);
+ openOutputFile(tempDistFile, output);
+
+ string firstName, secondName;
+ float dist;
+ while (input) {
+ input >> firstName >> secondName >> dist;
+ output << dist << '\t' << firstName << '\t' << secondName << endl;
+ gobble(input);
+ }
+ input.close();
+ output.close();
+
+
+ //sort using windows sort
+ string tempOutfile = outfile + ".temp";
+ string command = "sort " + tempDistFile + " /O " + tempOutfile;
+ system(command.c_str());
+
+ //read in sorted file and put distance at end again
+ ifstream input2;
+ openInputFile(tempOutfile, input2);
+ openOutputFile(outfile, output);
+
+ while (input2) {
+ input2 >> dist >> firstName >> secondName;
+ output << firstName << '\t' << secondName << '\t' << dist << endl;
+ gobble(input2);
+ }
+ input2.close();
+ output.close();
+
+ //remove temp files
+ remove(tempDistFile.c_str());
+ remove(tempOutfile.c_str());
+ #endif
+
+ return outfile;
+ }
+ catch(exception& e) {
+ errorOut(e, "mothur", "sortFile");
+ exit(1);
+ }
+}
/**************************************************************************************************/
#endif
--- /dev/null
+/*
+ * preclustercommand.cpp
+ * Mothur
+ *
+ * Created by westcott on 12/21/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "preclustercommand.h"
+
+//**********************************************************************************************************************
+inline bool comparePriority(seqPNode first, seqPNode second) { return (first.numIdentical > second.numIdentical); }
+//**********************************************************************************************************************
+
+PreClusterCommand::PreClusterCommand(string option){
+ try {
+ abort = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; }
+
+ else {
+ //valid paramters for this command
+ string Array[] = {"fasta", "name", "diffs"};
+ vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+
+ OptionParser parser(option);
+ map<string, string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+
+ //check to make sure all parameters are valid for command
+ for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) {
+ if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) { abort = true; }
+ }
+
+ //check for required parameters
+ fastafile = validParameter.validFile(parameters, "fasta", true);
+ if (fastafile == "not found") { mothurOut("fasta is a required parameter for the pre.cluster command."); mothurOutEndLine(); abort = true; }
+ else if (fastafile == "not open") { abort = true; }
+
+ //check for optional parameter and set defaults
+ // ...at some point should added some additional type checking...
+ namefile = validParameter.validFile(parameters, "name", true);
+ if (namefile == "not found") { namefile = ""; }
+ else if (namefile == "not open") { abort = true; }
+ else { readNameFile(); }
+
+ string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; }
+ convert(temp, diffs);
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "PreClusterCommand", "PreClusterCommand");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+PreClusterCommand::~PreClusterCommand(){}
+//**********************************************************************************************************************
+
+void PreClusterCommand::help(){
+ try {
+ mothurOut("The pre.cluster command groups sequences that are within a given number of base mismatches.\n");
+ mothurOut("The pre.cluster command outputs a new fasta and name file.\n");
+ mothurOut("The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n");
+ mothurOut("The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n");
+ mothurOut("The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n");
+ mothurOut("The pre.cluster command should be in the following format: \n");
+ mothurOut("pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n");
+ mothurOut("Example pre.cluster(fasta=amazon.fasta, diffs=2).\n");
+ mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
+ }
+ catch(exception& e) {
+ errorOut(e, "PreClusterCommand", "help");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
+int PreClusterCommand::execute(){
+ try {
+
+ if (abort == true) { return 0; }
+
+ //reads fasta file and return number of seqs
+ int numSeqs = readSeqs(); //fills alignSeqs and makes all seqs active
+
+ if (numSeqs == 0) { mothurOut("Error reading fasta file...please correct."); mothurOutEndLine(); return 0; }
+ if (diffs > length) { mothurOut("Error: diffs is greater than your sequence length."); mothurOutEndLine(); return 0; }
+
+ //clear sizes since you only needed this info to build the alignSeqs seqPNode structs
+ sizes.clear();
+
+ //sort seqs by number of identical seqs
+ sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
+
+ //go through active list and cluster everthing you can, until all nodes are inactive
+ //taking advantage of the fact that maps are already sorted
+ map<string, bool>::iterator itActive;
+ map<string, bool>::iterator it2Active;
+ int count = 0;
+
+ for (int i = 0; i < alignSeqs.size(); i++) {
+
+ //are you active
+ itActive = active.find(alignSeqs[i].seq.getName());
+
+ if (itActive != active.end()) { //this sequence has not been merged yet
+
+ //try to merge it with all smaller seqs
+ for (int j = i; j < alignSeqs.size(); j++) {
+
+ if (i != j) {
+ //are you active
+ it2Active = active.find(alignSeqs[j].seq.getName());
+ if (it2Active != active.end()) { //this sequence has not been merged yet
+ //are you within "diff" bases
+ int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
+
+ if (mismatch <= diffs) {
+ //merge
+ names[alignSeqs[i].seq.getName()] += "," + names[alignSeqs[j].seq.getName()];
+
+ //remove from active list
+ active.erase(it2Active);
+
+ //set numIdentical to 0, so you only print the representative seqs in the fasta file
+ alignSeqs[j].numIdentical = 0;
+ count++;
+ }
+ }//end if j active
+ }//end if i != j
+ }//end for loop
+
+ //remove from active list
+ active.erase(itActive);
+ }//end if active i
+ }
+
+ string newFastaFile = getRootName(fastafile) + "precluster" + getExtension(fastafile);
+ string newNamesFile = getRootName(fastafile) + "precluster.names";
+
+ printData(newFastaFile, newNamesFile);
+
+ mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); mothurOutEndLine();
+ mothurOut("pre.cluster removed " + toString(count) + " sequences."); mothurOutEndLine();
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "PreClusterCommand", "execute");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int PreClusterCommand::readSeqs(){
+ try {
+ ifstream inFasta;
+ openInputFile(fastafile, inFasta);
+ length = 0;
+ map<string, string>::iterator it;
+
+ while (!inFasta.eof()) {
+ Sequence temp(inFasta); //read seq
+
+ if (temp.getName() != "") {
+ if (namefile != "") {
+ //make sure fasta and name files match
+ it = names.find(temp.getName());
+ if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); }
+ }else { sizes[temp.getName()] = 1; }
+
+ seqPNode tempNode(sizes[temp.getName()], temp);
+ alignSeqs.push_back(tempNode);
+ active[temp.getName()] = true;
+ }
+ gobble(inFasta);
+ }
+ inFasta.close();
+
+ if (alignSeqs.size() != 0) { length = alignSeqs[0].seq.getAligned().length(); }
+
+ return alignSeqs.size();
+ }
+ catch(exception& e) {
+ errorOut(e, "PreClusterCommand", "readSeqs");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void PreClusterCommand::readNameFile(){
+ try {
+ ifstream in;
+ openInputFile(namefile, in);
+ string firstCol, secondCol;
+
+ while (!in.eof()) {
+ in >> firstCol >> secondCol; gobble(in);
+ names[firstCol] = secondCol;
+ int size = 1;
+ while (secondCol.find_first_of(',') != -1) {
+ size++;
+ secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
+ }
+ sizes[firstCol] = size;
+ }
+ in.close();
+ }
+ catch(exception& e) {
+ errorOut(e, "PreClusterCommand", "readNameFile");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int PreClusterCommand::calcMisMatches(string seq1, string seq2){
+ try {
+ int numBad = 0;
+
+ for (int i = 0; i < seq1.length(); i++) {
+ //do they match
+ if (seq1[i] != seq2[i]) { numBad++; }
+ if (numBad > diffs) { return length; } //to far to cluster
+ }
+
+ return numBad;
+ }
+ catch(exception& e) {
+ errorOut(e, "PreClusterCommand", "calcMisMatches");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void PreClusterCommand::printData(string newfasta, string newname){
+ try {
+ ofstream outFasta;
+ ofstream outNames;
+ openOutputFile(newfasta, outFasta);
+ openOutputFile(newname, outNames);
+
+ map<string, string>::iterator itNames;
+
+ for (int i = 0; i < alignSeqs.size(); i++) {
+ if (alignSeqs[i].numIdentical != 0) {
+ alignSeqs[i].seq.printSequence(outFasta);
+
+ itNames = names.find(alignSeqs[i].seq.getName());
+
+ outNames << itNames->first << '\t' << itNames->second << endl;
+ }
+ }
+
+ outFasta.close();
+ outNames.close();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "PreClusterCommand", "printData");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+
--- /dev/null
+#ifndef PRECLUSTERCOMMAND_H
+#define PRECLUSTERCOMMAND_H
+
+
+/*
+ * preclustercommand.h
+ * Mothur
+ *
+ * Created by westcott on 12/21/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "command.hpp"
+#include "sequence.hpp"
+
+/************************************************************/
+struct seqPNode {
+ int numIdentical;
+ Sequence seq;
+ seqPNode() {}
+ seqPNode(int s, Sequence q) : numIdentical(s), seq(q) {}
+ ~seqPNode() {}
+};
+/************************************************************/
+
+class PreClusterCommand : public Command {
+
+public:
+ PreClusterCommand(string);
+ ~PreClusterCommand();
+ int execute();
+ void help();
+
+private:
+ int diffs, length;
+ bool abort;
+ string fastafile, namefile;
+ vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
+ map<string, string> names; //represents the names file first column maps to second column
+ map<string, int> sizes; //this map a seq name to the number of identical seqs in the names file
+ map<string, bool> active; //maps sequence name to whether it has already been merged or not.
+
+ int readSeqs();
+ int calcMisMatches(string, string);
+ void readNameFile();
+ void printData(string, string); //fasta filename, names file name
+};
+
+/************************************************************/
+
+
+
+
+
+#endif
+
+
//********************************************************************************************************************
//sorts lowest to highest
-inline bool compareOverlap(DistNode left, DistNode right){
+inline bool compareOverlap(seqDist left, seqDist right){
return (left.dist < right.dist);
}
/*********************************************************************************************/
//if there is a valid overlap, add it
if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
if (!hclusterWanted) {
- DistNode overlapValue(itA->second, itB->second, thisoverlap);
+ seqDist overlapValue(itA->second, itB->second, thisoverlap);
overlap.push_back(overlapValue);
}else {
outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
//if there is a valid overlap, add it
if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
if (!hclusterWanted) {
- DistNode overlapValue(itA->second, itB->second, thisoverlap);
+ seqDist overlapValue(itA->second, itB->second, thisoverlap);
//cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
overlap.push_back(overlapValue);
}else {
//if there is a valid overlap, add it
if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
if (!hclusterWanted) {
- DistNode overlapValue(itA->second, itB->second, thisoverlap);
+ seqDist overlapValue(itA->second, itB->second, thisoverlap);
overlap.push_back(overlapValue);
}else {
outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
#include "sparsematrix.hpp"
#include "nameassignment.hpp"
-/****************************************************************************************/
-struct DistNode {
- int seq1;
- int seq2;
- float dist;
- DistNode(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {}
- DistNode() {}
- ~DistNode() {}
-};
/****************************************************************************************/
//Note: this class creates a sparsematrix and list if the read is executed, but does not delete them on deconstruction.
void read(NameAssignment*);
SparseMatrix* getDistMatrix() { return matrix; }
- vector<DistNode> getOverlapMatrix() { return overlap; }
+ vector<seqDist> getOverlapMatrix() { return overlap; }
string getOverlapFile() { return overlapFile; }
string getDistFile() { return distFile; }
bool hclusterWanted;
SparseMatrix* matrix;
- vector<DistNode> overlap;
+ vector<seqDist> overlap;
void readNames(NameAssignment*);
};
if (format == "phylip") { convertPhylip2Column(nameMap); }
else { list = new ListVector(nameMap->getListVector()); }
- createHClusterFile();
+ OutPutFile = sortFile(distFile);
}
catch(exception& e) {
}
/***********************************************************************/
-void ReadCluster::createHClusterFile(){
- try {
- string outfile = getRootName(distFile) + "sorted.dist";
-
- //if you can, use the unix sort since its been optimized for years
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- string command = "sort -n -k +3 " + distFile + " -o " + outfile;
- system(command.c_str());
- #else //you are stuck with my best attempt...
- //windows sort does not have a way to specify a column, only a character in the line
- //since we cannot assume that the distance will always be at the the same character location on each line
- //due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back.
-
- //read in file line by file and put distance first
- string tempDistFile = distFile + ".temp";
- ifstream input;
- ofstream output;
- openInputFile(distFile, input);
- openOutputFile(tempDistFile, output);
-
- string firstName, secondName;
- float dist;
- while (input) {
- input >> firstName >> secondName >> dist;
- output << dist << '\t' << firstName << '\t' << secondName << endl;
- gobble(input);
- }
- input.close();
- output.close();
-
-
- //sort using windows sort
- string tempOutfile = outfile + ".temp";
- string command = "sort " + tempDistFile + " /O " + tempOutfile;
- system(command.c_str());
-
- //read in sorted file and put distance at end again
- ifstream input2;
- openInputFile(tempOutfile, input2);
- openOutputFile(outfile, output);
-
- while (input2) {
- input2 >> dist >> firstName >> secondName;
- output << firstName << '\t' << secondName << '\t' << dist << endl;
- gobble(input2);
- }
- input2.close();
- output.close();
-
- //remove temp files
- remove(tempDistFile.c_str());
- remove(tempOutfile.c_str());
- #endif
-
- OutPutFile = outfile;
- }
- catch(exception& e) {
- errorOut(e, "ReadCluster", "createHClusterFile");
- exit(1);
- }
-}
-
-
-/***********************************************************************/
-
void ReadCluster::convertPhylip2Column(NameAssignment* nameMap){
try {
//convert phylip file to column file
string getOutputFile() { return OutPutFile; }
void setFormat(string f) { format = f; }
ListVector* getListVector() { return list; }
- //NameAssignment* getNameMap() { return nameMap; }
private:
GlobalData* globaldata;
string OutPutFile, format;
ListVector* list;
float cutoff;
- //NameAssignment* nameMap;
- void createHClusterFile();
void convertPhylip2Column(NameAssignment*);
};