From b5a791c81d432082bf38755a08b33863f255341d Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 23 Dec 2009 18:47:52 +0000 Subject: [PATCH] precluster command finished --- Mothur.xcodeproj/project.pbxproj | 6 + commandfactory.cpp | 3 + distancecommand.cpp | 2 +- distancecommand.h | 2 +- hcluster.cpp | 97 +++++++++-- hcluster.h | 8 +- hclustercommand.cpp | 105 ++---------- hclustercommand.h | 18 +-- mgclustercommand.cpp | 106 ++---------- mgclustercommand.h | 8 +- mothur.h | 96 ++++++++++- preclustercommand.cpp | 269 +++++++++++++++++++++++++++++++ preclustercommand.h | 59 +++++++ readblast.cpp | 8 +- readblast.h | 13 +- readcluster.cpp | 67 +------- readcluster.h | 3 - 17 files changed, 563 insertions(+), 307 deletions(-) create mode 100644 preclustercommand.cpp create mode 100644 preclustercommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 5ae6f33..1fbb5b7 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -158,6 +158,7 @@ 7EC3D4550FA0FFF900338DA5 /* coverage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4500FA0FFF900338DA5 /* coverage.cpp */; }; 7EC3D4560FA0FFF900338DA5 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */; }; 8DD76F6A0486A84900D96B5E /* Mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6859E8B029090EE04C91782 /* Mothur.1 */; }; + A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */; }; A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; }; A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; }; A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; }; @@ -526,6 +527,8 @@ 7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = SOURCE_ROOT; }; 7EC3D4540FA0FFF900338DA5 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = SOURCE_ROOT; }; 8DD76F6C0486A84900D96B5E /* mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mothur; sourceTree = BUILT_PRODUCTS_DIR; }; + A7027F2610DFF86D00BF45FE /* preclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = preclustercommand.h; sourceTree = SOURCE_ROOT; }; + A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = preclustercommand.cpp; sourceTree = SOURCE_ROOT; }; A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getgroupcommand.cpp; sourceTree = SOURCE_ROOT; }; A70B53A50F4CD7AD0064797E /* getgroupcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getgroupcommand.h; sourceTree = SOURCE_ROOT; }; A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlabelcommand.cpp; sourceTree = SOURCE_ROOT; }; @@ -931,6 +934,8 @@ 3792946F0F2E191800B9034A /* parsimonycommand.cpp */, A773808E10B6EFD1002A6A61 /* phylotypecommand.h */, A773808F10B6EFD1002A6A61 /* phylotypecommand.cpp */, + A7027F2610DFF86D00BF45FE /* preclustercommand.h */, + A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */, 37D927FE0F21331F001D4494 /* quitcommand.h */, 37D927FD0F21331F001D4494 /* quitcommand.cpp */, 37D928080F21331F001D4494 /* rarefactcommand.h */, @@ -1299,6 +1304,7 @@ A773809010B6EFD1002A6A61 /* phylotypecommand.cpp in Sources */, A727E84A10D14568001A8432 /* readblast.cpp in Sources */, A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */, + A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/commandfactory.cpp b/commandfactory.cpp index b361b66..d2b22d5 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -61,6 +61,7 @@ #include "classifyseqscommand.h" #include "phylotypecommand.h" #include "mgclustercommand.h" +#include "preclustercommand.h" /*******************************************************/ @@ -131,6 +132,7 @@ CommandFactory::CommandFactory(){ commands["classify.seqs"] = "classify.seqs"; commands["phylotype"] = "phylotype"; commands["mgcluster"] = "mgcluster"; + commands["pre.cluster"] = "pre.cluster"; } /***********************************************************/ @@ -199,6 +201,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "classify.seqs") { command = new ClassifySeqsCommand(optionString); } else if(commandName == "phylotype") { command = new PhylotypeCommand(optionString); } else if(commandName == "mgcluster") { command = new MGClusterCommand(optionString); } + else if(commandName == "pre.cluster") { command = new PreClusterCommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/distancecommand.cpp b/distancecommand.cpp index f8a2a75..3ac3969 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -278,7 +278,7 @@ int DistanceCommand::driver(int startLine, int endLine, string dFileName, float } } -/**************************************************************************************************/ +/************************************************************************************************** void DistanceCommand::appendFiles(string temp, string filename) { try{ ofstream output; diff --git a/distancecommand.h b/distancecommand.h index 6a7ac93..542f428 100644 --- a/distancecommand.h +++ b/distancecommand.h @@ -43,7 +43,7 @@ private: bool abort; vector Estimators; //holds estimators to be used - void appendFiles(string, string); + //void appendFiles(string, string); void createProcesses(string); int driver(/*Dist*, SequenceDB, */int, int, string, float); diff --git a/hcluster.cpp b/hcluster.cpp index 2d88533..4e8c994 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -12,11 +12,13 @@ #include "listvector.hpp" #include "sparsematrix.hpp" + /***********************************************************************/ -HCluster::HCluster(RAbundVector* rav, ListVector* lv) : rabund(rav), list(lv){ +HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m) : rabund(rav), list(lv), method(m){ try { mapWanted = false; + exitedBreak = false; numSeqs = list->getNumSeqs(); //initialize cluster array @@ -263,27 +265,32 @@ void HCluster::update(int row, int col, float distance){ smallRow = row; smallCol = col; smallDist = distance; - bool clustered = false; //find upmost parent of row and col smallRow = getUpmostParent(smallRow); smallCol = getUpmostParent(smallCol); //cout << "row = " << row << " smallRow = " << smallRow << " col = " << col << " smallCol = " << smallCol << " dist = " << distance << endl; - - //are they active in the link table - int linkValue = makeActive(); //after this point this nodes info is active in linkTable - //printInfo(); -//cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl; - //can we cluster??? - if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { - //printInfo(); - updateArrayandLinkTable(); - clusterBins(); - clusterNames(); - clustered = true; - //printInfo(); + //you don't want to cluster with yourself + if (smallRow != smallCol) { + //are they active in the link table + int linkValue = makeActive(); //after this point this nodes info is active in linkTable + //printInfo(); + //cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl; + //can we cluster??? + bool cluster = false; + + if (method == "nearest") { cluster = true; } + else if (method == "average") { cout << "still working on this... " << endl; //got to figure this out + }else{ //assume furthest + if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; } + } + + if (cluster) { + updateArrayandLinkTable(); + clusterBins(); + clusterNames(); + } } - //printInfo(); } catch(exception& e) { @@ -340,6 +347,64 @@ try { exit(1); } } +//********************************************************************************************************************** +vector HCluster::getSeqs(ifstream& filehandle, NameAssignment* nameMap, float cutoff){ + try { + string firstName, secondName; + float distance, prevDistance; + vector sameSeqs; + prevDistance = -1; + + //if you are not at the beginning of the file + if (exitedBreak) { + sameSeqs.push_back(next); + prevDistance = next.dist; + exitedBreak = false; + } + + //get entry + while (!filehandle.eof()) { + + filehandle >> firstName >> secondName >> distance; gobble(filehandle); + + //save first one + if (prevDistance == -1) { prevDistance = distance; } + + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + //using cutoff + if (distance > cutoff) { break; } + + if (distance != -1) { //-1 means skip me + + //are the distances the same + if (distance == prevDistance) { //save in vector + seqDist temp(itA->second, itB->second, distance); + sameSeqs.push_back(temp); + exitedBreak = false; + }else{ + next.seq1 = itA->second; + next.seq2 = itB->second; + next.dist = distance; + exitedBreak = true; + break; + } + } + } + + //rndomize matching dists + random_shuffle(sameSeqs.begin(), sameSeqs.end()); + + return sameSeqs; + } + catch(exception& e) { + errorOut(e, "HCluster", "getSeqs"); + exit(1); + } +} /***********************************************************************/ diff --git a/hcluster.h b/hcluster.h index dd4c679..065d559 100644 --- a/hcluster.h +++ b/hcluster.h @@ -12,6 +12,7 @@ #include "mothur.h" +#include "nameassignment.hpp" class RAbundVector; class ListVector; @@ -20,11 +21,12 @@ class ListVector; class HCluster { public: - HCluster(RAbundVector*, ListVector*); + HCluster(RAbundVector*, ListVector*, string); ~HCluster(){}; void update(int, int, float); void setMapWanted(bool m); map getSeqtoBin() { return seq2Bin; } + vector getSeqs(ifstream&, NameAssignment*, float); protected: void clusterBins(); @@ -49,7 +51,9 @@ protected: int smallCol; float smallDist; map seq2Bin; - bool mapWanted; + bool mapWanted, exitedBreak; + seqDist next; + string method; }; diff --git a/hclustercommand.cpp b/hclustercommand.cpp index 3c79f64..506ba54 100644 --- a/hclustercommand.cpp +++ b/hclustercommand.cpp @@ -76,7 +76,7 @@ HClusterCommand::HClusterCommand(string option){ cutoff += (5 / (precision * 10.0)); method = validParameter.validFile(parameters, "method", false); - if (method == "not found") { method = "nearest"; } + if (method == "not found") { method = "furthest"; } if ((method == "furthest") || (method == "nearest") || (method == "average")) { } else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; } @@ -96,7 +96,9 @@ HClusterCommand::HClusterCommand(string option){ fileroot = getRootName(distfile); - tag = "fn"; //until we figure out average and nearest methods + if (method == "furthest") { tag = "fn"; } + else if (method == "nearest") { tag = "nn"; } + else { tag = "an"; } openOutputFile(fileroot+ tag + ".sabund", sabundFile); openOutputFile(fileroot+ tag + ".rabund", rabundFile); @@ -119,7 +121,7 @@ void HClusterCommand::help(){ mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n"); mothurOut("The hcluster command should be in the following format: \n"); mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"); - mothurOut("The acceptable hcluster methods is furthest, but we hope to add nearest and average in the future.\n\n"); + mothurOut("The acceptable hcluster methods are furthest and nearest, but we hope to add average in the future.\n\n"); } catch(exception& e) { errorOut(e, "HClusterCommand", "help"); @@ -182,17 +184,13 @@ int HClusterCommand::execute(){ string firstName, secondName; float distance; - cluster = new HCluster(rabund, list); + cluster = new HCluster(rabund, list, method); vector seqs; seqs.resize(1); // to start loop - exitedBreak = false; //lets you know if there is a distance stored in next - - while (seqs.size() != 0){ - seqs = getSeqs(in); - random_shuffle(seqs.begin(), seqs.end()); - - if (seqs.size() == 0) { break; } //there are no more distances + while (seqs.size() != 0){ + seqs = cluster->getSeqs(in, globaldata->nameMap, cutoff); + for (int i = 0; i < seqs.size(); i++) { //-1 means skip me if (print_start && isTrue(timing)) { @@ -203,14 +201,12 @@ int HClusterCommand::execute(){ print_start = false; } - //cout << "before cluster update" << endl; + if (seqs[i].seq1 != seqs[i].seq2) { cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist); float rndDist = roundDist(seqs[i].dist, precision); - //cout << "after cluster update clusterSomething = " << clusteredSomething << " rndDist = " << rndDist << " rndPreviousDist = " << rndPreviousDist << endl; - - + if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){ printData("unique"); } @@ -256,11 +252,10 @@ int HClusterCommand::execute(){ sabundFile.close(); rabundFile.close(); listFile.close(); - delete cluster; - //if (isTrue(timing)) { - mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); mothurOutEndLine(); - //} + + mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); mothurOutEndLine(); + return 0; } catch(exception& e) { @@ -299,76 +294,4 @@ void HClusterCommand::printData(string label){ } //********************************************************************************************************************** -vector HClusterCommand::getSeqs(ifstream& filehandle){ - try { - string firstName, secondName; - float distance, prevDistance; - vector sameSeqs; - prevDistance = -1; - - //if you are not at the beginning of the file - if (exitedBreak) { - sameSeqs.push_back(next); - prevDistance = next.dist; - exitedBreak = false; - } - - //get entry - while (filehandle) { - - filehandle >> firstName >> secondName >> distance; -//cout << firstName << '\t' << secondName << '\t' << distance << endl; - gobble(filehandle); - - //save first one - if (prevDistance == -1) { prevDistance = distance; } - //cout << prevDistance << endl; -//if (globaldata->nameMap == NULL) { cout << "null" << endl; } - map::iterator itA = globaldata->nameMap->find(firstName); - map::iterator itB = globaldata->nameMap->find(secondName); - - if(itA == globaldata->nameMap->end()){ - cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); - } - if(itB == globaldata->nameMap->end()){ - cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); - } - //cout << "here" << endl; - //using cutoff - if (distance > cutoff) { break; } - - if (distance != -1) { //-1 means skip me - - //are the distances the same - if (distance == prevDistance) { //save in vector - seqDist temp; - temp.seq1 = itA->second; - temp.seq2 = itB->second; - temp.dist = distance; - sameSeqs.push_back(temp); - exitedBreak = false; - //what about precision?? - - }else{ - next.seq1 = itA->second; - next.seq2 = itB->second; - next.dist = distance; - exitedBreak = true; - break; - } - - } - } - - return sameSeqs; - } - catch(exception& e) { - errorOut(e, "HClusterCommand", "getSeqs"); - exit(1); - } - - -} - -//********************************************************************************************************************** diff --git a/hclustercommand.h b/hclustercommand.h index 1dd46b9..4c8fa9c 100644 --- a/hclustercommand.h +++ b/hclustercommand.h @@ -29,12 +29,6 @@ //Applications Center, 1030 S. Highway A1A, Patrick AFB, FL 32925-3002, USA //Received January 28, 2009; Revised April 14, 2009; Accepted April 15, 2009 /************************************************************/ -struct seqDist { - int seq1; - int seq2; - float dist; -}; -/************************************************************/ class HClusterCommand : public Command { public: @@ -52,23 +46,15 @@ private: ListVector oldList; ReadCluster* read; - bool abort; - - string method, fileroot, tag, distfile, format, phylipfile, columnfile, namefile, sort; + bool abort, sorted, print_start; + string method, fileroot, tag, distfile, format, phylipfile, columnfile, namefile, sort, showabund, timing; double cutoff; - string showabund, timing; int precision, length; ofstream sabundFile, rabundFile, listFile; - //ifstream in; - seqDist next; - bool exitedBreak, sorted; - - bool print_start; time_t start; unsigned long loops; void printData(string label); - vector getSeqs(ifstream&); }; /************************************************************/ diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index d5b84a8..07e3776 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -8,7 +8,6 @@ */ #include "mgclustercommand.h" -#include "readcluster.h" //********************************************************************************************************************** MGClusterCommand::MGClusterCommand(string option){ @@ -138,6 +137,15 @@ int MGClusterCommand::execute(){ start = time(NULL); oldList = *list; + if (method == "furthest") { tag = "fn"; } + else if (method == "nearest") { tag = "nn"; } + else { tag = "an"; } + + //open output files + openOutputFile(fileroot+ tag + ".list", listFile); + openOutputFile(fileroot+ tag + ".rabund", rabundFile); + openOutputFile(fileroot+ tag + ".sabund", sabundFile); + if (!hclusterWanted) { //get distmatrix and overlap SparseMatrix* distMatrix = read->getDistMatrix(); @@ -148,14 +156,8 @@ int MGClusterCommand::execute(){ if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix); } else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix); } else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix); } - tag = cluster->getTag(); cluster->setMapWanted(true); - //open output files - openOutputFile(fileroot+ tag + ".list", listFile); - openOutputFile(fileroot+ tag + ".rabund", rabundFile); - openOutputFile(fileroot+ tag + ".sabund", sabundFile); - //cluster using cluster classes while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){ @@ -218,24 +220,16 @@ int MGClusterCommand::execute(){ sortHclusterFiles(distFile, overlapFile); //create cluster - hcluster = new HCluster(rabund, list); + hcluster = new HCluster(rabund, list, method); hcluster->setMapWanted(true); - //open output files - openOutputFile(fileroot+ tag + ".list", listFile); - openOutputFile(fileroot+ tag + ".rabund", rabundFile); - openOutputFile(fileroot+ tag + ".sabund", sabundFile); - - vector seqs; seqs.resize(1); // to start loop - exitedBreak = false; //lets you know if there is a distance stored in next - + vector seqs; seqs.resize(1); // to start loop ifstream inHcluster; openInputFile(distFile, inHcluster); while (seqs.size() != 0){ - seqs = getSeqs(inHcluster); - random_shuffle(seqs.begin(), seqs.end()); + seqs = hcluster->getSeqs(inHcluster, nameMap, cutoff); for (int i = 0; i < seqs.size(); i++) { //-1 means skip me @@ -345,7 +339,7 @@ ListVector* MGClusterCommand::mergeOPFs(map binInfo, float dist){ while (!done) { //get next overlap - DistNode overlapNode; + seqDist overlapNode; if (!hclusterWanted) { if (count < overlapMatrix.size()) { //do we have another node in the matrix overlapNode = overlapMatrix[count]; @@ -415,89 +409,21 @@ ListVector* MGClusterCommand::mergeOPFs(map binInfo, float dist){ void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) { try { //sort distFile - ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff); - readCluster->setFormat("column"); - readCluster->read(nameMap); - string sortedDistFile = readCluster->getOutputFile(); - ListVector* extralist = readCluster->getListVector(); //we get this so we can delete it and not have a memory leak - + string sortedDistFile = sortFile(unsortedDist); remove(unsortedDist.c_str()); //delete unsorted file distFile = sortedDistFile; - delete extralist; - delete readCluster; //sort overlap file - readCluster = new ReadCluster(unsortedOverlap, cutoff); - readCluster->setFormat("column"); - readCluster->read(nameMap); - string sortedOverlapFile = readCluster->getOutputFile(); - extralist = readCluster->getListVector(); //we get this so we can delete it and not have a memory leak - + string sortedOverlapFile = sortFile(unsortedOverlap); remove(unsortedOverlap.c_str()); //delete unsorted file overlapFile = sortedOverlapFile; - delete extralist; - delete readCluster; } catch(exception& e) { errorOut(e, "MGClusterCommand", "sortHclusterFiles"); exit(1); } } -//********************************************************************************************************************** -vector MGClusterCommand::getSeqs(ifstream& filehandle){ - try { - string firstName, secondName; - float distance, prevDistance; - vector sameSeqs; - prevDistance = -1; - - //if you are not at the beginning of the file - if (exitedBreak) { - sameSeqs.push_back(next); - prevDistance = next.dist; - exitedBreak = false; - } - - //get entry - while (!filehandle.eof()) { - - filehandle >> firstName >> secondName >> distance; gobble(filehandle); - - //save first one - if (prevDistance == -1) { prevDistance = distance; } - - map::iterator itA = nameMap->find(firstName); - map::iterator itB = nameMap->find(secondName); - if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } - if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } - - //using cutoff - if (distance > cutoff) { break; } - - if (distance != -1) { //-1 means skip me - - //are the distances the same - if (distance == prevDistance) { //save in vector - DistNode temp(itA->second, itB->second, distance); - sameSeqs.push_back(temp); - exitedBreak = false; - }else{ - next.seq1 = itA->second; - next.seq2 = itB->second; - next.dist = distance; - exitedBreak = true; - break; - } - } - } - - return sameSeqs; - } - catch(exception& e) { - errorOut(e, "MGClusterCommand", "getSeqs"); - exit(1); - } -} + //********************************************************************************************************************** diff --git a/mgclustercommand.h b/mgclustercommand.h index 9aa86d4..2ac80c9 100644 --- a/mgclustercommand.h +++ b/mgclustercommand.h @@ -36,20 +36,18 @@ private: HCluster* hcluster; ListVector* list; ListVector oldList; - vector overlapMatrix; - DistNode next; - + vector overlapMatrix; string blastfile, method, namefile, overlapFile, distFile; ofstream sabundFile, rabundFile, listFile; float cutoff, penalty; int precision, length, precisionLength; - bool abort, minWanted, hclusterWanted, exitedBreak, merge; + bool abort, minWanted, hclusterWanted, merge; void printData(ListVector*); ListVector* mergeOPFs(map, float); void sortHclusterFiles(string, string); - vector getSeqs(ifstream&); + vector getSeqs(ifstream&); }; diff --git a/mothur.h b/mothur.h index f839328..cf6ae69 100644 --- a/mothur.h +++ b/mothur.h @@ -100,7 +100,15 @@ struct clusterNode { int smallChild; //used to make linkTable work with list and rabund. represents bin number of this cluster node clusterNode(int num, int par, int kid) : numSeq(num), parent(par), smallChild(kid) {}; }; - +/************************************************************/ +struct seqDist { + int seq1; + int seq2; + float dist; + seqDist() {} + seqDist(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {} + ~seqDist() {} +}; /***********************************************************************/ // snagged from http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.2 @@ -775,6 +783,92 @@ inline bool anyLabelsToProcess(string label, set& userLabels, string err } } +/**************************************************************************************************/ +inline void appendFiles(string temp, string filename) { + try{ + ofstream output; + ifstream input; + + //open output file in append mode + openOutputFileAppend(filename, output); + openInputFile(temp, input); + + while(char c = input.get()){ + if(input.eof()) { break; } + else { output << c; } + } + + input.close(); + output.close(); + } + catch(exception& e) { + errorOut(e, "mothur", "appendFiles"); + exit(1); + } +} + +/**************************************************************************************************/ +inline string sortFile(string distFile){ + try { + string outfile = getRootName(distFile) + "sorted.dist"; + + //if you can, use the unix sort since its been optimized for years + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + string command = "sort -n -k +3 " + distFile + " -o " + outfile; + system(command.c_str()); + #else //you are stuck with my best attempt... + //windows sort does not have a way to specify a column, only a character in the line + //since we cannot assume that the distance will always be at the the same character location on each line + //due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back. + + //read in file line by file and put distance first + string tempDistFile = distFile + ".temp"; + ifstream input; + ofstream output; + openInputFile(distFile, input); + openOutputFile(tempDistFile, output); + + string firstName, secondName; + float dist; + while (input) { + input >> firstName >> secondName >> dist; + output << dist << '\t' << firstName << '\t' << secondName << endl; + gobble(input); + } + input.close(); + output.close(); + + + //sort using windows sort + string tempOutfile = outfile + ".temp"; + string command = "sort " + tempDistFile + " /O " + tempOutfile; + system(command.c_str()); + + //read in sorted file and put distance at end again + ifstream input2; + openInputFile(tempOutfile, input2); + openOutputFile(outfile, output); + + while (input2) { + input2 >> dist >> firstName >> secondName; + output << firstName << '\t' << secondName << '\t' << dist << endl; + gobble(input2); + } + input2.close(); + output.close(); + + //remove temp files + remove(tempDistFile.c_str()); + remove(tempOutfile.c_str()); + #endif + + return outfile; + } + catch(exception& e) { + errorOut(e, "mothur", "sortFile"); + exit(1); + } +} /**************************************************************************************************/ #endif diff --git a/preclustercommand.cpp b/preclustercommand.cpp new file mode 100644 index 0000000..479a862 --- /dev/null +++ b/preclustercommand.cpp @@ -0,0 +1,269 @@ +/* + * preclustercommand.cpp + * Mothur + * + * Created by westcott on 12/21/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "preclustercommand.h" + +//********************************************************************************************************************** +inline bool comparePriority(seqPNode first, seqPNode second) { return (first.numIdentical > second.numIdentical); } +//********************************************************************************************************************** + +PreClusterCommand::PreClusterCommand(string option){ + try { + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + //valid paramters for this command + string Array[] = {"fasta", "name", "diffs"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + + //check to make sure all parameters are valid for command + for (map::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { + if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) { abort = true; } + } + + //check for required parameters + fastafile = validParameter.validFile(parameters, "fasta", true); + if (fastafile == "not found") { mothurOut("fasta is a required parameter for the pre.cluster command."); mothurOutEndLine(); abort = true; } + else if (fastafile == "not open") { abort = true; } + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not found") { namefile = ""; } + else if (namefile == "not open") { abort = true; } + else { readNameFile(); } + + string temp = validParameter.validFile(parameters, "diffs", false); if(temp == "not found"){ temp = "1"; } + convert(temp, diffs); + } + + } + catch(exception& e) { + errorOut(e, "PreClusterCommand", "PreClusterCommand"); + exit(1); + } +} + +//********************************************************************************************************************** +PreClusterCommand::~PreClusterCommand(){} +//********************************************************************************************************************** + +void PreClusterCommand::help(){ + try { + mothurOut("The pre.cluster command groups sequences that are within a given number of base mismatches.\n"); + mothurOut("The pre.cluster command outputs a new fasta and name file.\n"); + mothurOut("The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n"); + mothurOut("The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n"); + mothurOut("The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n"); + mothurOut("The pre.cluster command should be in the following format: \n"); + mothurOut("pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n"); + mothurOut("Example pre.cluster(fasta=amazon.fasta, diffs=2).\n"); + mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n"); + } + catch(exception& e) { + errorOut(e, "PreClusterCommand", "help"); + exit(1); + } +} +//********************************************************************************************************************** + +int PreClusterCommand::execute(){ + try { + + if (abort == true) { return 0; } + + //reads fasta file and return number of seqs + int numSeqs = readSeqs(); //fills alignSeqs and makes all seqs active + + if (numSeqs == 0) { mothurOut("Error reading fasta file...please correct."); mothurOutEndLine(); return 0; } + if (diffs > length) { mothurOut("Error: diffs is greater than your sequence length."); mothurOutEndLine(); return 0; } + + //clear sizes since you only needed this info to build the alignSeqs seqPNode structs + sizes.clear(); + + //sort seqs by number of identical seqs + sort(alignSeqs.begin(), alignSeqs.end(), comparePriority); + + //go through active list and cluster everthing you can, until all nodes are inactive + //taking advantage of the fact that maps are already sorted + map::iterator itActive; + map::iterator it2Active; + int count = 0; + + for (int i = 0; i < alignSeqs.size(); i++) { + + //are you active + itActive = active.find(alignSeqs[i].seq.getName()); + + if (itActive != active.end()) { //this sequence has not been merged yet + + //try to merge it with all smaller seqs + for (int j = i; j < alignSeqs.size(); j++) { + + if (i != j) { + //are you active + it2Active = active.find(alignSeqs[j].seq.getName()); + if (it2Active != active.end()) { //this sequence has not been merged yet + //are you within "diff" bases + int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned()); + + if (mismatch <= diffs) { + //merge + names[alignSeqs[i].seq.getName()] += "," + names[alignSeqs[j].seq.getName()]; + + //remove from active list + active.erase(it2Active); + + //set numIdentical to 0, so you only print the representative seqs in the fasta file + alignSeqs[j].numIdentical = 0; + count++; + } + }//end if j active + }//end if i != j + }//end for loop + + //remove from active list + active.erase(itActive); + }//end if active i + } + + string newFastaFile = getRootName(fastafile) + "precluster" + getExtension(fastafile); + string newNamesFile = getRootName(fastafile) + "precluster.names"; + + printData(newFastaFile, newNamesFile); + + mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); mothurOutEndLine(); + mothurOut("pre.cluster removed " + toString(count) + " sequences."); mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + errorOut(e, "PreClusterCommand", "execute"); + exit(1); + } +} +/**************************************************************************************************/ +int PreClusterCommand::readSeqs(){ + try { + ifstream inFasta; + openInputFile(fastafile, inFasta); + length = 0; + map::iterator it; + + while (!inFasta.eof()) { + Sequence temp(inFasta); //read seq + + if (temp.getName() != "") { + if (namefile != "") { + //make sure fasta and name files match + it = names.find(temp.getName()); + if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); } + }else { sizes[temp.getName()] = 1; } + + seqPNode tempNode(sizes[temp.getName()], temp); + alignSeqs.push_back(tempNode); + active[temp.getName()] = true; + } + gobble(inFasta); + } + inFasta.close(); + + if (alignSeqs.size() != 0) { length = alignSeqs[0].seq.getAligned().length(); } + + return alignSeqs.size(); + } + catch(exception& e) { + errorOut(e, "PreClusterCommand", "readSeqs"); + exit(1); + } +} +/**************************************************************************************************/ +void PreClusterCommand::readNameFile(){ + try { + ifstream in; + openInputFile(namefile, in); + string firstCol, secondCol; + + while (!in.eof()) { + in >> firstCol >> secondCol; gobble(in); + names[firstCol] = secondCol; + int size = 1; + while (secondCol.find_first_of(',') != -1) { + size++; + secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length()); + } + sizes[firstCol] = size; + } + in.close(); + } + catch(exception& e) { + errorOut(e, "PreClusterCommand", "readNameFile"); + exit(1); + } +} +/**************************************************************************************************/ +int PreClusterCommand::calcMisMatches(string seq1, string seq2){ + try { + int numBad = 0; + + for (int i = 0; i < seq1.length(); i++) { + //do they match + if (seq1[i] != seq2[i]) { numBad++; } + if (numBad > diffs) { return length; } //to far to cluster + } + + return numBad; + } + catch(exception& e) { + errorOut(e, "PreClusterCommand", "calcMisMatches"); + exit(1); + } +} +/**************************************************************************************************/ +void PreClusterCommand::printData(string newfasta, string newname){ + try { + ofstream outFasta; + ofstream outNames; + openOutputFile(newfasta, outFasta); + openOutputFile(newname, outNames); + + map::iterator itNames; + + for (int i = 0; i < alignSeqs.size(); i++) { + if (alignSeqs[i].numIdentical != 0) { + alignSeqs[i].seq.printSequence(outFasta); + + itNames = names.find(alignSeqs[i].seq.getName()); + + outNames << itNames->first << '\t' << itNames->second << endl; + } + } + + outFasta.close(); + outNames.close(); + + } + catch(exception& e) { + errorOut(e, "PreClusterCommand", "printData"); + exit(1); + } +} + +/**************************************************************************************************/ + + diff --git a/preclustercommand.h b/preclustercommand.h new file mode 100644 index 0000000..bb666cd --- /dev/null +++ b/preclustercommand.h @@ -0,0 +1,59 @@ +#ifndef PRECLUSTERCOMMAND_H +#define PRECLUSTERCOMMAND_H + + +/* + * preclustercommand.h + * Mothur + * + * Created by westcott on 12/21/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + + +#include "command.hpp" +#include "sequence.hpp" + +/************************************************************/ +struct seqPNode { + int numIdentical; + Sequence seq; + seqPNode() {} + seqPNode(int s, Sequence q) : numIdentical(s), seq(q) {} + ~seqPNode() {} +}; +/************************************************************/ + +class PreClusterCommand : public Command { + +public: + PreClusterCommand(string); + ~PreClusterCommand(); + int execute(); + void help(); + +private: + int diffs, length; + bool abort; + string fastafile, namefile; + vector alignSeqs; //maps the number of identical seqs to a sequence + map names; //represents the names file first column maps to second column + map sizes; //this map a seq name to the number of identical seqs in the names file + map active; //maps sequence name to whether it has already been merged or not. + + int readSeqs(); + int calcMisMatches(string, string); + void readNameFile(); + void printData(string, string); //fasta filename, names file name +}; + +/************************************************************/ + + + + + +#endif + + diff --git a/readblast.cpp b/readblast.cpp index 7a092f9..a6d23e9 100644 --- a/readblast.cpp +++ b/readblast.cpp @@ -12,7 +12,7 @@ //******************************************************************************************************************** //sorts lowest to highest -inline bool compareOverlap(DistNode left, DistNode right){ +inline bool compareOverlap(seqDist left, seqDist right){ return (left.dist < right.dist); } /*********************************************************************************************/ @@ -91,7 +91,7 @@ void ReadBlast::read(NameAssignment* nameMap) { //if there is a valid overlap, add it if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { if (!hclusterWanted) { - DistNode overlapValue(itA->second, itB->second, thisoverlap); + seqDist overlapValue(itA->second, itB->second, thisoverlap); overlap.push_back(overlapValue); }else { outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl; @@ -137,7 +137,7 @@ void ReadBlast::read(NameAssignment* nameMap) { //if there is a valid overlap, add it if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { if (!hclusterWanted) { - DistNode overlapValue(itA->second, itB->second, thisoverlap); + seqDist overlapValue(itA->second, itB->second, thisoverlap); //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl; overlap.push_back(overlapValue); }else { @@ -201,7 +201,7 @@ void ReadBlast::read(NameAssignment* nameMap) { //if there is a valid overlap, add it if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { if (!hclusterWanted) { - DistNode overlapValue(itA->second, itB->second, thisoverlap); + seqDist overlapValue(itA->second, itB->second, thisoverlap); overlap.push_back(overlapValue); }else { outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl; diff --git a/readblast.h b/readblast.h index 0b47b59..128c144 100644 --- a/readblast.h +++ b/readblast.h @@ -13,15 +13,6 @@ #include "sparsematrix.hpp" #include "nameassignment.hpp" -/****************************************************************************************/ -struct DistNode { - int seq1; - int seq2; - float dist; - DistNode(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {} - DistNode() {} - ~DistNode() {} -}; /****************************************************************************************/ //Note: this class creates a sparsematrix and list if the read is executed, but does not delete them on deconstruction. @@ -36,7 +27,7 @@ public: void read(NameAssignment*); SparseMatrix* getDistMatrix() { return matrix; } - vector getOverlapMatrix() { return overlap; } + vector getOverlapMatrix() { return overlap; } string getOverlapFile() { return overlapFile; } string getDistFile() { return distFile; } @@ -48,7 +39,7 @@ private: bool hclusterWanted; SparseMatrix* matrix; - vector overlap; + vector overlap; void readNames(NameAssignment*); }; diff --git a/readcluster.cpp b/readcluster.cpp index 233b8e1..11b8d01 100644 --- a/readcluster.cpp +++ b/readcluster.cpp @@ -25,7 +25,7 @@ void ReadCluster::read(NameAssignment* nameMap){ if (format == "phylip") { convertPhylip2Column(nameMap); } else { list = new ListVector(nameMap->getListVector()); } - createHClusterFile(); + OutPutFile = sortFile(distFile); } catch(exception& e) { @@ -35,71 +35,6 @@ void ReadCluster::read(NameAssignment* nameMap){ } /***********************************************************************/ -void ReadCluster::createHClusterFile(){ - try { - string outfile = getRootName(distFile) + "sorted.dist"; - - //if you can, use the unix sort since its been optimized for years - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - string command = "sort -n -k +3 " + distFile + " -o " + outfile; - system(command.c_str()); - #else //you are stuck with my best attempt... - //windows sort does not have a way to specify a column, only a character in the line - //since we cannot assume that the distance will always be at the the same character location on each line - //due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back. - - //read in file line by file and put distance first - string tempDistFile = distFile + ".temp"; - ifstream input; - ofstream output; - openInputFile(distFile, input); - openOutputFile(tempDistFile, output); - - string firstName, secondName; - float dist; - while (input) { - input >> firstName >> secondName >> dist; - output << dist << '\t' << firstName << '\t' << secondName << endl; - gobble(input); - } - input.close(); - output.close(); - - - //sort using windows sort - string tempOutfile = outfile + ".temp"; - string command = "sort " + tempDistFile + " /O " + tempOutfile; - system(command.c_str()); - - //read in sorted file and put distance at end again - ifstream input2; - openInputFile(tempOutfile, input2); - openOutputFile(outfile, output); - - while (input2) { - input2 >> dist >> firstName >> secondName; - output << firstName << '\t' << secondName << '\t' << dist << endl; - gobble(input2); - } - input2.close(); - output.close(); - - //remove temp files - remove(tempDistFile.c_str()); - remove(tempOutfile.c_str()); - #endif - - OutPutFile = outfile; - } - catch(exception& e) { - errorOut(e, "ReadCluster", "createHClusterFile"); - exit(1); - } -} - - -/***********************************************************************/ - void ReadCluster::convertPhylip2Column(NameAssignment* nameMap){ try { //convert phylip file to column file diff --git a/readcluster.h b/readcluster.h index 9e62e92..bcd910c 100644 --- a/readcluster.h +++ b/readcluster.h @@ -27,7 +27,6 @@ public: string getOutputFile() { return OutPutFile; } void setFormat(string f) { format = f; } ListVector* getListVector() { return list; } - //NameAssignment* getNameMap() { return nameMap; } private: GlobalData* globaldata; @@ -35,9 +34,7 @@ private: string OutPutFile, format; ListVector* list; float cutoff; - //NameAssignment* nameMap; - void createHClusterFile(); void convertPhylip2Column(NameAssignment*); }; -- 2.39.2