From: westcott Date: Mon, 21 Dec 2009 15:03:14 +0000 (+0000) Subject: fixed some bugs and added mgcluster command X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=c82900be3ceed3d9bc491bdc98b1819ef85c1af7 fixed some bugs and added mgcluster command --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index efe0beb..5ae6f33 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -162,6 +162,7 @@ A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; }; A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; }; A70DECD91063D8B40057C03C /* secondarystructurecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70DECD81063D8B40057C03C /* secondarystructurecommand.cpp */; }; + A727E84A10D14568001A8432 /* readblast.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727E84910D14568001A8432 /* readblast.cpp */; }; A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */; }; A729ACD010848E6100139801 /* hclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A729ACCF10848E6100139801 /* hclustercommand.cpp */; }; A729ACE410849BBE00139801 /* hcluster.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A729ACE310849BBE00139801 /* hcluster.cpp */; }; @@ -178,6 +179,7 @@ A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; }; A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; }; A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; }; + A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */; }; A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; }; A7E4A824106A9AD700688F62 /* maligner.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A823106A9AD700688F62 /* maligner.cpp */; }; A7F5759710CEBC0600E20E47 /* libreadline.a in Frameworks */ = {isa = PBXBuildFile; fileRef = A7F5759610CEBC0600E20E47 /* libreadline.a */; }; @@ -532,6 +534,8 @@ A70B53A90F4CD7AD0064797E /* getlinecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlinecommand.h; sourceTree = SOURCE_ROOT; }; A70DECD71063D8B40057C03C /* secondarystructurecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = secondarystructurecommand.h; sourceTree = SOURCE_ROOT; }; A70DECD81063D8B40057C03C /* secondarystructurecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = secondarystructurecommand.cpp; sourceTree = SOURCE_ROOT; }; + A727E84810D14568001A8432 /* readblast.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readblast.h; sourceTree = SOURCE_ROOT; }; + A727E84910D14568001A8432 /* readblast.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readblast.cpp; sourceTree = SOURCE_ROOT; }; A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckrdp.h; sourceTree = SOURCE_ROOT; }; A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckrdp.cpp; sourceTree = SOURCE_ROOT; }; A729ACCE10848E6100139801 /* hclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hclustercommand.h; sourceTree = SOURCE_ROOT; }; @@ -564,6 +568,8 @@ A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; }; A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; }; A7B0450D106CEEC90046FC83 /* slayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = slayer.cpp; sourceTree = SOURCE_ROOT; }; + A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = ""; }; + A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = ""; }; A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; }; A7E4A782106913F900688F62 /* getsharedotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsharedotucommand.cpp; sourceTree = SOURCE_ROOT; }; A7E4A822106A9AD700688F62 /* maligner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = maligner.h; sourceTree = SOURCE_ROOT; }; @@ -725,6 +731,8 @@ 3796441D0FB9B9650081FDB6 /* read */ = { isa = PBXGroup; children = ( + A727E84810D14568001A8432 /* readblast.h */, + A727E84910D14568001A8432 /* readblast.cpp */, A751ACE81098B283003D0911 /* readcluster.h */, A751ACE91098B283003D0911 /* readcluster.cpp */, 375AA1340F9E433D008EF9B8 /* readcolumn.h */, @@ -915,6 +923,8 @@ 21E859D70FC4632E005E1A48 /* matrixoutputcommand.cpp */, 7E5A17AD0FE57292003C6A03 /* mergefilecommand.h */, 7E5A17AE0FE57292003C6A03 /* mergefilecommand.cpp */, + A7E0AAB410D2886D00CF407B /* mgclustercommand.h */, + A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */, 375873F70F7D649C0040F377 /* nocommands.h */, 375873F60F7D649C0040F377 /* nocommands.cpp */, 3792946E0F2E191800B9034A /* parsimonycommand.h */, @@ -1287,6 +1297,8 @@ A787844510A1EBDD0086103D /* knn.cpp in Sources */, A773805F10B6D6FC002A6A61 /* taxonomyequalizer.cpp in Sources */, A773809010B6EFD1002A6A61 /* phylotypecommand.cpp in Sources */, + A727E84A10D14568001A8432 /* readblast.cpp in Sources */, + A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp index 55f0b82..96e1a78 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -209,22 +209,23 @@ int AlignCommand::execute(){ while(!inFASTA.eof()){ input = getline(inFASTA); if (input.length() != 0) { - if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } } } inFASTA.close(); numFastaSeqs = positions.size(); - + int numSeqsPerProcessor = numFastaSeqs / processors; for (int i = 0; i < processors; i++) { - int startPos = positions[ i * numSeqsPerProcessor ]; + long int startPos = positions[ i * numSeqsPerProcessor ]; if(i == processors - 1){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } lines.push_back(new linePair(startPos, numSeqsPerProcessor)); } + createProcesses(alignFileName, reportFileName, accnosFileName); rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str()); @@ -318,11 +319,11 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName, for(int i=0;inumSeqs;i++){ - Sequence* candidateSeq = new Sequence(inFASTA); + Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); int origNumBases = candidateSeq->getNumBases(); string originalUnaligned = candidateSeq->getUnaligned(); int numBasesNeeded = origNumBases * threshold; - + if (candidateSeq->getName() != "") { //incase there is a commented sequence at the end of a file if (candidateSeq->getUnaligned().length() > alignment->getnRows()) { alignment->resize(candidateSeq->getUnaligned().length()+1); diff --git a/aligncommand.h b/aligncommand.h index f7d59b0..ec0c80f 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -28,7 +28,7 @@ private: struct linePair { int start; int numSeqs; - linePair(int i, int j) : start(i), numSeqs(j) {} + linePair(long int i, int j) : start(i), numSeqs(j) {} }; vector processIDS; //processid vector lines; diff --git a/alignmentdb.cpp b/alignmentdb.cpp index e450952..52eedf8 100644 --- a/alignmentdb.cpp +++ b/alignmentdb.cpp @@ -30,7 +30,7 @@ AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, floa if (temp.getName() != "") { templateSequences.push_back(temp); //save longest base - if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length(); } + if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length()+1; } } } diff --git a/cluster.cpp b/cluster.cpp index 6fd4631..429e19a 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -55,6 +55,7 @@ rabund(rav), list(lv), dMatrix(dm) seqVec[currentCell->row].push_back(currentCell); seqVec[currentCell->column].push_back(currentCell); } + mapWanted = false; //set to true by mgcluster to speed up overlap merge } /***********************************************************************/ @@ -78,7 +79,7 @@ void Cluster::getRowColCells() { } } - +/***********************************************************************/ // Remove the specified cell from the seqVec and from the sparse // matrix void Cluster::removeCell(const MatData& cell, int vrow, int vcol, bool rmMatrix) @@ -150,7 +151,8 @@ void Cluster::clusterBins(){ void Cluster::clusterNames(){ try { // cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(smallRow) << '\t' << list->get(smallCol); - + if (mapWanted) { updateMap(); } + list->set(smallCol, list->get(smallRow)+','+list->get(smallCol)); list->set(smallRow, ""); list->setLabel(toString(smallDist)); @@ -222,7 +224,57 @@ void Cluster::update(){ exit(1); } } +/***********************************************************************/ +void Cluster::setMapWanted(bool m) { + try { + mapWanted = m; + + //initialize map + for (int i = 0; i < list->getNumBins(); i++) { + + //parse bin + string names = list->get(i); + while (names.find_first_of(',') != -1) { + //get name from bin + string name = names.substr(0,names.find_first_of(',')); + //save name and bin number + seq2Bin[name] = i; + names = names.substr(names.find_first_of(',')+1, names.length()); + } + + //get last name + seq2Bin[names] = i; + } + + } + catch(exception& e) { + errorOut(e, "Cluster", "setMapWanted"); + exit(1); + } +} +/***********************************************************************/ +void Cluster::updateMap() { +try { + //update location of seqs in smallRow since they move to smallCol now + string names = list->get(smallRow); + while (names.find_first_of(',') != -1) { + //get name from bin + string name = names.substr(0,names.find_first_of(',')); + //save name and bin number + seq2Bin[name] = smallCol; + names = names.substr(names.find_first_of(',')+1, names.length()); + } + + //get last name + seq2Bin[names] = smallCol; + + } + catch(exception& e) { + errorOut(e, "Cluster", "updateMap"); + exit(1); + } +} +/***********************************************************************/ -/***********************************************************************/ diff --git a/cluster.hpp b/cluster.hpp index 99d8b7d..a6024e7 100644 --- a/cluster.hpp +++ b/cluster.hpp @@ -14,8 +14,10 @@ class Cluster { public: Cluster(RAbundVector*, ListVector*, SparseMatrix*); - virtual void update(); + virtual void update(); virtual string getTag() = 0; + virtual void setMapWanted(bool m); + virtual map getSeqtoBin() { return seq2Bin; } protected: void getRowColCells(); @@ -25,6 +27,7 @@ protected: virtual void clusterBins(); virtual void clusterNames(); + virtual void updateMap(); RAbundVector* rabund; ListVector* list; @@ -33,8 +36,10 @@ protected: int smallRow; int smallCol; float smallDist; + bool mapWanted; + map seq2Bin; - vector seqVec; // contains vectors of cells related to a certain sequence + vector seqVec; // contains vectors of cells related to a certain sequence MatVec rowCells; MatVec colCells; ull nRowCells; diff --git a/commandfactory.cpp b/commandfactory.cpp index c33de86..b361b66 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -60,6 +60,7 @@ #include "hclustercommand.h" #include "classifyseqscommand.h" #include "phylotypecommand.h" +#include "mgclustercommand.h" /*******************************************************/ @@ -129,6 +130,7 @@ CommandFactory::CommandFactory(){ commands["hcluster"] = "hcluster"; commands["classify.seqs"] = "classify.seqs"; commands["phylotype"] = "phylotype"; + commands["mgcluster"] = "mgcluster"; } /***********************************************************/ @@ -196,6 +198,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "hcluster") { command = new HClusterCommand(optionString); } else if(commandName == "classify.seqs") { command = new ClassifySeqsCommand(optionString); } else if(commandName == "phylotype") { command = new PhylotypeCommand(optionString); } + else if(commandName == "mgcluster") { command = new MGClusterCommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/engine.cpp b/engine.cpp index c6ade38..593c2c3 100644 --- a/engine.cpp +++ b/engine.cpp @@ -14,16 +14,6 @@ #include "engine.hpp" -/***********************************************************************/ -inline void terminateCommand(int dummy) { - - //mothurOut("Stopping command...."); - //CommandFactory* cFactory = CommandFactory::getInstance(); - //cFactory->getCommand(); //deletes old command and makes new no command. - //this may cause memory leak if old commands execute function allocated memory - //that is freed in the execute function and not the deconstructor - //mothurOut("DONE."); mothurOutEndLine(); -} /***********************************************************************/ Engine::Engine(){ try { diff --git a/engine.hpp b/engine.hpp index df4e08d..d3f156e 100644 --- a/engine.hpp +++ b/engine.hpp @@ -27,7 +27,6 @@ public: virtual bool getInput() = 0; virtual string getCommand(); vector getOptions() { return options; } - //virtual void terminateCommand(int); protected: vector options; CommandFactory* cFactory; diff --git a/fullmatrix.h b/fullmatrix.h index 890d07a..7cb8b15 100644 --- a/fullmatrix.h +++ b/fullmatrix.h @@ -35,6 +35,7 @@ public: int getNumGroups(); void printMatrix(ostream&); float get(int, int); + Names getRowInfo(int row) { return index[row]; } private: vector< vector > matrix; //a 2D distance matrix of all the sequences and their distances to eachother. diff --git a/getsharedotucommand.cpp b/getsharedotucommand.cpp index 742377f..8081f10 100644 --- a/getsharedotucommand.cpp +++ b/getsharedotucommand.cpp @@ -16,6 +16,7 @@ GetSharedOTUCommand::GetSharedOTUCommand(string option){ globaldata = GlobalData::getInstance(); abort = false; + unique = true; allLines = 1; labels.clear(); @@ -24,7 +25,7 @@ GetSharedOTUCommand::GetSharedOTUCommand(string option){ else { //valid paramters for this command - string Array[] = {"label","groups","fasta","list","group","output"}; + string Array[] = {"label","unique","shared","fasta","list","group","output"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -61,11 +62,20 @@ GetSharedOTUCommand::GetSharedOTUCommand(string option){ output = validParameter.validFile(parameters, "output", false); if (output == "not found") { output = ""; } - groups = validParameter.validFile(parameters, "groups", false); + groups = validParameter.validFile(parameters, "unique", false); if (groups == "not found") { groups = ""; } else { splitAtDash(groups, Groups); globaldata->Groups = Groups; + + } + + groups = validParameter.validFile(parameters, "shared", false); + if (groups == "not found") { groups = ""; } + else { + splitAtDash(groups, Groups); + globaldata->Groups = Groups; + unique = false; } fastafile = validParameter.validFile(parameters, "fasta", true); @@ -84,9 +94,12 @@ GetSharedOTUCommand::GetSharedOTUCommand(string option){ void GetSharedOTUCommand::help(){ try { - mothurOut("The get.sharedseqs command parameters are list, group, label, groups, output and fasta. The list and group parameters are required.\n"); + mothurOut("The get.sharedseqs command parameters are list, group, label, unique, shared, output and fasta. The list and group parameters are required.\n"); mothurOut("The label parameter allows you to select what distance levels you would like output files for, and are separated by dashes.\n"); - mothurOut("The groups parameter allows you to select groups you would like to know the shared info for, and are separated by dashes.\n"); + mothurOut("The unique and shared parameters allow you to select groups you would like to know the shared info for, and are separated by dashes.\n"); + mothurOut("If you enter your groups under the unique parameter mothur will return the otus that contain ONLY sequences from those groups.\n"); + mothurOut("If you enter your groups under the shared parameter mothur will return the otus that contain sequences from those groups and may also contain sequences from other groups.\n"); + mothurOut("If you do not enter any groups then the get.sharedseqs command will return sequences that are unique to all groups in your group file.\n"); mothurOut("The fasta parameter allows you to input a fasta file and outputs a fasta file for each distance level containing only the sequences that are in OTUs shared by the groups specified.\n"); mothurOut("The output parameter allows you to output the list of names without the group and bin number added. \n"); mothurOut("With this option you can use the names file as an input in get.seqs and remove.seqs commands. To do this enter output=accnos. \n"); @@ -121,7 +134,7 @@ int GetSharedOTUCommand::execute(){ if (Groups.size() == 0) { Groups = groupMap->namesOfGroups; } - + //put groups in map to find easier for(int i = 0; i < Groups.size(); i++) { groupFinder[Groups[i]] = Groups[i]; @@ -238,7 +251,7 @@ void GetSharedOTUCommand::process(ListVector* shared) { //go through each bin, find out if shared for (int i = 0; i < shared->getNumBins(); i++) { - bool sharedByAll = true; + bool uniqueOTU = true; map atLeastOne; for (int m = 0; m < Groups.size(); m++) { @@ -248,48 +261,48 @@ void GetSharedOTUCommand::process(ListVector* shared) { vector namesOfSeqsInThisBin; string names = shared->get(i); - while ((names.find_first_of(',') != -1) && sharedByAll) { + while ((names.find_first_of(',') != -1)) { string name = names.substr(0,names.find_first_of(',')); names = names.substr(names.find_first_of(',')+1, names.length()); - + //find group string seqGroup = groupMap->getGroup(name); if (output != "accnos") { namesOfSeqsInThisBin.push_back((name + "\t" + seqGroup + "\t" + toString(i+1))); }else { namesOfSeqsInThisBin.push_back(name); } - + if (seqGroup == "not found") { mothurOut(name + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1); } //is this seq in one of hte groups we care about it = groupFinder.find(seqGroup); - if (it == groupFinder.end()) { sharedByAll = false; } //you have a sequence from a group you don't want + if (it == groupFinder.end()) { uniqueOTU = false; } //you have a sequence from a group you don't want else { atLeastOne[seqGroup]++; } } //get last name - //find group - if (sharedByAll) { - string seqGroup = groupMap->getGroup(names); - if (output != "accnos") { - namesOfSeqsInThisBin.push_back((names + "\t" + seqGroup + "\t" + toString(i+1))); - }else { namesOfSeqsInThisBin.push_back(names); } - - if (seqGroup == "not found") { mothurOut(names + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1); } + string seqGroup = groupMap->getGroup(names); + if (output != "accnos") { + namesOfSeqsInThisBin.push_back((names + "\t" + seqGroup + "\t" + toString(i+1))); + }else { namesOfSeqsInThisBin.push_back(names); } + + if (seqGroup == "not found") { mothurOut(names + " is not in your groupfile. Please correct."); mothurOutEndLine(); exit(1); } + + //is this seq in one of hte groups we care about + it = groupFinder.find(seqGroup); + if (it == groupFinder.end()) { uniqueOTU = false; } //you have a sequence from a group you don't want + else { atLeastOne[seqGroup]++; } - //is this seq in one of hte groups we care about - it = groupFinder.find(seqGroup); - if (it == groupFinder.end()) { sharedByAll = false; } //you have a sequence from a group you don't want - else { atLeastOne[seqGroup]++; } - } //make sure you have at least one seq from each group you want + bool sharedByAll = true; map::iterator it2; for (it2 = atLeastOne.begin(); it2 != atLeastOne.end(); it2++) { if (it2->second == 0) { sharedByAll = false; } } - //if shared, save names of seqs in that bin - if (sharedByAll) { + //if the user wants unique bins and this is unique then print + //or this the user wants shared bins and this bin is shared then print + if ((unique && uniqueOTU && sharedByAll) || (!unique && sharedByAll)) { wroteSomething = true; num++; @@ -309,9 +322,6 @@ void GetSharedOTUCommand::process(ListVector* shared) { } } } - - - } outNames.close(); diff --git a/getsharedotucommand.h b/getsharedotucommand.h index 6470961..3b14b39 100644 --- a/getsharedotucommand.h +++ b/getsharedotucommand.h @@ -34,7 +34,7 @@ class GetSharedOTUCommand : public Command { set labels; string fastafile, label, groups, listfile, groupfile, output; - bool abort, allLines; + bool abort, allLines, unique; vector Groups; map groupFinder; map::iterator it; diff --git a/hcluster.cpp b/hcluster.cpp index 7613ecb..2d88533 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -16,7 +16,7 @@ HCluster::HCluster(RAbundVector* rav, ListVector* lv) : rabund(rav), list(lv){ try { - + mapWanted = false; numSeqs = list->getNumSeqs(); //initialize cluster array @@ -56,7 +56,8 @@ void HCluster::clusterBins(){ void HCluster::clusterNames(){ try { ///cout << smallCol << '\t' << smallRow << '\t' << smallDist << '\t' << list->get(clusterArray[smallRow].smallChild) << '\t' << list->get(clusterArray[smallCol].smallChild); - + if (mapWanted) { updateMap(); } + list->set(clusterArray[smallCol].smallChild, list->get(clusterArray[smallRow].smallChild)+','+list->get(clusterArray[smallCol].smallChild)); list->set(clusterArray[smallRow].smallChild, ""); list->setLabel(toString(smallDist)); @@ -256,7 +257,7 @@ void HCluster::updateArrayandLinkTable() { } } /***********************************************************************/ -bool HCluster::update(int row, int col, float distance){ +void HCluster::update(int row, int col, float distance){ try { smallRow = row; @@ -284,15 +285,62 @@ bool HCluster::update(int row, int col, float distance){ } //printInfo(); - return clustered; } catch(exception& e) { errorOut(e, "HCluster", "update"); exit(1); } } - - /***********************************************************************/ +void HCluster::setMapWanted(bool m) { + try { + mapWanted = m; + + //initialize map + for (int i = 0; i < list->getNumBins(); i++) { + + //parse bin + string names = list->get(i); + while (names.find_first_of(',') != -1) { + //get name from bin + string name = names.substr(0,names.find_first_of(',')); + //save name and bin number + seq2Bin[name] = i; + names = names.substr(names.find_first_of(',')+1, names.length()); + } + + //get last name + seq2Bin[names] = i; + } + + } + catch(exception& e) { + errorOut(e, "HCluster", "setMapWanted"); + exit(1); + } +} +/***********************************************************************/ +void HCluster::updateMap() { +try { + //update location of seqs in smallRow since they move to smallCol now + string names = list->get(clusterArray[smallRow].smallChild); + while (names.find_first_of(',') != -1) { + //get name from bin + string name = names.substr(0,names.find_first_of(',')); + //save name and bin number + seq2Bin[name] = clusterArray[smallCol].smallChild; + names = names.substr(names.find_first_of(',')+1, names.length()); + } + + //get last name + seq2Bin[names] = clusterArray[smallCol].smallChild; + } + catch(exception& e) { + errorOut(e, "HCluster", "updateMap"); + exit(1); + } +} +/***********************************************************************/ + diff --git a/hcluster.h b/hcluster.h index 6b2862b..dd4c679 100644 --- a/hcluster.h +++ b/hcluster.h @@ -22,8 +22,9 @@ class HCluster { public: HCluster(RAbundVector*, ListVector*); ~HCluster(){}; - bool update(int, int, float); - //string getTag(); + void update(int, int, float); + void setMapWanted(bool m); + map getSeqtoBin() { return seq2Bin; } protected: void clusterBins(); @@ -32,6 +33,7 @@ protected: int makeActive(); void printInfo(); void updateArrayandLinkTable(); + void updateMap(); RAbundVector* rabund; ListVector* list; @@ -43,10 +45,11 @@ protected: map::iterator it2; int numSeqs; - int smallRow; int smallCol; float smallDist; + map seq2Bin; + bool mapWanted; }; diff --git a/hclustercommand.cpp b/hclustercommand.cpp index dbc0e1b..3c79f64 100644 --- a/hclustercommand.cpp +++ b/hclustercommand.cpp @@ -176,7 +176,6 @@ int HClusterCommand::execute(){ print_start = true; start = time(NULL); - ifstream in; openInputFile(distfile, in); @@ -184,7 +183,6 @@ int HClusterCommand::execute(){ float distance; cluster = new HCluster(rabund, list); - bool clusteredSomething; vector seqs; seqs.resize(1); // to start loop exitedBreak = false; //lets you know if there is a distance stored in next @@ -207,7 +205,7 @@ int HClusterCommand::execute(){ //cout << "before cluster update" << endl; if (seqs[i].seq1 != seqs[i].seq2) { - clusteredSomething = cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist); + cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist); float rndDist = roundDist(seqs[i].dist, precision); //cout << "after cluster update clusterSomething = " << clusteredSomething << " rndDist = " << rndDist << " rndPreviousDist = " << rndPreviousDist << endl; @@ -301,7 +299,6 @@ void HClusterCommand::printData(string label){ } //********************************************************************************************************************** - vector HClusterCommand::getSeqs(ifstream& filehandle){ try { string firstName, secondName; diff --git a/hclustercommand.h b/hclustercommand.h index 7351cef..1dd46b9 100644 --- a/hclustercommand.h +++ b/hclustercommand.h @@ -28,10 +28,6 @@ //University of Florida, Gainesville, FL 32610-3622 and 3Materials Technology Directorate, Air Force Technical //Applications Center, 1030 S. Highway A1A, Patrick AFB, FL 32925-3002, USA //Received January 28, 2009; Revised April 14, 2009; Accepted April 15, 2009 - -/******************************************************************/ - - /************************************************************/ struct seqDist { int seq1; diff --git a/libshuff.cpp b/libshuff.cpp index e6c83f7..cd4570e 100644 --- a/libshuff.cpp +++ b/libshuff.cpp @@ -64,7 +64,6 @@ vector > > Libshuff::getSavedMins(){ vector Libshuff::getMinX(int x){ try{ - vector minX(groupSizes[x], 0); for(int i=0;i 1 ? (i==0 ? matrix->get(groups[x][0], groups[x][1]) : matrix->get(groups[x][i], groups[x][0])) : 0.0); //get the first value in row i of this block diff --git a/libshuffcommand.cpp b/libshuffcommand.cpp index 2d84767..74acd84 100644 --- a/libshuffcommand.cpp +++ b/libshuffcommand.cpp @@ -22,7 +22,6 @@ LibShuffCommand::LibShuffCommand(string option){ try { - globaldata = GlobalData::getInstance(); abort = false; Groups.clear(); @@ -74,19 +73,19 @@ LibShuffCommand::LibShuffCommand(string option){ userform = validParameter.validFile(parameters, "form", false); if (userform == "not found") { userform = "integral"; } if (abort == false) { - + matrix = globaldata->gMatrix; //get the distance matrix setGroups(); //set the groups to be analyzed and sorts them - + /********************************************************************************************/ //this is needed because when we read the matrix we sort it into groups in alphabetical order //the rest of the command and the classes used in this command assume specific order /********************************************************************************************/ matrix->setGroups(globaldata->gGroupmap->namesOfGroups); vector sizes; - for (int i = 0; i < globaldata->gGroupmap->namesOfGroups.size(); i++) { sizes.push_back(globaldata->gGroupmap->getNumSeqs(globaldata->gGroupmap->namesOfGroups[i])); } + for (int i = 0; i < globaldata->gGroupmap->namesOfGroups.size(); i++) { sizes.push_back(globaldata->gGroupmap->getNumSeqs(globaldata->gGroupmap->namesOfGroups[i])); } matrix->setSizes(sizes); - + if(userform == "discrete"){ form = new DLibshuff(matrix, iters, step, cutOff); @@ -132,10 +131,10 @@ int LibShuffCommand::execute(){ try { if (abort == true) { return 0; } - + savedDXYValues = form->evaluateAll(); savedMinValues = form->getSavedMins(); - + pValueCounts.resize(numGroups); for(int i=0;i myArray (Array, Array+(sizeof(Array)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //check for required parameters + blastfile = validParameter.validFile(parameters, "blast", true); + if (blastfile == "not open") { abort = true; } + else if (blastfile == "not found") { blastfile = ""; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } + + if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; } + + //check for optional parameter and set defaults + string temp; + temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; } + precisionLength = temp.length(); + convert(temp, precision); + + temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; } + convert(temp, cutoff); + cutoff += (5 / (precision * 10.0)); + + method = validParameter.validFile(parameters, "method", false); + if (method == "not found") { method = "furthest"; } + + if ((method == "furthest") || (method == "nearest") || (method == "average")) { } + else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; } + + temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; } + convert(temp, length); + + temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; } + convert(temp, penalty); + + temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; } + minWanted = isTrue(temp); + + temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; } + merge = isTrue(temp); + + temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; } + hclusterWanted = isTrue(temp); + } + + } + catch(exception& e) { + errorOut(e, "MGClusterCommand", "MGClusterCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +void MGClusterCommand::help(){ + try { + mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n"); + mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n"); + mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n"); + mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n"); + mothurOut("The precision parameter's default value is 100. \n"); + mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n"); + mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n"); + mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n"); + mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n"); + mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n"); + mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n"); + mothurOut("The mgcluster command should be in the following format: \n"); + mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n"); + mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n"); + } + catch(exception& e) { + errorOut(e, "MGClusterCommand", "help"); + exit(1); + } +} +//********************************************************************************************************************** +MGClusterCommand::~MGClusterCommand(){} +//********************************************************************************************************************** +int MGClusterCommand::execute(){ + try { + + if (abort == true) { return 0; } + + //read names file + if (namefile != "") { + nameMap = new NameAssignment(namefile); + nameMap->readMap(); + }else{ nameMap= new NameAssignment(); } + + string fileroot = getRootName(blastfile); + string tag = ""; + time_t start; + float previousDist = 0.00000; + float rndPreviousDist = 0.00000; + + //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector + //must remember to delete those objects here since readBlast does not + read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted); + read->read(nameMap); + + list = new ListVector(nameMap->getListVector()); + RAbundVector* rabund = new RAbundVector(list->getRAbundVector()); + + start = time(NULL); + oldList = *list; + + if (!hclusterWanted) { + //get distmatrix and overlap + SparseMatrix* distMatrix = read->getDistMatrix(); + overlapMatrix = read->getOverlapMatrix(); //already sorted by read + delete read; + + //create cluster + if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix); } + else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix); } + else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix); } + tag = cluster->getTag(); + cluster->setMapWanted(true); + + //open output files + openOutputFile(fileroot+ tag + ".list", listFile); + openOutputFile(fileroot+ tag + ".rabund", rabundFile); + openOutputFile(fileroot+ tag + ".sabund", sabundFile); + + //cluster using cluster classes + while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){ + + cluster->update(); + float dist = distMatrix->getSmallDist(); + float rndDist = roundDist(dist, precision); + + if(previousDist <= 0.0000 && dist != previousDist){ + oldList.setLabel("unique"); + printData(&oldList); + } + else if(rndDist != rndPreviousDist){ + if (merge) { + map seq2Bin = cluster->getSeqtoBin(); + ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + temp->setLabel(toString(rndPreviousDist, precisionLength-1)); + printData(temp); + delete temp; + }else{ + oldList.setLabel(toString(rndPreviousDist, precisionLength-1)); + printData(&oldList); + } + } + + previousDist = dist; + rndPreviousDist = rndDist; + oldList = *list; + } + + if(previousDist <= 0.0000){ + oldList.setLabel("unique"); + printData(&oldList); + } + else if(rndPreviousDist seq2Bin = cluster->getSeqtoBin(); + ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + temp->setLabel(toString(rndPreviousDist, precisionLength-1)); + printData(temp); + delete temp; + }else{ + oldList.setLabel(toString(rndPreviousDist, precisionLength-1)); + printData(&oldList); + } + } + + //free memory + overlapMatrix.clear(); + delete distMatrix; + delete cluster; + + }else { //use hcluster to cluster + tag = "fn"; + //get distmatrix and overlap + overlapFile = read->getOverlapFile(); + distFile = read->getDistFile(); + delete read; + + //sort the distance and overlap files + sortHclusterFiles(distFile, overlapFile); + + //create cluster + hcluster = new HCluster(rabund, list); + hcluster->setMapWanted(true); + + //open output files + openOutputFile(fileroot+ tag + ".list", listFile); + openOutputFile(fileroot+ tag + ".rabund", rabundFile); + openOutputFile(fileroot+ tag + ".sabund", sabundFile); + + vector seqs; seqs.resize(1); // to start loop + exitedBreak = false; //lets you know if there is a distance stored in next + + ifstream inHcluster; + openInputFile(distFile, inHcluster); + + while (seqs.size() != 0){ + + seqs = getSeqs(inHcluster); + random_shuffle(seqs.begin(), seqs.end()); + + for (int i = 0; i < seqs.size(); i++) { //-1 means skip me + + if (seqs[i].seq1 != seqs[i].seq2) { + + hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist); + + float rndDist = roundDist(seqs[i].dist, precision); + + if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){ + oldList.setLabel("unique"); + printData(&oldList); + } + else if((rndDist != rndPreviousDist)){ + if (merge) { + map seq2Bin = hcluster->getSeqtoBin(); + ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + temp->setLabel(toString(rndPreviousDist, precisionLength-1)); + printData(temp); + delete temp; + }else{ + oldList.setLabel(toString(rndPreviousDist, precisionLength-1)); + printData(&oldList); + } + } + + previousDist = seqs[i].dist; + rndPreviousDist = rndDist; + oldList = *list; + } + } + } + inHcluster.close(); + + if(previousDist <= 0.0000){ + oldList.setLabel("unique"); + printData(&oldList); + } + else if(rndPreviousDist seq2Bin = hcluster->getSeqtoBin(); + ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist); + temp->setLabel(toString(rndPreviousDist, precisionLength-1)); + printData(temp); + delete temp; + }else{ + oldList.setLabel(toString(rndPreviousDist, precisionLength-1)); + printData(&oldList); + } + } + + delete hcluster; + remove(distFile.c_str()); + remove(overlapFile.c_str()); + } + + delete list; + delete rabund; + listFile.close(); + sabundFile.close(); + rabundFile.close(); + + globaldata->setListFile(fileroot+ tag + ".list"); + globaldata->setFormat("list"); + + mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + errorOut(e, "MGClusterCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +void MGClusterCommand::printData(ListVector* mergedList){ + try { + mergedList->print(listFile); + mergedList->getRAbundVector().print(rabundFile); + + SAbundVector sabund = mergedList->getSAbundVector(); + + sabund.print(cout); + sabund.print(sabundFile); + } + catch(exception& e) { + errorOut(e, "MGClusterCommand", "printData"); + exit(1); + } +} +//********************************************************************************************************************** +//this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures +//that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data. +ListVector* MGClusterCommand::mergeOPFs(map binInfo, float dist){ + try { + //create new listvector so you don't overwrite the clustering + ListVector* newList = new ListVector(oldList); + bool done = false; + ifstream inOverlap; + int count = 0; + + if (hclusterWanted) { + openInputFile(overlapFile, inOverlap); + if (inOverlap.eof()) { done = true; } + }else { if (overlapMatrix.size() == 0) { done = true; } } + + while (!done) { + + //get next overlap + DistNode overlapNode; + if (!hclusterWanted) { + if (count < overlapMatrix.size()) { //do we have another node in the matrix + overlapNode = overlapMatrix[count]; + count++; + }else { break; } + }else { + if (!inOverlap.eof()) { + string firstName, secondName; + float overlapDistance; + inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap); + + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + overlapNode.seq1 = itA->second; + overlapNode.seq2 = itB->second; + overlapNode.dist = overlapDistance; + }else { inOverlap.close(); break; } + } + + if (overlapNode.dist < dist) { + //get names of seqs that overlap + string name1 = nameMap->get(overlapNode.seq1); + string name2 = nameMap->get(overlapNode.seq2); + + //use binInfo to find out if they are already in the same bin + int binKeep = binInfo[name1]; + int binRemove = binInfo[name2]; + + //if not merge bins and update binInfo + if(binKeep != binRemove) { + //save names in old bin + string names = list->get(binRemove); + + //merge bins into name1s bin + newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep)); + newList->set(binRemove, ""); + + //update binInfo + while (names.find_first_of(',') != -1) { + //get name from bin + string name = names.substr(0,names.find_first_of(',')); + //save name and bin number + binInfo[name] = binKeep; + names = names.substr(names.find_first_of(',')+1, names.length()); + } + + //get last name + binInfo[names] = binKeep; + } + + }else { done = true; } + } + + //return listvector + return newList; + + } + catch(exception& e) { + errorOut(e, "MGClusterCommand", "mergeOPFs"); + exit(1); + } +} +//********************************************************************************************************************** +void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) { + try { + //sort distFile + ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff); + readCluster->setFormat("column"); + readCluster->read(nameMap); + string sortedDistFile = readCluster->getOutputFile(); + ListVector* extralist = readCluster->getListVector(); //we get this so we can delete it and not have a memory leak + + remove(unsortedDist.c_str()); //delete unsorted file + distFile = sortedDistFile; + delete extralist; + delete readCluster; + + //sort overlap file + readCluster = new ReadCluster(unsortedOverlap, cutoff); + readCluster->setFormat("column"); + readCluster->read(nameMap); + string sortedOverlapFile = readCluster->getOutputFile(); + extralist = readCluster->getListVector(); //we get this so we can delete it and not have a memory leak + + remove(unsortedOverlap.c_str()); //delete unsorted file + overlapFile = sortedOverlapFile; + delete extralist; + delete readCluster; + } + catch(exception& e) { + errorOut(e, "MGClusterCommand", "sortHclusterFiles"); + exit(1); + } +} +//********************************************************************************************************************** +vector MGClusterCommand::getSeqs(ifstream& filehandle){ + try { + string firstName, secondName; + float distance, prevDistance; + vector sameSeqs; + prevDistance = -1; + + //if you are not at the beginning of the file + if (exitedBreak) { + sameSeqs.push_back(next); + prevDistance = next.dist; + exitedBreak = false; + } + + //get entry + while (!filehandle.eof()) { + + filehandle >> firstName >> secondName >> distance; gobble(filehandle); + + //save first one + if (prevDistance == -1) { prevDistance = distance; } + + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + //using cutoff + if (distance > cutoff) { break; } + + if (distance != -1) { //-1 means skip me + + //are the distances the same + if (distance == prevDistance) { //save in vector + DistNode temp(itA->second, itB->second, distance); + sameSeqs.push_back(temp); + exitedBreak = false; + }else{ + next.seq1 = itA->second; + next.seq2 = itB->second; + next.dist = distance; + exitedBreak = true; + break; + } + } + } + + return sameSeqs; + } + catch(exception& e) { + errorOut(e, "MGClusterCommand", "getSeqs"); + exit(1); + } +} +//********************************************************************************************************************** + + + + + diff --git a/mgclustercommand.h b/mgclustercommand.h new file mode 100644 index 0000000..9aa86d4 --- /dev/null +++ b/mgclustercommand.h @@ -0,0 +1,61 @@ +#ifndef MGCLUSTERCOMMAND_H +#define MGCLUSTERCOMMAND_H + +/* + * mgclustercommand.h + * Mothur + * + * Created by westcott on 12/11/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "command.hpp" +#include "readblast.h" +#include "sparsematrix.hpp" +#include "nameassignment.hpp" +#include "globaldata.hpp" +#include "cluster.hpp" +#include "hcluster.h" + +/**********************************************************************/ + +class MGClusterCommand : public Command { + +public: + MGClusterCommand(string); + ~MGClusterCommand(); + int execute(); + void help(); + +private: + GlobalData* globaldata; + ReadBlast* read; + NameAssignment* nameMap; + Cluster* cluster; + HCluster* hcluster; + ListVector* list; + ListVector oldList; + vector overlapMatrix; + DistNode next; + + + string blastfile, method, namefile, overlapFile, distFile; + ofstream sabundFile, rabundFile, listFile; + float cutoff, penalty; + int precision, length, precisionLength; + bool abort, minWanted, hclusterWanted, exitedBreak, merge; + + void printData(ListVector*); + ListVector* mergeOPFs(map, float); + void sortHclusterFiles(string, string); + vector getSeqs(ifstream&); + +}; + +/**********************************************************************/ + +#endif + + + diff --git a/nameassignment.cpp b/nameassignment.cpp index 9256086..d098a48 100644 --- a/nameassignment.cpp +++ b/nameassignment.cpp @@ -28,6 +28,7 @@ void NameAssignment::readMap(){ // data[firstCol] = secondCol; //store data in map list.push_back(secondCol); //adds data's value to list + reverse[rowIndex] = firstCol; (*this)[firstCol] = rowIndex++; gobble(fileHandle); } @@ -45,6 +46,7 @@ void NameAssignment::push_back(string name) { int num = (*this).size(); (*this)[name] = num; + reverse[num] = name; list.push_back(name); } @@ -64,11 +66,12 @@ ListVector NameAssignment::getListVector(void){ //********************************************************************************************************************** -void NameAssignment::print(void){ +void NameAssignment::print(ostream& out){ try { map::iterator it; +cout << (*this).size() << endl; for(it = (*this).begin(); it!=(*this).end(); it++){ - mothurOut(it->first + "\t" + toString(it->second)); mothurOutEndLine(); //prints out keys and values of the map this. + out << it->first << '\t' << it->second << endl; //prints out keys and values of the map this. } } catch(exception& e) { @@ -84,6 +87,12 @@ int NameAssignment::get(string key){ return (*this)[key]; } +//********************************************************************************************************************** + +string NameAssignment::get(int key){ + + return reverse[key]; +} //********************************************************************************************************************** diff --git a/nameassignment.hpp b/nameassignment.hpp index 2470a84..3ccb425 100644 --- a/nameassignment.hpp +++ b/nameassignment.hpp @@ -11,11 +11,13 @@ public: void readMap(); ListVector getListVector(); int get(string); - void print(); + string get(int); + void print(ostream&); void push_back(string); private: ifstream fileHandle; ListVector list; + map reverse; }; diff --git a/readblast.cpp b/readblast.cpp new file mode 100644 index 0000000..7a092f9 --- /dev/null +++ b/readblast.cpp @@ -0,0 +1,321 @@ +/* + * readblast.cpp + * Mothur + * + * Created by westcott on 12/10/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "readblast.h" +#include "progress.hpp" + +//******************************************************************************************************************** +//sorts lowest to highest +inline bool compareOverlap(DistNode left, DistNode right){ + return (left.dist < right.dist); +} +/*********************************************************************************************/ +ReadBlast::ReadBlast(string file, float c, float p, int l, bool m, bool h) : blastfile(file), cutoff(c), penalty(p), length(l), minWanted(m), hclusterWanted(h) { + try { + matrix = NULL; + } + catch(exception& e) { + errorOut(e, "ReadBlast", "ReadBlast"); + exit(1); + } +} +/*********************************************************************************************/ +//assumptions about the blast file: +//1. if duplicate lines occur the first line is always best and is chosen +//2. blast scores are grouped together, ie. a a .... score, a b .... score, a c ....score... +void ReadBlast::read(NameAssignment* nameMap) { + try { + + //if the user has not given a names file read names from blastfile + if (nameMap->size() == 0) { readNames(nameMap); } + int nseqs = nameMap->size(); + + ifstream fileHandle; + openInputFile(blastfile, fileHandle); + + string firstName, secondName, eScore, currentRow; + string repeatName = ""; + int count = 1; + float distance, thisoverlap, refScore; + float percentId; + float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq; + + ofstream outDist; + ofstream outOverlap; + + //create objects needed for read + if (!hclusterWanted) { + matrix = new SparseMatrix(); + }else{ + overlapFile = getRootName(blastfile) + "overlap.dist"; + distFile = getRootName(blastfile) + "hclusterDists.dist"; + + openOutputFile(overlapFile, outOverlap); + openOutputFile(distFile, outDist); + } + + Progress* reading = new Progress("Reading blast: ", nseqs * nseqs); + + //this is used to quickly find if we already have a distance for this combo + vector< map > dists; dists.resize(nseqs); //dists[0][1] = distance from seq0 to seq1 + map thisRowsBlastScores; + + if (!fileHandle.eof()) { + //read in line from file + fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score; + gobble(fileHandle); + + currentRow = firstName; + lengthThisSeq = numBases; + repeatName = firstName + secondName; + + if (firstName == secondName) { refScore = score; } + else{ + //convert name to number + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + thisRowsBlastScores[itB->second] = score; + + //calc overlap score + thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty); + + //if there is a valid overlap, add it + if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { + if (!hclusterWanted) { + DistNode overlapValue(itA->second, itB->second, thisoverlap); + overlap.push_back(overlapValue); + }else { + outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl; + } + } + } + }else { mothurOut("Error in your blast file, cannot read."); mothurOutEndLine(); exit(1); } + + + //read file + while(!fileHandle.eof()){ + + //read in line from file + fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score; + //cout << firstName << '\t' << secondName << '\t' << percentId << '\t' << numBases << '\t' << mismatch << '\t' << gap << '\t' << startQuery << '\t' << endQuery << '\t' << startRef << '\t' << endRef << '\t' << eScore << '\t' << score << endl; + gobble(fileHandle); + + string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file + + //if this is a new pairing + if (temp != repeatName) { + repeatName = temp; + + if (currentRow == firstName) { + //cout << "first = " << firstName << " second = " << secondName << endl; + if (firstName == secondName) { + refScore = score; + reading->update((count + nseqs)); + count++; + }else{ + //convert name to number + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + //save score + thisRowsBlastScores[itB->second] = score; + + //calc overlap score + thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty); + + //if there is a valid overlap, add it + if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { + if (!hclusterWanted) { + DistNode overlapValue(itA->second, itB->second, thisoverlap); + //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl; + overlap.push_back(overlapValue); + }else { + outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl; + } + } + } //end else + }else { //end row + //convert blast scores to distance and add cell to sparse matrix if we can + map::iterator it; + map::iterator itDist; + for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) { + distance = 1.0 - (it->second / refScore); + + //do we already have the distance calculated for b->a + map::iterator itA = nameMap->find(currentRow); + itDist = dists[it->first].find(itA->second); + + //if we have it then compare + if (itDist != dists[it->first].end()) { + //if you want the minimum blast score ratio, then pick max distance + if(minWanted) { distance = max(itDist->second, distance); } + else{ distance = min(itDist->second, distance); } + + //is this distance below cutoff + if (distance < cutoff) { + if (!hclusterWanted) { + PCell value(itA->second, it->first, distance); + matrix->addCell(value); + }else{ + outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl; + } + } + //not going to need this again + dists[it->first].erase(itDist); + }else { //save this value until we get the other ratio + dists[itA->second][it->first] = distance; + } + } + //clear out last rows info + thisRowsBlastScores.clear(); + + currentRow = firstName; + lengthThisSeq = numBases; + + //add this row to thisRowsBlastScores + if (firstName == secondName) { refScore = score; } + else{ //add this row to thisRowsBlastScores + + //convert name to number + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + thisRowsBlastScores[itB->second] = score; + + //calc overlap score + thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty); + + //if there is a valid overlap, add it + if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) { + if (!hclusterWanted) { + DistNode overlapValue(itA->second, itB->second, thisoverlap); + overlap.push_back(overlapValue); + }else { + outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl; + } + } + } + }//end if current row + }//end if repeat + }//end while + + //get last rows info stored + //convert blast scores to distance and add cell to sparse matrix if we can + map::iterator it; + map::iterator itDist; + for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) { + distance = 1.0 - (it->second / refScore); + + //do we already have the distance calculated for b->a + map::iterator itA = nameMap->find(currentRow); + itDist = dists[it->first].find(itA->second); + + //if we have it then compare + if (itDist != dists[it->first].end()) { + //if you want the minimum blast score ratio, then pick max distance + if(minWanted) { distance = max(itDist->second, distance); } + else{ distance = min(itDist->second, distance); } + + //is this distance below cutoff + if (distance < cutoff) { + if (!hclusterWanted) { + PCell value(itA->second, it->first, distance); + matrix->addCell(value); + }else{ + outDist << itA->first << '\t' << nameMap->get(it->first) << '\t' << distance << endl; + } + } + //not going to need this again + dists[it->first].erase(itDist); + }else { //save this value until we get the other ratio + dists[itA->second][it->first] = distance; + } + } + //clear out info + thisRowsBlastScores.clear(); + dists.clear(); + + if (!hclusterWanted) { + sort(overlap.begin(), overlap.end(), compareOverlap); + }else { + outDist.close(); + outOverlap.close(); + } + + reading->finish(); + delete reading; + fileHandle.close(); + } + catch(exception& e) { + errorOut(e, "ReadBlast", "read"); + exit(1); + } +} +/*********************************************************************************************/ +void ReadBlast::readNames(NameAssignment* nameMap) { + try { + mothurOut("Reading names... "); cout.flush(); + + string name, hold, prevName; + int num = 1; + + ifstream in; + openInputFile(blastfile, in); + + //read first line + in >> prevName; + for (int i = 0; i < 11; i++) { in >> hold; } + gobble(in); + + //save name in nameMap + nameMap->push_back(prevName); + + while (!in.eof()) { + + //read line + in >> name; + for (int i = 0; i < 11; i++) { in >> hold; } + gobble(in); + + //is this a new name? + if (name != prevName) { + prevName = name; + nameMap->push_back(name); + num++; + } + } + + in.close(); + + //write out names file + //string outNames = getRootName(blastfile) + "names"; + //ofstream out; + //openOutputFile(outNames, out); + //nameMap->print(out); + //out.close(); + + mothurOut(toString(num) + " names read."); mothurOutEndLine(); + + } + catch(exception& e) { + errorOut(e, "ReadBlast", "readNames"); + exit(1); + } +} +/*********************************************************************************************/ + + + diff --git a/readblast.h b/readblast.h new file mode 100644 index 0000000..0b47b59 --- /dev/null +++ b/readblast.h @@ -0,0 +1,59 @@ +#ifndef READBLAST_H +#define READBLAST_H +/* + * readblast.h + * Mothur + * + * Created by westcott on 12/10/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "sparsematrix.hpp" +#include "nameassignment.hpp" + +/****************************************************************************************/ +struct DistNode { + int seq1; + int seq2; + float dist; + DistNode(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {} + DistNode() {} + ~DistNode() {} +}; +/****************************************************************************************/ + +//Note: this class creates a sparsematrix and list if the read is executed, but does not delete them on deconstruction. +//the user of this object is responsible for deleting the matrix and list if they call the read or there will be a memory leak +//it is done this way so the read can be deleted and the information still used. + +class ReadBlast { + +public: + ReadBlast(string, float, float, int, bool, bool); //blastfile, cutoff, penalty, length of overlap, min or max bsr, hclusterWanted + ~ReadBlast() {} + + void read(NameAssignment*); + SparseMatrix* getDistMatrix() { return matrix; } + vector getOverlapMatrix() { return overlap; } + string getOverlapFile() { return overlapFile; } + string getDistFile() { return distFile; } + +private: + string blastfile, overlapFile, distFile; + int length; //number of amino acids overlapped + float penalty, cutoff; //penalty is used to adjust error rate + bool minWanted; //if true choose min bsr, if false choose max bsr + bool hclusterWanted; + + SparseMatrix* matrix; + vector overlap; + + void readNames(NameAssignment*); +}; + +/*******************************************************************************************/ + +#endif + diff --git a/readcolumn.cpp b/readcolumn.cpp index fccc2cd..6b8892b 100644 --- a/readcolumn.cpp +++ b/readcolumn.cpp @@ -45,10 +45,10 @@ void ReadColumnMatrix::read(NameAssignment* nameMap){ map::iterator itB = nameMap->find(secondName); if(itA == nameMap->end()){ - cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; + cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } if(itB == nameMap->end()){ - cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; + cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } if (distance == -1) { distance = 1000000; } diff --git a/readdistcommand.cpp b/readdistcommand.cpp index f6b4195..f6cf5db 100644 --- a/readdistcommand.cpp +++ b/readdistcommand.cpp @@ -164,6 +164,34 @@ int ReadDistCommand::execute(){ openInputFile(distFileName, in); matrix = new FullMatrix(in); //reads the matrix file in.close(); + + //if files don't match... + if (matrix->getNumSeqs() < groupMap->getNumSeqs()) { + mothurOut("Your distance file contains " + toString(matrix->getNumSeqs()) + " sequences, and your group file contains " + toString(groupMap->getNumSeqs()) + " sequences."); mothurOutEndLine(); + //create new group file + string newGroupFile = getRootName(groupfile) + "editted.groups"; + ofstream outGroups; + openOutputFile(newGroupFile, outGroups); + + for (int i = 0; i < matrix->getNumSeqs(); i++) { + Names temp = matrix->getRowInfo(i); + outGroups << temp.seqName << '\t' << temp.groupName << endl; + } + outGroups.close(); + + mothurOut(newGroupFile + " is a new group file containing only the sequence that are in your distance file. I will read this file instead."); mothurOutEndLine(); + + //read new groupfile + delete groupMap; groupMap = NULL; + groupfile = newGroupFile; + globaldata->setGroupFile(groupfile); + + groupMap = new GroupMap(groupfile); + groupMap->readMap(); + + globaldata->gGroupmap = groupMap; + } + //memory leak prevention if (globaldata->gMatrix != NULL) { delete globaldata->gMatrix; } globaldata->gMatrix = matrix; //save matrix for coverage commands diff --git a/slibshuff.cpp b/slibshuff.cpp index 219c709..1f2d3df 100644 --- a/slibshuff.cpp +++ b/slibshuff.cpp @@ -24,7 +24,6 @@ float SLibshuff::evaluatePair(int i, int j){ vector > SLibshuff::evaluateAll(){ try{ savedMins.resize(numGroups); - vector > dCXYValues(numGroups); for(int i=0;i 0 || maxLength > 0){ success = cullLength(currSeq); - if ((currSeq.getUnaligned().length() > 300) && (success)) { cout << "too long " << currSeq.getUnaligned().length() << endl; } if(!success){ trashCode += 'l'; } } if(maxHomoP > 0){