From c3f0a9c8f932b923f3a6fbbf143e8f4b85fd6f5f Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 27 Jan 2010 16:04:47 +0000 Subject: [PATCH] added distance search method to classify.seqs --- Mothur.xcodeproj/project.pbxproj | 6 ++ classify.cpp | 2 + classifyseqscommand.cpp | 5 +- classifyseqscommand.h | 2 +- distancedb.cpp | 126 +++++++++++++++++++++---------- distancedb.hpp | 30 ++++---- mothur.cpp | 12 +-- mothur.h | 4 +- 8 files changed, 123 insertions(+), 64 deletions(-) diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index b0db018..e697db8 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -179,6 +179,7 @@ A787821110A0AAE70086103D /* bayesian.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787821010A0AAE70086103D /* bayesian.cpp */; }; A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78782AA10A1B1CB0086103D /* alignmentdb.cpp */; }; A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; }; + A794201111107897003AECCD /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A794200F11107897003AECCD /* distancedb.cpp */; }; A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; }; A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; }; A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D176CE10F2558500159497 /* pcacommand.cpp */; }; @@ -573,6 +574,8 @@ A78782AA10A1B1CB0086103D /* alignmentdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignmentdb.cpp; sourceTree = SOURCE_ROOT; }; A787844310A1EBDD0086103D /* knn.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = knn.h; sourceTree = SOURCE_ROOT; }; A787844410A1EBDD0086103D /* knn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = knn.cpp; sourceTree = SOURCE_ROOT; }; + A794200F11107897003AECCD /* distancedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distancedb.cpp; sourceTree = ""; }; + A794201011107897003AECCD /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = ""; }; A7B04491106CC3E60046FC83 /* chimeraslayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayer.h; sourceTree = SOURCE_ROOT; }; A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; }; A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; }; @@ -1016,6 +1019,8 @@ 37D927D40F21331F001D4494 /* database.hpp */, 37D927D30F21331F001D4494 /* database.cpp */, 37D927D50F21331F001D4494 /* datavector.hpp */, + A794201011107897003AECCD /* distancedb.hpp */, + A794200F11107897003AECCD /* distancedb.cpp */, 37D927DC0F21331F001D4494 /* fastamap.h */, 37D927DB0F21331F001D4494 /* fastamap.cpp */, 375873EA0F7D64520040F377 /* fullmatrix.h */, @@ -1331,6 +1336,7 @@ A7D86C8C10FE09FE00865661 /* formatphylip.cpp in Sources */, A704E8A31106045D00870157 /* otuhierarchycommand.cpp in Sources */, A70B00C8110885EB002F453A /* setdircommand.cpp in Sources */, + A794201111107897003AECCD /* distancedb.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/classify.cpp b/classify.cpp index c0f8a9c..346c764 100644 --- a/classify.cpp +++ b/classify.cpp @@ -12,6 +12,7 @@ #include "kmerdb.hpp" #include "suffixdb.hpp" #include "blastdb.hpp" +#include "distancedb.hpp" /**************************************************************************************************/ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) { @@ -42,6 +43,7 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, f } else if(method == "suffix") { database = new SuffixDB(numSeqs); } else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } + else if(method == "distance") { database = new DistanceDB(); } else { mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); mothurOutEndLine(); diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 0d9e797..e6eb560 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -204,9 +204,10 @@ ClassifySeqsCommand::~ClassifySeqsCommand(){ void ClassifySeqsCommand::help(){ try { mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n"); - mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"); + mothurOut("The classify.seqs command parameters are template, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"); mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"); - mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n"); + mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n"); + mothurOut("The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n"); mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n"); mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n"); diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 2223593..19fc8a0 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -12,7 +12,7 @@ #include "mothur.h" #include "command.hpp" -#include "alignment.hpp" +//#include "alignment.hpp" #include "classify.h" //KNN and Bayesian methods modeled from algorithms in diff --git a/distancedb.cpp b/distancedb.cpp index d6ec258..25641eb 100644 --- a/distancedb.cpp +++ b/distancedb.cpp @@ -11,52 +11,96 @@ #include "database.hpp" #include "sequence.hpp" #include "distancedb.hpp" - +#include "eachgapdist.h" /**************************************************************************************************/ - -DistanceDB::DistanceDB(string fastaFileName, string distanceFileName) : Database(fastaFileName) { - - ifstream inputData; - openInputFile(distanceFileName, inputData); - int numCandSeqs=count(istreambuf_iterator(inputData),istreambuf_iterator(), '\n'); // count the number of - inputData.seekg(0); // sequences - - hit closestMatch; - string candidateSeqName; - string junk; - - mostSimSequenceVector.resize(numCandSeqs); - - for(int i=0;i> candidateSeqName >> closestMatch.seqName >> closestMatch.indexNumber >> closestMatch.simScore; - mostSimSequenceVector[i] = closestMatch; +DistanceDB::DistanceDB() { + try { + templateAligned = true; + templateSeqsLength = 0; + distCalculator = new eachGapDist(); } - mothurOut(toString(numCandSeqs)); mothurOutEndLine(); - searchIndex = 0; - inputData.close(); + catch(exception& e) { + errorOut(e, "DistanceDB", "DistanceDB"); + exit(1); + } } - /**************************************************************************************************/ - -Sequence DistanceDB::findClosestSequence(Sequence* candidateSeq){ - - hit simAccession = mostSimSequenceVector[searchIndex]; -// string candidateSeqName, closestMatchSeqName, junk; -// int closestMatchIndexNumber; -// float closestMatchSimScore; -// -// inputData >> candidateSeqName >> closestMatchSeqName >> closestMatchIndexNumber >> closestMatchSimScore; -// getline(inputData, junk); - -// searchScore = 100. * closestMatchSimScore; - - searchScore = 100. * simAccession.simScore; - searchIndex++; -// return templateSequences[closestMatchIndexNumber]; - if (templateValid) { return templateSequences[simAccession.indexNumber]; } - else { return emptySequence; } - +void DistanceDB::addSequence(Sequence seq) { + try { + //are the template sequences aligned + if (!isAligned(seq.getAligned())) { templateAligned = false; mothurOut(seq.getName() + " is not aligned. Sequences must be aligned to use the distance method."); mothurOutEndLine(); } + + if (templateSeqsLength == 0) { templateSeqsLength = seq.getAligned().length(); } + + data.push_back(seq); + } + catch(exception& e) { + errorOut(e, "DistanceDB", "addSequence"); + exit(1); + } +} +/**************************************************************************************************/ +//returns indexes to top matches +vector DistanceDB::findClosestSequences(Sequence* query, int numWanted){ + try { + vector topMatches; + bool templateSameLength = true; + string sequence = query->getAligned(); + vector dists; + + if (numWanted > data.size()) { mothurOut("numwanted is larger than the number of template sequences, using 10."); mothurOutEndLine(); numWanted = 10; } + + if (sequence.length() != templateSeqsLength) { templateSameLength = false; } + + if (templateSameLength && templateAligned) { + //calc distance from this sequence to every sequence in the template + for (int i = 0; i < data.size(); i++) { + distCalculator->calcDist(*query, data[i]); + float dist = distCalculator->getDist(); + + //save distance to each template sequence + seqDist temp(-1, i, dist); + dists.push_back(temp); + } + + sort(dists.begin(), dists.end(), compareSequenceDistance); //sorts by distance lowest to highest + + //fill topmatches with numwanted closest sequences indexes + for (int i = 0; i < numWanted; i++) { + topMatches.push_back(dists[i].seq2); + } + + }else{ + mothurOut("cannot find closest matches using distance method for " + query->getName() + " without aligned template sequences of the same length."); mothurOutEndLine(); + exit(1); + } + + return topMatches; + } + catch(exception& e) { + errorOut(e, "DistanceDB", "findClosestSequence"); + exit(1); + } +} +/**************************************************************************************************/ +bool DistanceDB::isAligned(string seq){ + try { + bool aligned; + + int pos = seq.find_first_of(".-"); + + if (pos != seq.npos) { + aligned = true; + }else { aligned = false; } + + + return aligned; + } + catch(exception& e) { + errorOut(e, "DistanceDB", "isAligned"); + exit(1); + } } /**************************************************************************************************/ diff --git a/distancedb.hpp b/distancedb.hpp index b0d46c6..01fea59 100644 --- a/distancedb.hpp +++ b/distancedb.hpp @@ -5,31 +5,35 @@ * distancedb.hpp * * - * Created by Pat Schloss on 12/29/08. - * Copyright 2008 Patrick D. Schloss. All rights reserved. + * Created by westcott on 1/27/10. + * Copyright 2010 Schloss Lab. All rights reserved. * */ #include "mothur.h" +#include "dist.h" class DistanceDB : public Database { public: - - DistanceDB(string, string); - vector findClosestSequences(Sequence*, int); + + DistanceDB(); + ~DistanceDB() {} + + void generateDB() {} //doesn't generate a search db + void addSequence(Sequence); + vector findClosestSequences(Sequence*, int); // returns indexes of n closest sequences to query private: - - struct hit{ - string seqName; - int indexNumber; - float simScore; - }; + vector data; + Dist* distCalculator; + + int templateSeqsLength; + bool templateAligned; + + bool isAligned(string); - vector mostSimSequenceVector; - int searchIndex; }; #endif diff --git a/mothur.cpp b/mothur.cpp index d4177e3..bd7ab72 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -19,9 +19,8 @@ CommandFactory* CommandFactory::_uniqueInstance = 0; int main(int argc, char *argv[]){ try { - //remove old logfile -// string log = "mothur.logFile"; -// remove(log.c_str()); + string log = "mothur.logFile"; + remove(log.c_str()); time_t ltime = time(NULL); /* calendar time */ string logFileName = "mothur." + toString(ltime) + ".logfile"; @@ -104,10 +103,11 @@ int main(int argc, char *argv[]){ string outputDir = mothur->getOutputDir(); logFileName = outputDir + logFileName; - delete mothur; - -// rename(log.c_str(), logFileName.c_str()); //logfile with timestamp + //need this because mothur.h makes the logfile, but doesn't know where to put it + rename(log.c_str(), logFileName.c_str()); //logfile with timestamp + delete mothur; + return 0; } catch(exception& e) { diff --git a/mothur.h b/mothur.h index 6381e51..8065dd4 100644 --- a/mothur.h +++ b/mothur.h @@ -489,7 +489,7 @@ inline bool isBlank(string fileName){ inline string getFullPathName(string fileName){ string path = hasPath(fileName); - string newFileName = getSimpleName(fileName); + string newFileName; int pos; if (path == "") { return fileName; } //its a simple name @@ -501,6 +501,7 @@ inline string getFullPathName(string fileName){ //get current working directory #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if (path.rfind("./") == -1) { return fileName; } //already complete name + else { newFileName = fileName.substr(fileName.rfind("./")+2); } //save the complete part of the name char* cwdpath; size_t size; @@ -543,6 +544,7 @@ inline string getFullPathName(string fileName){ #else if (path.rfind(".\\") == -1) { return fileName; } //already complete name + else { newFileName = fileName.substr(fileName.rfind(".\\")+2); } //save the complete part of the name char *cwdpath = NULL; cwdpath = getcwd(NULL, 0); // or _getcwd -- 2.39.2