]> git.donarmstrong.com Git - mothur.git/commitdiff
added distance search method to classify.seqs
authorwestcott <westcott>
Wed, 27 Jan 2010 16:04:47 +0000 (16:04 +0000)
committerwestcott <westcott>
Wed, 27 Jan 2010 16:04:47 +0000 (16:04 +0000)
Mothur.xcodeproj/project.pbxproj
classify.cpp
classifyseqscommand.cpp
classifyseqscommand.h
distancedb.cpp
distancedb.hpp
mothur.cpp
mothur.h

index b0db01845e76ddedd0486f3b1906e626f6c88ff7..e697db8d1b3e6c4718f1f24320b59bdb6c7d0d67 100644 (file)
                A787821110A0AAE70086103D /* bayesian.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787821010A0AAE70086103D /* bayesian.cpp */; };
                A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78782AA10A1B1CB0086103D /* alignmentdb.cpp */; };
                A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; };
+               A794201111107897003AECCD /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A794200F11107897003AECCD /* distancedb.cpp */; };
                A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
                A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
                A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D176CE10F2558500159497 /* pcacommand.cpp */; };
                A78782AA10A1B1CB0086103D /* alignmentdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignmentdb.cpp; sourceTree = SOURCE_ROOT; };
                A787844310A1EBDD0086103D /* knn.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = knn.h; sourceTree = SOURCE_ROOT; };
                A787844410A1EBDD0086103D /* knn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = knn.cpp; sourceTree = SOURCE_ROOT; };
+               A794200F11107897003AECCD /* distancedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distancedb.cpp; sourceTree = "<group>"; };
+               A794201011107897003AECCD /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = "<group>"; };
                A7B04491106CC3E60046FC83 /* chimeraslayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayer.h; sourceTree = SOURCE_ROOT; };
                A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; };
                A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; };
                                37D927D40F21331F001D4494 /* database.hpp */,
                                37D927D30F21331F001D4494 /* database.cpp */,
                                37D927D50F21331F001D4494 /* datavector.hpp */,
+                               A794201011107897003AECCD /* distancedb.hpp */,
+                               A794200F11107897003AECCD /* distancedb.cpp */,
                                37D927DC0F21331F001D4494 /* fastamap.h */,
                                37D927DB0F21331F001D4494 /* fastamap.cpp */,
                                375873EA0F7D64520040F377 /* fullmatrix.h */,
                                A7D86C8C10FE09FE00865661 /* formatphylip.cpp in Sources */,
                                A704E8A31106045D00870157 /* otuhierarchycommand.cpp in Sources */,
                                A70B00C8110885EB002F453A /* setdircommand.cpp in Sources */,
+                               A794201111107897003AECCD /* distancedb.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
index c0f8a9c7d5115c60d46626235e82b99b625c43c1..346c76478eefdf896fe6f04659174321bc721063 100644 (file)
@@ -12,6 +12,7 @@
 #include "kmerdb.hpp"
 #include "suffixdb.hpp"
 #include "blastdb.hpp"
+#include "distancedb.hpp"
 
 /**************************************************************************************************/
 Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {         
@@ -42,6 +43,7 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, f
                }
                else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
                else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
+               else if(method == "distance")   {       database = new DistanceDB();    }
                else {
                        mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
                        mothurOutEndLine();
index 0d9e797a00c226ba2e2eea219d041fa1143d03fa..e6eb560e6880ff500f47d8c629f1a0b0c04066b7 100644 (file)
@@ -204,9 +204,10 @@ ClassifySeqsCommand::~ClassifySeqsCommand(){
 void ClassifySeqsCommand::help(){
        try {
                mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
-               mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n");
+               mothurOut("The classify.seqs command parameters are template, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n");
                mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
-               mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer and blast. The default is kmer.\n");
+               mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n");
+               mothurOut("The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n");
                mothurOut("The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n");
                mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
                mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
index 2223593c4e7758ac48844dbf7d5093b20f361b5d..19fc8a011d9b3dbc8dd8f1e482f7b50dc100078f 100644 (file)
@@ -12,7 +12,7 @@
 
 #include "mothur.h"
 #include "command.hpp"
-#include "alignment.hpp"
+//#include "alignment.hpp"
 #include "classify.h"
 
 //KNN and Bayesian methods modeled from algorithms in
index d6ec258702207ca1312e004f0f275095f506f326..25641eb0dacdf63d812cd1bf2a828a7cdfd5f8d2 100644 (file)
 #include "database.hpp"
 #include "sequence.hpp"
 #include "distancedb.hpp"
-
+#include "eachgapdist.h"
 
 /**************************************************************************************************/
-
-DistanceDB::DistanceDB(string fastaFileName, string distanceFileName) : Database(fastaFileName) {
-       
-       ifstream inputData;
-       openInputFile(distanceFileName, inputData);
-       int numCandSeqs=count(istreambuf_iterator<char>(inputData),istreambuf_iterator<char>(), '\n');  //      count the number of
-       inputData.seekg(0);                                                                                                                                             //      sequences
-
-       hit closestMatch;
-       string candidateSeqName;
-       string junk;
-       
-       mostSimSequenceVector.resize(numCandSeqs);
-       
-       for(int i=0;i<numCandSeqs;i++){
-               inputData >> candidateSeqName >> closestMatch.seqName >> closestMatch.indexNumber >> closestMatch.simScore;     
-               mostSimSequenceVector[i] = closestMatch;
+DistanceDB::DistanceDB() { 
+       try {
+               templateAligned = true;  
+               templateSeqsLength = 0; 
+               distCalculator = new eachGapDist();
        }
-       mothurOut(toString(numCandSeqs)); mothurOutEndLine();
-       searchIndex = 0;
-       inputData.close();
+       catch(exception& e) {
+               errorOut(e, "DistanceDB", "DistanceDB");
+               exit(1);
+       }       
 }
-
 /**************************************************************************************************/
-
-Sequence DistanceDB::findClosestSequence(Sequence* candidateSeq){
-       
-       hit simAccession = mostSimSequenceVector[searchIndex];
-//     string candidateSeqName, closestMatchSeqName, junk;
-//     int closestMatchIndexNumber;
-//     float closestMatchSimScore;
-//     
-//     inputData >> candidateSeqName >> closestMatchSeqName >> closestMatchIndexNumber >> closestMatchSimScore;
-//     getline(inputData, junk);       
-
-//     searchScore = 100. * closestMatchSimScore;
-
-       searchScore = 100. * simAccession.simScore;
-       searchIndex++;
-//     return templateSequences[closestMatchIndexNumber];
-       if (templateValid) {  return templateSequences[simAccession.indexNumber];       }
-       else {  return emptySequence;   }
-       
+void DistanceDB::addSequence(Sequence seq) {
+       try {
+               //are the template sequences aligned
+               if (!isAligned(seq.getAligned())) { templateAligned = false; mothurOut(seq.getName() + " is not aligned. Sequences must be aligned to use the distance method."); mothurOutEndLine(); }
+               
+               if (templateSeqsLength == 0) { templateSeqsLength = seq.getAligned().length(); }
+                               
+               data.push_back(seq);
+       }
+       catch(exception& e) {
+               errorOut(e, "DistanceDB", "addSequence");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+//returns indexes to top matches
+vector<int> DistanceDB::findClosestSequences(Sequence* query, int numWanted){
+       try {
+               vector<int> topMatches;
+               bool templateSameLength = true;
+               string sequence = query->getAligned();
+               vector<seqDist> dists;
+               
+               if (numWanted > data.size()) { mothurOut("numwanted is larger than the number of template sequences, using 10."); mothurOutEndLine(); numWanted = 10; }
+               
+               if (sequence.length() != templateSeqsLength) { templateSameLength = false; }
+               
+               if (templateSameLength && templateAligned) {
+                       //calc distance from this sequence to every sequence in the template
+                       for (int i = 0; i < data.size(); i++) {
+                               distCalculator->calcDist(*query, data[i]);
+                               float dist = distCalculator->getDist();
+                               
+                               //save distance to each template sequence
+                               seqDist temp(-1, i, dist);
+                               dists.push_back(temp);
+                       }
+                       
+                       sort(dists.begin(), dists.end(), compareSequenceDistance);  //sorts by distance lowest to highest
+                       
+                       //fill topmatches with numwanted closest sequences indexes
+                       for (int i = 0; i < numWanted; i++) {
+                               topMatches.push_back(dists[i].seq2);
+                       }
+               
+               }else{
+                       mothurOut("cannot find closest matches using distance method for " + query->getName() + " without aligned template sequences of the same length."); mothurOutEndLine();
+                       exit(1);
+               }
+               
+               return topMatches;
+       }
+       catch(exception& e) {
+               errorOut(e, "DistanceDB", "findClosestSequence");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+bool DistanceDB::isAligned(string seq){
+       try {
+               bool aligned;
+               
+               int pos = seq.find_first_of(".-");
+               
+               if (pos != seq.npos) {
+                       aligned = true;
+               }else { aligned = false; }
+               
+               
+               return aligned;
+       }
+       catch(exception& e) {
+               errorOut(e, "DistanceDB", "isAligned");
+               exit(1);
+       }       
 }
 
 /**************************************************************************************************/
index b0d46c6629ef1b19ce935f6f93608d2d89c8acd8..01fea59aeec3b340041fea68e26120cfd307a873 100644 (file)
@@ -5,31 +5,35 @@
  *  distancedb.hpp
  *  
  *
- *  Created by Pat Schloss on 12/29/08.
- *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *  Created by westcott on 1/27/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
  *
  */
 
 
 #include "mothur.h"
+#include "dist.h"
 
 class DistanceDB : public Database {
        
 public:
-
-       DistanceDB(string, string);
-       vector<int> findClosestSequences(Sequence*, int);
+       
+       DistanceDB();
+       ~DistanceDB() {}
+       
+       void generateDB() {} //doesn't generate a search db 
+       void addSequence(Sequence);  
+       vector<int> findClosestSequences(Sequence*, int);  // returns indexes of n closest sequences to query
        
 private:
-
-       struct hit{
-               string seqName;
-               int indexNumber;
-               float simScore;
-       };
+       vector<Sequence> data;
+       Dist* distCalculator;
+       
+       int templateSeqsLength;
+       bool templateAligned;
+       
+       bool isAligned(string);
        
-       vector<hit> mostSimSequenceVector;
-       int searchIndex;
 };
 
 #endif
index d4177e3368be8d836374e935b9e884d6ff26c5f4..bd7ab728c76d311ffe0dc7dcc052a026cfa6e741 100644 (file)
@@ -19,9 +19,8 @@ CommandFactory* CommandFactory::_uniqueInstance = 0;
 int main(int argc, char *argv[]){
        try {
                
-               //remove old logfile
-//             string log = "mothur.logFile";
-//             remove(log.c_str());
+               string log = "mothur.logFile";
+               remove(log.c_str());
                
                time_t ltime = time(NULL); /* calendar time */  
                string logFileName = "mothur." + toString(ltime) + ".logfile";
@@ -104,10 +103,11 @@ int main(int argc, char *argv[]){
                string outputDir = mothur->getOutputDir();
                logFileName = outputDir + logFileName;
        
-               delete mothur;
-               
-//             rename(log.c_str(), logFileName.c_str()); //logfile with timestamp
+               //need this because mothur.h makes the logfile, but doesn't know where to put it
+               rename(log.c_str(), logFileName.c_str()); //logfile with timestamp
                
+               delete mothur;
+
                return 0;
        }
        catch(exception& e) {
index 6381e51d626246244b5ce077b9b077337cfc42d8..8065dd4cc74284f11dbca4808eb2c8893a74b7c8 100644 (file)
--- a/mothur.h
+++ b/mothur.h
@@ -489,7 +489,7 @@ inline bool isBlank(string fileName){
 inline string getFullPathName(string fileName){
        
        string path = hasPath(fileName);
-       string newFileName = getSimpleName(fileName);
+       string newFileName;
        int pos;
        
        if (path == "") { return fileName; } //its a simple name
@@ -501,6 +501,7 @@ inline string getFullPathName(string fileName){
                //get current working directory 
                #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)   
                        if (path.rfind("./") == -1) { return fileName; } //already complete name
+                       else { newFileName = fileName.substr(fileName.rfind("./")+2); } //save the complete part of the name
                        
                        char* cwdpath;
                        size_t size;
@@ -543,6 +544,7 @@ inline string getFullPathName(string fileName){
                                
                #else
                        if (path.rfind(".\\") == -1) { return fileName; } //already complete name
+                       else { newFileName = fileName.substr(fileName.rfind(".\\")+2); } //save the complete part of the name
                                                
                        char *cwdpath = NULL;
                        cwdpath = getcwd(NULL, 0); // or _getcwd