A787821110A0AAE70086103D /* bayesian.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787821010A0AAE70086103D /* bayesian.cpp */; };
A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78782AA10A1B1CB0086103D /* alignmentdb.cpp */; };
A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; };
+ A794201111107897003AECCD /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A794200F11107897003AECCD /* distancedb.cpp */; };
A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
A7D176CF10F2558500159497 /* pcacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D176CE10F2558500159497 /* pcacommand.cpp */; };
A78782AA10A1B1CB0086103D /* alignmentdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignmentdb.cpp; sourceTree = SOURCE_ROOT; };
A787844310A1EBDD0086103D /* knn.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = knn.h; sourceTree = SOURCE_ROOT; };
A787844410A1EBDD0086103D /* knn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = knn.cpp; sourceTree = SOURCE_ROOT; };
+ A794200F11107897003AECCD /* distancedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distancedb.cpp; sourceTree = "<group>"; };
+ A794201011107897003AECCD /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = "<group>"; };
A7B04491106CC3E60046FC83 /* chimeraslayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayer.h; sourceTree = SOURCE_ROOT; };
A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; };
A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; };
37D927D40F21331F001D4494 /* database.hpp */,
37D927D30F21331F001D4494 /* database.cpp */,
37D927D50F21331F001D4494 /* datavector.hpp */,
+ A794201011107897003AECCD /* distancedb.hpp */,
+ A794200F11107897003AECCD /* distancedb.cpp */,
37D927DC0F21331F001D4494 /* fastamap.h */,
37D927DB0F21331F001D4494 /* fastamap.cpp */,
375873EA0F7D64520040F377 /* fullmatrix.h */,
A7D86C8C10FE09FE00865661 /* formatphylip.cpp in Sources */,
A704E8A31106045D00870157 /* otuhierarchycommand.cpp in Sources */,
A70B00C8110885EB002F453A /* setdircommand.cpp in Sources */,
+ A794201111107897003AECCD /* distancedb.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
#include "kmerdb.hpp"
#include "suffixdb.hpp"
#include "blastdb.hpp"
+#include "distancedb.hpp"
/**************************************************************************************************/
Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {
}
else if(method == "suffix") { database = new SuffixDB(numSeqs); }
else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
+ else if(method == "distance") { database = new DistanceDB(); }
else {
mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
mothurOutEndLine();
void ClassifySeqsCommand::help(){
try {
mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
- mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n");
+ mothurOut("The classify.seqs command parameters are template, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n");
mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
- mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n");
+ mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n");
+ mothurOut("The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n");
mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n");
mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
#include "mothur.h"
#include "command.hpp"
-#include "alignment.hpp"
+//#include "alignment.hpp"
#include "classify.h"
//KNN and Bayesian methods modeled from algorithms in
#include "database.hpp"
#include "sequence.hpp"
#include "distancedb.hpp"
-
+#include "eachgapdist.h"
/**************************************************************************************************/
-
-DistanceDB::DistanceDB(string fastaFileName, string distanceFileName) : Database(fastaFileName) {
-
- ifstream inputData;
- openInputFile(distanceFileName, inputData);
- int numCandSeqs=count(istreambuf_iterator<char>(inputData),istreambuf_iterator<char>(), '\n'); // count the number of
- inputData.seekg(0); // sequences
-
- hit closestMatch;
- string candidateSeqName;
- string junk;
-
- mostSimSequenceVector.resize(numCandSeqs);
-
- for(int i=0;i<numCandSeqs;i++){
- inputData >> candidateSeqName >> closestMatch.seqName >> closestMatch.indexNumber >> closestMatch.simScore;
- mostSimSequenceVector[i] = closestMatch;
+DistanceDB::DistanceDB() {
+ try {
+ templateAligned = true;
+ templateSeqsLength = 0;
+ distCalculator = new eachGapDist();
}
- mothurOut(toString(numCandSeqs)); mothurOutEndLine();
- searchIndex = 0;
- inputData.close();
+ catch(exception& e) {
+ errorOut(e, "DistanceDB", "DistanceDB");
+ exit(1);
+ }
}
-
/**************************************************************************************************/
-
-Sequence DistanceDB::findClosestSequence(Sequence* candidateSeq){
-
- hit simAccession = mostSimSequenceVector[searchIndex];
-// string candidateSeqName, closestMatchSeqName, junk;
-// int closestMatchIndexNumber;
-// float closestMatchSimScore;
-//
-// inputData >> candidateSeqName >> closestMatchSeqName >> closestMatchIndexNumber >> closestMatchSimScore;
-// getline(inputData, junk);
-
-// searchScore = 100. * closestMatchSimScore;
-
- searchScore = 100. * simAccession.simScore;
- searchIndex++;
-// return templateSequences[closestMatchIndexNumber];
- if (templateValid) { return templateSequences[simAccession.indexNumber]; }
- else { return emptySequence; }
-
+void DistanceDB::addSequence(Sequence seq) {
+ try {
+ //are the template sequences aligned
+ if (!isAligned(seq.getAligned())) { templateAligned = false; mothurOut(seq.getName() + " is not aligned. Sequences must be aligned to use the distance method."); mothurOutEndLine(); }
+
+ if (templateSeqsLength == 0) { templateSeqsLength = seq.getAligned().length(); }
+
+ data.push_back(seq);
+ }
+ catch(exception& e) {
+ errorOut(e, "DistanceDB", "addSequence");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+//returns indexes to top matches
+vector<int> DistanceDB::findClosestSequences(Sequence* query, int numWanted){
+ try {
+ vector<int> topMatches;
+ bool templateSameLength = true;
+ string sequence = query->getAligned();
+ vector<seqDist> dists;
+
+ if (numWanted > data.size()) { mothurOut("numwanted is larger than the number of template sequences, using 10."); mothurOutEndLine(); numWanted = 10; }
+
+ if (sequence.length() != templateSeqsLength) { templateSameLength = false; }
+
+ if (templateSameLength && templateAligned) {
+ //calc distance from this sequence to every sequence in the template
+ for (int i = 0; i < data.size(); i++) {
+ distCalculator->calcDist(*query, data[i]);
+ float dist = distCalculator->getDist();
+
+ //save distance to each template sequence
+ seqDist temp(-1, i, dist);
+ dists.push_back(temp);
+ }
+
+ sort(dists.begin(), dists.end(), compareSequenceDistance); //sorts by distance lowest to highest
+
+ //fill topmatches with numwanted closest sequences indexes
+ for (int i = 0; i < numWanted; i++) {
+ topMatches.push_back(dists[i].seq2);
+ }
+
+ }else{
+ mothurOut("cannot find closest matches using distance method for " + query->getName() + " without aligned template sequences of the same length."); mothurOutEndLine();
+ exit(1);
+ }
+
+ return topMatches;
+ }
+ catch(exception& e) {
+ errorOut(e, "DistanceDB", "findClosestSequence");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+bool DistanceDB::isAligned(string seq){
+ try {
+ bool aligned;
+
+ int pos = seq.find_first_of(".-");
+
+ if (pos != seq.npos) {
+ aligned = true;
+ }else { aligned = false; }
+
+
+ return aligned;
+ }
+ catch(exception& e) {
+ errorOut(e, "DistanceDB", "isAligned");
+ exit(1);
+ }
}
/**************************************************************************************************/
* distancedb.hpp
*
*
- * Created by Pat Schloss on 12/29/08.
- * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ * Created by westcott on 1/27/10.
+ * Copyright 2010 Schloss Lab. All rights reserved.
*
*/
#include "mothur.h"
+#include "dist.h"
class DistanceDB : public Database {
public:
-
- DistanceDB(string, string);
- vector<int> findClosestSequences(Sequence*, int);
+
+ DistanceDB();
+ ~DistanceDB() {}
+
+ void generateDB() {} //doesn't generate a search db
+ void addSequence(Sequence);
+ vector<int> findClosestSequences(Sequence*, int); // returns indexes of n closest sequences to query
private:
-
- struct hit{
- string seqName;
- int indexNumber;
- float simScore;
- };
+ vector<Sequence> data;
+ Dist* distCalculator;
+
+ int templateSeqsLength;
+ bool templateAligned;
+
+ bool isAligned(string);
- vector<hit> mostSimSequenceVector;
- int searchIndex;
};
#endif
int main(int argc, char *argv[]){
try {
- //remove old logfile
-// string log = "mothur.logFile";
-// remove(log.c_str());
+ string log = "mothur.logFile";
+ remove(log.c_str());
time_t ltime = time(NULL); /* calendar time */
string logFileName = "mothur." + toString(ltime) + ".logfile";
string outputDir = mothur->getOutputDir();
logFileName = outputDir + logFileName;
- delete mothur;
-
-// rename(log.c_str(), logFileName.c_str()); //logfile with timestamp
+ //need this because mothur.h makes the logfile, but doesn't know where to put it
+ rename(log.c_str(), logFileName.c_str()); //logfile with timestamp
+ delete mothur;
+
return 0;
}
catch(exception& e) {
inline string getFullPathName(string fileName){
string path = hasPath(fileName);
- string newFileName = getSimpleName(fileName);
+ string newFileName;
int pos;
if (path == "") { return fileName; } //its a simple name
//get current working directory
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
if (path.rfind("./") == -1) { return fileName; } //already complete name
+ else { newFileName = fileName.substr(fileName.rfind("./")+2); } //save the complete part of the name
char* cwdpath;
size_t size;
#else
if (path.rfind(".\\") == -1) { return fileName; } //already complete name
+ else { newFileName = fileName.substr(fileName.rfind(".\\")+2); } //save the complete part of the name
char *cwdpath = NULL;
cwdpath = getcwd(NULL, 0); // or _getcwd