]> git.donarmstrong.com Git - mothur.git/blobdiff - distancedb.cpp
sffinfo bug with flow grams right index when clipQualRight=0
[mothur.git] / distancedb.cpp
index d6ec258702207ca1312e004f0f275095f506f326..27e278574493c89d0197d99cbe71c0c3dd03d0d9 100644 (file)
 #include "database.hpp"
 #include "sequence.hpp"
 #include "distancedb.hpp"
+#include "onegapignore.h"
 
 
 /**************************************************************************************************/
-
-DistanceDB::DistanceDB(string fastaFileName, string distanceFileName) : Database(fastaFileName) {
-       
-       ifstream inputData;
-       openInputFile(distanceFileName, inputData);
-       int numCandSeqs=count(istreambuf_iterator<char>(inputData),istreambuf_iterator<char>(), '\n');  //      count the number of
-       inputData.seekg(0);                                                                                                                                             //      sequences
-
-       hit closestMatch;
-       string candidateSeqName;
-       string junk;
-       
-       mostSimSequenceVector.resize(numCandSeqs);
-       
-       for(int i=0;i<numCandSeqs;i++){
-               inputData >> candidateSeqName >> closestMatch.seqName >> closestMatch.indexNumber >> closestMatch.simScore;     
-               mostSimSequenceVector[i] = closestMatch;
+DistanceDB::DistanceDB() : Database() { 
+       try {
+               templateAligned = true;  
+               templateSeqsLength = 0; 
+               distCalculator = new oneGapIgnoreTermGapDist();
        }
-       mothurOut(toString(numCandSeqs)); mothurOutEndLine();
-       searchIndex = 0;
-       inputData.close();
+       catch(exception& e) {
+               m->errorOut(e, "DistanceDB", "DistanceDB");
+               exit(1);
+       }       
 }
-
 /**************************************************************************************************/
-
-Sequence DistanceDB::findClosestSequence(Sequence* candidateSeq){
-       
-       hit simAccession = mostSimSequenceVector[searchIndex];
-//     string candidateSeqName, closestMatchSeqName, junk;
-//     int closestMatchIndexNumber;
-//     float closestMatchSimScore;
-//     
-//     inputData >> candidateSeqName >> closestMatchSeqName >> closestMatchIndexNumber >> closestMatchSimScore;
-//     getline(inputData, junk);       
-
-//     searchScore = 100. * closestMatchSimScore;
-
-       searchScore = 100. * simAccession.simScore;
-       searchIndex++;
-//     return templateSequences[closestMatchIndexNumber];
-       if (templateValid) {  return templateSequences[simAccession.indexNumber];       }
-       else {  return emptySequence;   }
+void DistanceDB::addSequence(Sequence seq) {
+       try {
+               //are the template sequences aligned
+               if (!isAligned(seq.getAligned())) {
+                       templateAligned = false;
+                       m->mothurOut(seq.getName() + " is not aligned. Sequences must be aligned to use the distance method.");
+                       m->mothurOutEndLine(); 
+               }
+               
+               if (templateSeqsLength == 0) { templateSeqsLength = seq.getAligned().length(); }
+                               
+               data.push_back(seq);
+       }
+       catch(exception& e) {
+               m->errorOut(e, "DistanceDB", "addSequence");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+//returns indexes to top matches
+vector<int> DistanceDB::findClosestSequences(Sequence* query, int numWanted){
+       try {
+               vector<int> topMatches;
+               Scores.clear();
+               bool templateSameLength = true;
+               string sequence = query->getAligned();
+               vector<seqDist> dists; 
+               
+               searchScore = -1.0;
        
+               if (numWanted > data.size()){
+                       m->mothurOut("numwanted is larger than the number of template sequences, using "+ toString(data.size()) + ".");
+                       m->mothurOutEndLine();
+                       numWanted = data.size();
+               }
+               
+               if (sequence.length() != templateSeqsLength) { templateSameLength = false; }
+               
+               if (templateSameLength && templateAligned) {
+                       if (numWanted != 1) {
+                               
+                               dists.resize(data.size());
+                               
+                               //calc distance from this sequence to every sequence in the template
+                               for (int i = 0; i < data.size(); i++) {
+                                       distCalculator->calcDist(*query, data[i]);
+                                       float dist = distCalculator->getDist();
+                                       
+                                       //save distance to each template sequence
+                                       dists[i].seq1 = -1;
+                                       dists[i].seq2 = i;
+                                       dists[i].dist = dist;
+                               }
+                               
+                               sort(dists.begin(), dists.end(), compareSequenceDistance);  //sorts by distance lowest to highest
+                               
+                               //save distance of best match
+                               searchScore = dists[0].dist;
+                               
+                               //fill topmatches with numwanted closest sequences indexes
+                               for (int i = 0; i < numWanted; i++) {
+                                       topMatches.push_back(dists[i].seq2);
+                                       Scores.push_back(dists[i].dist);
+                               }
+                       }else {
+                               int bestIndex = 0;
+                               float smallDist = 100000;
+                               for (int i = 0; i < data.size(); i++) {
+                                       distCalculator->calcDist(*query, data[i]);
+                                       float dist = distCalculator->getDist();
+                                       
+                                       //are you smaller?
+                                       if (dist < smallDist) {
+                                               bestIndex = i;
+                                               smallDist = dist;
+                                       }
+                               }
+                               searchScore = smallDist;
+                               topMatches.push_back(bestIndex);
+                               Scores.push_back(smallDist);
+                       }
+               
+               }else{
+                       m->mothurOut("cannot find closest matches using distance method for " + query->getName() + " without aligned template sequences of the same length.");
+                       m->mothurOutEndLine();
+                       exit(1);
+               }
+               
+               return topMatches;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "DistanceDB", "findClosestSequence");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+bool DistanceDB::isAligned(string seq){
+       try {
+               bool aligned;
+               
+               int pos = seq.find_first_of(".-");
+               
+               if (pos != seq.npos) {
+                       aligned = true;
+               }else { aligned = false; }
+               
+               
+               return aligned;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "DistanceDB", "isAligned");
+               exit(1);
+       }       
 }
 
 /**************************************************************************************************/