X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=distancedb.cpp;h=27e278574493c89d0197d99cbe71c0c3dd03d0d9;hp=d6ec258702207ca1312e004f0f275095f506f326;hb=a8e2df1b96a57f5f29576b08361b86a96a8eff4f;hpb=63e089e0b3aad1741bab60119ed7ccc784dce347 diff --git a/distancedb.cpp b/distancedb.cpp index d6ec258..27e2785 100644 --- a/distancedb.cpp +++ b/distancedb.cpp @@ -11,52 +11,135 @@ #include "database.hpp" #include "sequence.hpp" #include "distancedb.hpp" +#include "onegapignore.h" /**************************************************************************************************/ - -DistanceDB::DistanceDB(string fastaFileName, string distanceFileName) : Database(fastaFileName) { - - ifstream inputData; - openInputFile(distanceFileName, inputData); - int numCandSeqs=count(istreambuf_iterator(inputData),istreambuf_iterator(), '\n'); // count the number of - inputData.seekg(0); // sequences - - hit closestMatch; - string candidateSeqName; - string junk; - - mostSimSequenceVector.resize(numCandSeqs); - - for(int i=0;i> candidateSeqName >> closestMatch.seqName >> closestMatch.indexNumber >> closestMatch.simScore; - mostSimSequenceVector[i] = closestMatch; +DistanceDB::DistanceDB() : Database() { + try { + templateAligned = true; + templateSeqsLength = 0; + distCalculator = new oneGapIgnoreTermGapDist(); } - mothurOut(toString(numCandSeqs)); mothurOutEndLine(); - searchIndex = 0; - inputData.close(); + catch(exception& e) { + m->errorOut(e, "DistanceDB", "DistanceDB"); + exit(1); + } } - /**************************************************************************************************/ - -Sequence DistanceDB::findClosestSequence(Sequence* candidateSeq){ - - hit simAccession = mostSimSequenceVector[searchIndex]; -// string candidateSeqName, closestMatchSeqName, junk; -// int closestMatchIndexNumber; -// float closestMatchSimScore; -// -// inputData >> candidateSeqName >> closestMatchSeqName >> closestMatchIndexNumber >> closestMatchSimScore; -// getline(inputData, junk); - -// searchScore = 100. * closestMatchSimScore; - - searchScore = 100. * simAccession.simScore; - searchIndex++; -// return templateSequences[closestMatchIndexNumber]; - if (templateValid) { return templateSequences[simAccession.indexNumber]; } - else { return emptySequence; } +void DistanceDB::addSequence(Sequence seq) { + try { + //are the template sequences aligned + if (!isAligned(seq.getAligned())) { + templateAligned = false; + m->mothurOut(seq.getName() + " is not aligned. Sequences must be aligned to use the distance method."); + m->mothurOutEndLine(); + } + + if (templateSeqsLength == 0) { templateSeqsLength = seq.getAligned().length(); } + + data.push_back(seq); + } + catch(exception& e) { + m->errorOut(e, "DistanceDB", "addSequence"); + exit(1); + } +} +/**************************************************************************************************/ +//returns indexes to top matches +vector DistanceDB::findClosestSequences(Sequence* query, int numWanted){ + try { + vector topMatches; + Scores.clear(); + bool templateSameLength = true; + string sequence = query->getAligned(); + vector dists; + + searchScore = -1.0; + if (numWanted > data.size()){ + m->mothurOut("numwanted is larger than the number of template sequences, using "+ toString(data.size()) + "."); + m->mothurOutEndLine(); + numWanted = data.size(); + } + + if (sequence.length() != templateSeqsLength) { templateSameLength = false; } + + if (templateSameLength && templateAligned) { + if (numWanted != 1) { + + dists.resize(data.size()); + + //calc distance from this sequence to every sequence in the template + for (int i = 0; i < data.size(); i++) { + distCalculator->calcDist(*query, data[i]); + float dist = distCalculator->getDist(); + + //save distance to each template sequence + dists[i].seq1 = -1; + dists[i].seq2 = i; + dists[i].dist = dist; + } + + sort(dists.begin(), dists.end(), compareSequenceDistance); //sorts by distance lowest to highest + + //save distance of best match + searchScore = dists[0].dist; + + //fill topmatches with numwanted closest sequences indexes + for (int i = 0; i < numWanted; i++) { + topMatches.push_back(dists[i].seq2); + Scores.push_back(dists[i].dist); + } + }else { + int bestIndex = 0; + float smallDist = 100000; + for (int i = 0; i < data.size(); i++) { + distCalculator->calcDist(*query, data[i]); + float dist = distCalculator->getDist(); + + //are you smaller? + if (dist < smallDist) { + bestIndex = i; + smallDist = dist; + } + } + searchScore = smallDist; + topMatches.push_back(bestIndex); + Scores.push_back(smallDist); + } + + }else{ + m->mothurOut("cannot find closest matches using distance method for " + query->getName() + " without aligned template sequences of the same length."); + m->mothurOutEndLine(); + exit(1); + } + + return topMatches; + } + catch(exception& e) { + m->errorOut(e, "DistanceDB", "findClosestSequence"); + exit(1); + } +} +/**************************************************************************************************/ +bool DistanceDB::isAligned(string seq){ + try { + bool aligned; + + int pos = seq.find_first_of(".-"); + + if (pos != seq.npos) { + aligned = true; + }else { aligned = false; } + + + return aligned; + } + catch(exception& e) { + m->errorOut(e, "DistanceDB", "isAligned"); + exit(1); + } } /**************************************************************************************************/