]> git.donarmstrong.com Git - mothur.git/blobdiff - maligner.cpp
tracking down chimera.slayer issue
[mothur.git] / maligner.cpp
index eaaaa8789961d4938a1aba793fb32e7ac331a105..7d7145f4ca6bd9fac7a316745ace4432e4c26fe0 100644 (file)
@@ -88,7 +88,8 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
 
                vector<Sequence*> temp = refSeqs;
                temp.push_back(query);
-               
+                       
+                       
                verticalFilter(temp);
 
                vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
@@ -105,7 +106,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
 //             }
                
                if (trace.size() > 1) {         chimera = "yes";        }
-               else { chimera = "no";  }
+               else { chimera = "no";  return chimera; }
                
                int traceStart = trace[0].col;
                int traceEnd = trace[trace.size()-1].oldCol;    
@@ -204,7 +205,7 @@ int Maligner::computeChimeraPenalty() {
                
                int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
 
-//             if(numAllowable < 2){   numAllowable = 2;       }
+//             if(numAllowable < 1){   numAllowable = 1;       }
                
                int penalty = int(numAllowable + 1) * misMatchPenalty;
 
@@ -314,7 +315,7 @@ void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequenc
                        if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
                                ms[i][0].score = 0;
 //                             ms[i][0].mismatches = 0;
-                       }else if (queryAligned[0] == subjectAligned[0]) {
+                       }else if (queryAligned[0] == subjectAligned[0] || subjectAligned[0] == 'N') {
                                ms[i][0].score = matchScore;
 //                             ms[i][0].mismatches = 0;
                        }else{
@@ -335,6 +336,7 @@ void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequenc
                                if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
                                        //leave the same
                                }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
+                                       matchMisMatchScore = matchScore;
                                        //leave the same
                                }else if (queryAligned[j] == subjectAligned[j]) {
                                        matchMisMatchScore = matchScore;
@@ -421,11 +423,10 @@ vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> >
                        }
                }
                        
-//             cout << highestScore << '\t' << highestStruct.size() << endl;   
+//             cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl;   
                
                vector<trace_struct> maxTrace;
                double maxPercentIdenticalQueryAntiChimera = 0;
-               int maxIndex = -1;
                
                for(int i=0;i<highestStruct.size();i++){
                        
@@ -453,11 +454,7 @@ vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> >
 //                     for(int j=0;j<trace.size();j++){
 //                             cout << trace[j].col << '\t' << trace[j].oldCol << '\t' << refSeqs[trace[j].row]->getName() << endl;
 //                     }
-                       
-//                     Need to do something with this in a bit...
-//                     if (trace.size() > 1) {         chimera = "yes";        }
-//                     else { chimera = "no";  }
-                       
+                                               
                        int traceStart = path[0].col;
                        int traceEnd = path[path.size()-1].col; 
 //                     cout << "traceStart/End\t" << traceStart << '\t' << traceEnd << endl;
@@ -475,10 +472,9 @@ vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> >
                        if(percentIdenticalQueryAntiChimera > maxPercentIdenticalQueryAntiChimera){
                                maxPercentIdenticalQueryAntiChimera = percentIdenticalQueryAntiChimera;
                                maxTrace = trace;
-                               maxIndex = i;
                        }
                }
-//             cout << maxPercentIdenticalQueryAntiChimera << '\t' << maxIndex << endl;
+//             cout << maxPercentIdenticalQueryAntiChimera << endl;
                return maxTrace;
                
        }
@@ -632,25 +628,35 @@ vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
                string queryUnAligned = q->getUnaligned();
                string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
                string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
-
+               
                Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery);
                Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery);
-               
-               vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1);
+
                vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1);
-                       
+               vector<float> leftScores = databaseLeft->getMegaBlastSearchScores();
+               vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1);
+               vector<float> rightScores = databaseLeft->getMegaBlastSearchScores();
+
                //if ((tempIndexesRight.size() == 0) && (tempIndexesLeft.size() == 0))  {  m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to process sequence " + q->getName()); m->mothurOutEndLine(); return refResults; }
                
                vector<int> smaller;
+               vector<float> smallerScores;
                vector<int> larger;
+               vector<float> largerScores;
+               
+               if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight; smallerScores = rightScores; larger = tempIndexesLeft; largerScores = leftScores; }
+               else { smaller = tempIndexesLeft; smallerScores = leftScores; larger = tempIndexesRight; largerScores = rightScores; } 
                
-               if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight;  larger = tempIndexesLeft; }
-               else { smaller = tempIndexesLeft; larger = tempIndexesRight; } 
+               //for (int i = 0; i < smaller.size(); i++) { cout << "smaller = " << smaller[i] << '\t' << smallerScores[i] << endl; }
+               //cout << endl;
+               //for (int i = 0; i < larger.size(); i++) { cout << "larger = " << larger[i] << '\t' << largerScores[i] << endl; }
                
                //merge results         
                map<int, int> seen;
                map<int, int>::iterator it;
-               
+               float lastSmaller = smallerScores[0];
+               float lastLarger = largerScores[0];
+               int lasti = 0;
                vector<int> mergedResults;
                for (int i = 0; i < smaller.size(); i++) {
                        //add left if you havent already
@@ -658,6 +664,7 @@ vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
                        if (it == seen.end()) {  
                                mergedResults.push_back(smaller[i]);
                                seen[smaller[i]] = smaller[i];
+                               lastSmaller = smallerScores[i];
                        }
 
                        //add right if you havent already
@@ -665,26 +672,74 @@ vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
                        if (it == seen.end()) {  
                                mergedResults.push_back(larger[i]);
                                seen[larger[i]] = larger[i];
+                               lastLarger = largerScores[i];
                        }
+                       
+                       lasti = i;
+                       if (mergedResults.size() > num) { break; }
                }
                
-               for (int i = smaller.size(); i < larger.size(); i++) {
-                       it = seen.find(larger[i]);
-                       if (it == seen.end()) {  
-                               mergedResults.push_back(larger[i]);
-                               seen[larger[i]] = larger[i];
+               //save lasti for smaller ties below
+               lasti++;
+               int iSmaller = lasti;
+               
+               if (!(mergedResults.size() > num)) { //if we still need more results.  
+                       for (int i = smaller.size(); i < larger.size(); i++) {
+                               it = seen.find(larger[i]);
+                               if (it == seen.end()) {  
+                                       mergedResults.push_back(larger[i]);
+                                       seen[larger[i]] = larger[i];
+                                       lastLarger = largerScores[i];
+                               }
+                               
+                               lasti = i;
+                               if (mergedResults.size() > num) {  break; }
+                       }
+               }
+               
+               
+               //add in any ties from smaller
+               while (iSmaller < smaller.size()) {
+                       if (smallerScores[iSmaller] == lastSmaller) {
+                               it = seen.find(smaller[iSmaller]);
+                               
+                               if (it == seen.end()) {  
+                                       mergedResults.push_back(smaller[iSmaller]);
+                                       seen[smaller[iSmaller]] = smaller[iSmaller];
+                               }
                        }
+                       else { break; }
+                       iSmaller++;                     
                }
                
+               lasti++;
+               //add in any ties from larger
+               while (lasti < larger.size()) {
+                       if (largerScores[lasti] == lastLarger) { //is it a tie
+                               it = seen.find(larger[lasti]);
+                               
+                               if (it == seen.end()) {  //we haven't already seen it
+                                       mergedResults.push_back(larger[lasti]);
+                                       seen[larger[lasti]] = larger[lasti];
+                               }
+                       }
+                       else { break; }
+                       lasti++;                        
+               }
+               
+               numWanted = mergedResults.size();
+               
                if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); }
 //cout << q->getName() << " merged results size = " << mergedResults.size() << '\t' << "numwanted = " << numWanted <<  endl;           
                for (int i = 0; i < numWanted; i++) {
 //cout << db[mergedResults[i]]->getName()  << '\t' << mergedResults[i] << endl;        
+                       
                        if (db[mergedResults[i]]->getName() != q->getName()) { 
                                Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
                                refResults.push_back(temp);
                                indexes.push_back(mergedResults[i]);
                        }
+                       
 //cout << mergedResults[i] << endl;
                }
 //cout << "done " << q->getName()  << endl;