]> git.donarmstrong.com Git - mothur.git/commitdiff
chimeracode
authorwestcott <westcott>
Tue, 16 Feb 2010 15:23:05 +0000 (15:23 +0000)
committerwestcott <westcott>
Tue, 16 Feb 2010 15:23:05 +0000 (15:23 +0000)
blastdb.cpp
blastdb.hpp
chimera.h
chimerarealigner.cpp
chimeraslayer.cpp
database.hpp
maligner.cpp

index 50a0401b2cb8780765ba56a3ba2c025380c4e212..b139f3eb7a47bcc86f0920bdc0fc22bad368816a 100644 (file)
@@ -50,7 +50,7 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
                //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
                //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
                
-               string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28 -v " + toString(n);
+               string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);;
                blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
                system(blastCommand.c_str());
                
@@ -82,9 +82,9 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
 }
 /**************************************************************************************************/
 //assumes you have added all the template sequences using the addSequence function and run generateDB.
-map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
+vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
        try{
-               map<int, float> topMatches;
+               vector<int> topMatches;
                
                ofstream queryFile;
                openOutputFile(queryFileName, queryFile);
@@ -96,7 +96,7 @@ map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
                //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
                //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
        
-               string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -b " + toString(n) + " -v " + toString(n);
+               string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
                blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
                system(blastCommand.c_str());
                
@@ -106,22 +106,18 @@ map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
                string dummy;
                int templateAccession;
                gobble(m8FileHandle);
-//string name = seq->getName();
-//ofstream out;
-//openOutputFileAppend(name, out);     
+               
                while(!m8FileHandle.eof()){
                        m8FileHandle >> dummy >> templateAccession >> searchScore;
-//out << dummy << '\t' <<  templateAccession   << '\t' << searchScore << endl;
+                       
                        //get rest of junk in line
-                       while (!m8FileHandle.eof())     {       char c = m8FileHandle.get(); 
-                       //out << c; 
-                       if (c == 10 || c == 13){        break;  }       } 
+                       while (!m8FileHandle.eof())     {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }       } 
                        
                        gobble(m8FileHandle);
-                       topMatches[templateAccession] = searchScore;
+                       topMatches.push_back(templateAccession);
                }
                m8FileHandle.close();
-//out.close();         
+               
                return topMatches;
        }
        catch(exception& e) {
index fea55c8256e561af5477356b3c5bc545b30d6960..0f8fccd7f19507737642e218e4a149c5f4a60e0b 100644 (file)
@@ -23,7 +23,7 @@ public:
        void generateDB();
        void addSequence(Sequence);
        vector<int> findClosestSequences(Sequence*, int);
-       map<int, float> findClosest(Sequence*, int); //template index -> searchscore
+       vector<int> findClosestMegaBlast(Sequence*, int);
 
 private:
        string dbFileName;
index a968a02fa61cb06d7bef2d92386db5d55cf020f3..4d590123ee319768f694d8b8f3fd213ad055290e 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -50,23 +50,11 @@ struct results {
        float divR;
 };
 /***********************************************************************/
-struct rank {
-               int num;
-               float score;
-               rank(int n, float s) : num(n), score(s) {}
-};
-/***********************************************************************/
-
 struct SeqDist {
        Sequence* seq;
        float dist;
 };
 //********************************************************************************************************************
-//sorts highest to lowest
-inline bool compareMembers(rank left, rank right){
-       return (left.score > right.score);      
-} 
-//********************************************************************************************************************
 //sorts lowest to highest
 inline bool compareRegionStart(results left, results right){
        return (left.nastRegionStart < right.nastRegionStart);  
index 57c3641d14f4f2664dd12c3e1a7f89d13722ab91..f45b9a87f6829b68c81c8b137758ec3f6a096022 100644 (file)
@@ -29,19 +29,23 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
                        sort(parents.begin(), parents.end(), compareRegionStart);
 
                        //make sure you don't cutoff beginning of query 
-                       if (parents[0].nastRegionStart > 0) {  newQuery += qAligned.substr(0, parents[0].nastRegionStart+1);  }
+                       if (parents[0].nastRegionStart > 0) {  newQuery += qAligned.substr(0, parents[0].nastRegionStart);  }
                        int longest = 0;
                        //take query and break apart into pieces using breakpoints given by results of parents
                        for (int i = 0; i < parents.size(); i++) {
-                               int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1;
+       cout << parents[i].parent << '\t' << parents[i].nastRegionStart <<  '\t' << parents[i].nastRegionEnd    << endl;        
+                               int length = parents[i].nastRegionEnd - parents[i].nastRegionStart;
                                string q = qAligned.substr(parents[i].nastRegionStart, length);
+       cout << "query = " << q << endl;
                                Sequence* queryFrag = new Sequence(query->getName(), q);
                                
                                queryParts.push_back(queryFrag);
                                
                                Sequence* parent = getSequence(parents[i].parent);
                                string p = parent->getAligned();
+               
                                p = p.substr(parents[i].nastRegionStart, length);
+       cout << "parent = " << p << endl;               
                                parent->setAligned(p);
                                
                                parentParts.push_back(parent);
@@ -62,8 +66,9 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
                                newQuery += queryParts[i]->getAligned();
                        }
                        
+                       
                        //make sure you don't cutoff end of query 
-                       if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) {  newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd-1);  }
+                       if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) {  newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd);  }
                        
                        //set query to new aligned string
                        query->setAligned(newQuery);
index b2c0a0561a0f6baf035f83fc4113d3a0f16ef643..2cbd5d8343f8b96614020d2be5f04250089b73ee 100644 (file)
@@ -64,9 +64,9 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                vector<results> Results = maligner->getOutput();
 
                //realign query to parents to improve slayers detection rate
-               //ChimeraReAligner realigner(templateSeqs, match, misMatch);
-               //realigner.reAlign(query, Results);
-                       
+               ChimeraReAligner realigner(templateSeqs, match, misMatch);
+               realigner.reAlign(query, Results);
+cout << query->getName() << '\n' << query->getAligned() << endl;
                        //if (chimeraFlag == "yes") {
                        
                //get sequence that were given from maligner results
index 79050777acd7d695754d7a382741538d35409b94..7fd29b3ae4d09ede0d5227f639e1ea6308f778a0 100644 (file)
@@ -48,7 +48,7 @@ public:
        virtual void generateDB() = 0; 
        virtual void addSequence(Sequence) = 0;  //add sequence to search engine
        virtual vector<int> findClosestSequences(Sequence*, int) = 0;  // returns indexes of n closest sequences to query
-       virtual map<int, float> findClosest(Sequence*, int){ return results; }  // returns of n closest sequences to query and their search scores
+       virtual vector<int> findClosestMegaBlast(Sequence*, int){return results;}
        virtual float getSearchScore();
        virtual int getLongestBase(); 
        virtual void readKmerDB(ifstream&){};
@@ -60,7 +60,7 @@ public:
 protected:
        int numSeqs, longest;
        float searchScore;
-       map<int, float> results;
+       vector<int> results;
 };
 /**************************************************************************************************/
 #endif
index de6004d7e387d0e241dfbc205695d359dbcf6dda..b393194bbc3a4c89fd2ae8ea3f7f7ba67f84b927 100644 (file)
@@ -32,28 +32,8 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
                        refSeqs = getBlastSeqs(query, numWanted);
                }
                
-//ofstream out;
-//string name = toString(numi+1); 
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(0, name.find_last_of("|"));
-//cout << name << endl;
-//string filename = name + ".seqsformaligner";
-//openOutputFile(filename, out);       
-//for (int u = 0; u < refSeqs.size(); u++) {  refSeqs[u]->printSequence(out);  }
-//out.close();
-//filename = name + ".fasta";
-//openOutputFile(filename, out);       
-//query->printSequence(out);
-//out.close();
-
-//for (int i = 0; i < refSeqs.size(); i++)  { cout << refSeqs[i]->getName() << endl;  }        
-//cout << "before = " << refSeqs.size() << endl;
                refSeqs = minCoverageFilter(refSeqs);
-//cout << "after = " << refSeqs.size() << endl;                
+               
                if (refSeqs.size() < 2)  { 
                        for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
                        percentIdenticalQueryChimera = 0.0;
@@ -61,7 +41,7 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
                }
                
                int chimeraPenalty = computeChimeraPenalty();
-//cout << chimeraPenalty << endl;              
+       
                //fills outputResults
                chimera = chimeraMaligner(chimeraPenalty, decalc);
                
@@ -500,8 +480,7 @@ float Maligner::computePercentID(string queryAlign, string chimera) {
 }
 //***************************************************************************************************************
 vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
-       try {
-cout << q->getName() << endl;  
+       try {   
                //generate blastdb
                Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
                for (int i = 0; i < db.size(); i++) {   database->addSequence(*db[i]);  }
@@ -509,55 +488,42 @@ cout << q->getName() << endl;
                database->setNumSeqs(db.size());
                
                //get parts of query
-               string queryAligned = q->getAligned();
-               string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence
-               string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence
-               
+               string queryUnAligned = q->getUnaligned();
+               string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
+               string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
+
                Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
                Sequence* queryRight = new Sequence(q->getName(), rightQuery);
                
-               map<int, float> tempIndexesRight = database->findClosest(queryRight, num);
-               map<int, float> tempIndexesLeft = database->findClosest(queryLeft, num);
-               
-               //merge results
-               vector<rank> mergedResults;
+               vector<int> tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1);
+               vector<int> tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1);
                
-               map<int, float>::iterator it;
-               map<int, float>::iterator it2;
+               //merge results         
+               map<int, int> seen;
+               map<int, int>::iterator it;
                
-               //add in right guys merging common finds
-               for (it = tempIndexesRight.begin(); it != tempIndexesRight.end(); it++) {
-                       it2 = tempIndexesLeft.find(it->first);
-                       
-                       if (it2 == tempIndexesLeft.end()) {  //result only present in right
-                               rank temp(it->first, it->second);
-                               mergedResults.push_back(temp); 
+               vector<int> mergedResults;
+               for (int i = 0; i < tempIndexesLeft.size(); i++) {
+                       //add left if you havent already
+                       it = seen.find(tempIndexesLeft[i]);
+                       if (it == seen.end()) {  
+                               mergedResults.push_back(tempIndexesLeft[i]);
+                               seen[tempIndexesLeft[i]] = tempIndexesLeft[i];
+                       }
 
-                       }else { //result present in both save best score
-                               float bestscore;
-                               if (it->second > it2->second)   {  bestscore = it->second;  }
-                               else                                                    {  bestscore = it2->second; }
-                               
-                               rank temp(it->first, bestscore);
-                               mergedResults.push_back(temp);
-                               
-                               tempIndexesLeft.erase(it2);
+                       //add right if you havent already
+                       it = seen.find(tempIndexesRight[i]);
+                       if (it == seen.end()) {  
+                               mergedResults.push_back(tempIndexesRight[i]);
+                               seen[tempIndexesRight[i]] = tempIndexesRight[i];
                        }
                }
                
-               //add in unique left guys
-               for (it = tempIndexesLeft.begin(); it != tempIndexesLeft.end(); it++) {
-                       rank temp(it->first, it->second);
-                       mergedResults.push_back(temp); 
-               }
-               
-               sort(mergedResults.begin(), mergedResults.end(), compareMembers);
                
                vector<Sequence*> refResults;
                for (int i = 0; i < numWanted; i++) {
-                       Sequence* temp = new Sequence(db[mergedResults[i].num]->getName(), db[mergedResults[i].num]->getAligned());
+                       Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
                        refResults.push_back(temp);
-cout << db[mergedResults[i].num]->getName() << endl;
                }
                
                delete queryRight;