chimeracode

author westcott <westcott>

Tue, 16 Feb 2010 15:23:05 +0000 (15:23 +0000)

committer westcott <westcott>

Tue, 16 Feb 2010 15:23:05 +0000 (15:23 +0000)
author westcott <westcott>
Tue, 16 Feb 2010 15:23:05 +0000 (15:23 +0000)
committer westcott <westcott>
Tue, 16 Feb 2010 15:23:05 +0000 (15:23 +0000)
diff --git a/blastdb.cpp b/blastdb.cpp

index 50a0401b2cb8780765ba56a3ba2c025380c4e212..b139f3eb7a47bcc86f0920bdc0fc22bad368816a 100644 (file)
--- a/blastdb.cpp
+++ b/blastdb.cpp
@@ -50,7 +50,7 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
                 //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
                 //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
                 
-               string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28 -v " + toString(n);
+               string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);;
                 blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
                 system(blastCommand.c_str());
                 
@@ -82,9 +82,9 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
  }
  /**************************************************************************************************/
  //assumes you have added all the template sequences using the addSequence function and run generateDB.
-map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
+vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
         try{
-               map<int, float> topMatches;
+               vector<int> topMatches;
                 
                 ofstream queryFile;
                 openOutputFile(queryFileName, queryFile);
@@ -96,7 +96,7 @@ map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
                 //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
                 //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
         
-               string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -b " + toString(n) + " -v " + toString(n);
+               string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
                 blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
                 system(blastCommand.c_str());
                 
@@ -106,22 +106,18 @@ map<int, float> BlastDB::findClosest(Sequence* seq, int n) {
                 string dummy;
                 int templateAccession;
                 gobble(m8FileHandle);
-//string name = seq->getName();
-//ofstream out;
-//openOutputFileAppend(name, out);     
+               
                 while(!m8FileHandle.eof()){
                         m8FileHandle >> dummy >> templateAccession >> searchScore;
-//out << dummy << '\t' <<  templateAccession   << '\t' << searchScore << endl;
+                       
                         //get rest of junk in line
-                       while (!m8FileHandle.eof())     {       char c = m8FileHandle.get(); 
-                       //out << c; 
-                       if (c == 10 || c == 13){        break;  }       } 
+                       while (!m8FileHandle.eof())     {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }       } 
                         
                         gobble(m8FileHandle);
-                       topMatches[templateAccession] = searchScore;
+                       topMatches.push_back(templateAccession);
                 }
                 m8FileHandle.close();
-//out.close();         
+               
                 return topMatches;
         }
         catch(exception& e) {
diff --git a/blastdb.hpp b/blastdb.hpp

index fea55c8256e561af5477356b3c5bc545b30d6960..0f8fccd7f19507737642e218e4a149c5f4a60e0b 100644 (file)
--- a/blastdb.hpp
+++ b/blastdb.hpp
@@ -23,7 +23,7 @@ public:
         void generateDB();
         void addSequence(Sequence);
         vector<int> findClosestSequences(Sequence*, int);
-       map<int, float> findClosest(Sequence*, int); //template index -> searchscore
+       vector<int> findClosestMegaBlast(Sequence*, int);
  
  private:
         string dbFileName;
diff --git a/chimera.h b/chimera.h

index a968a02fa61cb06d7bef2d92386db5d55cf020f3..4d590123ee319768f694d8b8f3fd213ad055290e 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -50,23 +50,11 @@ struct results {
         float divR;
  };
  /***********************************************************************/
-struct rank {
-               int num;
-               float score;
-               rank(int n, float s) : num(n), score(s) {}
-};
-/***********************************************************************/
-
  struct SeqDist {
         Sequence* seq;
         float dist;
  };
  //********************************************************************************************************************
-//sorts highest to lowest
-inline bool compareMembers(rank left, rank right){
-       return (left.score > right.score);      
-} 
-//********************************************************************************************************************
  //sorts lowest to highest
  inline bool compareRegionStart(results left, results right){
         return (left.nastRegionStart < right.nastRegionStart);  
diff --git a/chimerarealigner.cpp b/chimerarealigner.cpp

index 57c3641d14f4f2664dd12c3e1a7f89d13722ab91..f45b9a87f6829b68c81c8b137758ec3f6a096022 100644 (file)
--- a/chimerarealigner.cpp
+++ b/chimerarealigner.cpp
@@ -29,19 +29,23 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
                         sort(parents.begin(), parents.end(), compareRegionStart);
  
                         //make sure you don't cutoff beginning of query 
-                       if (parents[0].nastRegionStart > 0) {  newQuery += qAligned.substr(0, parents[0].nastRegionStart+1);  }
+                       if (parents[0].nastRegionStart > 0) {  newQuery += qAligned.substr(0, parents[0].nastRegionStart);  }
                         int longest = 0;
                         //take query and break apart into pieces using breakpoints given by results of parents
                         for (int i = 0; i < parents.size(); i++) {
-                               int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1;
+       cout << parents[i].parent << '\t' << parents[i].nastRegionStart <<  '\t' << parents[i].nastRegionEnd    << endl;        
+                               int length = parents[i].nastRegionEnd - parents[i].nastRegionStart;
                                 string q = qAligned.substr(parents[i].nastRegionStart, length);
+       cout << "query = " << q << endl;
                                 Sequence* queryFrag = new Sequence(query->getName(), q);
                                 
                                 queryParts.push_back(queryFrag);
                                 
                                 Sequence* parent = getSequence(parents[i].parent);
                                 string p = parent->getAligned();
+               
                                 p = p.substr(parents[i].nastRegionStart, length);
+       cout << "parent = " << p << endl;               
                                 parent->setAligned(p);
                                 
                                 parentParts.push_back(parent);
@@ -62,8 +66,9 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
                                 newQuery += queryParts[i]->getAligned();
                         }
                         
+                       
                         //make sure you don't cutoff end of query 
-                       if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) {  newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd-1);  }
+                       if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) {  newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd);  }
                         
                         //set query to new aligned string
                         query->setAligned(newQuery);
diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp

index b2c0a0561a0f6baf035f83fc4113d3a0f16ef643..2cbd5d8343f8b96614020d2be5f04250089b73ee 100644 (file)
--- a/chimeraslayer.cpp
+++ b/chimeraslayer.cpp
@@ -64,9 +64,9 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                 vector<results> Results = maligner->getOutput();
  
                 //realign query to parents to improve slayers detection rate
-               //ChimeraReAligner realigner(templateSeqs, match, misMatch);
-               //realigner.reAlign(query, Results);
-                       
+               ChimeraReAligner realigner(templateSeqs, match, misMatch);
+               realigner.reAlign(query, Results);
+cout << query->getName() << '\n' << query->getAligned() << endl;
                         //if (chimeraFlag == "yes") {
                         
                 //get sequence that were given from maligner results
diff --git a/database.hpp b/database.hpp

index 79050777acd7d695754d7a382741538d35409b94..7fd29b3ae4d09ede0d5227f639e1ea6308f778a0 100644 (file)
--- a/database.hpp
+++ b/database.hpp
@@ -48,7 +48,7 @@ public:
         virtual void generateDB() = 0; 
         virtual void addSequence(Sequence) = 0;  //add sequence to search engine
         virtual vector<int> findClosestSequences(Sequence*, int) = 0;  // returns indexes of n closest sequences to query
-       virtual map<int, float> findClosest(Sequence*, int){ return results; }  // returns of n closest sequences to query and their search scores
+       virtual vector<int> findClosestMegaBlast(Sequence*, int){return results;}
         virtual float getSearchScore();
         virtual int getLongestBase(); 
         virtual void readKmerDB(ifstream&){};
@@ -60,7 +60,7 @@ public:
  protected:
         int numSeqs, longest;
         float searchScore;
-       map<int, float> results;
+       vector<int> results;
  };
  /**************************************************************************************************/
  #endif
diff --git a/maligner.cpp b/maligner.cpp

index de6004d7e387d0e241dfbc205695d359dbcf6dda..b393194bbc3a4c89fd2ae8ea3f7f7ba67f84b927 100644 (file)
--- a/maligner.cpp
+++ b/maligner.cpp
@@ -32,28 +32,8 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
                         refSeqs = getBlastSeqs(query, numWanted);
                 }
                 
-//ofstream out;
-//string name = toString(numi+1); 
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(0, name.find_last_of("|"));
-//cout << name << endl;
-//string filename = name + ".seqsformaligner";
-//openOutputFile(filename, out);       
-//for (int u = 0; u < refSeqs.size(); u++) {  refSeqs[u]->printSequence(out);  }
-//out.close();
-//filename = name + ".fasta";
-//openOutputFile(filename, out);       
-//query->printSequence(out);
-//out.close();
-
-//for (int i = 0; i < refSeqs.size(); i++)  { cout << refSeqs[i]->getName() << endl;  }        
-//cout << "before = " << refSeqs.size() << endl;
                 refSeqs = minCoverageFilter(refSeqs);
-//cout << "after = " << refSeqs.size() << endl;                
+               
                 if (refSeqs.size() < 2)  { 
                         for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
                         percentIdenticalQueryChimera = 0.0;
@@ -61,7 +41,7 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
                 }
                 
                 int chimeraPenalty = computeChimeraPenalty();
-//cout << chimeraPenalty << endl;              
+       
                 //fills outputResults
                 chimera = chimeraMaligner(chimeraPenalty, decalc);
                 
@@ -500,8 +480,7 @@ float Maligner::computePercentID(string queryAlign, string chimera) {
  }
  //***************************************************************************************************************
  vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
-       try {
-cout << q->getName() << endl;  
+       try {   
                 //generate blastdb
                 Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
                 for (int i = 0; i < db.size(); i++) {   database->addSequence(*db[i]);  }
@@ -509,55 +488,42 @@ cout << q->getName() << endl;
                 database->setNumSeqs(db.size());
                 
                 //get parts of query
-               string queryAligned = q->getAligned();
-               string leftQuery = queryAligned.substr(0, (queryAligned.length() / 3)); //first 1/3 of the sequence
-               string rightQuery = queryAligned.substr(((queryAligned.length() / 3)*2)); //last 1/3 of the sequence
-               
+               string queryUnAligned = q->getUnaligned();
+               string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
+               string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
+
                 Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
                 Sequence* queryRight = new Sequence(q->getName(), rightQuery);
                 
-               map<int, float> tempIndexesRight = database->findClosest(queryRight, num);
-               map<int, float> tempIndexesLeft = database->findClosest(queryLeft, num);
-               
-               //merge results
-               vector<rank> mergedResults;
+               vector<int> tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1);
+               vector<int> tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1);
                 
-               map<int, float>::iterator it;
-               map<int, float>::iterator it2;
+               //merge results         
+               map<int, int> seen;
+               map<int, int>::iterator it;
                 
-               //add in right guys merging common finds
-               for (it = tempIndexesRight.begin(); it != tempIndexesRight.end(); it++) {
-                       it2 = tempIndexesLeft.find(it->first);
-                       
-                       if (it2 == tempIndexesLeft.end()) {  //result only present in right
-                               rank temp(it->first, it->second);
-                               mergedResults.push_back(temp); 
+               vector<int> mergedResults;
+               for (int i = 0; i < tempIndexesLeft.size(); i++) {
+                       //add left if you havent already
+                       it = seen.find(tempIndexesLeft[i]);
+                       if (it == seen.end()) {  
+                               mergedResults.push_back(tempIndexesLeft[i]);
+                               seen[tempIndexesLeft[i]] = tempIndexesLeft[i];
+                       }
  
-                       }else { //result present in both save best score
-                               float bestscore;
-                               if (it->second > it2->second)   {  bestscore = it->second;  }
-                               else                                                    {  bestscore = it2->second; }
-                               
-                               rank temp(it->first, bestscore);
-                               mergedResults.push_back(temp);
-                               
-                               tempIndexesLeft.erase(it2);
+                       //add right if you havent already
+                       it = seen.find(tempIndexesRight[i]);
+                       if (it == seen.end()) {  
+                               mergedResults.push_back(tempIndexesRight[i]);
+                               seen[tempIndexesRight[i]] = tempIndexesRight[i];
                         }
                 }
                 
-               //add in unique left guys
-               for (it = tempIndexesLeft.begin(); it != tempIndexesLeft.end(); it++) {
-                       rank temp(it->first, it->second);
-                       mergedResults.push_back(temp); 
-               }
-               
-               sort(mergedResults.begin(), mergedResults.end(), compareMembers);
                 
                 vector<Sequence*> refResults;
                 for (int i = 0; i < numWanted; i++) {
-                       Sequence* temp = new Sequence(db[mergedResults[i].num]->getName(), db[mergedResults[i].num]->getAligned());
+                       Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
                         refResults.push_back(temp);
-cout << db[mergedResults[i].num]->getName() << endl;
                 }
                 
                 delete queryRight;
author	westcott <westcott>
	Tue, 16 Feb 2010 15:23:05 +0000 (15:23 +0000)
committer	westcott <westcott>
	Tue, 16 Feb 2010 15:23:05 +0000 (15:23 +0000)
blastdb.cpp		patch \| blob \| history
blastdb.hpp		patch \| blob \| history
chimera.h		patch \| blob \| history
chimerarealigner.cpp		patch \| blob \| history
chimeraslayer.cpp		patch \| blob \| history
database.hpp		patch \| blob \| history
maligner.cpp		patch \| blob \| history