chimeras, fix to sabundvector and sharedsabundvector that caused getRabundVector...

author westcott <westcott>

Mon, 22 Feb 2010 17:22:22 +0000 (17:22 +0000)

committer westcott <westcott>

Mon, 22 Feb 2010 17:22:22 +0000 (17:22 +0000)
author westcott <westcott>
Mon, 22 Feb 2010 17:22:22 +0000 (17:22 +0000)
committer westcott <westcott>
Mon, 22 Feb 2010 17:22:22 +0000 (17:22 +0000)
diff --git a/averagelinkage.cpp b/averagelinkage.cpp

index db2c51edc5b182b52dcf540456baad7b5782c61e..7522d2a3083f9247744f93a5bc147480f0304a72 100644 (file)
--- a/averagelinkage.cpp
+++ b/averagelinkage.cpp
@@ -38,15 +38,8 @@ bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
                         saveCol = smallCol;
                 }
                 
-               float oldColCell = colCell->dist;
-               
                 colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin;
                 
-               //warn user if merge with value above cutoff produces a value below cutoff
-               if ((colCell->dist < cutoff) && ((oldColCell > cutoff) || (rowCell->dist > cutoff)) ) {
-                       mothurOut("Warning: merging " + toString(oldColCell) + " with " + toString(rowCell->dist) + ", new value = " + toString(colCell->dist) + ". Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine();
-               }
-
                 return(true);
         }
         catch(exception& e) {
diff --git a/blastdb.cpp b/blastdb.cpp

index dff1074e92a3d405419712c72c3528333a004814..396a6b4b91c4496b80dfc3009cdda4a0f34b12e6 100644 (file)
--- a/blastdb.cpp
+++ b/blastdb.cpp
@@ -118,7 +118,7 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
                 system(blastCommand.c_str());
                 
                 ifstream m8FileHandle;
-               openInputFile(blastFileName, m8FileHandle);
+               openInputFile(blastFileName, m8FileHandle, "no error");
         
                 string dummy;
                 int templateAccession;
@@ -132,9 +132,10 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
                         
                         gobble(m8FileHandle);
                         topMatches.push_back(templateAccession);
+//cout << templateAccession << endl;
                 }
                 m8FileHandle.close();
-               
+//cout << "\n\n" ;             
                 return topMatches;
         }
         catch(exception& e) {
diff --git a/ccode.cpp b/ccode.cpp

index bfcd44d8239a30f8ad18db73f0c06268dc0e7474..326af7cec61a88eb6ab859f203098c0ded3a9cfb 100644 (file)
--- a/ccode.cpp
+++ b/ccode.cpp
@@ -48,13 +48,10 @@ void Ccode::print(ostream& out) {
                         out2 << it->first << '\t' << it->second << endl;
                 }
                 out2.close();
-               
-               out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
-               
                 out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
                         
                 for (int j = 0; j < closest.size(); j++) {
-                       out << setprecision(3) << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
+                       out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
                 }
                 out << endl << endl;
                 
@@ -72,14 +69,12 @@ void Ccode::print(ostream& out) {
                         
                 out << endl << "Window\tStartPos\tEndPos" << endl;
                 it = trim.begin();
-                       
                 for (int k = 0; k < windows.size()-1; k++) {
                         out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
                 }
                         
                 out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
                 out << endl;
-                       
                 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
                 for (int k = 0; k < windows.size(); k++) {
                         float ds = averageQuery[k] / averageRef[k]; 
@@ -94,7 +89,7 @@ void Ccode::print(ostream& out) {
                 //float fs = varQuery[query] / varRef[query];   /* F-Snedecor, test for differences of variances */
                         
                 bool results = false;   
-                                               
+                                       
                 //confidence limit, t - Student, anova
                 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
                         
@@ -118,6 +113,10 @@ void Ccode::print(ostream& out) {
                 if (results) {
                         mothurOut(querySeq->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
                 }
+
+               //free memory
+               for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
+
                 
         }
         catch(exception& e) {
@@ -176,7 +175,7 @@ int Ccode::getChimeras(Sequence* query) {
                         for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq);  }
                         temp.push_back(query);  
                         
-                       createFilter(temp);
+                       createFilter(temp, 0.5);
                 
                         for (int i = 0; i < temp.size(); i++) { runFilter(temp[i]);  }
                         
@@ -221,9 +220,6 @@ int Ccode::getChimeras(Sequence* query) {
                                         
                 determineChimeras();  //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. 
                 
-               //free memory
-               for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
-               
                 return 0;
         }
         catch(exception& e) {
@@ -525,6 +521,10 @@ vector<SeqDist>  Ccode::findClosest(Sequence* q, int numWanted) {
                 }
                         
                 sort(topMatches.begin(), topMatches.end(), compareSeqDist);
+               
+               for (int i = numWanted; i < topMatches.size(); i++) {  delete topMatches[i].seq;  }
+               
+               topMatches.resize(numWanted);
                         
                 return topMatches;
  
diff --git a/chimera.cpp b/chimera.cpp

index bf16de4e34b795694d1a894ea1005f32353be110..9ad287b68d8457e6dffd6e101bbca007c64745eb 100644 (file)
--- a/chimera.cpp
+++ b/chimera.cpp
@@ -11,10 +11,10 @@
  
  //***************************************************************************************************************
  //this is a vertical soft filter
-void Chimera::createFilter(vector<Sequence*> seqs) {
+string Chimera::createFilter(vector<Sequence*> seqs, float t) {
         try {
                 filterString = "";
-               int threshold = int (0.5 * seqs.size());
+               int threshold = int (t * seqs.size());
  //cout << "threshhold = " << threshold << endl;
                 
                 vector<int> gaps;       gaps.resize(seqs[0]->getAligned().length(), 0);
@@ -53,6 +53,7 @@ void Chimera::createFilter(vector<Sequence*> seqs) {
  //cout << "filter = " << filterString << endl; 
  
                 mothurOut("Filter removed " + toString(numColRemoved) + " columns.");  mothurOutEndLine();
+               return filterString;
         }
         catch(exception& e) {
                 errorOut(e, "Chimera", "createFilter");
@@ -60,18 +61,25 @@ void Chimera::createFilter(vector<Sequence*> seqs) {
         }
  }
  //***************************************************************************************************************
-void Chimera::runFilter(Sequence* seq) {
+map<int, int> Chimera::runFilter(Sequence* seq) {
         try {
-               
+               map<int, int> maskMap;
                 string seqAligned = seq->getAligned();
                 string newAligned = "";
+               int count = 0;
                         
                 for (int j = 0; j < seqAligned.length(); j++) {
                         //if this spot is a gap
-                       if (filterString[j] == '1') { newAligned += seqAligned[j]; }
+                       if (filterString[j] == '1') { 
+                               newAligned += seqAligned[j]; 
+                               maskMap[count] = j;
+                               count++;
+                       }
                 }
                         
                 seq->setAligned(newAligned);
+               
+               return maskMap;
         }
         catch(exception& e) {
                 errorOut(e, "Chimera", "runFilter");
diff --git a/chimera.h b/chimera.h

index 41615733392988ceacec5531b908349157804ce5..da908ce07571a54148f561e28c16b94e8b09f68b 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -45,6 +45,7 @@ struct results {
         int nastRegionStart;
         int nastRegionEnd;
         string parent;
+       string parentAligned;
         float queryToParent;
         float queryToParentLocal;
         float divR;
@@ -53,6 +54,7 @@ struct results {
  struct SeqDist {
         Sequence* seq;
         float dist;
+       int index;
  };
  //********************************************************************************************************************
  //sorts lowest to highest
@@ -88,7 +90,7 @@ class Chimera {
         
                 Chimera(){};
                 Chimera(string);
-               Chimera(string, bool);
+               Chimera(string, bool, string);
                 Chimera(string, string);
                 virtual ~Chimera(){};
                 virtual void setFilter(bool f)                  {       filter = f;                     }
@@ -119,8 +121,8 @@ class Chimera {
                 virtual vector<Sequence*> readSeqs(string);
                 virtual vector< vector<float> > readQuantiles();
                 virtual void setMask(string);
-               virtual void runFilter(Sequence*);
-               virtual void createFilter(vector<Sequence*>);
+               virtual map<int, int> runFilter(Sequence*);
+               virtual string createFilter(vector<Sequence*>, float);
                 
                 virtual void printHeader(ostream&){};
                 virtual int getChimeras(Sequence*){ return 0; }
diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp

index 13adb52bbbbb2e18477964f7fda344fd1dc1bcf2..2ec6ea9c85e2d8034aafa6527fd53b6bf28a052d 100644 (file)
--- a/chimeraseqscommand.cpp
+++ b/chimeraseqscommand.cpp
@@ -180,7 +180,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                         temp = validParameter.validFile(parameters, "realign", false);                  if (temp == "not found") { temp = "f"; }
                         realign = isTrue(temp); 
                         
-                       search = validParameter.validFile(parameters, "search", false);                 if (search == "not found") { temp = "distance"; }
+                       search = validParameter.validFile(parameters, "search", false);                 if (search == "not found") { search = "distance"; }
                         
                         temp = validParameter.validFile(parameters, "iters", false);    
                         if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; }             
@@ -198,7 +198,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                         else if (temp == "not found") { temp = "20"; }
                         convert(temp, numwanted);
  
-                       
+                       if ((search != "distance") && (search != "blast") && (search != "kmer")) { mothurOut(search + " is not a valid search."); mothurOutEndLine(); abort = true;  }
                         
                         if (((method != "bellerophon")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail, ccode, chimeraslayer or chimeracheck methods."); mothurOutEndLine(); abort = true;  }
                         
@@ -239,7 +239,7 @@ void ChimeraSeqsCommand::help(){
                 mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n");
                 mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n");
                 mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n");
-               mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance and blast, default distance.  -used only by chimeraslayer. \n");
+               mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance.  -used only by chimeraslayer. \n");
                 mothurOut("The realign parameter allows you to realign the query to the potential paretns. Choices are true or false, default false.  -used only by chimeraslayer. \n");
                 mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n");
                 mothurOut("Details for each method: \n"); 
@@ -282,7 +282,7 @@ int ChimeraSeqsCommand::execute(){
                 else if (method == "pintail")                   {               chimera = new Pintail(fastafile, outputDir);                            }
                 else if (method == "ccode")                             {               chimera = new Ccode(fastafile, outputDir);                                      }
                 else if (method == "chimeracheck")              {               chimera = new ChimeraCheckRDP(fastafile, outputDir);            }
-               else if (method == "chimeraslayer")             {               chimera = new ChimeraSlayer(search, realign);                           }
+               else if (method == "chimeraslayer")             {               chimera = new ChimeraSlayer(search, realign, fastafile);        }
                 else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0;          }
                 
                 //set user options
@@ -418,7 +418,8 @@ int ChimeraSeqsCommand::execute(){
                 //      else                             { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; }
                 
                 appendOutputFiles(tempHeader, outputFileName);
-               remove(tempHeader.c_str());
+               remove(outputFileName.c_str());
+               rename(tempHeader.c_str(), outputFileName.c_str());
  
                 for (int i = 0; i < templateSeqs.size(); i++)           {   delete templateSeqs[i];     }
                 
diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp

index 8a0cbd067cc6cbfb8ced9fbb3a3cb89ea6fdc62b..769032856bb5ca0801b21b3457ea5fa36bc90a20 100644 (file)
--- a/chimeraslayer.cpp
+++ b/chimeraslayer.cpp
@@ -9,16 +9,113 @@
  
  #include "chimeraslayer.h"
  #include "chimerarealigner.h"
+#include "kmerdb.hpp"
  
  //***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string mode, bool r) : searchMethod(mode), realign(r) {   decalc = new DeCalculator();      }
+ChimeraSlayer::ChimeraSlayer(string mode, bool r, string f) : searchMethod(mode), realign(r), fastafile(f) {   
+       decalc = new DeCalculator();    
+}
  //***************************************************************************************************************
-ChimeraSlayer::~ChimeraSlayer() {      delete decalc;   }
+void ChimeraSlayer::doPrep() {
+       try {
+       
+               string  kmerDBNameLeft;
+               string  kmerDBNameRight;
+               
+               //generate the kmerdb to pass to maligner
+               if (searchMethod == "kmer") { 
+                       //leftside
+                       string leftTemplateFileName = "left." + templateFileName;
+                       databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);                      
+                       kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+                       ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
+                       
+                       if(!kmerFileTestLeft){  
+                       
+                               for (int i = 0; i < templateSeqs.size(); i++) {
+                                       string leftFrag = templateSeqs[i]->getUnaligned();
+                                       leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+                                       
+                                       Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
+                                       databaseLeft->addSequence(leftTemp);    
+                               }
+                               databaseLeft->generateDB();
+                               
+                       }else { 
+                               databaseLeft->readKmerDB(kmerFileTestLeft);     
+                       }
+                       kmerFileTestLeft.close();
+                       
+                       databaseLeft->setNumSeqs(templateSeqs.size());
+                       
+                       //rightside
+                       string rightTemplateFileName = "right." + templateFileName;
+                       databaseRight = new KmerDB(rightTemplateFileName, kmerSize);                    
+                       kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+                       ifstream kmerFileTestRight(kmerDBNameRight.c_str());
+                       
+                       if(!kmerFileTestRight){ 
+                       
+                               for (int i = 0; i < templateSeqs.size(); i++) {
+                                       string rightFrag = templateSeqs[i]->getUnaligned();
+                                       rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+                                       
+                                       Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
+                                       databaseRight->addSequence(rightTemp);  
+                               }
+                               databaseRight->generateDB();
+                               
+                       }else { 
+                               databaseRight->readKmerDB(kmerFileTestRight);   
+                       }
+                       kmerFileTestRight.close();
+                       
+                       databaseRight->setNumSeqs(templateSeqs.size());
+
+               }
+               
+               int start = time(NULL); 
+               //filter the sequences
+               //read in all query seqs
+               ifstream in; 
+               openInputFile(fastafile, in);
+               
+               vector<Sequence*> tempQuerySeqs;
+               while(!in.eof()){
+                       Sequence* s = new Sequence(in);
+                       gobble(in);
+                       
+                       if (s->getName() != "") { tempQuerySeqs.push_back(s); }
+               }
+               in.close();
+               
+               vector<Sequence*> temp = templateSeqs;
+               for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
+                               
+               createFilter(temp, 0.0); //just removed columns where all seqs have a gap
+                               
+               for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
+               
+               //run filter on template
+               for (int i = 0; i < templateSeqs.size(); i++) {  runFilter(templateSeqs[i]);  }
+               
+               mothurOutEndLine(); mothurOut("It took " + toString(time(NULL) - start) + " secs to filter.");  mothurOutEndLine();
+
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraSlayer", "doprep");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+ChimeraSlayer::~ChimeraSlayer() {      delete decalc;  if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }    }
  //***************************************************************************************************************
  void ChimeraSlayer::printHeader(ostream& out) {
         mothurOutEndLine();
-       mothurOut("Only reporting sequence supported by 90% of bootstrapped results.");
+       mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
         mothurOutEndLine();
+       
+       out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
  }
  //***************************************************************************************************************
  void ChimeraSlayer::print(ostream& out) {
@@ -35,7 +132,7 @@ void ChimeraSlayer::print(ostream& out) {
                                         mothurOut(querySeq->getName() + "\tyes"); mothurOutEndLine();
                                 }
                         }
-                       out << querySeq->getName() << "\tyes" << endl;
+                       
                         printBlock(chimeraResults[0], out);
                         out << endl;
                 }else {  out << querySeq->getName() << "\tno" << endl;  }
@@ -50,114 +147,102 @@ void ChimeraSlayer::print(ostream& out) {
  int ChimeraSlayer::getChimeras(Sequence* query) {
         try {
                 chimeraFlags = "no";
-               querySeq = query;
                 
-               for (int i = 0; i < query->getAligned().length(); i++) {
-                       spotMap[i] = i;
-               }
+               //filter query
+               spotMap = runFilter(query);
+               
+               querySeq = query;
                 
                 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
-               maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod);
+               maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
                 slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
                 
                 string chimeraFlag = maligner->getResults(query, decalc);
                 vector<results> Results = maligner->getOutput();
-
-               //realign query to parents to improve slayers detection rate???
+                               
+               //found in testing realigning only made things worse
                 if (realign) {
                         ChimeraReAligner realigner(templateSeqs, match, misMatch);
                         realigner.reAlign(query, Results);
                 }
  
-                       //if (chimeraFlag == "yes") {
-                       
-               //get sequence that were given from maligner results
-               vector<SeqDist> seqs;
-               map<string, float> removeDups;
-               map<string, float>::iterator itDup;
-               for (int j = 0; j < Results.size(); j++) {
-                       float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
-                       //only add if you are not a duplicate
-                       itDup = removeDups.find(Results[j].parent);
-                       if (itDup == removeDups.end()) { //this is not duplicate
-                               removeDups[Results[j].parent] = dist;
-                       }else if (dist > itDup->second) { //is this a stronger number for this parent
-                               removeDups[Results[j].parent] = dist;
-                       }
-               }
-               
-               for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
-                       Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
-                       
-                       SeqDist member;
-                       member.seq = seq;
-                       member.dist = itDup->second;
+               if (chimeraFlag == "yes") {
                         
-                       seqs.push_back(member);
-               }
-               
-               //limit number of parents to explore - default 3
-               if (Results.size() > parents) {
-                       //sort by distance
-                       sort(seqs.begin(), seqs.end(), compareSeqDist);
-                       //prioritize larger more similiar sequence fragments
-                       reverse(seqs.begin(), seqs.end());
+                       //get sequence that were given from maligner results
+                       vector<SeqDist> seqs;
+                       map<string, float> removeDups;
+                       map<string, float>::iterator itDup;
+                       map<string, string> parentNameSeq;
+                       map<string, string>::iterator itSeq;
+                       for (int j = 0; j < Results.size(); j++) {
+                               float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
+                               //only add if you are not a duplicate
+                               itDup = removeDups.find(Results[j].parent);
+                               if (itDup == removeDups.end()) { //this is not duplicate
+                                       removeDups[Results[j].parent] = dist;
+                                       parentNameSeq[Results[j].parent] = Results[j].parentAligned;
+                               }else if (dist > itDup->second) { //is this a stronger number for this parent
+                                       removeDups[Results[j].parent] = dist;
+                                       parentNameSeq[Results[j].parent] = Results[j].parentAligned;
+                               }
+                       }
                         
-                       for (int k = seqs.size()-1; k > (parents-1); k--)  {  
-                               delete seqs[k].seq;
-                               seqs.pop_back();        
+                       for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
+                               //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
+                               itSeq = parentNameSeq.find(itDup->first);
+//cout << itDup->first << itSeq->second << endl;
+                               Sequence* seq = new Sequence(itDup->first, itSeq->second);
+                               
+                               SeqDist member;
+                               member.seq = seq;
+                               member.dist = itDup->second;
+                               
+                               seqs.push_back(member);
                         }
-               }
-               
-               //put seqs into vector to send to slayer
-               vector<Sequence*> seqsForSlayer;
-               for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);  }
-               //cout << i+1 << "num parents = " << seqsForSlayer.size() << '\t' << chimeraFlag << endl;
-//ofstream out;
-//string name = querySeqs[i]->getName();
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(0, name.find_last_of("|"));
-//cout << name << endl;
-//string filename = toString(i+1) + ".seqsforslayer";
-//openOutputFile(filename, out);       
-//cout << querySeqs[i]->getName() << endl;
-//for (int u = 0; u < seqsForSlayer.size(); u++) { cout << seqsForSlayer[u]->getName() << '\t'; seqsForSlayer[u]->printSequence(out);  }
-//cout << endl;
-//out.close();
-//filename = toString(i+1) + ".fasta";
-//openOutputFile(filename, out);       
-//querySeqs[i]->printSequence(out);
-//out.close();
-
-
                         
-               //mask then send to slayer...
-               if (seqMask != "") {
-                       decalc->setMask(seqMask);
+                       //limit number of parents to explore - default 3
+                       if (Results.size() > parents) {
+                               //sort by distance
+                               sort(seqs.begin(), seqs.end(), compareSeqDist);
+                               //prioritize larger more similiar sequence fragments
+                               reverse(seqs.begin(), seqs.end());
+                               
+                               for (int k = seqs.size()-1; k > (parents-1); k--)  {  
+                                       delete seqs[k].seq;
+                                       seqs.pop_back();        
+                               }
+                       }
                         
-                       //mask querys
-                       decalc->runMask(query);
+                       //put seqs into vector to send to slayer
+                       vector<Sequence*> seqsForSlayer;
+                       for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);  }
                         
-                       //mask parents
-                       for (int k = 0; k < seqsForSlayer.size(); k++) {
-                               decalc->runMask(seqsForSlayer[k]);
+                       //mask then send to slayer...
+                       if (seqMask != "") {
+                               decalc->setMask(seqMask);
+                               
+                               //mask querys
+                               decalc->runMask(query);
+                               
+                               //mask parents
+                               for (int k = 0; k < seqsForSlayer.size(); k++) {
+                                       decalc->runMask(seqsForSlayer[k]);
+                               }
+                               
+                               spotMap = decalc->getMaskMap();
                         }
                         
-                       spotMap = decalc->getMaskMap();
+                       //send to slayer
+                       chimeraFlags = slayer->getResults(query, seqsForSlayer);
+                       chimeraResults = slayer->getOutput();
+                       
+                       //free memory
+                       for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
                 }
                 
-               //send to slayer
-               chimeraFlags = slayer->getResults(query, seqsForSlayer);
-               chimeraResults = slayer->getOutput();
-       
-               //free memory
-               for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
-               //}
-                       
+               delete maligner;
+               delete slayer;
+               
                 return 0;
         }
         catch(exception& e) {
@@ -168,28 +253,31 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
  //***************************************************************************************************************
  void ChimeraSlayer::printBlock(data_struct data, ostream& out){
         try {
+       //out << "Name\tParentA\tParentB\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
+               
+               out << querySeq->getName() << '\t';
+               out << data.parentA.getName() << "\t" << data.parentB.getName()  << '\t';
+               //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
+               //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
+               
+               out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
+               out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
                 
-               out << "parentA: " << data.parentA.getName() << "  parentB: " << data.parentB.getName()  << endl;
-               out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
-               out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
+               out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
                 
-               out << "Divergence of Query to Leftside ParentA and Rightside ParentB: " << data.divr_qla_qrb << '\t' << "Bootstrap: " << data.bsa << endl;
-               out << "Divergence of Query to Rightside ParentA and Leftside ParentB: " << data.divr_qlb_qra << '\t' << "Bootstrap: " << data.bsb << endl;
+               //out << "Similarity of parents: " << data.ab << endl;
+               //out << "Similarity of query to parentA: " << data.qa << endl;
+               //out << "Similarity of query to parentB: " << data.qb << endl;
                 
-               out << "Similarity of parents: " << data.ab << endl;
-               out << "Similarity of query to parentA: " << data.qa << endl;
-               out << "Similarity of query to parentB: " << data.qb << endl;
                 
-               out << "Percent_ID QLA_QRB: " << data.qla_qrb << endl;
-               out << "Percent_ID QLB_QRA: " << data.qlb_qra << endl;
                 //out << "Per_id(QL,A): " << data.qla << endl;
                 //out << "Per_id(QL,B): " << data.qlb << endl;
                 //out << "Per_id(QR,A): " << data.qra << endl;
                 //out << "Per_id(QR,B): " << data.qrb << endl;
  
                 
-               out << "DeltaL: " << (data.qla - data.qlb) << endl;
-               out << "DeltaR: " << (data.qra - data.qrb) << endl;
+               //out << "DeltaL: " << (data.qla - data.qlb) << endl;
+               //out << "DeltaR: " << (data.qra - data.qrb) << endl;
  
         }
         catch(exception& e) {
diff --git a/chimeraslayer.h b/chimeraslayer.h

index f8006c41382919b311e8fd8b590f31e9b8bfaad1..bf9cf7f8166884e1dda08f0de7a1989bec961c65 100644 (file)
--- a/chimeraslayer.h
+++ b/chimeraslayer.h
@@ -23,12 +23,13 @@
  class ChimeraSlayer : public Chimera {
         
         public:
-               ChimeraSlayer(string, bool);    
+               ChimeraSlayer(string, bool, string);    
                 ~ChimeraSlayer();
                 
                 int getChimeras(Sequence*);
                 void print(ostream&);
                 void printHeader(ostream&);
+               void doPrep();
                 
         private:
                 Sequence* querySeq;
@@ -36,13 +37,15 @@ class ChimeraSlayer : public Chimera {
                 Maligner* maligner;
                 Slayer* slayer;
                 map<int, int>  spotMap;
+               Database* databaseRight;
+               Database* databaseLeft;
                 
                 vector<data_struct>  chimeraResults;
-               string chimeraFlags, searchMethod;
-               string fastafile;
+               string chimeraFlags, searchMethod, fastafile;
                 bool realign;
         
                 void printBlock(data_struct, ostream&);
+               
  };
  
  /************************************************************************/
diff --git a/cluster.cpp b/cluster.cpp

index 00d8083275779399748d690b3a9fe84910d91104..931b44d70288c120cf151487f88580e07207cc68 100644 (file)
--- a/cluster.cpp
+++ b/cluster.cpp
@@ -216,6 +216,10 @@ void Cluster::update(){
                 for (int i=nColCells-1;i>=0;i--) {
                         if (found[i] == 0) {
                                 removeCell(colCells[i], -1, i);
+cout << "smallRow = " << smallRow+1 << " smallCol = " << smallCol+1 << endl;
+                               if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) {
+                                       mothurOut("Warning: merging " + toString(colCells[i]->row+1)  + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine();
+                               }
                         }
                 }
         }
diff --git a/decalc.cpp b/decalc.cpp

index 61de7c832dfbc7a16f8984b6bff127ebc68ba63f..c5f0c2607120ba0455d9819e3a6b5cb064900e81 100644 (file)
--- a/decalc.cpp
+++ b/decalc.cpp
@@ -11,6 +11,7 @@
  #include "chimera.h"
  #include "dist.h"
  #include "eachgapdist.h"
+#include "ignoregaps.h"
  
  
  //***************************************************************************************************************
@@ -680,11 +681,13 @@ float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
  }
  //***************************************************************************************************************
  //gets closest matches to each end, since chimeras will most likely have different parents on each end
-vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int numWanted) {
+vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int& numWanted, vector<int>& indexes) {
         try {
+               indexes.clear();
                 
                 vector<Sequence*> seqsMatches;  
-               vector<SeqDist> dists;
+               vector<SeqDist> distsLeft;
+               vector<SeqDist> distsRight;
                 
                 Dist* distcalculator = new eachGapDist();
                 
@@ -696,12 +699,19 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                 string queryAligned = querySeq->getAligned();
                 
                 //left side
+               bool foundFirstBase = false;
                 int baseCount = 0;
                 int leftSpot = 0;
+               int firstBaseSpot = 0;
                 for (int i = 0; i < queryAligned.length(); i++) {
-                       leftQuery += queryAligned[i];
                         //if you are a base
-                       if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) {             baseCount++;    }
+                       if (isalpha(queryAligned[i])) {         
+                               baseCount++; 
+                               if (!foundFirstBase) {   foundFirstBase = true;  firstBaseSpot = i;  }
+                       }
+                       
+                       //eliminate opening .'s
+                       if (foundFirstBase) {   leftQuery += queryAligned[i];  }
                         //if you have 1/3
                         if (baseCount >= numBases) {  leftSpot = i; break; } //first 1/3
                 }
@@ -711,21 +721,31 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                 int rightSpot = 0;
                 for (int i = leftSpot; i < queryAligned.length(); i++) {
                         //if you are a base
-                       if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) {             baseCount++;    }
+                       if (isalpha(queryAligned[i])) {         baseCount++;    }
                         //if you have 1/3
                         if (baseCount >= numBases) { rightSpot = i;  break; } //last 1/3
                 }
-               rightQuery = queryAligned.substr(rightSpot); //sequence from pos spot to end
+               
+               //trim end
+               //find last position in query that is a non gap character
+               int lastBaseSpot = queryAligned.length()-1;
+               for (int j = queryAligned.length()-1; j >= 0; j--) {
+                       if (isalpha(queryAligned[j])) {
+                               lastBaseSpot = j;
+                               break;
+                       }
+               }
+               rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //sequence from pos spot to end
                 
                 Sequence queryLeft(querySeq->getName(), leftQuery);
                 Sequence queryRight(querySeq->getName(), rightQuery);
-//cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << endl;
+//cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl;
  //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl;
                 for(int j = 0; j < db.size(); j++){
                         
                         string dbAligned = db[j]->getAligned();
-                       string leftDB = dbAligned.substr(0, leftSpot+1); //first 1/3 of the sequence
-                       string rightDB = dbAligned.substr(rightSpot); //last 1/3 of the sequence
+                       string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence
+                       string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //last 1/3 of the sequence
                         
                         Sequence dbLeft(db[j]->getName(), leftDB);
                         Sequence dbRight(db[j]->getName(), rightDB);
@@ -736,22 +756,78 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                         distcalculator->calcDist(queryRight, dbRight);
                         float distRight = distcalculator->getDist();
                         
-                       float dist = min(distLeft, distRight); //keep smallest distance
+                       SeqDist subjectLeft;
+                       subjectLeft.seq = db[j];
+                       subjectLeft.dist = distLeft;
+                       subjectLeft.index = j;
+                       
+                       distsLeft.push_back(subjectLeft);
                         
-                       SeqDist subject;
-                       subject.seq = db[j];
-                       subject.dist = dist;
+                       SeqDist subjectRight;
+                       subjectRight.seq = db[j];
+                       subjectRight.dist = distRight;
+                       subjectRight.index = j;
                         
-                       dists.push_back(subject);
+                       distsRight.push_back(subjectRight);
+
                 }
                 
                 delete distcalculator;
                 
-               sort(dists.begin(), dists.end(), compareSeqDist);
+               //sort by smallest distance
+               sort(distsRight.begin(), distsRight.end(), compareSeqDist);
+               sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
+               
+               //merge results         
+               map<string, string> seen;
+               map<string, string>::iterator it;
                 
+               vector<SeqDist> dists;
+               float lastRight = distsRight[0].dist;
+               float lastLeft = distsLeft[0].dist;
+               int lasti = 0;
+               for (int i = 0; i < distsLeft.size(); i++) {
+                       //add left if you havent already
+                       it = seen.find(distsLeft[i].seq->getName());
+                       if (it == seen.end()) {  
+                               dists.push_back(distsLeft[i]);
+                               seen[distsLeft[i].seq->getName()] = distsLeft[i].seq->getName();
+                               lastLeft =  distsLeft[i].dist;
+                       }
+
+                       //add right if you havent already
+                       it = seen.find(distsRight[i].seq->getName());
+                       if (it == seen.end()) {  
+                               dists.push_back(distsRight[i]);
+                               seen[distsRight[i].seq->getName()] = distsRight[i].seq->getName();
+                               lastRight =  distsRight[i].dist;
+                       }
+                       
+                       if (dists.size() > numWanted) { lasti = i; break; } //you have enough results
+               }
+               
+               //add in dups
+               lasti++;
+               int i = lasti;
+               while (i < distsLeft.size()) {  
+                       if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  numWanted++; }
+                       else { break; }
+                       i++;
+               }
+               
+               i = lasti;
+               while (i < distsRight.size()) {  
+                       if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  numWanted++; }
+                       else { break; }
+                       i++;
+               }
+
+//cout << numWanted << endl;
                 for (int i = 0; i < numWanted; i++) {
+//cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl;
                         Sequence* temp = new Sequence(dists[i].seq->getName(), dists[i].seq->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
                         seqsMatches.push_back(temp);
+                       indexes.push_back(dists[i].index);
                 }
                 
                 return seqsMatches;
@@ -761,6 +837,38 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                 exit(1);
         }
  }
+//***************************************************************************************************************
+Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
+       try {
+               
+               Sequence* seqsMatch;  
+               
+               Dist* distcalculator = new eachGapDist();
+               int index = 0;
+               int smallest = 1000000;
+               
+               for(int j = 0; j < db.size(); j++){
+                       
+                       distcalculator->calcDist(*querySeq, *db[j]);
+                       float dist = distcalculator->getDist();
+                       
+                       if (dist < smallest) { 
+                               smallest = dist;
+                               index = j;
+                       }
+               }
+               
+               delete distcalculator;
+               
+               seqsMatch = new Sequence(db[index]->getName(), db[index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
+                       
+               return seqsMatch;
+       }
+       catch(exception& e) {
+               errorOut(e, "DeCalculator", "findClosest");
+               exit(1);
+       }
+}
  /***************************************************************************************************************/
  map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
         try {
@@ -838,10 +946,10 @@ map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatch
  
                 //check to make sure that is not whole seq
                 if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
-//cout << "front = " << frontPos << " rear = " << rearPos << endl;             
+//cout << query->getName() << " front = " << frontPos << " rear = " << rearPos << endl;                
                 //trim query
                 string newAligned = query->getAligned();
-               newAligned = newAligned.substr(frontPos, (rearPos-frontPos));
+               newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
                 query->setAligned(newAligned);
                 
                 //trim topMatches
diff --git a/decalc.h b/decalc.h

index 011993652618dc38294ade4638ad0657238b9377..48a72bb9cc10f6387694bee21d5265c6f12e043d 100644 (file)
--- a/decalc.h
+++ b/decalc.h
@@ -39,7 +39,8 @@ class DeCalculator {
                 DeCalculator() {};
                 ~DeCalculator() {};
                 
-               vector<Sequence*> findClosest(Sequence*, vector<Sequence*>, int);  //takes querySeq, a reference db and numWanted - returns indexes to closest seqs in db
+               vector<Sequence*> findClosest(Sequence*, vector<Sequence*>, int&, vector<int>&);  //takes querySeq, a reference db, numWanted and indexes 
+               Sequence* findClosest(Sequence*, vector<Sequence*>);
                 set<int> getPos() {  return h;  }
                 void setMask(string); 
                 void setAlignmentLength(int l) {  alignLength = l;  }
diff --git a/maligner.cpp b/maligner.cpp

index b393194bbc3a4c89fd2ae8ea3f7f7ba67f84b927..91b0fa78da7510fc2f2313847f0cc1c5347c8223 100644 (file)
--- a/maligner.cpp
+++ b/maligner.cpp
@@ -8,12 +8,12 @@
   */
  
  #include "maligner.h"
-#include "database.hpp"
+#include "kmerdb.hpp"
  #include "blastdb.hpp"
  
  /***********************************************************************/
-Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) :
-               db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {}
+Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode, Database* dataLeft, Database* dataRight) :
+               db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight) {}
  /***********************************************************************/
  string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
         try {
@@ -25,15 +25,17 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
                 
                 string chimera;
                 
-               if (searchMethod != "blast") {
+               if (searchMethod == "distance") {
                         //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
-                       refSeqs = decalc->findClosest(query, db, numWanted);
-               }else{
-                       refSeqs = getBlastSeqs(query, numWanted);
-               }
+                       refSeqs = decalc->findClosest(query, db, numWanted, indexes);
+               }else if (searchMethod == "blast")  {
+                       refSeqs = getBlastSeqs(query, numWanted); //fills indexes
+               }else if (searchMethod == "kmer") {
+                       refSeqs = getKmerSeqs(query, numWanted); //fills indexes
+               }else { mothurOut("not valid search."); exit(1);  } //should never get here
                 
                 refSeqs = minCoverageFilter(refSeqs);
-               
+
                 if (refSeqs.size() < 2)  { 
                         for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
                         percentIdenticalQueryChimera = 0.0;
@@ -41,13 +43,14 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
                 }
                 
                 int chimeraPenalty = computeChimeraPenalty();
-       
+
                 //fills outputResults
                 chimera = chimeraMaligner(chimeraPenalty, decalc);
-               
+       
                                 
                 //free memory
                 delete query;
+
                 for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
                 
                 return chimera;
@@ -65,31 +68,31 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
                 
                 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
                 spotMap = decalc->trimSeqs(query, refSeqs);
-               
+
                 vector<Sequence*> temp = refSeqs;
                 temp.push_back(query);
                 
                 verticalFilter(temp);
+//for (int i = 0; i < temp.size(); i++) { cout << temp[i]->getName() << '\n' << temp[i]->getAligned() << endl; } return "no";
  
                 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
                 
                 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
-               
+       
                 vector<score_struct> path = extractHighestPath(matrix);
                 
                 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
-               
+       
                 if (trace.size() > 1) {         chimera = "yes";        }
                 else { chimera = "no";  }
                 
                 int traceStart = path[0].col;
-               int traceEnd = path[path.size()-1].col;
-               
+               int traceEnd = path[path.size()-1].col; 
                 string queryInRange = query->getAligned();
                 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
         
                 string chimeraSeq = constructChimericSeq(trace, refSeqs);
-               
+       
                 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
                 
                 //save output results
@@ -101,6 +104,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
                         results temp;
                         
                         temp.parent = refSeqs[seqIndex]->getName();
+                       temp.parentAligned = db[indexes[seqIndex]]->getAligned();
                         temp.nastRegionStart = spotMap[regionStart];
                         temp.nastRegionEnd = spotMap[regionEnd];
                         temp.regionStart = regionStart;
@@ -481,6 +485,8 @@ float Maligner::computePercentID(string queryAlign, string chimera) {
  //***************************************************************************************************************
  vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
         try {   
+               indexes.clear();
+               vector<Sequence*> refResults;
                 //generate blastdb
                 Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
                 for (int i = 0; i < db.size(); i++) {   database->addSequence(*db[i]);  }
@@ -498,6 +504,82 @@ vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
                 vector<int> tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1);
                 vector<int> tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1);
                 
+               //if ((tempIndexesRight.size() != (num+1)) || (tempIndexesLeft.size() != (num+1)))  {  mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to porcess sequence " + q->getName()); mothurOutEndLine(); return refResults; }
+               
+               vector<int> smaller;
+               vector<int> larger;
+               
+               if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight;  larger = tempIndexesLeft; }
+               else { smaller = tempIndexesLeft; larger = tempIndexesRight; } 
+               
+               //merge results         
+               map<int, int> seen;
+               map<int, int>::iterator it;
+               
+               vector<int> mergedResults;
+               for (int i = 0; i < smaller.size(); i++) {
+                       //add left if you havent already
+                       it = seen.find(smaller[i]);
+                       if (it == seen.end()) {  
+                               mergedResults.push_back(smaller[i]);
+                               seen[smaller[i]] = smaller[i];
+                       }
+
+                       //add right if you havent already
+                       it = seen.find(larger[i]);
+                       if (it == seen.end()) {  
+                               mergedResults.push_back(larger[i]);
+                               seen[larger[i]] = larger[i];
+                       }
+               }
+               
+               for (int i = smaller.size(); i < larger.size(); i++) {
+                       it = seen.find(larger[i]);
+                       if (it == seen.end()) {  
+                               mergedResults.push_back(larger[i]);
+                               seen[larger[i]] = larger[i];
+                       }
+               }
+               
+               if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); }
+//cout << q->getName() <<  endl;               
+               for (int i = 0; i < numWanted; i++) {
+//cout << db[mergedResults[i]]->getName() << endl;     
+                       if (db[mergedResults[i]]->getName() != q->getName()) { 
+                               Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
+                               refResults.push_back(temp);
+                               indexes.push_back(mergedResults[i]);
+                       }
+//cout << mergedResults[i] << endl;
+               }
+//cout << endl;                
+               delete queryRight;
+               delete queryLeft;
+               delete database;
+               
+               return refResults;
+       }
+       catch(exception& e) {
+               errorOut(e, "Maligner", "getBlastSeqs");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+vector<Sequence*> Maligner::getKmerSeqs(Sequence* q, int num) {
+       try {   
+               indexes.clear();
+               
+               //get parts of query
+               string queryUnAligned = q->getUnaligned();
+               string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
+               string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
+
+               Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
+               Sequence* queryRight = new Sequence(q->getName(), rightQuery);
+               
+               vector<int> tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, numWanted);
+               vector<int> tempIndexesRight = databaseRight->findClosestSequences(queryRight, numWanted);
+               
                 //merge results         
                 map<int, int> seen;
                 map<int, int>::iterator it;
@@ -519,16 +601,17 @@ vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
                         }
                 }
                 
-               
+//cout << q->getName() << endl;                
                 vector<Sequence*> refResults;
                 for (int i = 0; i < numWanted; i++) {
+//cout << db[mergedResults[i]]->getName() << endl;     
                         Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
                         refResults.push_back(temp);
+                       indexes.push_back(mergedResults[i]);
                 }
-               
+//cout << endl;                
                 delete queryRight;
                 delete queryLeft;
-               delete database;
                 
                 return refResults;
         }
@@ -537,6 +620,5 @@ vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
                 exit(1);
         }
  }
-
  //***************************************************************************************************************
  
diff --git a/maligner.h b/maligner.h

index abe18a3cb1ad6c6b65d41077b8dd1fb56d802f96..13aa917afb5b39ac238dda69b6032bb509e1345e 100644 (file)
--- a/maligner.h
+++ b/maligner.h
@@ -11,6 +11,7 @@
   
  #include "decalc.h"
  #include "chimera.h"
+#include "database.hpp"
  
  /***********************************************************************/
  //This class was modeled after the chimeraMaligner written by the Broad Institute
@@ -19,7 +20,7 @@ class Maligner {
  
         public:
                 
-               Maligner(vector<Sequence*>, int, int, int, float, int, int, string);
+               Maligner(vector<Sequence*>, int, int, int, float, int, int, string, Database*, Database*);
                 ~Maligner() {};
                 
                 string getResults(Sequence*, DeCalculator*);
@@ -35,7 +36,10 @@ class Maligner {
                 string searchMethod;
                 float minDivR, percentIdenticalQueryChimera;
                 vector<results> outputResults;
+               vector<int> indexes;  //stores index into template seqs of the refSeqs, so we can return the whole sequence rather than the trimmed and filtered one
                 map<int, int> spotMap;
+               Database* databaseLeft;
+               Database* databaseRight;
                 
                 vector<Sequence*> minCoverageFilter(vector<Sequence*>);  //removes top matches that do not have minimum coverage with query.
                 int computeChimeraPenalty();
@@ -49,6 +53,7 @@ class Maligner {
                 float computePercentID(string, string);
                 string chimeraMaligner(int, DeCalculator*);
                 vector<Sequence*> getBlastSeqs(Sequence*, int);
+               vector<Sequence*> getKmerSeqs(Sequence*, int);
                 
  };
  
diff --git a/mothur.cpp b/mothur.cpp

index 8203c5716f02122ed77feef18f31428c23148552..7acf3894e150f24a8eaaa5bc72f0e752bab4ecc6 100644 (file)
--- a/mothur.cpp
+++ b/mothur.cpp
@@ -93,12 +93,21 @@ int main(int argc, char *argv[]){
                         input = argv[1];
  
                         if (input[0] == '#') {
+                               mothurOutJustToLog("Script Mode");
+                               mothurOutEndLine(); mothurOutEndLine();
+
                                 mothur = new ScriptEngine(argv[0], argv[1]);
                         }else{
+                               mothurOutJustToLog("Batch Mode");
+                               mothurOutEndLine(); mothurOutEndLine();
+                               
                                 mothur = new BatchEngine(argv[0], argv[1]);
                         }
                 }
                 else{
+                       mothurOutJustToLog("Interactive Mode");
+                       mothurOutEndLine(); mothurOutEndLine();
+                       
                         mothur = new InteractEngine(argv[0]);   
                 }
                 
diff --git a/pintail.cpp b/pintail.cpp

index 74e1ce47a80e60ec4c27e685f53e6dbfedfb6da9..46da9137a18c072bfc1033025fd2551ea92c728a 100644 (file)
--- a/pintail.cpp
+++ b/pintail.cpp
@@ -39,7 +39,8 @@ Pintail::~Pintail() {
  //***************************************************************************************************************
  void Pintail::doPrep() {
         try {
-       
+               
+               mergedFilterString = "";
                 windowSizesTemplate.resize(templateSeqs.size(), window);
                 quantiles.resize(100);  //one for every percent mismatch
                 quantilesMembers.resize(100);  //one for every percent mismatch
@@ -72,14 +73,15 @@ void Pintail::doPrep() {
                         mothurOut("Done."); mothurOutEndLine();
                 }else                           {   probabilityProfile = readFreq();    mothurOut("Done.");               }
                 mothurOutEndLine();
-       
-               //quantiles are used to determine whether the de values found indicate a chimera
-               //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
-               //combination of sequences in the template
-               if (quanfile != "") {  quantiles = readQuantiles();  }
-               else {
+               
+               //make P into Q
+               for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
+               
+               bool reRead = false;
+               //create filter if needed for later
+               if (filter) {
+                       reRead = true;
                         
-                       //if user has not provided the quantiles and wants the mask we need to mask, but then delete and reread or we will mess up the best match process in getChimeras
                         if (seqMask != "") {
                                 //mask templates
                                 for (int i = 0; i < templateSeqs.size(); i++) {
@@ -87,11 +89,44 @@ void Pintail::doPrep() {
                                 }
                         }
                         
-                       if (filter) {
-                               createFilter(templateSeqs);
-                               for (int i = 0; i < templateSeqs.size(); i++) {  runFilter(templateSeqs[i]);  }
+                       //read in all query seqs
+                       ifstream in; 
+                       openInputFile(fastafile, in);
+                       
+                       vector<Sequence*> tempQuerySeqs;
+                       while(!in.eof()){
+                               Sequence* s = new Sequence(in);
+                               gobble(in);
+                               
+                               if (s->getName() != "") { tempQuerySeqs.push_back(s); }
+                       }
+                       in.close();
+                       
+                       vector<Sequence*> temp;
+                       //merge query seqs and template seqs
+                       temp = templateSeqs;
+                       for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
+                       
+                       mergedFilterString = createFilter(temp, 0.5);
+                       
+                       //reread template seqs
+                       for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
+               }
+               
+               
+               //quantiles are used to determine whether the de values found indicate a chimera
+               //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
+               //combination of sequences in the template
+               if (quanfile != "") {  
+                       quantiles = readQuantiles(); 
+               }else {
+                       if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
+                               reRead = true;
+                               //mask templates
+                               for (int i = 0; i < templateSeqs.size(); i++) {
+                                       decalc->runMask(templateSeqs[i]);
+                               }
                         }
-
                         
                         mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
                         if (processors == 1) { 
@@ -155,15 +190,15 @@ void Pintail::doPrep() {
                         }
  
                         mothurOut("Done."); mothurOutEndLine();
-                       
-                       //if mask reread template
-                       if ((seqMask != "") || (filter)) {
-                               for (int i = 0; i < templateSeqs.size(); i++)           {  delete templateSeqs[i];              }
-                               templateSeqs = readSeqs(templateFileName);
-                       }
-                       
                 }
                 
+               if (reRead) {
+                       for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i];  }
+                       templateSeqs.clear();
+                       templateSeqs = readSeqs(templateFileName);
+               }
+
+               
                 //free memory
                 for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i];  }
                 
@@ -217,24 +252,22 @@ int Pintail::getChimeras(Sequence* query) {
                 querySeq = query;
                 trimmed.clear();
                 windowSizes = window;
-                               
-               //find pairs
-               bestfit = findPairs(query);
                                                         
-               //make P into Q
-               for (int i = 0; i < probabilityProfile.size(); i++)  {  probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
-       
-               //mask sequences if the user wants to 
+               //find pairs has to be done before a mask
+               bestfit = findPairs(query);
+               
+               //if they mask  
                 if (seqMask != "") {
                         decalc->runMask(query);
                         decalc->runMask(bestfit);
                 }
-               
-               if (filter) {
+
+               if (filter) { //must be done after a mask
                         runFilter(query);
                         runFilter(bestfit);
                 }
                 
+                               
                 //trim seq
                 decalc->trimSeqs(query, bestfit, trimmed);  
                 
@@ -319,9 +352,7 @@ Sequence* Pintail::findPairs(Sequence* q) {
                 
                 Sequence* seqsMatches;  
                 
-               vector<Sequence*> copy = decalc->findClosest(q, templateSeqs, 1);
-               seqsMatches = copy[0];
-               
+               seqsMatches = decalc->findClosest(q, templateSeqs);
                 return seqsMatches;
         
         }
diff --git a/pintail.h b/pintail.h

index ed52c3aa0d010c70bd99bb045ac3715a5d8cb94e..efb8ba6aa31afcd9c12bb3a85cfa21d2cd8310aa 100644 (file)
--- a/pintail.h
+++ b/pintail.h
@@ -65,6 +65,7 @@ class Pintail : public Chimera {
                 vector< vector<float> > quantiles;  //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
                 vector< vector<quanMember> > quantilesMembers;  //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
                 set<int>  h;
+               string mergedFilterString;
                 
                 
                 vector<float> readFreq();
diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp

index a8f03e06b29c505d219a8a4f5770999586e182ae..19a4bc032abbcd1a5a7c9f53cdeac1b1be2c5715 100644 (file)
--- a/removeseqscommand.cpp
+++ b/removeseqscommand.cpp
@@ -22,7 +22,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option){
                 
                 else {
                         //valid paramters for this command
-                       string Array[] =  {"fasta","name", "group", "alignreport", "accnos", "list","outputdir","inputdir" };
+                       string Array[] =  {"fasta","name", "group", "alignreport", "accnos", "list","outputdir","inputdir", "dups" };
                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                         
                         OptionParser parser(option);
@@ -119,10 +119,17 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option){
                         if (listfile == "not open") { abort = true; }
                         else if (listfile == "not found") {  listfile = "";  }
                         
+                       string usedDups = "true";
+                       string temp = validParameter.validFile(parameters, "dups", false);      if (temp == "not found") { temp = "false"; usedDups = ""; }
+                       dups = isTrue(temp);
+                       
                         if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == ""))  { mothurOut("You must provide one of the following: fasta, name, group, alignreport or list."); mothurOutEndLine(); abort = true; }
                         
                         int okay = 2;
                         if (outputDir != "") { okay++; }
+                       if (usedDups != "") { okay++;  }
+                       
+                       if ((usedDups != "") && (namefile == "")) {  mothurOut("You may only use dups with the name option."); mothurOutEndLine();  abort = true; }
                         
                         if (parameters.size() > okay) { mothurOut("You may only enter one of the following: fasta, name, group, alignreport, or list."); mothurOutEndLine(); abort = true;  }
                 }
@@ -139,7 +146,8 @@ void RemoveSeqsCommand::help(){
         try {
                 mothurOut("The remove.seqs command reads an .accnos file and one of the following file types: fasta, name, group, list or alignreport file.\n");
                 mothurOut("It outputs a file containing the sequences NOT in the .accnos file.\n");
-               mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list and alignreport.  You must provide accnos and one of the other parameters.\n");
+               mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list, alignreport and dups.  You must provide accnos and one of the file parameters.\n");
+               mothurOut("The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=false. If dups=true, then remove.seqs outputs a new .accnos file containing all the sequences removed. \n");
                 mothurOut("The remove.seqs command should be in the following format: remove.seqs(accnos=yourAccnos, fasta=yourFasta).\n");
                 mothurOut("Example remove.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n");
                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
@@ -288,7 +296,12 @@ void RemoveSeqsCommand::readName(){
         try {
                 if (outputDir == "") {  outputDir += hasPath(namefile);  }
                 string outputFileName = getRootName(namefile) + "pick" + getExtension(namefile);
+               string outputFileName2 = getRootName(namefile) + "dups.accnos";
  
+               ofstream out2;
+               if (dups) {      openOutputFile(outputFileName2, out2); }
+               bool wroteDups = false;
+               
                 ofstream out;
                 openOutputFile(outputFileName, out);
  
@@ -322,38 +335,47 @@ void RemoveSeqsCommand::readName(){
                                 }
                         }
                         
-                       //if the name in the first column is in the set then print it and any other names in second column also in set
-                       if (names.count(firstCol) == 0) {
-                       
-                               wroteSomething = true;
-                               
-                               out << firstCol << '\t';
-                               
-                               //you know you have at least one valid second since first column is valid
-                               for (int i = 0; i < validSecond.size()-1; i++) {  out << validSecond[i] << ',';  }
-                               out << validSecond[validSecond.size()-1] << endl;
-                       
-                       //make first name in set you come to first column and then add the remaining names to second column
+                       if ((dups) && (validSecond.size() != parsedNames.size())) { 
+                               wroteDups = true;
+                               for (int i = 0; i < parsedNames.size(); i++) {  out2 << parsedNames[i] << endl;  }
                         }else {
-                               
-                               //you want part of this row
-                               if (validSecond.size() != 0) {
-                               
+                                       //if the name in the first column is in the set then print it and any other names in second column also in set
+                               if (names.count(firstCol) == 0) {
+                                       
                                         wroteSomething = true;
                                         
-                                       out << validSecond[0] << '\t';
-                               
+                                       out << firstCol << '\t';
+                                       
                                         //you know you have at least one valid second since first column is valid
                                         for (int i = 0; i < validSecond.size()-1; i++) {  out << validSecond[i] << ',';  }
                                         out << validSecond[validSecond.size()-1] << endl;
+                                       
+                                       //make first name in set you come to first column and then add the remaining names to second column
+                               }else {
+                                       
+                                       //you want part of this row
+                                       if (validSecond.size() != 0) {
+                                               
+                                               wroteSomething = true;
+                                               
+                                               out << validSecond[0] << '\t';
+                                               
+                                               //you know you have at least one valid second since first column is valid
+                                               for (int i = 0; i < validSecond.size()-1; i++) {  out << validSecond[i] << ',';  }
+                                               out << validSecond[validSecond.size()-1] << endl;
+                                       }
                                 }
                         }
-                       
                         gobble(in);
                 }
                 in.close();
                 out.close();
                 
+               if (dups) { out2.close();  }
+               if (wroteDups == false) {
+                       remove(outputFileName2.c_str()); 
+               }
+               
                 if (wroteSomething == false) {
                         mothurOut("Your file contains only sequences from the .accnos file."); mothurOutEndLine();
                         remove(outputFileName.c_str()); 
diff --git a/removeseqscommand.h b/removeseqscommand.h

index 1dcb0fceab5ed3800d2b604d2ba32ff632cfec8c..647cb765f0e75781862d0fa74305d55573c85872 100644 (file)
--- a/removeseqscommand.h
+++ b/removeseqscommand.h
@@ -24,7 +24,7 @@ class RemoveSeqsCommand : public Command {
         private:
                 set<string> names;
                 string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, outputDir;
-               bool abort;
+               bool abort, dups;
                 
                 void readFasta();
                 void readName();
diff --git a/sabundvector.cpp b/sabundvector.cpp

index b70c642643cfd746e63c236e328feb8e37d74a44..2bb8d69a02614a9ad386062f05f9eda7673bbfa2 100644 (file)
--- a/sabundvector.cpp
+++ b/sabundvector.cpp
@@ -60,8 +60,10 @@ SAbundVector::SAbundVector(ifstream& f): DataVector(), maxRank(0), numBins(0), n
         
                 for(int i=1;i<=hold;i++){
                         f >> inputData;
+
                         set(i, inputData);
                 }
+
         }
         catch(exception& e) {
                 errorOut(e, "SAbundVector", "SAbundVector");
@@ -192,13 +194,13 @@ RAbundVector SAbundVector::getRAbundVector(){
         try {
                 RAbundVector rav;
         
-               for(int i=1;i<=data.size();i++){                
+               for(int i=1;i < data.size();i++){
                         for(int j=0;j<data[i];j++){
                                 rav.push_back(i);
                         }
                 }
                 sort(rav.rbegin(), rav.rend());
-       
+
                 rav.setLabel(label);
                 return rav;
         }
diff --git a/sharedsabundvector.cpp b/sharedsabundvector.cpp

index 9b89921ee0ff028dee0b75eb0453816015cdf094..a7dfd6683a09c3daa8cd67d2e5b16cd5880e979c 100644 (file)
--- a/sharedsabundvector.cpp
+++ b/sharedsabundvector.cpp
@@ -143,7 +143,7 @@ RAbundVector SharedSAbundVector::getRAbundVector(){
         try {
                 RAbundVector rav;
         
-               for(int i=1;i<=data.size();i++){                
+               for(int i=1;i<data.size();i++){         
                         for(int j=0;j<data[i].abundance;j++){
                                 rav.push_back(i);
                         }
@@ -191,7 +191,7 @@ SharedRAbundVector SharedSAbundVector::getSharedRAbundVector(){
         try {
                 SharedRAbundVector rav;
                 
-               for(int i=1;i<=data.size();i++){                
+               for(int i=1;i<data.size();i++){         
                         for(int j=0;j<data[i].abundance;j++){
                                 rav.push_back(i, data[i].group);
                         }
author	westcott <westcott>
	Mon, 22 Feb 2010 17:22:22 +0000 (17:22 +0000)
committer	westcott <westcott>
	Mon, 22 Feb 2010 17:22:22 +0000 (17:22 +0000)
averagelinkage.cpp		patch \| blob \| history
blastdb.cpp		patch \| blob \| history
ccode.cpp		patch \| blob \| history
chimera.cpp		patch \| blob \| history
chimera.h		patch \| blob \| history
chimeraseqscommand.cpp		patch \| blob \| history
chimeraslayer.cpp		patch \| blob \| history
chimeraslayer.h		patch \| blob \| history
cluster.cpp		patch \| blob \| history
decalc.cpp		patch \| blob \| history
decalc.h		patch \| blob \| history
maligner.cpp		patch \| blob \| history
maligner.h		patch \| blob \| history
mothur.cpp		patch \| blob \| history
pintail.cpp		patch \| blob \| history
pintail.h		patch \| blob \| history
removeseqscommand.cpp		patch \| blob \| history
removeseqscommand.h		patch \| blob \| history
sabundvector.cpp		patch \| blob \| history
sharedsabundvector.cpp		patch \| blob \| history