]> git.donarmstrong.com Git - mothur.git/blobdiff - chimeraslayer.cpp
created mothurOut class to handle logfiles
[mothur.git] / chimeraslayer.cpp
index 293ee807d5b875728716fddde17cd6c032f318e1..f8651079c8da037e6aeb119d494a350ae410597a 100644 (file)
  */
 
 #include "chimeraslayer.h"
+#include "chimerarealigner.h"
+#include "kmerdb.hpp"
 
 //***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
+ChimeraSlayer::ChimeraSlayer(string mode, bool r, string f) : searchMethod(mode), realign(r), fastafile(f) {   
+       decalc = new DeCalculator();    
+}
 //***************************************************************************************************************
-
-ChimeraSlayer::~ChimeraSlayer() {
+void ChimeraSlayer::doPrep() {
        try {
-               for (int i = 0; i < querySeqs.size(); i++)                      {  delete querySeqs[i];                 }
-               for (int i = 0; i < templateSeqs.size(); i++)           {  delete templateSeqs[i];              }
+       
+               string  kmerDBNameLeft;
+               string  kmerDBNameRight;
+               
+               //generate the kmerdb to pass to maligner
+               if (searchMethod == "kmer") { 
+                       //leftside
+                       string leftTemplateFileName = "left." + templateFileName;
+                       databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);                      
+                       kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+                       ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
+                       
+                       if(!kmerFileTestLeft){  
+                       
+                               for (int i = 0; i < templateSeqs.size(); i++) {
+                                       string leftFrag = templateSeqs[i]->getUnaligned();
+                                       leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+                                       
+                                       Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
+                                       databaseLeft->addSequence(leftTemp);    
+                               }
+                               databaseLeft->generateDB();
+                               
+                       }else { 
+                               databaseLeft->readKmerDB(kmerFileTestLeft);     
+                       }
+                       kmerFileTestLeft.close();
+                       
+                       databaseLeft->setNumSeqs(templateSeqs.size());
+                       
+                       //rightside
+                       string rightTemplateFileName = "right." + templateFileName;
+                       databaseRight = new KmerDB(rightTemplateFileName, kmerSize);                    
+                       kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+                       ifstream kmerFileTestRight(kmerDBNameRight.c_str());
+                       
+                       if(!kmerFileTestRight){ 
+                       
+                               for (int i = 0; i < templateSeqs.size(); i++) {
+                                       string rightFrag = templateSeqs[i]->getUnaligned();
+                                       rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+                                       
+                                       Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
+                                       databaseRight->addSequence(rightTemp);  
+                               }
+                               databaseRight->generateDB();
+                               
+                       }else { 
+                               databaseRight->readKmerDB(kmerFileTestRight);   
+                       }
+                       kmerFileTestRight.close();
+                       
+                       databaseRight->setNumSeqs(templateSeqs.size());
+
+               }
+               
+               int start = time(NULL); 
+               //filter the sequences
+               //read in all query seqs
+               ifstream in; 
+               openInputFile(fastafile, in);
+               
+               vector<Sequence*> tempQuerySeqs;
+               while(!in.eof()){
+                       Sequence* s = new Sequence(in);
+                       gobble(in);
+                       
+                       if (s->getName() != "") { tempQuerySeqs.push_back(s); }
+               }
+               in.close();
+               
+               vector<Sequence*> temp = templateSeqs;
+               for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
+                               
+               createFilter(temp, 0.0); //just removed columns where all seqs have a gap
+                               
+               for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
+               
+               //run filter on template
+               for (int i = 0; i < templateSeqs.size(); i++) {  runFilter(templateSeqs[i]);  }
+               
+               m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to filter.");    m->mothurOutEndLine();
+
        }
        catch(exception& e) {
-               errorOut(e, "ChimeraSlayer", "~ChimeraSlayer");
+               m->errorOut(e, "ChimeraSlayer", "doprep");
                exit(1);
        }
-}      
+}
 //***************************************************************************************************************
-void ChimeraSlayer::print(ostream& out) {
+ChimeraSlayer::~ChimeraSlayer() {      delete decalc;  if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }    }
+//***************************************************************************************************************
+void ChimeraSlayer::printHeader(ostream& out) {
+       m->mothurOutEndLine();
+       m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
+       m->mothurOutEndLine();
+       
+       out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
+}
+//***************************************************************************************************************
+void ChimeraSlayer::print(ostream& out, ostream& outAcc) {
        try {
+               if (chimeraFlags == "yes") {
+                       string chimeraFlag = "no";
+                       if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
+                          ||
+                          (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
+                       
+                       
+                       if (chimeraFlag == "yes") {     
+                               if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
+                                       m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
+                                       outAcc << querySeq->getName() << endl;
+                               }
+                       }
+                       
+                       printBlock(chimeraResults[0], out);
+                       out << endl;
+               }else {  out << querySeq->getName() << "\tno" << endl;  }
                
-               mothurOutEndLine();
-               
-                               
        }
        catch(exception& e) {
-               errorOut(e, "ChimeraSlayer", "print");
+               m->errorOut(e, "ChimeraSlayer", "print");
                exit(1);
        }
 }
-
 //***************************************************************************************************************
-void ChimeraSlayer::getChimeras() {
+int ChimeraSlayer::getChimeras(Sequence* query) {
        try {
+               chimeraFlags = "no";
                
-               //read in query sequences and subject sequences
-               mothurOut("Reading sequences and template file... "); cout.flush();
-               querySeqs = readSeqs(fastafile);
-               templateSeqs = readSeqs(templateFile);
-               mothurOut("Done."); mothurOutEndLine();
-               
-               int numSeqs = querySeqs.size();
+               //filter query
+               spotMap = runFilter(query);
                
-               //break up file if needed
-               int linesPerProcess = numSeqs / processors ;
-               
-               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                       //find breakup of sequences for all times we will Parallelize
-                       if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
-                       else {
-                               //fill line pairs
-                               for (int i = 0; i < (processors-1); i++) {                      
-                                       lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
-                               }
-                               //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
-                               int i = processors - 1;
-                               lines.push_back(new linePair((i*linesPerProcess), numSeqs));
-                       }
-               #else
-                       lines.push_back(new linePair(0, numSeqs));
-               #endif
-               
-               if (seqMask != "") {    decalc = new DeCalculator();    } //to use below
+               querySeq = query;
                
                //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
-               maligner = new Maligner(templateSeqs, numWanted, match, misMatch, 1.01, minSim);
-               slayer = new Slayer(window, increment, minSim, divR);
-               
-               for (int i = 0; i < querySeqs.size(); i++) {
+               maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
+               slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
                
-                       string chimeraFlag = maligner->getResults(querySeqs[i]);
-                       float percentIdentical = maligner->getPercentID();
-                       vector<results> Results = maligner->getOutput();
-                       
-                       cout << querySeqs[4]->getName() << '\t' << chimeraFlag << '\t' << percentIdentical << endl;
+               string chimeraFlag = maligner->getResults(query, decalc);
+               vector<results> Results = maligner->getOutput();
+                               
+               //found in testing realigning only made things worse
+               if (realign) {
+                       ChimeraReAligner realigner(templateSeqs, match, misMatch);
+                       realigner.reAlign(query, Results);
+               }
+
+               if (chimeraFlag == "yes") {
                        
+                       //get sequence that were given from maligner results
+                       vector<SeqDist> seqs;
+                       map<string, float> removeDups;
+                       map<string, float>::iterator itDup;
+                       map<string, string> parentNameSeq;
+                       map<string, string>::iterator itSeq;
                        for (int j = 0; j < Results.size(); j++) {
-                               cout << "regionStart = " << Results[j].regionStart << "\tRegionEnd = " << Results[j].regionEnd << "\tName = " << Results[j].parent << "\tPerQP = " << Results[j].queryToParent << "\tLocalPerQP = " << Results[j].queryToParentLocal << "\tdivR = " << Results[j].divR << endl;
-                               
+                               float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
+                               //only add if you are not a duplicate
+                               itDup = removeDups.find(Results[j].parent);
+                               if (itDup == removeDups.end()) { //this is not duplicate
+                                       removeDups[Results[j].parent] = dist;
+                                       parentNameSeq[Results[j].parent] = Results[j].parentAligned;
+                               }else if (dist > itDup->second) { //is this a stronger number for this parent
+                                       removeDups[Results[j].parent] = dist;
+                                       parentNameSeq[Results[j].parent] = Results[j].parentAligned;
+                               }
                        }
                        
-                       if (chimeraFlag == "yes") {
+                       for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
+                               //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
+                               itSeq = parentNameSeq.find(itDup->first);
+//cout << itDup->first << itSeq->second << endl;
+                               Sequence* seq = new Sequence(itDup->first, itSeq->second);
+                               
+                               SeqDist member;
+                               member.seq = seq;
+                               member.dist = itDup->second;
+                               
+                               seqs.push_back(member);
+                       }
                        
-                               //get sequence that were given from maligner results
-                               vector<SeqDist> seqs;
-                               for (int j = 0; j < Results.size(); j++) {
-                                       Sequence* seq = getSequence(Results[j].parent); //makes copy so you can filter and mask and not effect template
-                                       
-                                       //seq = NULL if error occurred in getSequence
-                                       if (seq == NULL) {  break;      }
-                                       else {  
-                                               SeqDist member;
-                                               member.seq = seq;
-                                               member.dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
-                                               seqs.push_back(member); 
-                                       }
+                       //limit number of parents to explore - default 3
+                       if (Results.size() > parents) {
+                               //sort by distance
+                               sort(seqs.begin(), seqs.end(), compareSeqDist);
+                               //prioritize larger more similiar sequence fragments
+                               reverse(seqs.begin(), seqs.end());
+                               
+                               for (int k = seqs.size()-1; k > (parents-1); k--)  {  
+                                       delete seqs[k].seq;
+                                       seqs.pop_back();        
                                }
+                       }
                        
-                               //limit number of parents to explore - default 5
-                               if (Results.size() > parents) {
-                                       //sort by distance
-                                       sort(seqs.begin(), seqs.end(), compareSeqDist);
-                                       //prioritize larger more similiar sequence fragments
-                                       reverse(seqs.begin(), seqs.end());
-                                       
-                                       for (int k = seqs.size()-1; k > (parents-1); k--)  {  
-                                               delete seqs[k].seq;
-                                               seqs.pop_back();        
-                                       }
-                               }
-                               
-                               //put seqs into vector to send to slayer
-                               vector<Sequence*> seqsForSlayer;
-                               for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);  }
+                       //put seqs into vector to send to slayer
+                       vector<Sequence*> seqsForSlayer;
+                       for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);  }
                        
-                               //mask then send to slayer...
-                               if (seqMask != "") {
-                                       decalc->setMask(seqMask);
-
-                                       //mask querys
-                                       decalc->runMask(querySeqs[i]);
-                                       
-                                       //mask parents
-                                       for (int k = 0; k < seqsForSlayer.size(); k++) {
-                                               decalc->runMask(seqsForSlayer[k]);
-                                       }
-                                       
-                               }
+                       //mask then send to slayer...
+                       if (seqMask != "") {
+                               decalc->setMask(seqMask);
+                               
+                               //mask querys
+                               decalc->runMask(query);
                                
-                               //send to slayer
-                               slayer->getResults(querySeqs[i], seqsForSlayer);
+                               //mask parents
+                               for (int k = 0; k < seqsForSlayer.size(); k++) {
+                                       decalc->runMask(seqsForSlayer[k]);
+                               }
                                
-                               //free memory
-                               for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
+                               spotMap = decalc->getMaskMap();
                        }
                        
-               }       
-               //free memory
-               for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];        }
-               
-               if (seqMask != "") {
-                       delete decalc; 
-               }
-
+                       //send to slayer
+                       chimeraFlags = slayer->getResults(query, seqsForSlayer);
+                       chimeraResults = slayer->getOutput();
                        
+                       //free memory
+                       for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
+               }
+               
+               delete maligner;
+               delete slayer;
+               
+               return 0;
        }
        catch(exception& e) {
-               errorOut(e, "ChimeraSlayer", "getChimeras");
+               m->errorOut(e, "ChimeraSlayer", "getChimeras");
                exit(1);
        }
 }
 //***************************************************************************************************************
-Sequence* ChimeraSlayer::getSequence(string name) {
-       try{
-               Sequence* temp;
-               
-               //look through templateSeqs til you find it
-               int spot = -1;
-               for (int i = 0; i < templateSeqs.size(); i++) {
-                       if (name == templateSeqs[i]->getName()) {  
-                               spot = i;
-                               break;
-                       }
-               }
+void ChimeraSlayer::printBlock(data_struct data, ostream& out){
+       try {
+       //out << "Name\tParentA\tParentB\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
+               
+               out << querySeq->getName() << '\t';
+               out << data.parentA.getName() << "\t" << data.parentB.getName()  << '\t';
+               //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
+               //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
+               
+               out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
+               out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
+               
+               out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
                
-               if(spot == -1) { mothurOut("Error: Could not find sequence in chimeraSlayer."); mothurOutEndLine(); return NULL; }
+               //out << "Similarity of parents: " << data.ab << endl;
+               //out << "Similarity of query to parentA: " << data.qa << endl;
+               //out << "Similarity of query to parentB: " << data.qb << endl;
                
-               temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned());
                
-               return temp;
+               //out << "Per_id(QL,A): " << data.qla << endl;
+               //out << "Per_id(QL,B): " << data.qlb << endl;
+               //out << "Per_id(QR,A): " << data.qra << endl;
+               //out << "Per_id(QR,B): " << data.qrb << endl;
+
+               
+               //out << "DeltaL: " << (data.qla - data.qlb) << endl;
+               //out << "DeltaR: " << (data.qra - data.qrb) << endl;
+
        }
        catch(exception& e) {
-               errorOut(e, "ChimeraSlayer", "getSequence");
+               m->errorOut(e, "ChimeraSlayer", "printBlock");
                exit(1);
        }
 }
 //***************************************************************************************************************
 
-
-