]> git.donarmstrong.com Git - mothur.git/blobdiff - chimeracheckrdp.cpp
added MPI to dist.seqs command
[mothur.git] / chimeracheckrdp.cpp
index 73bc1a06c0fe49d8997965c16b4cb3c0485b16db..790d3ebe01100c1d615be9d189e73d6fa03f28ef 100644 (file)
 #include "chimeracheckrdp.h"
                
 //***************************************************************************************************************
-ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
+ChimeraCheckRDP::ChimeraCheckRDP(string filename,  string o) {  fastafile = filename;  outputDir = o;  }
 //***************************************************************************************************************
 
 ChimeraCheckRDP::~ChimeraCheckRDP() {
        try {
-               for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
                delete templateDB;
                delete kmer;
        }
        catch(exception& e) {
-               errorOut(e, "ChimeraCheckRDP", "~AlignSim");
+               m->errorOut(e, "ChimeraCheckRDP", "~AlignSim");
                exit(1);
        }
 }      
 //***************************************************************************************************************
-void ChimeraCheckRDP::print(ostream& out) {
+int ChimeraCheckRDP::print(ostream& out, ostream& outAcc) {
        try {
                
-               mothurOutEndLine();
-       /*      
-               for (int i = 0; i < querySeqs.size(); i++) {
+               m->mothurOut("Processing: " + querySeq->getName()); m->mothurOutEndLine();
+               
+               out << querySeq->getName() << endl;
+               out << "IS scores: " << '\t';
                        
-                       int j = 0;  float largest = -10;
-                       //find largest sim value
-                       for (int k = 0; k < IS[i].size(); k++) {
-                               //is this score larger
-                               if (IS[i][k].score > largest) {
-                                       j = k;
-                                       largest = IS[i][k].score;
+               for (int k = 0; k < IS.size(); k++) {
+                       out << IS[k].score << '\t'; 
+               }
+               out << endl;
+               
+               if (svg) {
+                       if (name != "") { //if user has specific names
+                               map<string, string>::iterator it = names.find(querySeq->getName());
+                               
+                               if (it != names.end()) { //user wants pic of this
+                                       makeSVGpic(IS);  //zeros out negative results
                                }
+                       }else{//output them all
+                               makeSVGpic(IS);  //zeros out negative results
                        }
-                       
-                       //find parental similarity
-                       distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
-                       float dist = distCalc->getDist();
-                       
-                       //convert to similarity
-                       dist = (1 - dist) * 100;
-
-                       //warn about parental similarity - if its above 82% may not detect a chimera 
-                       if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly.");  mothurOutEndLine(); }
-                       
-                       int index = ceil(dist);
-                       
-                       if (index == 0) { index=1;  }
-       
-                       //is your DE value higher than the 95%
-                       string chimera;
-                       if (IS[i][j].score > quantile[index-1][4])              {       chimera = "Yes";        }
-                       else                                                                            {       chimera = "No";         }                       
-                       
-                       out << querySeqs[i]->getName() <<  "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
-                       
-                       if (chimera == "Yes") {
-                               mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
-                       }
-                       out << "Improvement Score\t";
-                       
-                       for (int r = 0; r < IS[i].size(); r++) {  out << IS[i][r].score << '\t';  }
-                       out << endl;
-               }*/
+               }
+               
+               return 0;
        }
        catch(exception& e) {
-               errorOut(e, "ChimeraCheckRDP", "print");
+               m->errorOut(e, "ChimeraCheckRDP", "print");
                exit(1);
        }
 }
-
 //***************************************************************************************************************
-void ChimeraCheckRDP::getChimeras() {
+int ChimeraCheckRDP::doPrep() {
        try {
-               
-               //read in query sequences and subject sequences
-               mothurOut("Reading sequences and template file... "); cout.flush();
-               querySeqs = readSeqs(fastafile);
-               //templateSeqs = readSeqs(templateFile);
-               templateDB = new KmerDB(templateFile, kmerSize);
-               mothurOut("Done."); mothurOutEndLine();
-               
-               int numSeqs = querySeqs.size();
-               
-               IS.resize(numSeqs);
-               closest.resize(numSeqs);
-               
-               //break up file if needed
-               int linesPerProcess = numSeqs / processors ;
-               
-               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                       //find breakup of sequences for all times we will Parallelize
-                       if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
-                       else {
-                               //fill line pairs
-                               for (int i = 0; i < (processors-1); i++) {                      
-                                       lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
-                               }
-                               //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
-                               int i = processors - 1;
-                               lines.push_back(new linePair((i*linesPerProcess), numSeqs));
-                       }
-                       
-               #else
-                       lines.push_back(new linePair(0, numSeqs));
-               #endif
+               templateDB = new AlignmentDB(templateFileName, "kmer", kmerSize, 0.0,0.0,0.0,0.0);
+               m->mothurOutEndLine();
                
                kmer = new Kmer(kmerSize);
                
-               //find closest seq to each querySeq
-               for (int i = 0; i < querySeqs.size(); i++) {
-                       closest[i] = templateDB->findClosestSequence(querySeqs[i]);  
-               }
-               
-               //fill seqKmerInfo for query seqs
-               for (int i = 0; i < querySeqs.size(); i++) {
-                       seqKmerInfo[querySeqs[i]->getName()] = kmer->getKmerCounts(querySeqs[i]->getUnaligned());
-               }
-               
-               //fill seqKmerInfo for closest
-               for (int i = 0; i < closest.size(); i++) {
-                       seqKmerInfo[closest[i].getName()] = kmer->getKmerCounts(closest[i].getUnaligned());
-               }
-
-               
-               //for each query find IS value - this should be paralellized, 
-               //but paralellizing may cause you to have to recalculate some seqKmerInfo since the separate processes don't share memory after they split
-               for (int i = 0; i < querySeqs.size(); i++) {
-                       IS[i] = findIS(i);  //fills seqKmerInfo
+               if (name != "") { 
+                       readName(name);  //fills name map with names of seqs the user wants to have .svg for.  
                }
                
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraCheckRDP", "doPrep");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+int ChimeraCheckRDP::getChimeras(Sequence* query) {
+       try {
                
-               //free memory
-               for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];        }
-       
+               IS.clear();
+                               
+               querySeq = query;
                        
+               closest = templateDB->findClosestSequence(query);  
+       
+               IS = findIS(); 
+                                       
+               //determine chimera report cutoff - window score above 95%
+               //getCutoff();  - not very acurate predictor
+               
+               return 0;
        }
        catch(exception& e) {
-               errorOut(e, "ChimeraCheckRDP", "getChimeras");
+               m->errorOut(e, "ChimeraCheckRDP", "getChimeras");
                exit(1);
        }
 }
 //***************************************************************************************************************
-vector<sim> ChimeraCheckRDP::findIS(int query) {
+vector<sim> ChimeraCheckRDP::findIS() {
        try {
                
+               
+               vector< map<int, int> > queryKmerInfo;  //vector of maps - each entry in the vector is a map of the kmers up to that spot in the unaligned seq
+                                                                                               //example:  seqKmerInfo[50] = map containing the kmers found in the first 50 + kmersize characters of ecoli.
+                                                                                               //i chose to store the kmers numbers in a map so you wouldn't have to check for dupilcate entries and could easily find the 
+                                                                                               //kmers 2 seqs had in common.  There may be a better way to do this thats why I am leaving so many comments...
+               vector< map<int, int> > subjectKmerInfo;
+               
                vector<sim>  isValues;
-               string queryName = querySeqs[query]->getName();
-               string seq = querySeqs[query]->getUnaligned();
+               string queryName = querySeq->getName();
+               string seq = querySeq->getUnaligned();
                
-               mothurOut("Finding IS values for sequence " + query); mothurOutEndLine();
+               queryKmerInfo = kmer->getKmerCounts(seq);
+               subjectKmerInfo = kmer->getKmerCounts(closest.getUnaligned());
                
                //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each
-               int nTotal = calcKmers(seqKmerInfo[queryName][(seqKmerInfo[queryName].size()-1)], seqKmerInfo[closest[query].getName()][(seqKmerInfo[closest[query].getName()].size()-1)]);
-               
+               int nTotal = calcKmers(queryKmerInfo[(queryKmerInfo.size()-1)], subjectKmerInfo[(subjectKmerInfo.size()-1)]);
+
                //you don't want the starting point to be virtually at hte end so move it in 10%
                int start = seq.length() / 10;
                        
                //for each window
-               for (int m = start; m < (seq.length() - start); m+=increment) {
+               for (int f = start; f < (seq.length() - start); f+=increment) {
+                       
+                       if ((f - kmerSize) < 0)  { m->mothurOut("Your sequence is too short for your kmerSize."); m->mothurOutEndLine(); exit(1); }
                        
                        sim temp;
                        
-                       string fragLeft = seq.substr(0, m);  //left side of breakpoint
-                       string fragRight = seq.substr(m, seq.length());  //right side of breakpoint
+                       string fragLeft = seq.substr(0, f);  //left side of breakpoint
+                       string fragRight = seq.substr(f);  //right side of breakpoint
                        
                        //make a sequence of the left side and right side
                        Sequence* left = new Sequence(queryName, fragLeft);
@@ -179,77 +137,89 @@ vector<sim> ChimeraCheckRDP::findIS(int query) {
                        
                        //find seqs closest to each fragment
                        Sequence closestLeft = templateDB->findClosestSequence(left); 
+       
                        Sequence closestRight = templateDB->findClosestSequence(right); 
+               
+                       //get kmerinfo for the closest left
+                       vector< map<int, int> > closeLeftKmerInfo = kmer->getKmerCounts(closestLeft.getUnaligned());
                        
-                       map<int, int>::iterator itleft;
-                       map<int, int>::iterator itleftclose;
-                       
-                       //get kmer in the closest seqs
-                       //if it's not found calc kmer info and save, otherwise use already calculated data
-                       //left
-                       it = seqKmerInfo.find(closestLeft.getName());
-                       if (it == seqKmerInfo.end()) {  //you have to calc it
-                               seqKmerInfo[closestLeft.getName()] = kmer->getKmerCounts(closestLeft.getUnaligned());
-                       }
-                       
-                       //right
-                       it = seqKmerInfo.find(closestRight.getName());
-                       if (it == seqKmerInfo.end()) {  //you have to calc it
-                               seqKmerInfo[closestRight.getName()] = kmer->getKmerCounts(closestRight.getUnaligned());
-                       }
+                       //get kmerinfo for the closest right
+                       vector< map<int, int> > closeRightKmerInfo = kmer->getKmerCounts(closestRight.getUnaligned());
                        
                        //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side
                        //iterate through left sides map to subtract the number of times you saw things before you got the the right side
-                       map<int, int> rightside;
-                       for (itleft = seqKmerInfo[queryName][m-kmerSize].begin(); itleft != seqKmerInfo[queryName][m-kmerSize].end(); itleft++) {
-                               int howManyTotal = seqKmerInfo[queryName][seqKmerInfo[queryName].size()-1][itleft->first];   //times that kmer was seen in total
-                               
+                       map<int, int> rightside = queryKmerInfo[queryKmerInfo.size()-1];
+                       for (map<int, int>::iterator itleft = queryKmerInfo[f-kmerSize].begin(); itleft != queryKmerInfo[f-kmerSize].end(); itleft++) {
+                               int howManyTotal = queryKmerInfo[queryKmerInfo.size()-1][itleft->first];   //times that kmer was seen in total
+
                                //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
                                int howmanyright = howManyTotal - itleft->second;
                                
-                               //if any were seen just on the right add that ammount to map
-                               if (howmanyright > 0) {
-                                       rightside[itleft->first] = howmanyright;
+                               //if any were seen just on the left erase
+                               if (howmanyright == 0) {
+                                       rightside.erase(itleft->first);
                                }
                        }
                        
-                       //iterate through left side of the seq closest to the right fragment of query to subtract the number you saw before you reached the right side of the closest right
-                       //this way you can get the map for just the fragment you want to compare and not hte whole sequence
-                       map<int, int> rightsideclose;
-                       for (itleftclose = seqKmerInfo[closestRight.getName()][m-kmerSize].begin(); itleftclose != seqKmerInfo[closestRight.getName()][m-kmerSize].end(); itleftclose++) {
-                               int howManyTotal = seqKmerInfo[closestRight.getName()][seqKmerInfo[closestRight.getName()].size()-1][itleftclose->first];   //times that kmer was seen in total
-                               
+                       map<int, int> closerightside = closeRightKmerInfo[closeRightKmerInfo.size()-1];
+                       for (map<int, int>::iterator itright = closeRightKmerInfo[f-kmerSize].begin(); itright != closeRightKmerInfo[f-kmerSize].end(); itright++) {
+                               int howManyTotal = closeRightKmerInfo[(closeRightKmerInfo.size()-1)][itright->first];   //times that kmer was seen in total
+
                                //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
-                               int howmanyright = howManyTotal - itleftclose->second;
+                               int howmanyright = howManyTotal - itright->second;
                                
-                               //if any were seen just on the right add that ammount to map
-                               if (howmanyright > 0) {
-                                       rightsideclose[itleftclose->first] = howmanyright;
+                               //if any were seen just on the left erase
+                               if (howmanyright == 0) {
+                                       closerightside.erase(itright->first);
                                }
                        }
 
-                       int nLeft = calcKmers(seqKmerInfo[closestLeft.getName()][m-kmerSize], seqKmerInfo[queryName][m-kmerSize]);
-                       int nRight = calcKmers(rightsideclose, rightside);
                        
+                       int nLeft = calcKmers(closeLeftKmerInfo[f-kmerSize], queryKmerInfo[f-kmerSize]);
+
+                       int nRight = calcKmers(closerightside, rightside);
+
                        int is = nLeft + nRight - nTotal;
-                       
+
                        //save IS, leftparent, rightparent, breakpoint
                        temp.leftParent = closestLeft.getName();
                        temp.rightParent = closestRight.getName();
                        temp.score = is;
-                       temp.midpoint = m;
+                       temp.midpoint = f;
                        
                        isValues.push_back(temp);
                        
                        delete left;
                        delete right;
-               }       
+               }
                
                return isValues;
        
        }
        catch(exception& e) {
-               errorOut(e, "ChimeraCheckRDP", "findIS");
+               m->errorOut(e, "ChimeraCheckRDP", "findIS");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+void ChimeraCheckRDP::readName(string namefile) {
+       try{
+               ifstream in;
+               openInputFile(namefile, in);
+               string name;
+               
+               while (!in.eof()) {
+                       
+                       in >> name;
+                       
+                       names[name] = name;
+                       
+                       gobble(in);
+               }
+       
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraCheckRDP", "readName");
                exit(1);
        }
 }
@@ -278,7 +248,7 @@ int ChimeraCheckRDP::calcKmers(map<int, int> query, map<int, int> subject) {
                                large = query.find(small->first);
                                
                                //if you found it they have that kmer in common
-                               if (large != query.end()) {             common++;       }
+                               if (large != query.end()) {             common++;        }
                        }
                }
                
@@ -286,10 +256,68 @@ int ChimeraCheckRDP::calcKmers(map<int, int> query, map<int, int> subject) {
                
        }
        catch(exception& e) {
-               errorOut(e, "ChimeraCheckRDP", "calcKmers");
+               m->errorOut(e, "ChimeraCheckRDP", "calcKmers");
                exit(1);
        }
 }
 
 //***************************************************************************************************************
+void ChimeraCheckRDP::makeSVGpic(vector<sim> info) {
+       try{
+               
+               string file = outputDir + querySeq->getName() + ".chimeracheck.svg";
+               ofstream outsvg;
+               openOutputFile(file, outsvg);
+               
+               int width = (info.size()*5) + 150;
+               
+               outsvg << "<svg xmlns:svg=\"http://www.w3.org/2000/svg\" xmlns=\"http://www.w3.org/2000/svg\" width=\"100%\" height=\"100%\" viewBox=\"0 0 700 " + toString(width) + "\">\n";
+               outsvg << "<g>\n";
+               outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"25\">Plotted IS values for " + querySeq->getName() + "</text>\n";
+               
+               outsvg <<  "<line x1=\"75\" y1=\"600\" x2=\"" + toString((info.size()*5) + 75) + "\" y2=\"600\" stroke=\"black\" stroke-width=\"2\"/>\n";  
+               outsvg <<  "<line x1=\"75\" y1=\"600\" x2=\"75\" y2=\"125\" stroke=\"black\" stroke-width=\"2\"/>\n";
+               
+               outsvg << "<text fill=\"black\" class=\"seri\" x=\"80\" y=\"620\">" + toString(info[0].midpoint) + "</text>\n";
+               outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((info.size()*5) + 75) + "\" y=\"620\">" + toString(info[info.size()-1].midpoint) + "</text>\n";
+               outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString((width / 2) - 150) + "\" y=\"650\">Base Positions</text>\n";
+               
+               outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"580\">0</text>\n";
+               
+               outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"350\">IS</text>\n";
+               
+               
+               //find max is score
+               float biggest = 0.0;
+               for (int i = 0; i < info.size(); i++) {
+                       if (info[i].score > biggest)  {
+                               biggest = info[i].score;
+                       }
+               }
+               
+               outsvg << "<text fill=\"black\" class=\"seri\" x=\"50\" y=\"135\">" + toString(biggest) + "</text>\n";
+               
+               int scaler2 = 500 / biggest;
+               
+               
+               outsvg << "<polyline fill=\"none\" stroke=\"red\" stroke-width=\"2\" points=\"";
+               //160,200 180,230 200,210 234,220\"/> "; 
+               for (int i = 0; i < info.size(); i++) {
+                       if(info[i].score < 0) { info[i].score = 0; }
+                       outsvg << ((i*5) + 75) << "," << (600 - (info[i].score * scaler2)) << " ";
+               }
+               
+               outsvg << "\"/> ";
+               outsvg << "</g>\n</svg>\n";
+               
+               outsvg.close();
+
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraCheckRDP", "makeSVGpic");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+