checking in chimera files in progress after move to michigan

author westcott <westcott>

Mon, 31 Aug 2009 14:33:57 +0000 (14:33 +0000)

committer westcott <westcott>

Mon, 31 Aug 2009 14:33:57 +0000 (14:33 +0000)
author westcott <westcott>
Mon, 31 Aug 2009 14:33:57 +0000 (14:33 +0000)
committer westcott <westcott>
Mon, 31 Aug 2009 14:33:57 +0000 (14:33 +0000)
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index c4f38cbbae3c035acd003a324fdaa293b0aae419..c6d6c0ed29fa33724df4aae7934582ad63b14ddd 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -162,6 +162,8 @@
                 A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; };
                 A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; };
                 A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
+               A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B8879104C16860083C454 /* alignedsimilarity.cpp */; };
+               A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
                 EB1216880F619B83004A865F /* bergerparker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216870F619B83004A865F /* bergerparker.cpp */; };
                 EB1216E50F61ACFB004A865F /* bstick.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216E40F61ACFB004A865F /* bstick.cpp */; };
                 EB1217230F61C9AC004A865F /* sharedkstest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1217220F61C9AC004A865F /* sharedkstest.cpp */; };
@@ -513,6 +515,10 @@
                 A70B53A70F4CD7AD0064797E /* getlabelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlabelcommand.h; sourceTree = SOURCE_ROOT; };
                 A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlinecommand.cpp; sourceTree = SOURCE_ROOT; };
                 A70B53A90F4CD7AD0064797E /* getlinecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlinecommand.h; sourceTree = SOURCE_ROOT; };
+               A75B8879104C16860083C454 /* alignedsimilarity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignedsimilarity.cpp; sourceTree = "<group>"; };
+               A75B887A104C16860083C454 /* alignedsimilarity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignedsimilarity.h; sourceTree = "<group>"; };
+               A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = "<group>"; };
+               A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = "<group>"; };
                 C6859E8B029090EE04C91782 /* Mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = Mothur.1; sourceTree = "<group>"; };
                 EB1216860F619B83004A865F /* bergerparker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bergerparker.h; sourceTree = SOURCE_ROOT; };
                 EB1216870F619B83004A865F /* bergerparker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bergerparker.cpp; sourceTree = SOURCE_ROOT; };
@@ -645,6 +651,10 @@
                                 372095C1103196D70004D347 /* chimera.cpp */,
                                 374F2FE9100634B600E97C66 /* bellerophon.h */,
                                 374F2FEA100634B600E97C66 /* bellerophon.cpp */,
+                               A75B8879104C16860083C454 /* alignedsimilarity.cpp */,
+                               A75B887A104C16860083C454 /* alignedsimilarity.h */,
+                               A75B887B104C16860083C454 /* ccode.cpp */,
+                               A75B887C104C16860083C454 /* ccode.h */,
                                 372A55531017922B00C5194B /* decalc.h */,
                                 372A55541017922B00C5194B /* decalc.cpp */,
                                 374F301110063B6F00E97C66 /* pintail.h */,
@@ -1167,6 +1177,8 @@
                                 374F301310063B6F00E97C66 /* pintail.cpp in Sources */,
                                 372A55551017922B00C5194B /* decalc.cpp in Sources */,
                                 372095C2103196D70004D347 /* chimera.cpp in Sources */,
+                               A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */,
+                               A75B887E104C16860083C454 /* ccode.cpp in Sources */,
                         );
                         runOnlyForDeploymentPostprocessing = 0;
                 };
diff --git a/alignedsimilarity.cpp b/alignedsimilarity.cpp

new file mode 100644 (file)

index 0000000..0ed9ce3
--- /dev/null
+++ b/alignedsimilarity.cpp
@@ -0,0 +1,543 @@
+/*
+ *  alignedsimilarity.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 8/18/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "alignedsimilarity.h"
+#include "ignoregaps.h"
+
+
+//***************************************************************************************************************
+AlignSim::AlignSim(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
+//***************************************************************************************************************
+
+AlignSim::~AlignSim() {
+       try {
+               for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
+               for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
+               delete distCalc;
+       }
+       catch(exception& e) {
+               errorOut(e, "AlignSim", "~AlignSim");
+               exit(1);
+       }
+}      
+//***************************************************************************************************************
+void AlignSim::print(ostream& out) {
+       try {
+               
+               mothurOutEndLine();
+               
+               for (int i = 0; i < querySeqs.size(); i++) {
+                       
+                       int j = 0;  float largest = -10;
+                       //find largest sim value
+                       for (int k = 0; k < IS[i].size(); k++) {
+                               //is this score larger
+                               if (IS[i][k].score > largest) {
+                                       j = k;
+                                       largest = IS[i][k].score;
+                               }
+                       }
+                       
+                       //find parental similarity
+                       distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
+                       float dist = distCalc->getDist();
+                       
+                       //convert to similarity
+                       dist = (1 - dist) * 100;
+
+                       //warn about parental similarity - if its above 82% may not detect a chimera 
+                       if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly.");  mothurOutEndLine(); }
+                       
+                       int index = ceil(dist);
+                       
+                       if (index == 0) { index=1;  }
+       
+                       //is your DE value higher than the 95%
+                       string chimera;
+                       if (IS[i][j].score > quantile[index-1][4])              {       chimera = "Yes";        }
+                       else                                                                            {       chimera = "No";         }                       
+                       
+                       out << querySeqs[i]->getName() <<  "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
+                       
+                       if (chimera == "Yes") {
+                               mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
+                       }
+                       out << "Improvement Score\t";
+                       
+                       for (int r = 0; r < IS[i].size(); r++) {  out << IS[i][r].score << '\t';  }
+                       out << endl;
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "AlignSim", "print");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+void AlignSim::getChimeras() {
+       try {
+               
+               //read in query sequences and subject sequences
+               mothurOut("Reading sequences and template file... "); cout.flush();
+               querySeqs = readSeqs(fastafile);
+               templateSeqs = readSeqs(templateFile);
+               mothurOut("Done."); mothurOutEndLine();
+               
+               int numSeqs = querySeqs.size();
+               
+               IS.resize(numSeqs);
+               
+               //break up file if needed
+               int linesPerProcess = numSeqs / processors ;
+               
+               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       //find breakup of sequences for all times we will Parallelize
+                       if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
+                       else {
+                               //fill line pairs
+                               for (int i = 0; i < (processors-1); i++) {                      
+                                       lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+                               }
+                               //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+                               int i = processors - 1;
+                               lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+                       }
+                       
+                       //find breakup of templatefile for quantiles
+                       if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
+                       else { 
+                               for (int i = 0; i < processors; i++) {
+                                       templateLines.push_back(new linePair());
+                                       templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
+                                       templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
+                               }
+                       }
+               #else
+                       lines.push_back(new linePair(0, numSeqs));
+                       templateLines.push_back(new linePair(0, templateSeqs.size()));
+               #endif
+       
+       
+               distCalc = new ignoreGaps();
+               
+               //find window breaks
+               windowBreak = findWindows();
+//for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t';  }
+//cout << endl;
+
+               mothurOut("Finding the IS values for your sequences..."); cout.flush();
+               if (processors == 1) { 
+                       IS = findIS(lines[0]->start, lines[0]->end, querySeqs);
+               }else {         IS = createProcessesIS(querySeqs, lines);               }
+               mothurOut("Done."); mothurOutEndLine();
+               
+               //quantiles are used to determine whether the de values found indicate a chimera
+               if (quanfile != "") {  quantile = readQuantiles();  }
+               else {
+                       
+                       mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
+                       //get IS quantiles as a reference to these IS scores
+                       //you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see
+                       if (processors == 1) { 
+                               templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs);
+                       }else {         templateIS = createProcessesIS(templateSeqs, templateLines);            }
+//cout << "here" << endl;                      
+                       ofstream out4;
+                       string o;
+                       
+                       o = getRootName(templateFile) + "alignedsimilarity.quan";
+                       
+                       openOutputFile(o, out4);
+                       
+                       //adjust quantiles
+                       //construct table
+                       quantile.resize(100);
+               
+                       for (int i = 0; i < templateIS.size(); i++) {
+                               
+                               if (templateIS[i].size() != 0) {
+                                       int j = 0; float score = -1000;
+                                       //find highest IS score
+       //cout << templateIS[i].size() << endl;
+                                       for (int k = 0; k < templateIS[i].size(); k++) {
+                                               if (templateIS[i][k].score > score) {
+                                                       score = templateIS[i][k].score;
+                                                       j = k;
+                                               }
+                                       }
+       //cout << j << endl;            
+                                       //find similarity of parents
+                                       distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent));
+                                       float dist = distCalc->getDist();
+                       
+                                       //convert to similarity
+                                       dist = (1 - dist) * 100;
+       //      cout << dist << endl;   
+                                       int index = ceil(dist);
+       //      cout << "index = " << index << endl;    
+                                       if (index == 0) {       index = 1;      }
+                                       quantile[index-1].push_back(templateIS[i][j].score);
+                               }
+                       }
+               
+               
+                       for (int i = 0; i < quantile.size(); i++) {
+                               vector<float> temp;
+                               
+                               if (quantile[i].size() == 0) {
+                                       //in case this is not a distance found in your template files
+                                       for (int g = 0; g < 6; g++) {
+                                               temp.push_back(0.0);
+                                       }
+                               }else{
+                                       
+                                       sort(quantile[i].begin(), quantile[i].end());
+                                       
+                                       //save 10%
+                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]);
+                                       //save 25%
+                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]);
+                                       //save 50%
+                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]);
+                                       //save 75%
+                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]);
+                                       //save 95%
+                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]);
+                                       //save 99%
+                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]);
+                                       
+                               }
+                               
+                               //output quan value
+                               out4 << i+1 << '\t';                            
+                               for (int u = 0; u < temp.size(); u++) {   out4 << temp[u] << '\t'; }
+                               out4 << endl;
+                               
+                               quantile[i] = temp;
+                               
+                       }
+                       
+                       out4.close();
+                       
+                       mothurOut("Done."); mothurOutEndLine();
+
+               }
+               
+               //free memory
+               for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
+               for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
+                       
+       }
+       catch(exception& e) {
+               errorOut(e, "AlignSim", "getChimeras");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+vector<int> AlignSim::findWindows() {
+       try {
+               
+               vector<int> win; 
+               
+               if (increment > querySeqs[0]->getAligned().length()) {  mothurOut("You have selected an increment larger than the length of your sequences.  I will use the default of 25.");  increment = 25; }
+               
+               for (int m = increment;  m < (querySeqs[0]->getAligned().length() - increment); m+=increment) {  win.push_back(m);  }
+
+               return win;
+       
+       }
+       catch(exception& e) {
+               errorOut(e, "AlignSim", "findWindows");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+vector< vector<sim>  > AlignSim::findIS(int start, int end, vector<Sequence*> seqs) {
+       try {
+               
+               vector< vector<sim> >  isValues;
+               isValues.resize(seqs.size());
+               
+               //for each sequence
+               for(int i = start; i < end; i++){
+                       
+                       vector<sim> temp;  temp.resize(windowBreak.size());
+                       
+                       mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine();
+                       
+                       //for each window
+                       for (int m = 0; m < windowBreak.size(); m++) {
+                               
+                               vector<Sequence*> closest;  //left, right, overall
+                               vector<int> similarity; //left+right and overall
+                               
+                               //find closest left, closest right and closest overall
+                               closest = findClosestSides(seqs[i], windowBreak[m], similarity, i);
+                               
+                               int totalNumBases = seqs[i]->getUnaligned().length();
+                               
+                               //IS = left+right-overall
+                               float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases);
+                               
+                               ///save IS, leftparent, rightparent, breakpoint
+                               temp[m].leftParent = closest[0];
+                               temp[m].rightParent = closest[1];
+                               temp[m].score = is;
+                               temp[m].midpoint = windowBreak[m];
+//cout << is << '\t';
+                       }
+//cout << endl;                        
+                       isValues[i] = temp;
+               
+               }
+               
+               return isValues;
+       
+       }
+       catch(exception& e) {
+               errorOut(e, "AlignSim", "findIS");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+vector<Sequence*> AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector<int>& sim, int i) {
+       try{
+       
+               vector<Sequence*> closest;
+               
+               Sequence query, queryLeft, queryRight;
+               string frag = seq->getAligned();
+               string fragLeft = frag.substr(0, breakpoint);
+               string fragRight = frag.substr(breakpoint, frag.length());
+               
+               //get pieces
+               query = *(seq);  
+               queryLeft = *(seq);  
+               queryRight = *(seq);
+               queryLeft.setAligned(fragLeft);
+               queryRight.setAligned(fragRight);
+               
+               //initialize
+               Sequence* overall = templateSeqs[0];
+               Sequence* left = templateSeqs[0];
+               Sequence* right = templateSeqs[0];
+               
+               float smallestOverall, smallestLeft, smallestRight;
+               smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
+               
+               //go through the templateSeqs and search for the closest
+               for(int j = 0; j < templateSeqs.size(); j++){
+                       
+                       //so you don't pick yourself
+                       if (j != i) {
+                               Sequence temp, tempLeft, tempRight;
+                               string fragTemp = templateSeqs[j]->getAligned();
+                               string fragTempLeft = fragTemp.substr(0, breakpoint);
+                               string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length());
+                               
+                               //get pieces
+                               temp = *(templateSeqs[j]); 
+                               tempLeft = *(templateSeqs[j]); 
+                               tempRight = *(templateSeqs[j]);
+                               tempLeft.setAligned(fragTempLeft);
+                               tempRight.setAligned(fragTempRight);
+                               
+                               //find overall dist
+                               distCalc->calcDist(query, temp);
+                               float dist = distCalc->getDist();       
+                               
+                               if (dist < smallestOverall) { 
+                                       overall = templateSeqs[j];
+                                       smallestOverall = dist;
+                               }
+                               
+                               //find left dist
+                               distCalc->calcDist(queryLeft, tempLeft);
+                               dist = distCalc->getDist();
+                               
+                               if (dist < smallestLeft) { 
+                                       left = templateSeqs[j];
+                                       smallestLeft = dist;
+                               }
+                               
+                               //find left dist
+                               distCalc->calcDist(queryRight, tempRight);
+                               dist = distCalc->getDist();
+                               
+                               if (dist < smallestRight) { 
+                                       right = templateSeqs[j];
+                                       smallestRight = dist;
+                               }
+                       }
+
+               }
+                       
+               closest.push_back(left);
+               closest.push_back(right);
+               closest.push_back(overall);
+               
+               //fill sim with number of matched bases
+               Sequence tempL = *(left);
+               string tempLFrag = tempL.getAligned();
+               tempL.setAligned(tempLFrag.substr(0, breakpoint));
+               
+               int bothMatches = findNumMatchedBases(queryLeft, tempL);
+               sim.push_back(bothMatches);
+
+               Sequence tempR = *(right);
+               string tempRFrag = tempR.getAligned();
+               tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length()));
+               
+               bothMatches = findNumMatchedBases(queryRight, tempR);
+               sim.push_back(bothMatches);
+
+               bothMatches = findNumMatchedBases(query, *(overall));
+               sim.push_back(bothMatches);
+                       
+               return closest;
+
+
+       }
+       catch(exception& e) {
+               errorOut(e, "AlignSim", "findClosestSides");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+
+int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) {
+       try{
+       
+               int num = 0;
+               
+               string query = seq.getAligned();
+               string subject = temp.getAligned();
+//cout << seq.getName() << endl << endl;
+//cout << temp.getName()  << endl << endl;
+               
+               for (int i = 0; i < query.length(); i++) {
+                       //count matches if query[i] is a base and subject is same bases or gap
+                       if (isalpha(query[i])) {
+                               if(!isalpha(subject[i])) {  num++;  } 
+                               else if (query[i] == subject[i])  { num++;  }
+                               else {  }
+                       }
+               }
+//cout << "num = " << num << endl;             
+               return num;
+       }
+       catch(exception& e) {
+               errorOut(e, "AlignSim", "findNumMatchedBases");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+vector< vector<sim> > AlignSim::createProcessesIS(vector<Sequence*> seqs, vector<linePair*> linesToProcess) {
+       try {
+       vector< vector<sim> > localIs; localIs.resize(seqs.size());
+       
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               int process = 0;
+               vector<int> processIDS;
+               
+               //loop through and create all the processes you want
+               while (process != processors) {
+                       int pid = fork();
+                       
+                       if (pid > 0) {
+                               processIDS.push_back(pid);  
+                               process++;
+                       }else if (pid == 0){
+                               
+                               mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
+                               localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs);
+                               mothurOut("Done finding IS values for sequences " +  toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
+                               
+                               //write out data to file so parent can read it
+                               ofstream out;
+                               string s = toString(getpid()) + ".temp";
+                               openOutputFile(s, out);
+                               
+                               //output pairs
+                               for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) {
+                                        out << localIs[i].size() << endl;
+                                        for (int j = 0; j < localIs[i].size(); j++) {
+                                               localIs[i][j].leftParent->printSequence(out);
+                                               localIs[i][j].rightParent->printSequence(out);
+                                               out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl;
+                                        }
+                               }
+                               out.close();
+                               
+                               exit(0);
+                       }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+               }
+               
+               //force parent to wait until all the processes are done
+               for (int i=0;i<processors;i++) { 
+                       int temp = processIDS[i];
+                       wait(&temp);
+               }
+               
+               //get data created by processes
+               for (int i=0;i<processors;i++) { 
+                       ifstream in;
+                       string s = toString(processIDS[i]) + ".temp";
+                       openInputFile(s, in);
+                       
+                       //get pairs
+                       for (int k = linesToProcess[i]->start; k < linesToProcess[i]->end; k++) {
+                               int size;
+                               in >> size;
+                               gobble(in);
+                               
+                               vector<sim> tempVector;
+                               
+                               for (int j = 0; j < size; j++) {
+                               
+                                       sim temp;
+                                       
+                                       temp.leftParent = new Sequence(in);
+                                       gobble(in);
+       //temp.leftParent->printSequence(cout);                         
+                                       temp.rightParent = new Sequence(in);
+                                       gobble(in);
+//temp.rightParent->printSequence(cout);
+                                       string throwaway;
+                                       in >> throwaway >> temp.score >> temp.midpoint;
+       //cout << temp.score << '\t' << temp.midpoint << endl;                          
+                                       gobble(in);
+                                       
+                                       tempVector.push_back(temp);
+                               }
+                               
+                               localIs[k] = tempVector;
+                       }
+                       
+                       in.close();
+                       remove(s.c_str());
+               }
+                       
+       
+#else
+               localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs);
+#endif 
+
+               return localIs;
+       }
+       catch(exception& e) {
+               errorOut(e, "AlignSim", "createProcessesIS");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
diff --git a/alignedsimilarity.h b/alignedsimilarity.h

new file mode 100644 (file)

index 0000000..716b8b7
--- /dev/null
+++ b/alignedsimilarity.h
@@ -0,0 +1,67 @@
+#ifndef ALIGNSIM_H
+#define ALIGNSIM_H
+
+/*
+ *  alignedsimilarity.h
+ *  Mothur
+ *
+ *  Created by westcott on 8/18/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "chimera.h"
+#include "dist.h"
+
+//This class was created using the algorythms described in the 
+//"Evaluation of Nearest Neighbor Methods for Detection of Chimeric Small-Subunit rRna Sequences" paper 
+//by J.F. Robison-Cox 1, M.M. Bateson 2 and D.M. Ward 2
+
+
+/***********************************************************/
+
+class AlignSim : public Chimera {
+       
+       public:
+               AlignSim(string, string);       
+               ~AlignSim();
+               
+               void getChimeras();
+               void print(ostream&);
+               
+               void setCons(string){};
+               void setQuantiles(string q) { quanfile = q; };
+               
+               
+       private:
+               
+               Dist* distCalc;
+               vector<linePair*> lines;
+               vector<linePair*> templateLines;
+               
+               vector<Sequence*> querySeqs;
+               vector<Sequence*> templateSeqs;
+               
+               vector< vector<sim> > IS;  //IS[0] is the vector os sim values for each window for querySeqs[0]
+               vector< vector<sim> > templateIS;  //templateIS[0] is the vector os sim values for each window for templateSeqs[0]
+               vector<int> windowBreak;
+               
+               string fastafile, templateFile;
+               int iters;
+               
+               vector< vector<sim> > findIS(int, int, vector<Sequence*>);
+               vector<Sequence*> findClosestSides(Sequence*, int, vector<int>&, int);
+               int findNumMatchedBases(Sequence, Sequence);
+               vector<int> findWindows();
+               
+               vector< vector<float> > quantile;
+               
+               vector< vector<sim> > createProcessesIS(vector<Sequence*>, vector<linePair*>);
+               
+};
+
+/***********************************************************/
+
+#endif
+
diff --git a/bellerophon.cpp b/bellerophon.cpp

index dc021c064c27bf2550da911589cc51fd5f381458..2ce395189520a83837805a8322f2619a5f227508 100644 (file)
--- a/bellerophon.cpp
+++ b/bellerophon.cpp
@@ -35,7 +35,7 @@ void Bellerophon::print(ostream& out) {
                         out << pref[i].name << '\t' << setprecision(3) << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
                         
                         //calc # of seqs with preference above 1.0
-                       if (pref[i].score[0] > 1.0) { 
+                       if (pref[i].score[0] > 1.1) { 
                                 above1++; 
                                 mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
                                 mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
@@ -44,7 +44,7 @@ void Bellerophon::print(ostream& out) {
                 
                 //output results to screen
                 mothurOutEndLine();
-               mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
+               mothurOut("Sequence with preference score above 1.1: " + toString(above1)); mothurOutEndLine();
                 int spot;
                 spot = pref.size()-1;
                 mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
@@ -115,7 +115,7 @@ void Bellerophon::getChimeras() {
                                 
                         }else{ increment = 0; }
                 }
-cout << "increment = " << increment << endl;           
+               
                 if (increment == 0) { iters = 1; }
                 else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); }
                 
@@ -193,7 +193,6 @@ cout << "increment = " << increment << endl;
                                 delete SparseLeft;
                                 delete SparseRight;
                                 
-                               
                                 //fill preference structure
                                 generatePreferences(distMapLeft, distMapRight, midpoint);
                                 
@@ -204,7 +203,7 @@ cout << "increment = " << increment << endl;
                 delete distCalculator;
                 
                 //rank preference score to eachother
-               float dme = 0.0;
+               /*float dme = 0.0;
                 float expectedPercent = 1 / (float) (pref.size());
                 
                 for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[0];  }
@@ -216,10 +215,8 @@ cout << "increment = " << increment << endl;
                         
                         //how much higher or lower is this than expected
                         pref[i].score[0] = pref[i].score[0] / expectedPercent;
-                       
-                       
-                       
-               }
+               
+               }*/
                 
                 //sort Preferences highest to lowest
                 sort(pref.begin(), pref.end(), comparePref);
@@ -337,23 +334,25 @@ void Bellerophon::generatePreferences(vector<SeqMap> left, vector<SeqMap> right,
                 
                   
                 //calculate the dme
-               
                 int count0 = 0;
-               for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) { count0++; }  }
+               for (int i = 0; i < pref.size(); i++) {  dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) {  count0++; }  }
                 
-               float expectedPercent = 1 / (float) (pref.size() - count0);
+               //float expectedPercent = 1 / (float) (pref.size() - count0);
  //cout << endl << "dme = " << dme << endl;
                 //recalculate prefernences based on dme
                 for (int i = 0; i < pref.size(); i++) {
-//cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;        
+//cout << "unadjusted col[i] " << i << " = " << pref[i].score[1] << endl;      
+//cout << i << '\t' <<  (dme / (dme - 2 * pref[i].score[1])) << endl;
+                       
                         // gives the actual percentage of the dme this seq adds
-                       pref[i].score[1] = pref[i].score[1] / dme;
+                       //pref[i].score[1] = pref[i].score[1] / dme;
                         
                         //how much higher or lower is this than expected
-                       pref[i].score[1] = pref[i].score[1] / expectedPercent;
-                       
-                       //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
+                       //pref[i].score[1] = pref[i].score[1] / expectedPercent;
                         
+                       //not 2 * pref[i].score[1] because i only calulate the pref scores once.
+                       pref[i].score[1] = dme / (dme - pref[i].score[1]);
+//cout << i << '\t' << pref[i].score[1] << endl;
                         //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
  //cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;                                  
                 }
diff --git a/bellerophon.h b/bellerophon.h

index bc86c183bb2da0594568dc5949a244115273bea6..b309545e29c18d3ed3669e6e70df3243754551ca 100644 (file)
--- a/bellerophon.h
+++ b/bellerophon.h
@@ -13,20 +13,12 @@
  
  #include "chimera.h"
  #include "filterseqscommand.h"
+#include "sparsematrix.hpp"
  #include "sequence.hpp"
  #include "dist.h"
  
-struct Preference {
-               string name;
-               vector<string> leftParent; //keep the name of closest left associated with the two scores
-               vector<string> rightParent; //keep the name of closest right associated with the two scores
-               vector<float> score;  //so you can keep last score and calc this score and keep whichever is bigger.
-               vector<float> closestLeft;  //keep the closest left associated with the two scores
-               vector<float> closestRight; //keep the closest right associated with the two scores
-               int midpoint;
-
-};
-
+typedef list<PCell>::iterator MatData;
+typedef map<int, float> SeqMap;  //maps sequence to all distance for that seqeunce
  
  /***********************************************************/
  
diff --git a/ccode.cpp b/ccode.cpp

new file mode 100644 (file)

index 0000000..a3ef5e6
--- /dev/null
+++ b/ccode.cpp
@@ -0,0 +1,272 @@
+/*
+ *  ccode.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 8/24/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "ccode.h"
+#include "ignoregaps.h"
+
+
+//***************************************************************************************************************
+Ccode::Ccode(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
+//***************************************************************************************************************
+
+Ccode::~Ccode() {
+       try {
+               for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
+               for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
+               delete distCalc;
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "~Ccode");
+               exit(1);
+       }
+}      
+//***************************************************************************************************************
+void Ccode::print(ostream& out) {
+       try {
+               
+               mothurOutEndLine();
+               
+               for (int i = 0; i < querySeqs.size(); i++) {
+                       
+               }
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "print");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+void Ccode::getChimeras() {
+       try {
+               
+               //read in query sequences and subject sequences
+               mothurOut("Reading sequences and template file... "); cout.flush();
+               querySeqs = readSeqs(fastafile);
+               templateSeqs = readSeqs(templateFile);
+               mothurOut("Done."); mothurOutEndLine();
+               
+               int numSeqs = querySeqs.size();
+               
+               closest.resize(numSeqs);
+               
+               //break up file if needed
+               int linesPerProcess = numSeqs / processors ;
+               
+               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       //find breakup of sequences for all times we will Parallelize
+                       if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
+                       else {
+                               //fill line pairs
+                               for (int i = 0; i < (processors-1); i++) {                      
+                                       lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+                               }
+                               //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+                               int i = processors - 1;
+                               lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+                       }
+                       
+                       //find breakup of templatefile for quantiles
+                       if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
+                       else { 
+                               for (int i = 0; i < processors; i++) {
+                                       templateLines.push_back(new linePair());
+                                       templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
+                                       templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
+                               }
+                       }
+               #else
+                       lines.push_back(new linePair(0, numSeqs));
+                       templateLines.push_back(new linePair(0, templateSeqs.size()));
+               #endif
+       
+               distCalc = new ignoreGaps();
+               
+               //find closest
+               if (processors == 1) { 
+                       mothurOut("Finding top matches for sequences... "); cout.flush();
+                       closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
+                       mothurOut("Done."); mothurOutEndLine();
+               }else {         createProcessesClosest();               }
+
+               
+               for (int i = 0; i < closest.size(); i++) {
+                       cout << querySeqs[i]->getName() << ": ";
+                       for (int j = 0; j < closest[i].size(); j++) {
+                       
+                               cout << closest[i][j]->getName() << '\t';
+                       }
+                       cout << endl;
+               }
+                       
+               //free memory
+               for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
+               for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
+                       
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "getChimeras");
+               exit(1);
+       }
+}
+/***************************************************************************************************************
+vector<int> Ccode::findWindows() {
+       try {
+               
+               vector<int> win; 
+               
+               if (increment > querySeqs[0]->getAligned().length()) {  mothurOut("You have selected an increment larger than the length of your sequences.  I will use the default of 25.");  increment = 25; }
+               
+               for (int m = increment;  m < (querySeqs[0]->getAligned().length() - increment); m+=increment) {  win.push_back(m);  }
+
+               return win;
+       
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "findWindows");
+               exit(1);
+       }
+}
+*/
+//***************************************************************************************************************
+vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted) {
+       try{
+       
+               vector< vector<Sequence*> > topMatches;  topMatches.resize(querySeqs.size());
+       
+               float smallestOverall, smallestLeft, smallestRight;
+               smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
+               
+               //for each sequence in querySeqs - find top matches to use as reference
+               for(int j = start; j < end; j++){
+                       
+                       Sequence query = *(querySeqs[j]);
+                       
+                       vector<SeqDist> distances;
+                       
+                       //calc distance to each sequence in template seqs
+                       for (int i = 0; i < templateSeqs.size(); i++) {
+                       
+                               Sequence ref = *(templateSeqs[i]); 
+                                       
+                               //find overall dist
+                               distCalc->calcDist(query, ref);
+                               float dist = distCalc->getDist();       
+                               
+                               //save distance
+                               SeqDist temp;
+                               temp.seq = templateSeqs[i];
+                               temp.dist = dist;
+
+                               distances.push_back(temp);
+                       }
+                       
+                       sort(distances.begin(), distances.end(), compareSeqDist);
+                       
+                       //save the number of top matches wanted
+                       for (int h = 0; h < numWanted; h++) {
+                               topMatches[j].push_back(distances[h].seq);
+                       }
+               }
+                       
+               return topMatches;
+
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "findClosestSides");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+void Ccode::createProcessesClosest() {
+       try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               int process = 0;
+               vector<int> processIDS;
+               
+               //loop through and create all the processes you want
+               while (process != processors) {
+                       int pid = fork();
+                       
+                       if (pid > 0) {
+                               processIDS.push_back(pid);  
+                               process++;
+                       }else if (pid == 0){
+                               
+                               mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
+                               closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
+                               mothurOut("Done finding top matches for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
+                               
+                               //write out data to file so parent can read it
+                               ofstream out;
+                               string s = toString(getpid()) + ".temp";
+                               openOutputFile(s, out);
+                               
+                               //output pairs
+                               for (int i = lines[process]->start; i < lines[process]->end; i++) {
+                                        out << closest[i].size() << endl;
+                                        for (int j = 0; j < closest[i].size(); j++) {
+                                               closest[i][j]->printSequence(out);
+                                        }
+                               }
+                               out.close();
+                               
+                               exit(0);
+                       }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+               }
+               
+               //force parent to wait until all the processes are done
+               for (int i=0;i<processors;i++) { 
+                       int temp = processIDS[i];
+                       wait(&temp);
+               }
+               
+               //get data created by processes
+               for (int i=0;i<processors;i++) { 
+                       ifstream in;
+                       string s = toString(processIDS[i]) + ".temp";
+                       openInputFile(s, in);
+                       
+                       //get pairs
+                       for (int k = lines[i]->start; k < lines[i]->end; k++) {
+                               int size;
+                               in >> size;
+                               gobble(in);
+                               
+                               vector<Sequence*> tempVector;
+                               
+                               for (int j = 0; j < size; j++) {
+                               
+                                       Sequence* temp = new Sequence(in);
+                                       gobble(in);
+                                               
+                                       tempVector.push_back(temp);
+                               }
+                               
+                               closest[k] = tempVector;
+                       }
+                       
+                       in.close();
+                       remove(s.c_str());
+               }
+                       
+       
+#else
+               closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
+#endif 
+
+       }
+       catch(exception& e) {
+               errorOut(e, "Ccode", "createProcessesClosest");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+
diff --git a/ccode.h b/ccode.h

new file mode 100644 (file)

index 0000000..a4860dc
--- /dev/null
+++ b/ccode.h
@@ -0,0 +1,62 @@
+#ifndef CCODE_H
+#define CCODE_H
+
+/*
+ *  ccode.h
+ *  Mothur
+ *
+ *  Created by westcott on 8/24/09.
+ *  Copyright 2009 Schloss LAB. All rights reserved.
+ *
+ */
+
+#include "chimera.h"
+#include "dist.h"
+#include "decalc.h"
+
+//This class was created using the algorythms described in the 
+// "Evaluating putative chimeric sequences from PCR-amplified products" paper 
+//by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez.
+
+/***********************************************************/
+
+class Ccode : public Chimera {
+       
+       public:
+               Ccode(string, string);  
+               ~Ccode();
+               
+               void getChimeras();
+               void print(ostream&);
+               
+               void setCons(string c)          {}
+               void setQuantiles(string q) {}
+               
+               
+       private:
+       
+               Dist* distCalc;
+               DeCalculator* decalc;
+               int iters;
+               string fastafile, templateFile;
+               
+               
+               vector<linePair*> lines;
+               vector<linePair*> templateLines;
+               vector<Sequence*> querySeqs;
+               vector<Sequence*> templateSeqs;
+               
+               vector< vector<Sequence*> > closest;  //bestfit[0] is a vector of sequence at are closest to queryseqs[0]...
+               
+               vector< vector<Sequence*> > findClosest(int, int, int); 
+               void removeSeqs(vector<Sequence*>);  //removes sequences from closest that are to different of too similar to eachother. 
+               
+               void createProcessesClosest();
+               
+};
+
+/***********************************************************/
+
+#endif
+
+
diff --git a/chimera.cpp b/chimera.cpp

index 48e5763f2dbffe8e19a11e22cb8b897b34317865..8aaab4b12a4c5c261f8a7a4494ca7ef4f12ea923 100644 (file)
--- a/chimera.cpp
+++ b/chimera.cpp
@@ -9,6 +9,76 @@
  
  #include "chimera.h"
  
+//***************************************************************************************************************
+//this is a vertical filter
+void Chimera::createFilter(vector<Sequence*> seqs) {
+       try {
+               
+               int threshold = int (0.5 * seqs.size());
+//cout << "threshhold = " << threshold << endl;
+               
+               vector<int> gaps;       gaps.resize(seqs[0]->getAligned().length(), 0);
+               vector<int> a;          a.resize(seqs[0]->getAligned().length(), 0);
+               vector<int> t;          t.resize(seqs[0]->getAligned().length(), 0);
+               vector<int> g;          g.resize(seqs[0]->getAligned().length(), 0);
+               vector<int> c;          c.resize(seqs[0]->getAligned().length(), 0);
+               
+               filterString = (string(seqs[0]->getAligned().length(), '1'));
+               
+               //for each sequence
+               for (int i = 0; i < seqs.size(); i++) {
+               
+                       string seqAligned = seqs[i]->getAligned();
+                       
+                       for (int j = 0; j < seqAligned.length(); j++) {
+                               //if this spot is a gap
+                               if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))   {       gaps[j]++;      }
+                               else if (toupper(seqAligned[j]) == 'A')                                 {       a[j]++;         }
+                               else if (toupper(seqAligned[j]) == 'T')                                 {       t[j]++;         }
+                               else if (toupper(seqAligned[j]) == 'G')                                 {       g[j]++;         }
+                               else if (toupper(seqAligned[j]) == 'C')                                 {       c[j]++;         }
+                       }
+               }
+               
+               //zero out spot where all sequences have blanks
+               for(int i = 0;i < seqs[0]->getAligned().length(); i++){
+                       if(gaps[i] == seqs.size())      {       filterString[i] = '0';  }
+                       
+                       else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {      filterString[i] = '0';  }
+                       //cout << "a = " << a[i] <<  " t = " << t[i] <<  " g = " << g[i] <<  " c = " << c[i] << endl;
+               }
+//cout << "filter = " << filterString << endl; 
+       }
+       catch(exception& e) {
+               errorOut(e, "Chimera", "createFilter");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+void Chimera::runFilter(vector<Sequence*> seqs) {
+       try {
+               
+               //for each sequence
+               for (int i = 0; i < seqs.size(); i++) {
+               
+                       string seqAligned = seqs[i]->getAligned();
+                       string newAligned = "";
+                       
+                       for (int j = 0; j < seqAligned.length(); j++) {
+                               //if this spot is a gap
+                               if (filterString[j] == '1') { newAligned += seqAligned[j]; }
+                       }
+                       
+                       seqs[i]->setAligned(newAligned);
+               }
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "Chimera", "runFilter");
+               exit(1);
+       }
+}
+
  //***************************************************************************************************************
  vector<Sequence*> Chimera::readSeqs(string file) {
         try {
@@ -61,3 +131,46 @@ void Chimera::setMask(string filename) {
         }
  }
  //***************************************************************************************************************
+
+vector< vector<float> > Chimera::readQuantiles() {
+       try {
+       
+               ifstream in;
+               openInputFile(quanfile, in);
+               
+               vector< vector<float> > quan;
+       
+               int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; 
+               
+               while(!in.eof()){
+                       
+                       in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; 
+                       
+                       vector <float> temp;
+                       
+                       temp.push_back(ten); 
+                       temp.push_back(twentyfive);
+                       temp.push_back(fifty);
+                       temp.push_back(seventyfive);
+                       temp.push_back(ninetyfive);
+                       temp.push_back(ninetynine);
+                       
+                       quan.push_back(temp);  
+       
+                       gobble(in);
+               }
+               
+               in.close();
+               return quan;
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "Chimera", "readQuantiles");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+
+
+
+
diff --git a/chimera.h b/chimera.h

index fd6b53a26778ff247ba43ee5f5a5bc6b6fa6c5c8..366068b43f77cae77f6df9ce82766e253a8789e1 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -12,12 +12,45 @@
  
  
  #include "mothur.h"
-#include "sparsematrix.hpp"
  #include "sequence.hpp"
  
-typedef list<PCell>::iterator MatData;
-typedef map<int, float> SeqMap;  //maps sequence to all distance for that seqeunce
  
+struct Preference {
+               string name;
+               vector<string> leftParent; //keep the name of closest left associated with the two scores
+               vector<string> rightParent; //keep the name of closest right associated with the two scores
+               vector<float> score;  //so you can keep last score and calc this score and keep whichever is bigger.
+               vector<float> closestLeft;  //keep the closest left associated with the two scores
+               vector<float> closestRight; //keep the closest right associated with the two scores
+               int midpoint;
+
+};
+
+struct SeqDist {
+       Sequence* seq;
+       float dist;
+};
+
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareSeqDist(SeqDist left, SeqDist right){
+       return (left.dist < right.dist);        
+} 
+//********************************************************************************************************************
+
+struct sim {
+               Sequence* leftParent;
+               Sequence* rightParent; 
+               float score;  
+               int midpoint;
+};
+
+struct linePair {
+                       int start;
+                       int end;
+                       linePair(int i, int j) : start(i), end(j) {}
+                       linePair(){}
+};
  
  /***********************************************************************/
  
@@ -29,16 +62,21 @@ class Chimera {
                 Chimera(string);
                 Chimera(string, string);
                 virtual ~Chimera(){};
-               virtual void setFilter(bool f)                  {       filter = f;                     }
+               virtual void setFilter(bool f)                  {       filter = f;                     }
                 virtual void setCorrection(bool c)              {       correction = c;         }
                 virtual void setProcessors(int p)               {       processors = p;         }
                 virtual void setWindow(int w)                   {       window = w;                     }
                 virtual void setIncrement(int i)                {       increment = i;          }
+               virtual void setNumWanted(int n)                {       numWanted = n;          }
                 
-               virtual void setCons(string) {};
-               virtual void setQuantiles(string) {};
+               virtual void setCons(string){};
+               virtual void setQuantiles(string){};
                 virtual vector<Sequence*> readSeqs(string);
+               virtual vector< vector<float> > readQuantiles();
                 virtual void setMask(string);
+               virtual void runFilter(vector<Sequence*>);
+               virtual void createFilter(vector<Sequence*>);
+               
                 
                 //pure functions
                 virtual void getChimeras() = 0; 
@@ -47,8 +85,8 @@ class Chimera {
         protected:
                 
                 bool filter, correction;
-               int processors, window, increment;
-               string seqMask;
+               int processors, window, increment, numWanted;
+               string seqMask, quanfile, filterString;
                         
  
  };
diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp

index f980310008cb562c8128344e17a477db60801d76..6af8aa3f3d4f1ae3085743eb90b4472edfbfe301 100644 (file)
--- a/chimeraseqscommand.cpp
+++ b/chimeraseqscommand.cpp
@@ -10,6 +10,8 @@
  #include "chimeraseqscommand.h"
  #include "bellerophon.h"
  #include "pintail.h"
+#include "alignedsimilarity.h"
+#include "ccode.h"
  
  
  //***************************************************************************************************************
@@ -23,7 +25,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                 
                 else {
                         //valid paramters for this command
-                       string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask" };
+                       string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted" };
                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                         
                         OptionParser parser(option);
@@ -61,9 +63,6 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                                 if (ableToOpen == 1) { abort = true; }
                                 in.close();
                         }
-               
-
-                       
  
                         string temp;
                         temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "F"; }
@@ -80,10 +79,13 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                                         
                         temp = validParameter.validFile(parameters, "increment", false);                        if (temp == "not found") { temp = "25"; }
                         convert(temp, increment);
-                               
+                       
+                       temp = validParameter.validFile(parameters, "numwanted", false);                        if (temp == "not found") { temp = "20"; }
+                       convert(temp, numwanted);
+
                         method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "pintail"; }
                         
-                       if ((method == "pintail") && (templatefile == "")) { mothurOut("You must provide a template file with the pintail method."); mothurOutEndLine(); abort = true;  }
+                       if (((method == "pintail") || (method == "alignsim")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail and alignsim methods."); mothurOutEndLine(); abort = true;  }
                         
  
                 }
@@ -101,7 +103,7 @@ void ChimeraSeqsCommand::help(){
                 mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n");
                 mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation and quantile.\n");
                 mothurOut("The fasta parameter is always required and template is required if using pintail.\n");
-               mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter this only applies when the method is bellerphon.  The default is false. \n");
+               mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter.  The default is false. \n");
                 mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.   The default is true. This only applies when the method is bellerphon.\n");
                 mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
                 mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is pintail. Options include..... \n");
@@ -134,8 +136,10 @@ int ChimeraSeqsCommand::execute(){
                 
                 if (abort == true) { return 0; }
                 
-               if (method == "bellerophon")    {               chimera = new Bellerophon(fastafile);                   }
-               else if (method == "pintail")   {               chimera = new Pintail(fastafile, templatefile); }
+               if (method == "bellerophon")    {               chimera = new Bellerophon(fastafile);                           }
+               else if (method == "pintail")   {               chimera = new Pintail(fastafile, templatefile);         }
+               //else if (method == "alignsim")        {               chimera = new AlignSim(fastafile, templatefile);        }
+               else if (method == "ccode")             {               chimera = new Ccode(fastafile, templatefile);           }
                 else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0;          }
                 
                 //set user options
@@ -155,6 +159,7 @@ int ChimeraSeqsCommand::execute(){
                 chimera->setProcessors(processors);
                 chimera->setWindow(window);
                 chimera->setIncrement(increment);
+               chimera->setNumWanted(numwanted);
                                 
                 //find chimeras
                 chimera->getChimeras();
diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h

index 85d573246a56f83c73140ac393c96cdf66397097..6274c48ea2acd8849155e9fc3b8813f6bae74919 100644 (file)
--- a/chimeraseqscommand.h
+++ b/chimeraseqscommand.h
@@ -30,7 +30,7 @@ private:
         bool abort;
         string method, fastafile, templatefile, consfile, quanfile, maskfile;
         bool filter, correction;
-       int processors, midpoint, averageLeft, averageRight, window, iters, increment;
+       int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted;
         Chimera* chimera;
         
         
diff --git a/decalc.cpp b/decalc.cpp

index 860b12f3c380088606088e99f952fd114b9ae021..05897517d061540e6548b9bf2c51c9c97443248d 100644 (file)
--- a/decalc.cpp
+++ b/decalc.cpp
@@ -251,9 +251,16 @@ float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
                 
                 //for each window
                 float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
-               for (int m = 0; m < obs.size(); m++) {          sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));         }
+               int numZeros = 0;
+               for (int m = 0; m < obs.size(); m++) {          
                         
-               float de = sqrt((sum / (obs.size() - 1)));
+                       //if (obs[m] != 0.0) {
+                               sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); 
+                       //}else {  numZeros++;   }
+                       
+               }
+                       
+               float de = sqrt((sum / (obs.size() - 1 - numZeros)));
                         
                 return de;
         }
@@ -366,6 +373,9 @@ vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs,
                 
                 //percentage of mismatched pairs 1 to 100
                 quan.resize(100);
+//ofstream o;
+//string out = "getQuantiles.out";
+//openOutputFile(out, o);
                 
                 //for each sequence
                 for(int i = start; i < end; i++){
@@ -402,7 +412,7 @@ vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs,
                                 float de = calcDE(obsi, exp);
                                                                 
                                 float dist = calcDist(query, subject, front, back); 
-                               
+       //o << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                   
                                 dist = ceil(dist);
                                 
                                 quanMember newScore(de, i, j);
@@ -424,19 +434,16 @@ vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs,
                 exit(1);
         }
  }
-
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareQuanMembers(quanMember left, quanMember right){
+       return (left.score < right.score);      
+} 
  //***************************************************************************************************************
  //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
-vector< vector<float> > DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
+void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
         try {
-               vector< vector<float> > quan; 
-               quan.resize(100);
-       
-               /*vector<quanMember> contributions;  
-               vector<int> seen;  //seen[0] is the number of outliers that template seqs[0] was part of.
-               seen.resize(num,0);
-                               
-               //find contributions
+                                               
                 for (int i = 0; i < quantiles.size(); i++) {
                 
                         //find mean of this quantile score
@@ -444,22 +451,21 @@ vector< vector<float> > DeCalculator::removeObviousOutliers(vector< vector<quanM
                         
                         float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
                         float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
-               
+                       
+                       vector<quanMember> temp;
+                       
                         //look at each value in quantiles to see if it is an outlier
                         for (int j = 0; j < quantiles[i].size(); j++) {
-                               
                                 //is this score between 1 and 99%
                                 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
-                                       
-                               }else {
-                                       //add to contributions
-                                       contributions.push_back(quantiles[i][j]);
-                                       seen[quantiles[i][j].member1]++;
-                                       seen[quantiles[i][j].member2]++;
+                                       temp.push_back(quantiles[i][j]);
                                 }
                         }
+                       
+                       quantiles[i] = temp;
                 }
  
+/*
                 //find contributer with most offending score related to it
                 int largestContrib = findLargestContrib(seen);
         
@@ -538,7 +544,7 @@ cout << "high = " << high << endl;
                         
                 }
  */
-               return quan;
+               
         }
         catch(exception& e) {
                 errorOut(e, "DeCalculator", "removeObviousOutliers");
diff --git a/decalc.h b/decalc.h

index a307497032004c5926c4c7d88f6f3e55c42232b2..da0f96cd72b69acf3350a1565f66e7586644ff12 100644 (file)
--- a/decalc.h
+++ b/decalc.h
@@ -44,7 +44,7 @@ class DeCalculator {
                 void setAlignmentLength(int l) {  alignLength = l;  }
                 void runMask(Sequence*);
                 void trimSeqs(Sequence*, Sequence*, map<int, int>&);
-               vector< vector<float> > removeObviousOutliers(vector< vector<quanMember> >&, int);
+               void removeObviousOutliers(vector< vector<quanMember> >&, int);
                 vector<float> calcFreq(vector<Sequence*>, string);
                 vector<int> findWindows(Sequence*, int, int, int&, int);
                 vector<float> calcObserved(Sequence*, Sequence*, vector<int>, int);
diff --git a/pintail.cpp b/pintail.cpp

index f46e0e290f0c99f344bb9e809c7c5b20989e9da0..04dfb0a76467ab6b030cfa60b8462dd7d30afd73 100644 (file)
--- a/pintail.cpp
+++ b/pintail.cpp
@@ -9,6 +9,7 @@
  
  #include "pintail.h"
  #include "ignoregaps.h"
+#include "eachgapdist.h"
  
  //********************************************************************************************************************
  //sorts lowest to highest
@@ -41,9 +42,11 @@ void Pintail::print(ostream& out) {
                         
                         int index = ceil(deviation[i]);
                         
+                       if (index == 0) { index=1;  }
+                                               
                         //is your DE value higher than the 95%
                         string chimera;
-                       if (DE[i] > quantiles[index][4])                {       chimera = "Yes";        }
+                       if (DE[i] > quantiles[index-1][4])              {       chimera = "Yes";        }
                         else                                                                    {       chimera = "No";         }
                         
                         out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
@@ -125,7 +128,7 @@ void Pintail::getChimeras() {
                         templateLines.push_back(new linePair(0, templateSeqs.size()));
                 #endif
                 
-               distcalculator = new ignoreGaps();
+               distcalculator = new eachGapDist();
                 decalc = new DeCalculator();
                 
                 //if the user does enter a mask then you want to keep all the spots in the alignment
@@ -141,7 +144,18 @@ void Pintail::getChimeras() {
                         mothurOut("Done."); mothurOutEndLine();
                 }else {         createProcessesPairs();         }
                 
-               
+/*string o = "foronlinepintailpairs-eachgap";
+ofstream out7;
+openOutputFile(o, out7);
+
+for (int i = 0; i < bestfit.size(); i++) {
+       out7 << querySeqs[i]->getName() << endl;
+       out7 << querySeqs[i]->getUnaligned() << endl << endl;
+       
+       out7 << bestfit[i]->getName() << endl;
+       out7 << bestfit[i]->getUnaligned() << endl << endl << endl;
+}              
+out7.close();/*/       
                 //find P
                 mothurOut("Getting conservation... "); cout.flush();
                 if (consfile == "") { 
@@ -171,7 +185,19 @@ void Pintail::getChimeras() {
                         }
  
                 }
-                                               
+               
+               if (filter) {
+                       vector<Sequence*> temp = templateSeqs;
+                       for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
+                       
+                       createFilter(temp);
+                       
+                       runFilter(querySeqs);
+                       runFilter(templateSeqs);
+                       runFilter(bestfit);
+               }
+                                                                               
+                                                                                                                                               
                 if (processors == 1) { 
         
                         for (int j = 0; j < bestfit.size(); j++) { 
@@ -241,26 +267,22 @@ void Pintail::getChimeras() {
                 //quantiles are used to determine whether the de values found indicate a chimera
                 //if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
                 //combination of sequences in the template
-               if (quanfile != "") {  quantiles =  readQuantiles();  }
+               if (quanfile != "") {  quantiles = readQuantiles();  }
                 else {
                         
                         mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
                         if (processors == 1) { 
                                 quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
                         }else {         createProcessesQuan();          }
+               
                         
+                       ofstream out4, out5;
+                       string noOutliers, outliers;
                         
-                                               
-                       
-                       //decided against this because we were having trouble setting the sensitivity... may want to revisit this...
-                       //quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
-                       
-                       ofstream out4;
-                       string o;
-                       
-                       o = getRootName(templateFile) + "quan";
+                       noOutliers = getRootName(templateFile) + "pintail.quanNOOUTLIERS";
+                       outliers = getRootName(templateFile) + "pintail.quanYESOUTLIERS";
                         
-                       openOutputFile(o, out4);
+                       openOutputFile(outliers, out4);
                         
                         //adjust quantiles
                         for (int i = 0; i < quantilesMembers.size(); i++) {
@@ -301,6 +323,47 @@ void Pintail::getChimeras() {
                         
                         out4.close();
                         
+                       decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
+                       
+                       openOutputFile(noOutliers, out5);
+                       
+                       //adjust quantiles
+                       for (int i = 0; i < quantilesMembers.size(); i++) {
+                               vector<float> temp;
+                               
+                               if (quantilesMembers[i].size() == 0) {
+                                       //in case this is not a distance found in your template files
+                                       for (int g = 0; g < 6; g++) {
+                                               temp.push_back(0.0);
+                                       }
+                               }else{
+                                       
+                                       sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
+                                       
+                                       //save 10%
+                                       temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
+                                       //save 25%
+                                       temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
+                                       //save 50%
+                                       temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
+                                       //save 75%
+                                       temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
+                                       //save 95%
+                                       temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
+                                       //save 99%
+                                       temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
+                                       
+                               }
+                               
+                               //output quan value
+                               out5 << i+1 << '\t';                            
+                               for (int u = 0; u < temp.size(); u++) {   out5 << temp[u] << '\t'; }
+                               out5 << endl;
+                               
+                               quantiles[i] = temp;
+                               
+                       }
+
                         mothurOut("Done."); mothurOutEndLine();
                         
                 }
@@ -360,45 +423,6 @@ vector<float> Pintail::readFreq() {
         }
  }
  
-//***************************************************************************************************************
-
-vector< vector<float> > Pintail::readQuantiles() {
-       try {
-       
-               ifstream in;
-               openInputFile(quanfile, in);
-               
-               vector< vector<float> > quan;
-       
-               int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; 
-               
-               while(!in.eof()){
-                       
-                       in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; 
-                       
-                       vector <float> temp;
-                       
-                       temp.push_back(ten); 
-                       temp.push_back(twentyfive);
-                       temp.push_back(fifty);
-                       temp.push_back(seventyfive);
-                       temp.push_back(ninetyfive);
-                       temp.push_back(ninetynine);
-                       
-                       quan.push_back(temp);  
-                       
-                       gobble(in);
-               }
-               
-               in.close();
-               return quan;
-               
-       }
-       catch(exception& e) {
-               errorOut(e, "Pintail", "readQuantiles");
-               exit(1);
-       }
-}
  //***************************************************************************************************************
  //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
  vector<Sequence*> Pintail::findPairs(int start, int end) {
diff --git a/pintail.h b/pintail.h

index ddfba85efdf4d3d424443ed1fba1ecee6ae101eb..37efd00b86793f064770784035d5e7dbfb78ee15 100644 (file)
--- a/pintail.h
+++ b/pintail.h
@@ -35,17 +35,10 @@ class Pintail : public Chimera {
                 
         private:
         
-               struct linePair {
-                       int start;
-                       int end;
-                       linePair(int i, int j) : start(i), end(j) {}
-                       linePair(){}
-               };
-
                 Dist* distcalculator;
                 DeCalculator* decalc;
                 int iters;
-               string fastafile, templateFile, consfile, quanfile;
+               string fastafile, templateFile, consfile;
                 
                 
                 vector<linePair*> lines;
@@ -77,7 +70,6 @@ class Pintail : public Chimera {
                 
                 
                 vector<float> readFreq();
-               vector< vector<float> > readQuantiles();
                 vector<Sequence*> findPairs(int, int);
                         
                 void createProcessesSpots();
author	westcott <westcott>
	Mon, 31 Aug 2009 14:33:57 +0000 (14:33 +0000)
committer	westcott <westcott>
	Mon, 31 Aug 2009 14:33:57 +0000 (14:33 +0000)
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
alignedsimilarity.cpp	[new file with mode: 0644]	patch \| blob
alignedsimilarity.h	[new file with mode: 0644]	patch \| blob
bellerophon.cpp		patch \| blob \| history
bellerophon.h		patch \| blob \| history
ccode.cpp	[new file with mode: 0644]	patch \| blob
ccode.h	[new file with mode: 0644]	patch \| blob
chimera.cpp		patch \| blob \| history
chimera.h		patch \| blob \| history
chimeraseqscommand.cpp		patch \| blob \| history
chimeraseqscommand.h		patch \| blob \| history
decalc.cpp		patch \| blob \| history
decalc.h		patch \| blob \| history
pintail.cpp		patch \| blob \| history
pintail.h		patch \| blob \| history