A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; };
A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; };
A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
+ A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B8879104C16860083C454 /* alignedsimilarity.cpp */; };
+ A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
EB1216880F619B83004A865F /* bergerparker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216870F619B83004A865F /* bergerparker.cpp */; };
EB1216E50F61ACFB004A865F /* bstick.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216E40F61ACFB004A865F /* bstick.cpp */; };
EB1217230F61C9AC004A865F /* sharedkstest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1217220F61C9AC004A865F /* sharedkstest.cpp */; };
A70B53A70F4CD7AD0064797E /* getlabelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlabelcommand.h; sourceTree = SOURCE_ROOT; };
A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlinecommand.cpp; sourceTree = SOURCE_ROOT; };
A70B53A90F4CD7AD0064797E /* getlinecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlinecommand.h; sourceTree = SOURCE_ROOT; };
+ A75B8879104C16860083C454 /* alignedsimilarity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignedsimilarity.cpp; sourceTree = "<group>"; };
+ A75B887A104C16860083C454 /* alignedsimilarity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignedsimilarity.h; sourceTree = "<group>"; };
+ A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = "<group>"; };
+ A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = "<group>"; };
C6859E8B029090EE04C91782 /* Mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = Mothur.1; sourceTree = "<group>"; };
EB1216860F619B83004A865F /* bergerparker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bergerparker.h; sourceTree = SOURCE_ROOT; };
EB1216870F619B83004A865F /* bergerparker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bergerparker.cpp; sourceTree = SOURCE_ROOT; };
372095C1103196D70004D347 /* chimera.cpp */,
374F2FE9100634B600E97C66 /* bellerophon.h */,
374F2FEA100634B600E97C66 /* bellerophon.cpp */,
+ A75B8879104C16860083C454 /* alignedsimilarity.cpp */,
+ A75B887A104C16860083C454 /* alignedsimilarity.h */,
+ A75B887B104C16860083C454 /* ccode.cpp */,
+ A75B887C104C16860083C454 /* ccode.h */,
372A55531017922B00C5194B /* decalc.h */,
372A55541017922B00C5194B /* decalc.cpp */,
374F301110063B6F00E97C66 /* pintail.h */,
374F301310063B6F00E97C66 /* pintail.cpp in Sources */,
372A55551017922B00C5194B /* decalc.cpp in Sources */,
372095C2103196D70004D347 /* chimera.cpp in Sources */,
+ A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */,
+ A75B887E104C16860083C454 /* ccode.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
--- /dev/null
+/*
+ * alignedsimilarity.cpp
+ * Mothur
+ *
+ * Created by westcott on 8/18/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "alignedsimilarity.h"
+#include "ignoregaps.h"
+
+
+//***************************************************************************************************************
+AlignSim::AlignSim(string filename, string temp) { fastafile = filename; templateFile = temp; }
+//***************************************************************************************************************
+
+AlignSim::~AlignSim() {
+ try {
+ for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
+ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
+ delete distCalc;
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "~AlignSim");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void AlignSim::print(ostream& out) {
+ try {
+
+ mothurOutEndLine();
+
+ for (int i = 0; i < querySeqs.size(); i++) {
+
+ int j = 0; float largest = -10;
+ //find largest sim value
+ for (int k = 0; k < IS[i].size(); k++) {
+ //is this score larger
+ if (IS[i][k].score > largest) {
+ j = k;
+ largest = IS[i][k].score;
+ }
+ }
+
+ //find parental similarity
+ distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
+ float dist = distCalc->getDist();
+
+ //convert to similarity
+ dist = (1 - dist) * 100;
+
+ //warn about parental similarity - if its above 82% may not detect a chimera
+ if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly."); mothurOutEndLine(); }
+
+ int index = ceil(dist);
+
+ if (index == 0) { index=1; }
+
+ //is your DE value higher than the 95%
+ string chimera;
+ if (IS[i][j].score > quantile[index-1][4]) { chimera = "Yes"; }
+ else { chimera = "No"; }
+
+ out << querySeqs[i]->getName() << "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
+
+ if (chimera == "Yes") {
+ mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
+ }
+ out << "Improvement Score\t";
+
+ for (int r = 0; r < IS[i].size(); r++) { out << IS[i][r].score << '\t'; }
+ out << endl;
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "print");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+void AlignSim::getChimeras() {
+ try {
+
+ //read in query sequences and subject sequences
+ mothurOut("Reading sequences and template file... "); cout.flush();
+ querySeqs = readSeqs(fastafile);
+ templateSeqs = readSeqs(templateFile);
+ mothurOut("Done."); mothurOutEndLine();
+
+ int numSeqs = querySeqs.size();
+
+ IS.resize(numSeqs);
+
+ //break up file if needed
+ int linesPerProcess = numSeqs / processors ;
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ //find breakup of sequences for all times we will Parallelize
+ if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
+ else {
+ //fill line pairs
+ for (int i = 0; i < (processors-1); i++) {
+ lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+ }
+ //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+ int i = processors - 1;
+ lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+ }
+
+ //find breakup of templatefile for quantiles
+ if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
+ else {
+ for (int i = 0; i < processors; i++) {
+ templateLines.push_back(new linePair());
+ templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
+ templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
+ }
+ }
+ #else
+ lines.push_back(new linePair(0, numSeqs));
+ templateLines.push_back(new linePair(0, templateSeqs.size()));
+ #endif
+
+
+ distCalc = new ignoreGaps();
+
+ //find window breaks
+ windowBreak = findWindows();
+//for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t'; }
+//cout << endl;
+
+ mothurOut("Finding the IS values for your sequences..."); cout.flush();
+ if (processors == 1) {
+ IS = findIS(lines[0]->start, lines[0]->end, querySeqs);
+ }else { IS = createProcessesIS(querySeqs, lines); }
+ mothurOut("Done."); mothurOutEndLine();
+
+ //quantiles are used to determine whether the de values found indicate a chimera
+ if (quanfile != "") { quantile = readQuantiles(); }
+ else {
+
+ mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
+ //get IS quantiles as a reference to these IS scores
+ //you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see
+ if (processors == 1) {
+ templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs);
+ }else { templateIS = createProcessesIS(templateSeqs, templateLines); }
+//cout << "here" << endl;
+ ofstream out4;
+ string o;
+
+ o = getRootName(templateFile) + "alignedsimilarity.quan";
+
+ openOutputFile(o, out4);
+
+ //adjust quantiles
+ //construct table
+ quantile.resize(100);
+
+ for (int i = 0; i < templateIS.size(); i++) {
+
+ if (templateIS[i].size() != 0) {
+ int j = 0; float score = -1000;
+ //find highest IS score
+ //cout << templateIS[i].size() << endl;
+ for (int k = 0; k < templateIS[i].size(); k++) {
+ if (templateIS[i][k].score > score) {
+ score = templateIS[i][k].score;
+ j = k;
+ }
+ }
+ //cout << j << endl;
+ //find similarity of parents
+ distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent));
+ float dist = distCalc->getDist();
+
+ //convert to similarity
+ dist = (1 - dist) * 100;
+ // cout << dist << endl;
+ int index = ceil(dist);
+ // cout << "index = " << index << endl;
+ if (index == 0) { index = 1; }
+ quantile[index-1].push_back(templateIS[i][j].score);
+ }
+ }
+
+
+ for (int i = 0; i < quantile.size(); i++) {
+ vector<float> temp;
+
+ if (quantile[i].size() == 0) {
+ //in case this is not a distance found in your template files
+ for (int g = 0; g < 6; g++) {
+ temp.push_back(0.0);
+ }
+ }else{
+
+ sort(quantile[i].begin(), quantile[i].end());
+
+ //save 10%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]);
+ //save 25%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]);
+ //save 50%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]);
+ //save 75%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]);
+ //save 95%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]);
+ //save 99%
+ temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]);
+
+ }
+
+ //output quan value
+ out4 << i+1 << '\t';
+ for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; }
+ out4 << endl;
+
+ quantile[i] = temp;
+
+ }
+
+ out4.close();
+
+ mothurOut("Done."); mothurOutEndLine();
+
+ }
+
+ //free memory
+ for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
+ for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "getChimeras");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+vector<int> AlignSim::findWindows() {
+ try {
+
+ vector<int> win;
+
+ if (increment > querySeqs[0]->getAligned().length()) { mothurOut("You have selected an increment larger than the length of your sequences. I will use the default of 25."); increment = 25; }
+
+ for (int m = increment; m < (querySeqs[0]->getAligned().length() - increment); m+=increment) { win.push_back(m); }
+
+ return win;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "findWindows");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+vector< vector<sim> > AlignSim::findIS(int start, int end, vector<Sequence*> seqs) {
+ try {
+
+ vector< vector<sim> > isValues;
+ isValues.resize(seqs.size());
+
+ //for each sequence
+ for(int i = start; i < end; i++){
+
+ vector<sim> temp; temp.resize(windowBreak.size());
+
+ mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine();
+
+ //for each window
+ for (int m = 0; m < windowBreak.size(); m++) {
+
+ vector<Sequence*> closest; //left, right, overall
+ vector<int> similarity; //left+right and overall
+
+ //find closest left, closest right and closest overall
+ closest = findClosestSides(seqs[i], windowBreak[m], similarity, i);
+
+ int totalNumBases = seqs[i]->getUnaligned().length();
+
+ //IS = left+right-overall
+ float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases);
+
+ ///save IS, leftparent, rightparent, breakpoint
+ temp[m].leftParent = closest[0];
+ temp[m].rightParent = closest[1];
+ temp[m].score = is;
+ temp[m].midpoint = windowBreak[m];
+//cout << is << '\t';
+ }
+//cout << endl;
+ isValues[i] = temp;
+
+ }
+
+ return isValues;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "findIS");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+vector<Sequence*> AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector<int>& sim, int i) {
+ try{
+
+ vector<Sequence*> closest;
+
+ Sequence query, queryLeft, queryRight;
+ string frag = seq->getAligned();
+ string fragLeft = frag.substr(0, breakpoint);
+ string fragRight = frag.substr(breakpoint, frag.length());
+
+ //get pieces
+ query = *(seq);
+ queryLeft = *(seq);
+ queryRight = *(seq);
+ queryLeft.setAligned(fragLeft);
+ queryRight.setAligned(fragRight);
+
+ //initialize
+ Sequence* overall = templateSeqs[0];
+ Sequence* left = templateSeqs[0];
+ Sequence* right = templateSeqs[0];
+
+ float smallestOverall, smallestLeft, smallestRight;
+ smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000;
+
+ //go through the templateSeqs and search for the closest
+ for(int j = 0; j < templateSeqs.size(); j++){
+
+ //so you don't pick yourself
+ if (j != i) {
+ Sequence temp, tempLeft, tempRight;
+ string fragTemp = templateSeqs[j]->getAligned();
+ string fragTempLeft = fragTemp.substr(0, breakpoint);
+ string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length());
+
+ //get pieces
+ temp = *(templateSeqs[j]);
+ tempLeft = *(templateSeqs[j]);
+ tempRight = *(templateSeqs[j]);
+ tempLeft.setAligned(fragTempLeft);
+ tempRight.setAligned(fragTempRight);
+
+ //find overall dist
+ distCalc->calcDist(query, temp);
+ float dist = distCalc->getDist();
+
+ if (dist < smallestOverall) {
+ overall = templateSeqs[j];
+ smallestOverall = dist;
+ }
+
+ //find left dist
+ distCalc->calcDist(queryLeft, tempLeft);
+ dist = distCalc->getDist();
+
+ if (dist < smallestLeft) {
+ left = templateSeqs[j];
+ smallestLeft = dist;
+ }
+
+ //find left dist
+ distCalc->calcDist(queryRight, tempRight);
+ dist = distCalc->getDist();
+
+ if (dist < smallestRight) {
+ right = templateSeqs[j];
+ smallestRight = dist;
+ }
+ }
+
+ }
+
+ closest.push_back(left);
+ closest.push_back(right);
+ closest.push_back(overall);
+
+ //fill sim with number of matched bases
+ Sequence tempL = *(left);
+ string tempLFrag = tempL.getAligned();
+ tempL.setAligned(tempLFrag.substr(0, breakpoint));
+
+ int bothMatches = findNumMatchedBases(queryLeft, tempL);
+ sim.push_back(bothMatches);
+
+ Sequence tempR = *(right);
+ string tempRFrag = tempR.getAligned();
+ tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length()));
+
+ bothMatches = findNumMatchedBases(queryRight, tempR);
+ sim.push_back(bothMatches);
+
+ bothMatches = findNumMatchedBases(query, *(overall));
+ sim.push_back(bothMatches);
+
+ return closest;
+
+
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "findClosestSides");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) {
+ try{
+
+ int num = 0;
+
+ string query = seq.getAligned();
+ string subject = temp.getAligned();
+//cout << seq.getName() << endl << endl;
+//cout << temp.getName() << endl << endl;
+
+ for (int i = 0; i < query.length(); i++) {
+ //count matches if query[i] is a base and subject is same bases or gap
+ if (isalpha(query[i])) {
+ if(!isalpha(subject[i])) { num++; }
+ else if (query[i] == subject[i]) { num++; }
+ else { }
+ }
+ }
+//cout << "num = " << num << endl;
+ return num;
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "findNumMatchedBases");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+vector< vector<sim> > AlignSim::createProcessesIS(vector<Sequence*> seqs, vector<linePair*> linesToProcess) {
+ try {
+ vector< vector<sim> > localIs; localIs.resize(seqs.size());
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 0;
+ vector<int> processIDS;
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+
+ mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
+ localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs);
+ mothurOut("Done finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
+
+ //write out data to file so parent can read it
+ ofstream out;
+ string s = toString(getpid()) + ".temp";
+ openOutputFile(s, out);
+
+ //output pairs
+ for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) {
+ out << localIs[i].size() << endl;
+ for (int j = 0; j < localIs[i].size(); j++) {
+ localIs[i][j].leftParent->printSequence(out);
+ localIs[i][j].rightParent->printSequence(out);
+ out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl;
+ }
+ }
+ out.close();
+
+ exit(0);
+ }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+ }
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processors;i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ //get data created by processes
+ for (int i=0;i<processors;i++) {
+ ifstream in;
+ string s = toString(processIDS[i]) + ".temp";
+ openInputFile(s, in);
+
+ //get pairs
+ for (int k = linesToProcess[i]->start; k < linesToProcess[i]->end; k++) {
+ int size;
+ in >> size;
+ gobble(in);
+
+ vector<sim> tempVector;
+
+ for (int j = 0; j < size; j++) {
+
+ sim temp;
+
+ temp.leftParent = new Sequence(in);
+ gobble(in);
+ //temp.leftParent->printSequence(cout);
+ temp.rightParent = new Sequence(in);
+ gobble(in);
+//temp.rightParent->printSequence(cout);
+ string throwaway;
+ in >> throwaway >> temp.score >> temp.midpoint;
+ //cout << temp.score << '\t' << temp.midpoint << endl;
+ gobble(in);
+
+ tempVector.push_back(temp);
+ }
+
+ localIs[k] = tempVector;
+ }
+
+ in.close();
+ remove(s.c_str());
+ }
+
+
+#else
+ localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs);
+#endif
+
+ return localIs;
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignSim", "createProcessesIS");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
--- /dev/null
+#ifndef ALIGNSIM_H
+#define ALIGNSIM_H
+
+/*
+ * alignedsimilarity.h
+ * Mothur
+ *
+ * Created by westcott on 8/18/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "chimera.h"
+#include "dist.h"
+
+//This class was created using the algorythms described in the
+//"Evaluation of Nearest Neighbor Methods for Detection of Chimeric Small-Subunit rRna Sequences" paper
+//by J.F. Robison-Cox 1, M.M. Bateson 2 and D.M. Ward 2
+
+
+/***********************************************************/
+
+class AlignSim : public Chimera {
+
+ public:
+ AlignSim(string, string);
+ ~AlignSim();
+
+ void getChimeras();
+ void print(ostream&);
+
+ void setCons(string){};
+ void setQuantiles(string q) { quanfile = q; };
+
+
+ private:
+
+ Dist* distCalc;
+ vector<linePair*> lines;
+ vector<linePair*> templateLines;
+
+ vector<Sequence*> querySeqs;
+ vector<Sequence*> templateSeqs;
+
+ vector< vector<sim> > IS; //IS[0] is the vector os sim values for each window for querySeqs[0]
+ vector< vector<sim> > templateIS; //templateIS[0] is the vector os sim values for each window for templateSeqs[0]
+ vector<int> windowBreak;
+
+ string fastafile, templateFile;
+ int iters;
+
+ vector< vector<sim> > findIS(int, int, vector<Sequence*>);
+ vector<Sequence*> findClosestSides(Sequence*, int, vector<int>&, int);
+ int findNumMatchedBases(Sequence, Sequence);
+ vector<int> findWindows();
+
+ vector< vector<float> > quantile;
+
+ vector< vector<sim> > createProcessesIS(vector<Sequence*>, vector<linePair*>);
+
+};
+
+/***********************************************************/
+
+#endif
+
out << pref[i].name << '\t' << setprecision(3) << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
//calc # of seqs with preference above 1.0
- if (pref[i].score[0] > 1.0) {
+ if (pref[i].score[0] > 1.1) {
above1++;
mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
//output results to screen
mothurOutEndLine();
- mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
+ mothurOut("Sequence with preference score above 1.1: " + toString(above1)); mothurOutEndLine();
int spot;
spot = pref.size()-1;
mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
}else{ increment = 0; }
}
-cout << "increment = " << increment << endl;
+
if (increment == 0) { iters = 1; }
else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); }
delete SparseLeft;
delete SparseRight;
-
//fill preference structure
generatePreferences(distMapLeft, distMapRight, midpoint);
delete distCalculator;
//rank preference score to eachother
- float dme = 0.0;
+ /*float dme = 0.0;
float expectedPercent = 1 / (float) (pref.size());
for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[0]; }
//how much higher or lower is this than expected
pref[i].score[0] = pref[i].score[0] / expectedPercent;
-
-
-
- }
+
+ }*/
//sort Preferences highest to lowest
sort(pref.begin(), pref.end(), comparePref);
//calculate the dme
-
int count0 = 0;
- for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } }
+ for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } }
- float expectedPercent = 1 / (float) (pref.size() - count0);
+ //float expectedPercent = 1 / (float) (pref.size() - count0);
//cout << endl << "dme = " << dme << endl;
//recalculate prefernences based on dme
for (int i = 0; i < pref.size(); i++) {
-//cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;
+//cout << "unadjusted col[i] " << i << " = " << pref[i].score[1] << endl;
+//cout << i << '\t' << (dme / (dme - 2 * pref[i].score[1])) << endl;
+
// gives the actual percentage of the dme this seq adds
- pref[i].score[1] = pref[i].score[1] / dme;
+ //pref[i].score[1] = pref[i].score[1] / dme;
//how much higher or lower is this than expected
- pref[i].score[1] = pref[i].score[1] / expectedPercent;
-
- //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
+ //pref[i].score[1] = pref[i].score[1] / expectedPercent;
+ //not 2 * pref[i].score[1] because i only calulate the pref scores once.
+ pref[i].score[1] = dme / (dme - pref[i].score[1]);
+//cout << i << '\t' << pref[i].score[1] << endl;
//so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
//cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;
}
#include "chimera.h"
#include "filterseqscommand.h"
+#include "sparsematrix.hpp"
#include "sequence.hpp"
#include "dist.h"
-struct Preference {
- string name;
- vector<string> leftParent; //keep the name of closest left associated with the two scores
- vector<string> rightParent; //keep the name of closest right associated with the two scores
- vector<float> score; //so you can keep last score and calc this score and keep whichever is bigger.
- vector<float> closestLeft; //keep the closest left associated with the two scores
- vector<float> closestRight; //keep the closest right associated with the two scores
- int midpoint;
-
-};
-
+typedef list<PCell>::iterator MatData;
+typedef map<int, float> SeqMap; //maps sequence to all distance for that seqeunce
/***********************************************************/
--- /dev/null
+/*
+ * ccode.cpp
+ * Mothur
+ *
+ * Created by westcott on 8/24/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "ccode.h"
+#include "ignoregaps.h"
+
+
+//***************************************************************************************************************
+Ccode::Ccode(string filename, string temp) { fastafile = filename; templateFile = temp; }
+//***************************************************************************************************************
+
+Ccode::~Ccode() {
+ try {
+ for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
+ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
+ delete distCalc;
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "~Ccode");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void Ccode::print(ostream& out) {
+ try {
+
+ mothurOutEndLine();
+
+ for (int i = 0; i < querySeqs.size(); i++) {
+
+ }
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "print");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+void Ccode::getChimeras() {
+ try {
+
+ //read in query sequences and subject sequences
+ mothurOut("Reading sequences and template file... "); cout.flush();
+ querySeqs = readSeqs(fastafile);
+ templateSeqs = readSeqs(templateFile);
+ mothurOut("Done."); mothurOutEndLine();
+
+ int numSeqs = querySeqs.size();
+
+ closest.resize(numSeqs);
+
+ //break up file if needed
+ int linesPerProcess = numSeqs / processors ;
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ //find breakup of sequences for all times we will Parallelize
+ if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
+ else {
+ //fill line pairs
+ for (int i = 0; i < (processors-1); i++) {
+ lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+ }
+ //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+ int i = processors - 1;
+ lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+ }
+
+ //find breakup of templatefile for quantiles
+ if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); }
+ else {
+ for (int i = 0; i < processors; i++) {
+ templateLines.push_back(new linePair());
+ templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
+ templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
+ }
+ }
+ #else
+ lines.push_back(new linePair(0, numSeqs));
+ templateLines.push_back(new linePair(0, templateSeqs.size()));
+ #endif
+
+ distCalc = new ignoreGaps();
+
+ //find closest
+ if (processors == 1) {
+ mothurOut("Finding top matches for sequences... "); cout.flush();
+ closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
+ mothurOut("Done."); mothurOutEndLine();
+ }else { createProcessesClosest(); }
+
+
+ for (int i = 0; i < closest.size(); i++) {
+ cout << querySeqs[i]->getName() << ": ";
+ for (int j = 0; j < closest[i].size(); j++) {
+
+ cout << closest[i][j]->getName() << '\t';
+ }
+ cout << endl;
+ }
+
+ //free memory
+ for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
+ for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "getChimeras");
+ exit(1);
+ }
+}
+/***************************************************************************************************************
+vector<int> Ccode::findWindows() {
+ try {
+
+ vector<int> win;
+
+ if (increment > querySeqs[0]->getAligned().length()) { mothurOut("You have selected an increment larger than the length of your sequences. I will use the default of 25."); increment = 25; }
+
+ for (int m = increment; m < (querySeqs[0]->getAligned().length() - increment); m+=increment) { win.push_back(m); }
+
+ return win;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "findWindows");
+ exit(1);
+ }
+}
+*/
+//***************************************************************************************************************
+vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted) {
+ try{
+
+ vector< vector<Sequence*> > topMatches; topMatches.resize(querySeqs.size());
+
+ float smallestOverall, smallestLeft, smallestRight;
+ smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000;
+
+ //for each sequence in querySeqs - find top matches to use as reference
+ for(int j = start; j < end; j++){
+
+ Sequence query = *(querySeqs[j]);
+
+ vector<SeqDist> distances;
+
+ //calc distance to each sequence in template seqs
+ for (int i = 0; i < templateSeqs.size(); i++) {
+
+ Sequence ref = *(templateSeqs[i]);
+
+ //find overall dist
+ distCalc->calcDist(query, ref);
+ float dist = distCalc->getDist();
+
+ //save distance
+ SeqDist temp;
+ temp.seq = templateSeqs[i];
+ temp.dist = dist;
+
+ distances.push_back(temp);
+ }
+
+ sort(distances.begin(), distances.end(), compareSeqDist);
+
+ //save the number of top matches wanted
+ for (int h = 0; h < numWanted; h++) {
+ topMatches[j].push_back(distances[h].seq);
+ }
+ }
+
+ return topMatches;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "findClosestSides");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void Ccode::createProcessesClosest() {
+ try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 0;
+ vector<int> processIDS;
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+
+ mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
+ closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
+ mothurOut("Done finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
+
+ //write out data to file so parent can read it
+ ofstream out;
+ string s = toString(getpid()) + ".temp";
+ openOutputFile(s, out);
+
+ //output pairs
+ for (int i = lines[process]->start; i < lines[process]->end; i++) {
+ out << closest[i].size() << endl;
+ for (int j = 0; j < closest[i].size(); j++) {
+ closest[i][j]->printSequence(out);
+ }
+ }
+ out.close();
+
+ exit(0);
+ }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+ }
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processors;i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+ //get data created by processes
+ for (int i=0;i<processors;i++) {
+ ifstream in;
+ string s = toString(processIDS[i]) + ".temp";
+ openInputFile(s, in);
+
+ //get pairs
+ for (int k = lines[i]->start; k < lines[i]->end; k++) {
+ int size;
+ in >> size;
+ gobble(in);
+
+ vector<Sequence*> tempVector;
+
+ for (int j = 0; j < size; j++) {
+
+ Sequence* temp = new Sequence(in);
+ gobble(in);
+
+ tempVector.push_back(temp);
+ }
+
+ closest[k] = tempVector;
+ }
+
+ in.close();
+ remove(s.c_str());
+ }
+
+
+#else
+ closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
+#endif
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Ccode", "createProcessesClosest");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+
--- /dev/null
+#ifndef CCODE_H
+#define CCODE_H
+
+/*
+ * ccode.h
+ * Mothur
+ *
+ * Created by westcott on 8/24/09.
+ * Copyright 2009 Schloss LAB. All rights reserved.
+ *
+ */
+
+#include "chimera.h"
+#include "dist.h"
+#include "decalc.h"
+
+//This class was created using the algorythms described in the
+// "Evaluating putative chimeric sequences from PCR-amplified products" paper
+//by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez.
+
+/***********************************************************/
+
+class Ccode : public Chimera {
+
+ public:
+ Ccode(string, string);
+ ~Ccode();
+
+ void getChimeras();
+ void print(ostream&);
+
+ void setCons(string c) {}
+ void setQuantiles(string q) {}
+
+
+ private:
+
+ Dist* distCalc;
+ DeCalculator* decalc;
+ int iters;
+ string fastafile, templateFile;
+
+
+ vector<linePair*> lines;
+ vector<linePair*> templateLines;
+ vector<Sequence*> querySeqs;
+ vector<Sequence*> templateSeqs;
+
+ vector< vector<Sequence*> > closest; //bestfit[0] is a vector of sequence at are closest to queryseqs[0]...
+
+ vector< vector<Sequence*> > findClosest(int, int, int);
+ void removeSeqs(vector<Sequence*>); //removes sequences from closest that are to different of too similar to eachother.
+
+ void createProcessesClosest();
+
+};
+
+/***********************************************************/
+
+#endif
+
+
#include "chimera.h"
+//***************************************************************************************************************
+//this is a vertical filter
+void Chimera::createFilter(vector<Sequence*> seqs) {
+ try {
+
+ int threshold = int (0.5 * seqs.size());
+//cout << "threshhold = " << threshold << endl;
+
+ vector<int> gaps; gaps.resize(seqs[0]->getAligned().length(), 0);
+ vector<int> a; a.resize(seqs[0]->getAligned().length(), 0);
+ vector<int> t; t.resize(seqs[0]->getAligned().length(), 0);
+ vector<int> g; g.resize(seqs[0]->getAligned().length(), 0);
+ vector<int> c; c.resize(seqs[0]->getAligned().length(), 0);
+
+ filterString = (string(seqs[0]->getAligned().length(), '1'));
+
+ //for each sequence
+ for (int i = 0; i < seqs.size(); i++) {
+
+ string seqAligned = seqs[i]->getAligned();
+
+ for (int j = 0; j < seqAligned.length(); j++) {
+ //if this spot is a gap
+ if ((seqAligned[j] == '-') || (seqAligned[j] == '.')) { gaps[j]++; }
+ else if (toupper(seqAligned[j]) == 'A') { a[j]++; }
+ else if (toupper(seqAligned[j]) == 'T') { t[j]++; }
+ else if (toupper(seqAligned[j]) == 'G') { g[j]++; }
+ else if (toupper(seqAligned[j]) == 'C') { c[j]++; }
+ }
+ }
+
+ //zero out spot where all sequences have blanks
+ for(int i = 0;i < seqs[0]->getAligned().length(); i++){
+ if(gaps[i] == seqs.size()) { filterString[i] = '0'; }
+
+ else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) { filterString[i] = '0'; }
+ //cout << "a = " << a[i] << " t = " << t[i] << " g = " << g[i] << " c = " << c[i] << endl;
+ }
+//cout << "filter = " << filterString << endl;
+ }
+ catch(exception& e) {
+ errorOut(e, "Chimera", "createFilter");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void Chimera::runFilter(vector<Sequence*> seqs) {
+ try {
+
+ //for each sequence
+ for (int i = 0; i < seqs.size(); i++) {
+
+ string seqAligned = seqs[i]->getAligned();
+ string newAligned = "";
+
+ for (int j = 0; j < seqAligned.length(); j++) {
+ //if this spot is a gap
+ if (filterString[j] == '1') { newAligned += seqAligned[j]; }
+ }
+
+ seqs[i]->setAligned(newAligned);
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Chimera", "runFilter");
+ exit(1);
+ }
+}
+
//***************************************************************************************************************
vector<Sequence*> Chimera::readSeqs(string file) {
try {
}
}
//***************************************************************************************************************
+
+vector< vector<float> > Chimera::readQuantiles() {
+ try {
+
+ ifstream in;
+ openInputFile(quanfile, in);
+
+ vector< vector<float> > quan;
+
+ int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
+
+ while(!in.eof()){
+
+ in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
+
+ vector <float> temp;
+
+ temp.push_back(ten);
+ temp.push_back(twentyfive);
+ temp.push_back(fifty);
+ temp.push_back(seventyfive);
+ temp.push_back(ninetyfive);
+ temp.push_back(ninetynine);
+
+ quan.push_back(temp);
+
+ gobble(in);
+ }
+
+ in.close();
+ return quan;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Chimera", "readQuantiles");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+
+
+
#include "mothur.h"
-#include "sparsematrix.hpp"
#include "sequence.hpp"
-typedef list<PCell>::iterator MatData;
-typedef map<int, float> SeqMap; //maps sequence to all distance for that seqeunce
+struct Preference {
+ string name;
+ vector<string> leftParent; //keep the name of closest left associated with the two scores
+ vector<string> rightParent; //keep the name of closest right associated with the two scores
+ vector<float> score; //so you can keep last score and calc this score and keep whichever is bigger.
+ vector<float> closestLeft; //keep the closest left associated with the two scores
+ vector<float> closestRight; //keep the closest right associated with the two scores
+ int midpoint;
+
+};
+
+struct SeqDist {
+ Sequence* seq;
+ float dist;
+};
+
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareSeqDist(SeqDist left, SeqDist right){
+ return (left.dist < right.dist);
+}
+//********************************************************************************************************************
+
+struct sim {
+ Sequence* leftParent;
+ Sequence* rightParent;
+ float score;
+ int midpoint;
+};
+
+struct linePair {
+ int start;
+ int end;
+ linePair(int i, int j) : start(i), end(j) {}
+ linePair(){}
+};
/***********************************************************************/
Chimera(string);
Chimera(string, string);
virtual ~Chimera(){};
- virtual void setFilter(bool f) { filter = f; }
+ virtual void setFilter(bool f) { filter = f; }
virtual void setCorrection(bool c) { correction = c; }
virtual void setProcessors(int p) { processors = p; }
virtual void setWindow(int w) { window = w; }
virtual void setIncrement(int i) { increment = i; }
+ virtual void setNumWanted(int n) { numWanted = n; }
- virtual void setCons(string) {};
- virtual void setQuantiles(string) {};
+ virtual void setCons(string){};
+ virtual void setQuantiles(string){};
virtual vector<Sequence*> readSeqs(string);
+ virtual vector< vector<float> > readQuantiles();
virtual void setMask(string);
+ virtual void runFilter(vector<Sequence*>);
+ virtual void createFilter(vector<Sequence*>);
+
//pure functions
virtual void getChimeras() = 0;
protected:
bool filter, correction;
- int processors, window, increment;
- string seqMask;
+ int processors, window, increment, numWanted;
+ string seqMask, quanfile, filterString;
};
#include "chimeraseqscommand.h"
#include "bellerophon.h"
#include "pintail.h"
+#include "alignedsimilarity.h"
+#include "ccode.h"
//***************************************************************************************************************
else {
//valid paramters for this command
- string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask" };
+ string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted" };
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
if (ableToOpen == 1) { abort = true; }
in.close();
}
-
-
-
string temp;
temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; }
temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; }
convert(temp, increment);
-
+
+ temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "20"; }
+ convert(temp, numwanted);
+
method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; }
- if ((method == "pintail") && (templatefile == "")) { mothurOut("You must provide a template file with the pintail method."); mothurOutEndLine(); abort = true; }
+ if (((method == "pintail") || (method == "alignsim")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail and alignsim methods."); mothurOutEndLine(); abort = true; }
}
mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n");
mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation and quantile.\n");
mothurOut("The fasta parameter is always required and template is required if using pintail.\n");
- mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter this only applies when the method is bellerphon. The default is false. \n");
+ mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. The default is false. \n");
mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. This only applies when the method is bellerphon.\n");
mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. Options include..... \n");
if (abort == true) { return 0; }
- if (method == "bellerophon") { chimera = new Bellerophon(fastafile); }
- else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); }
+ if (method == "bellerophon") { chimera = new Bellerophon(fastafile); }
+ else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); }
+ //else if (method == "alignsim") { chimera = new AlignSim(fastafile, templatefile); }
+ else if (method == "ccode") { chimera = new Ccode(fastafile, templatefile); }
else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; }
//set user options
chimera->setProcessors(processors);
chimera->setWindow(window);
chimera->setIncrement(increment);
+ chimera->setNumWanted(numwanted);
//find chimeras
chimera->getChimeras();
bool abort;
string method, fastafile, templatefile, consfile, quanfile, maskfile;
bool filter, correction;
- int processors, midpoint, averageLeft, averageRight, window, iters, increment;
+ int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted;
Chimera* chimera;
//for each window
float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
- for (int m = 0; m < obs.size(); m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); }
+ int numZeros = 0;
+ for (int m = 0; m < obs.size(); m++) {
- float de = sqrt((sum / (obs.size() - 1)));
+ //if (obs[m] != 0.0) {
+ sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));
+ //}else { numZeros++; }
+
+ }
+
+ float de = sqrt((sum / (obs.size() - 1 - numZeros)));
return de;
}
//percentage of mismatched pairs 1 to 100
quan.resize(100);
+//ofstream o;
+//string out = "getQuantiles.out";
+//openOutputFile(out, o);
//for each sequence
for(int i = start; i < end; i++){
float de = calcDE(obsi, exp);
float dist = calcDist(query, subject, front, back);
-
+ //o << i << '\t' << j << '\t' << dist << '\t' << de << endl;
dist = ceil(dist);
quanMember newScore(de, i, j);
exit(1);
}
}
-
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareQuanMembers(quanMember left, quanMember right){
+ return (left.score < right.score);
+}
//***************************************************************************************************************
//this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right. may want to revisit in the future...
-vector< vector<float> > DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
+void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
try {
- vector< vector<float> > quan;
- quan.resize(100);
-
- /*vector<quanMember> contributions;
- vector<int> seen; //seen[0] is the number of outliers that template seqs[0] was part of.
- seen.resize(num,0);
-
- //find contributions
+
for (int i = 0; i < quantiles.size(); i++) {
//find mean of this quantile score
float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
float low = quantiles[i][int(quantiles[i].size() * 0.01)].score;
-
+
+ vector<quanMember> temp;
+
//look at each value in quantiles to see if it is an outlier
for (int j = 0; j < quantiles[i].size(); j++) {
-
//is this score between 1 and 99%
if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
-
- }else {
- //add to contributions
- contributions.push_back(quantiles[i][j]);
- seen[quantiles[i][j].member1]++;
- seen[quantiles[i][j].member2]++;
+ temp.push_back(quantiles[i][j]);
}
}
+
+ quantiles[i] = temp;
}
+/*
//find contributer with most offending score related to it
int largestContrib = findLargestContrib(seen);
}
*/
- return quan;
+
}
catch(exception& e) {
errorOut(e, "DeCalculator", "removeObviousOutliers");
void setAlignmentLength(int l) { alignLength = l; }
void runMask(Sequence*);
void trimSeqs(Sequence*, Sequence*, map<int, int>&);
- vector< vector<float> > removeObviousOutliers(vector< vector<quanMember> >&, int);
+ void removeObviousOutliers(vector< vector<quanMember> >&, int);
vector<float> calcFreq(vector<Sequence*>, string);
vector<int> findWindows(Sequence*, int, int, int&, int);
vector<float> calcObserved(Sequence*, Sequence*, vector<int>, int);
#include "pintail.h"
#include "ignoregaps.h"
+#include "eachgapdist.h"
//********************************************************************************************************************
//sorts lowest to highest
int index = ceil(deviation[i]);
+ if (index == 0) { index=1; }
+
//is your DE value higher than the 95%
string chimera;
- if (DE[i] > quantiles[index][4]) { chimera = "Yes"; }
+ if (DE[i] > quantiles[index-1][4]) { chimera = "Yes"; }
else { chimera = "No"; }
out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
templateLines.push_back(new linePair(0, templateSeqs.size()));
#endif
- distcalculator = new ignoreGaps();
+ distcalculator = new eachGapDist();
decalc = new DeCalculator();
//if the user does enter a mask then you want to keep all the spots in the alignment
mothurOut("Done."); mothurOutEndLine();
}else { createProcessesPairs(); }
-
+/*string o = "foronlinepintailpairs-eachgap";
+ofstream out7;
+openOutputFile(o, out7);
+
+for (int i = 0; i < bestfit.size(); i++) {
+ out7 << querySeqs[i]->getName() << endl;
+ out7 << querySeqs[i]->getUnaligned() << endl << endl;
+
+ out7 << bestfit[i]->getName() << endl;
+ out7 << bestfit[i]->getUnaligned() << endl << endl << endl;
+}
+out7.close();/*/
//find P
mothurOut("Getting conservation... "); cout.flush();
if (consfile == "") {
}
}
-
+
+ if (filter) {
+ vector<Sequence*> temp = templateSeqs;
+ for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); }
+
+ createFilter(temp);
+
+ runFilter(querySeqs);
+ runFilter(templateSeqs);
+ runFilter(bestfit);
+ }
+
+
if (processors == 1) {
for (int j = 0; j < bestfit.size(); j++) {
//quantiles are used to determine whether the de values found indicate a chimera
//if you have to calculate them, its time intensive because you are finding the de and deviation values for each
//combination of sequences in the template
- if (quanfile != "") { quantiles = readQuantiles(); }
+ if (quanfile != "") { quantiles = readQuantiles(); }
else {
mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
if (processors == 1) {
quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
}else { createProcessesQuan(); }
+
+ ofstream out4, out5;
+ string noOutliers, outliers;
-
-
- //decided against this because we were having trouble setting the sensitivity... may want to revisit this...
- //quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
-
- ofstream out4;
- string o;
-
- o = getRootName(templateFile) + "quan";
+ noOutliers = getRootName(templateFile) + "pintail.quanNOOUTLIERS";
+ outliers = getRootName(templateFile) + "pintail.quanYESOUTLIERS";
- openOutputFile(o, out4);
+ openOutputFile(outliers, out4);
//adjust quantiles
for (int i = 0; i < quantilesMembers.size(); i++) {
out4.close();
+ decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
+
+ openOutputFile(noOutliers, out5);
+
+ //adjust quantiles
+ for (int i = 0; i < quantilesMembers.size(); i++) {
+ vector<float> temp;
+
+ if (quantilesMembers[i].size() == 0) {
+ //in case this is not a distance found in your template files
+ for (int g = 0; g < 6; g++) {
+ temp.push_back(0.0);
+ }
+ }else{
+
+ sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
+
+ //save 10%
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
+ //save 25%
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
+ //save 50%
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
+ //save 75%
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
+ //save 95%
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
+ //save 99%
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
+
+ }
+
+ //output quan value
+ out5 << i+1 << '\t';
+ for (int u = 0; u < temp.size(); u++) { out5 << temp[u] << '\t'; }
+ out5 << endl;
+
+ quantiles[i] = temp;
+
+ }
+
mothurOut("Done."); mothurOutEndLine();
}
}
}
-//***************************************************************************************************************
-
-vector< vector<float> > Pintail::readQuantiles() {
- try {
-
- ifstream in;
- openInputFile(quanfile, in);
-
- vector< vector<float> > quan;
-
- int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine;
-
- while(!in.eof()){
-
- in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine;
-
- vector <float> temp;
-
- temp.push_back(ten);
- temp.push_back(twentyfive);
- temp.push_back(fifty);
- temp.push_back(seventyfive);
- temp.push_back(ninetyfive);
- temp.push_back(ninetynine);
-
- quan.push_back(temp);
-
- gobble(in);
- }
-
- in.close();
- return quan;
-
- }
- catch(exception& e) {
- errorOut(e, "Pintail", "readQuantiles");
- exit(1);
- }
-}
//***************************************************************************************************************
//calculate the distances from each query sequence to all sequences in the template to find the closest sequence
vector<Sequence*> Pintail::findPairs(int start, int end) {
private:
- struct linePair {
- int start;
- int end;
- linePair(int i, int j) : start(i), end(j) {}
- linePair(){}
- };
-
Dist* distcalculator;
DeCalculator* decalc;
int iters;
- string fastafile, templateFile, consfile, quanfile;
+ string fastafile, templateFile, consfile;
vector<linePair*> lines;
vector<float> readFreq();
- vector< vector<float> > readQuantiles();
vector<Sequence*> findPairs(int, int);
void createProcessesSpots();