From: westcott Date: Mon, 31 Aug 2009 14:33:57 +0000 (+0000) Subject: checking in chimera files in progress after move to michigan X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=f7fbd74ffedcf62217109c22e828453eaefa1458 checking in chimera files in progress after move to michigan --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index c4f38cb..c6d6c0e 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -162,6 +162,8 @@ A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; }; A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; }; A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; }; + A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B8879104C16860083C454 /* alignedsimilarity.cpp */; }; + A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; }; EB1216880F619B83004A865F /* bergerparker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216870F619B83004A865F /* bergerparker.cpp */; }; EB1216E50F61ACFB004A865F /* bstick.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216E40F61ACFB004A865F /* bstick.cpp */; }; EB1217230F61C9AC004A865F /* sharedkstest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1217220F61C9AC004A865F /* sharedkstest.cpp */; }; @@ -513,6 +515,10 @@ A70B53A70F4CD7AD0064797E /* getlabelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlabelcommand.h; sourceTree = SOURCE_ROOT; }; A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlinecommand.cpp; sourceTree = SOURCE_ROOT; }; A70B53A90F4CD7AD0064797E /* getlinecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlinecommand.h; sourceTree = SOURCE_ROOT; }; + A75B8879104C16860083C454 /* alignedsimilarity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignedsimilarity.cpp; sourceTree = ""; }; + A75B887A104C16860083C454 /* alignedsimilarity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignedsimilarity.h; sourceTree = ""; }; + A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = ""; }; + A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = ""; }; C6859E8B029090EE04C91782 /* Mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = Mothur.1; sourceTree = ""; }; EB1216860F619B83004A865F /* bergerparker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bergerparker.h; sourceTree = SOURCE_ROOT; }; EB1216870F619B83004A865F /* bergerparker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bergerparker.cpp; sourceTree = SOURCE_ROOT; }; @@ -645,6 +651,10 @@ 372095C1103196D70004D347 /* chimera.cpp */, 374F2FE9100634B600E97C66 /* bellerophon.h */, 374F2FEA100634B600E97C66 /* bellerophon.cpp */, + A75B8879104C16860083C454 /* alignedsimilarity.cpp */, + A75B887A104C16860083C454 /* alignedsimilarity.h */, + A75B887B104C16860083C454 /* ccode.cpp */, + A75B887C104C16860083C454 /* ccode.h */, 372A55531017922B00C5194B /* decalc.h */, 372A55541017922B00C5194B /* decalc.cpp */, 374F301110063B6F00E97C66 /* pintail.h */, @@ -1167,6 +1177,8 @@ 374F301310063B6F00E97C66 /* pintail.cpp in Sources */, 372A55551017922B00C5194B /* decalc.cpp in Sources */, 372095C2103196D70004D347 /* chimera.cpp in Sources */, + A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */, + A75B887E104C16860083C454 /* ccode.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/alignedsimilarity.cpp b/alignedsimilarity.cpp new file mode 100644 index 0000000..0ed9ce3 --- /dev/null +++ b/alignedsimilarity.cpp @@ -0,0 +1,543 @@ +/* + * alignedsimilarity.cpp + * Mothur + * + * Created by westcott on 8/18/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "alignedsimilarity.h" +#include "ignoregaps.h" + + +//*************************************************************************************************************** +AlignSim::AlignSim(string filename, string temp) { fastafile = filename; templateFile = temp; } +//*************************************************************************************************************** + +AlignSim::~AlignSim() { + try { + for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } + for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } + delete distCalc; + } + catch(exception& e) { + errorOut(e, "AlignSim", "~AlignSim"); + exit(1); + } +} +//*************************************************************************************************************** +void AlignSim::print(ostream& out) { + try { + + mothurOutEndLine(); + + for (int i = 0; i < querySeqs.size(); i++) { + + int j = 0; float largest = -10; + //find largest sim value + for (int k = 0; k < IS[i].size(); k++) { + //is this score larger + if (IS[i][k].score > largest) { + j = k; + largest = IS[i][k].score; + } + } + + //find parental similarity + distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent)); + float dist = distCalc->getDist(); + + //convert to similarity + dist = (1 - dist) * 100; + + //warn about parental similarity - if its above 82% may not detect a chimera + if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly."); mothurOutEndLine(); } + + int index = ceil(dist); + + if (index == 0) { index=1; } + + //is your DE value higher than the 95% + string chimera; + if (IS[i][j].score > quantile[index-1][4]) { chimera = "Yes"; } + else { chimera = "No"; } + + out << querySeqs[i]->getName() << "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl; + + if (chimera == "Yes") { + mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine(); + } + out << "Improvement Score\t"; + + for (int r = 0; r < IS[i].size(); r++) { out << IS[i][r].score << '\t'; } + out << endl; + } + } + catch(exception& e) { + errorOut(e, "AlignSim", "print"); + exit(1); + } +} + +//*************************************************************************************************************** +void AlignSim::getChimeras() { + try { + + //read in query sequences and subject sequences + mothurOut("Reading sequences and template file... "); cout.flush(); + querySeqs = readSeqs(fastafile); + templateSeqs = readSeqs(templateFile); + mothurOut("Done."); mothurOutEndLine(); + + int numSeqs = querySeqs.size(); + + IS.resize(numSeqs); + + //break up file if needed + int linesPerProcess = numSeqs / processors ; + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //find breakup of sequences for all times we will Parallelize + if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } + else { + //fill line pairs + for (int i = 0; i < (processors-1); i++) { + lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); + } + //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end + int i = processors - 1; + lines.push_back(new linePair((i*linesPerProcess), numSeqs)); + } + + //find breakup of templatefile for quantiles + if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); } + else { + for (int i = 0; i < processors; i++) { + templateLines.push_back(new linePair()); + templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size()); + templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size()); + } + } + #else + lines.push_back(new linePair(0, numSeqs)); + templateLines.push_back(new linePair(0, templateSeqs.size())); + #endif + + + distCalc = new ignoreGaps(); + + //find window breaks + windowBreak = findWindows(); +//for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t'; } +//cout << endl; + + mothurOut("Finding the IS values for your sequences..."); cout.flush(); + if (processors == 1) { + IS = findIS(lines[0]->start, lines[0]->end, querySeqs); + }else { IS = createProcessesIS(querySeqs, lines); } + mothurOut("Done."); mothurOutEndLine(); + + //quantiles are used to determine whether the de values found indicate a chimera + if (quanfile != "") { quantile = readQuantiles(); } + else { + + mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); + //get IS quantiles as a reference to these IS scores + //you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see + if (processors == 1) { + templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs); + }else { templateIS = createProcessesIS(templateSeqs, templateLines); } +//cout << "here" << endl; + ofstream out4; + string o; + + o = getRootName(templateFile) + "alignedsimilarity.quan"; + + openOutputFile(o, out4); + + //adjust quantiles + //construct table + quantile.resize(100); + + for (int i = 0; i < templateIS.size(); i++) { + + if (templateIS[i].size() != 0) { + int j = 0; float score = -1000; + //find highest IS score + //cout << templateIS[i].size() << endl; + for (int k = 0; k < templateIS[i].size(); k++) { + if (templateIS[i][k].score > score) { + score = templateIS[i][k].score; + j = k; + } + } + //cout << j << endl; + //find similarity of parents + distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent)); + float dist = distCalc->getDist(); + + //convert to similarity + dist = (1 - dist) * 100; + // cout << dist << endl; + int index = ceil(dist); + // cout << "index = " << index << endl; + if (index == 0) { index = 1; } + quantile[index-1].push_back(templateIS[i][j].score); + } + } + + + for (int i = 0; i < quantile.size(); i++) { + vector temp; + + if (quantile[i].size() == 0) { + //in case this is not a distance found in your template files + for (int g = 0; g < 6; g++) { + temp.push_back(0.0); + } + }else{ + + sort(quantile[i].begin(), quantile[i].end()); + + //save 10% + temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]); + //save 25% + temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]); + //save 50% + temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]); + //save 75% + temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]); + //save 95% + temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]); + //save 99% + temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]); + + } + + //output quan value + out4 << i+1 << '\t'; + for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; } + out4 << endl; + + quantile[i] = temp; + + } + + out4.close(); + + mothurOut("Done."); mothurOutEndLine(); + + } + + //free memory + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } + for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } + + } + catch(exception& e) { + errorOut(e, "AlignSim", "getChimeras"); + exit(1); + } +} +//*************************************************************************************************************** +vector AlignSim::findWindows() { + try { + + vector win; + + if (increment > querySeqs[0]->getAligned().length()) { mothurOut("You have selected an increment larger than the length of your sequences. I will use the default of 25."); increment = 25; } + + for (int m = increment; m < (querySeqs[0]->getAligned().length() - increment); m+=increment) { win.push_back(m); } + + return win; + + } + catch(exception& e) { + errorOut(e, "AlignSim", "findWindows"); + exit(1); + } +} + +//*************************************************************************************************************** +vector< vector > AlignSim::findIS(int start, int end, vector seqs) { + try { + + vector< vector > isValues; + isValues.resize(seqs.size()); + + //for each sequence + for(int i = start; i < end; i++){ + + vector temp; temp.resize(windowBreak.size()); + + mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine(); + + //for each window + for (int m = 0; m < windowBreak.size(); m++) { + + vector closest; //left, right, overall + vector similarity; //left+right and overall + + //find closest left, closest right and closest overall + closest = findClosestSides(seqs[i], windowBreak[m], similarity, i); + + int totalNumBases = seqs[i]->getUnaligned().length(); + + //IS = left+right-overall + float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases); + + ///save IS, leftparent, rightparent, breakpoint + temp[m].leftParent = closest[0]; + temp[m].rightParent = closest[1]; + temp[m].score = is; + temp[m].midpoint = windowBreak[m]; +//cout << is << '\t'; + } +//cout << endl; + isValues[i] = temp; + + } + + return isValues; + + } + catch(exception& e) { + errorOut(e, "AlignSim", "findIS"); + exit(1); + } +} +//*************************************************************************************************************** +vector AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector& sim, int i) { + try{ + + vector closest; + + Sequence query, queryLeft, queryRight; + string frag = seq->getAligned(); + string fragLeft = frag.substr(0, breakpoint); + string fragRight = frag.substr(breakpoint, frag.length()); + + //get pieces + query = *(seq); + queryLeft = *(seq); + queryRight = *(seq); + queryLeft.setAligned(fragLeft); + queryRight.setAligned(fragRight); + + //initialize + Sequence* overall = templateSeqs[0]; + Sequence* left = templateSeqs[0]; + Sequence* right = templateSeqs[0]; + + float smallestOverall, smallestLeft, smallestRight; + smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000; + + //go through the templateSeqs and search for the closest + for(int j = 0; j < templateSeqs.size(); j++){ + + //so you don't pick yourself + if (j != i) { + Sequence temp, tempLeft, tempRight; + string fragTemp = templateSeqs[j]->getAligned(); + string fragTempLeft = fragTemp.substr(0, breakpoint); + string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length()); + + //get pieces + temp = *(templateSeqs[j]); + tempLeft = *(templateSeqs[j]); + tempRight = *(templateSeqs[j]); + tempLeft.setAligned(fragTempLeft); + tempRight.setAligned(fragTempRight); + + //find overall dist + distCalc->calcDist(query, temp); + float dist = distCalc->getDist(); + + if (dist < smallestOverall) { + overall = templateSeqs[j]; + smallestOverall = dist; + } + + //find left dist + distCalc->calcDist(queryLeft, tempLeft); + dist = distCalc->getDist(); + + if (dist < smallestLeft) { + left = templateSeqs[j]; + smallestLeft = dist; + } + + //find left dist + distCalc->calcDist(queryRight, tempRight); + dist = distCalc->getDist(); + + if (dist < smallestRight) { + right = templateSeqs[j]; + smallestRight = dist; + } + } + + } + + closest.push_back(left); + closest.push_back(right); + closest.push_back(overall); + + //fill sim with number of matched bases + Sequence tempL = *(left); + string tempLFrag = tempL.getAligned(); + tempL.setAligned(tempLFrag.substr(0, breakpoint)); + + int bothMatches = findNumMatchedBases(queryLeft, tempL); + sim.push_back(bothMatches); + + Sequence tempR = *(right); + string tempRFrag = tempR.getAligned(); + tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length())); + + bothMatches = findNumMatchedBases(queryRight, tempR); + sim.push_back(bothMatches); + + bothMatches = findNumMatchedBases(query, *(overall)); + sim.push_back(bothMatches); + + return closest; + + + } + catch(exception& e) { + errorOut(e, "AlignSim", "findClosestSides"); + exit(1); + } +} +//*************************************************************************************************************** + +int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) { + try{ + + int num = 0; + + string query = seq.getAligned(); + string subject = temp.getAligned(); +//cout << seq.getName() << endl << endl; +//cout << temp.getName() << endl << endl; + + for (int i = 0; i < query.length(); i++) { + //count matches if query[i] is a base and subject is same bases or gap + if (isalpha(query[i])) { + if(!isalpha(subject[i])) { num++; } + else if (query[i] == subject[i]) { num++; } + else { } + } + } +//cout << "num = " << num << endl; + return num; + } + catch(exception& e) { + errorOut(e, "AlignSim", "findNumMatchedBases"); + exit(1); + } +} + +/**************************************************************************************************/ +vector< vector > AlignSim::createProcessesIS(vector seqs, vector linesToProcess) { + try { + vector< vector > localIs; localIs.resize(seqs.size()); + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + vector processIDS; + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); + process++; + }else if (pid == 0){ + + mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine(); + localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs); + mothurOut("Done finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine(); + + //write out data to file so parent can read it + ofstream out; + string s = toString(getpid()) + ".temp"; + openOutputFile(s, out); + + //output pairs + for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) { + out << localIs[i].size() << endl; + for (int j = 0; j < localIs[i].size(); j++) { + localIs[i][j].leftParent->printSequence(out); + localIs[i][j].rightParent->printSequence(out); + out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl; + } + } + out.close(); + + exit(0); + }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;istart; k < linesToProcess[i]->end; k++) { + int size; + in >> size; + gobble(in); + + vector tempVector; + + for (int j = 0; j < size; j++) { + + sim temp; + + temp.leftParent = new Sequence(in); + gobble(in); + //temp.leftParent->printSequence(cout); + temp.rightParent = new Sequence(in); + gobble(in); +//temp.rightParent->printSequence(cout); + string throwaway; + in >> throwaway >> temp.score >> temp.midpoint; + //cout << temp.score << '\t' << temp.midpoint << endl; + gobble(in); + + tempVector.push_back(temp); + } + + localIs[k] = tempVector; + } + + in.close(); + remove(s.c_str()); + } + + +#else + localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs); +#endif + + return localIs; + } + catch(exception& e) { + errorOut(e, "AlignSim", "createProcessesIS"); + exit(1); + } +} + +//*************************************************************************************************************** diff --git a/alignedsimilarity.h b/alignedsimilarity.h new file mode 100644 index 0000000..716b8b7 --- /dev/null +++ b/alignedsimilarity.h @@ -0,0 +1,67 @@ +#ifndef ALIGNSIM_H +#define ALIGNSIM_H + +/* + * alignedsimilarity.h + * Mothur + * + * Created by westcott on 8/18/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + + +#include "chimera.h" +#include "dist.h" + +//This class was created using the algorythms described in the +//"Evaluation of Nearest Neighbor Methods for Detection of Chimeric Small-Subunit rRna Sequences" paper +//by J.F. Robison-Cox 1, M.M. Bateson 2 and D.M. Ward 2 + + +/***********************************************************/ + +class AlignSim : public Chimera { + + public: + AlignSim(string, string); + ~AlignSim(); + + void getChimeras(); + void print(ostream&); + + void setCons(string){}; + void setQuantiles(string q) { quanfile = q; }; + + + private: + + Dist* distCalc; + vector lines; + vector templateLines; + + vector querySeqs; + vector templateSeqs; + + vector< vector > IS; //IS[0] is the vector os sim values for each window for querySeqs[0] + vector< vector > templateIS; //templateIS[0] is the vector os sim values for each window for templateSeqs[0] + vector windowBreak; + + string fastafile, templateFile; + int iters; + + vector< vector > findIS(int, int, vector); + vector findClosestSides(Sequence*, int, vector&, int); + int findNumMatchedBases(Sequence, Sequence); + vector findWindows(); + + vector< vector > quantile; + + vector< vector > createProcessesIS(vector, vector); + +}; + +/***********************************************************/ + +#endif + diff --git a/bellerophon.cpp b/bellerophon.cpp index dc021c0..2ce3951 100644 --- a/bellerophon.cpp +++ b/bellerophon.cpp @@ -35,7 +35,7 @@ void Bellerophon::print(ostream& out) { out << pref[i].name << '\t' << setprecision(3) << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl; //calc # of seqs with preference above 1.0 - if (pref[i].score[0] > 1.0) { + if (pref[i].score[0] > 1.1) { above1++; mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine(); mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine(); @@ -44,7 +44,7 @@ void Bellerophon::print(ostream& out) { //output results to screen mothurOutEndLine(); - mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine(); + mothurOut("Sequence with preference score above 1.1: " + toString(above1)); mothurOutEndLine(); int spot; spot = pref.size()-1; mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); @@ -115,7 +115,7 @@ void Bellerophon::getChimeras() { }else{ increment = 0; } } -cout << "increment = " << increment << endl; + if (increment == 0) { iters = 1; } else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); } @@ -193,7 +193,6 @@ cout << "increment = " << increment << endl; delete SparseLeft; delete SparseRight; - //fill preference structure generatePreferences(distMapLeft, distMapRight, midpoint); @@ -204,7 +203,7 @@ cout << "increment = " << increment << endl; delete distCalculator; //rank preference score to eachother - float dme = 0.0; + /*float dme = 0.0; float expectedPercent = 1 / (float) (pref.size()); for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[0]; } @@ -216,10 +215,8 @@ cout << "increment = " << increment << endl; //how much higher or lower is this than expected pref[i].score[0] = pref[i].score[0] / expectedPercent; - - - - } + + }*/ //sort Preferences highest to lowest sort(pref.begin(), pref.end(), comparePref); @@ -337,23 +334,25 @@ void Bellerophon::generatePreferences(vector left, vector right, //calculate the dme - int count0 = 0; - for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } } + for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } } - float expectedPercent = 1 / (float) (pref.size() - count0); + //float expectedPercent = 1 / (float) (pref.size() - count0); //cout << endl << "dme = " << dme << endl; //recalculate prefernences based on dme for (int i = 0; i < pref.size(); i++) { -//cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl; +//cout << "unadjusted col[i] " << i << " = " << pref[i].score[1] << endl; +//cout << i << '\t' << (dme / (dme - 2 * pref[i].score[1])) << endl; + // gives the actual percentage of the dme this seq adds - pref[i].score[1] = pref[i].score[1] / dme; + //pref[i].score[1] = pref[i].score[1] / dme; //how much higher or lower is this than expected - pref[i].score[1] = pref[i].score[1] / expectedPercent; - - //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]); + //pref[i].score[1] = pref[i].score[1] / expectedPercent; + //not 2 * pref[i].score[1] because i only calulate the pref scores once. + pref[i].score[1] = dme / (dme - pref[i].score[1]); +//cout << i << '\t' << pref[i].score[1] << endl; //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher. //cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl; } diff --git a/bellerophon.h b/bellerophon.h index bc86c18..b309545 100644 --- a/bellerophon.h +++ b/bellerophon.h @@ -13,20 +13,12 @@ #include "chimera.h" #include "filterseqscommand.h" +#include "sparsematrix.hpp" #include "sequence.hpp" #include "dist.h" -struct Preference { - string name; - vector leftParent; //keep the name of closest left associated with the two scores - vector rightParent; //keep the name of closest right associated with the two scores - vector score; //so you can keep last score and calc this score and keep whichever is bigger. - vector closestLeft; //keep the closest left associated with the two scores - vector closestRight; //keep the closest right associated with the two scores - int midpoint; - -}; - +typedef list::iterator MatData; +typedef map SeqMap; //maps sequence to all distance for that seqeunce /***********************************************************/ diff --git a/ccode.cpp b/ccode.cpp new file mode 100644 index 0000000..a3ef5e6 --- /dev/null +++ b/ccode.cpp @@ -0,0 +1,272 @@ +/* + * ccode.cpp + * Mothur + * + * Created by westcott on 8/24/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "ccode.h" +#include "ignoregaps.h" + + +//*************************************************************************************************************** +Ccode::Ccode(string filename, string temp) { fastafile = filename; templateFile = temp; } +//*************************************************************************************************************** + +Ccode::~Ccode() { + try { + for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } + for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } + delete distCalc; + } + catch(exception& e) { + errorOut(e, "Ccode", "~Ccode"); + exit(1); + } +} +//*************************************************************************************************************** +void Ccode::print(ostream& out) { + try { + + mothurOutEndLine(); + + for (int i = 0; i < querySeqs.size(); i++) { + + } + } + catch(exception& e) { + errorOut(e, "Ccode", "print"); + exit(1); + } +} + +//*************************************************************************************************************** +void Ccode::getChimeras() { + try { + + //read in query sequences and subject sequences + mothurOut("Reading sequences and template file... "); cout.flush(); + querySeqs = readSeqs(fastafile); + templateSeqs = readSeqs(templateFile); + mothurOut("Done."); mothurOutEndLine(); + + int numSeqs = querySeqs.size(); + + closest.resize(numSeqs); + + //break up file if needed + int linesPerProcess = numSeqs / processors ; + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + //find breakup of sequences for all times we will Parallelize + if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } + else { + //fill line pairs + for (int i = 0; i < (processors-1); i++) { + lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); + } + //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end + int i = processors - 1; + lines.push_back(new linePair((i*linesPerProcess), numSeqs)); + } + + //find breakup of templatefile for quantiles + if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); } + else { + for (int i = 0; i < processors; i++) { + templateLines.push_back(new linePair()); + templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size()); + templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size()); + } + } + #else + lines.push_back(new linePair(0, numSeqs)); + templateLines.push_back(new linePair(0, templateSeqs.size())); + #endif + + distCalc = new ignoreGaps(); + + //find closest + if (processors == 1) { + mothurOut("Finding top matches for sequences... "); cout.flush(); + closest = findClosest(lines[0]->start, lines[0]->end, numWanted); + mothurOut("Done."); mothurOutEndLine(); + }else { createProcessesClosest(); } + + + for (int i = 0; i < closest.size(); i++) { + cout << querySeqs[i]->getName() << ": "; + for (int j = 0; j < closest[i].size(); j++) { + + cout << closest[i][j]->getName() << '\t'; + } + cout << endl; + } + + //free memory + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } + for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } + + } + catch(exception& e) { + errorOut(e, "Ccode", "getChimeras"); + exit(1); + } +} +/*************************************************************************************************************** +vector Ccode::findWindows() { + try { + + vector win; + + if (increment > querySeqs[0]->getAligned().length()) { mothurOut("You have selected an increment larger than the length of your sequences. I will use the default of 25."); increment = 25; } + + for (int m = increment; m < (querySeqs[0]->getAligned().length() - increment); m+=increment) { win.push_back(m); } + + return win; + + } + catch(exception& e) { + errorOut(e, "Ccode", "findWindows"); + exit(1); + } +} +*/ +//*************************************************************************************************************** +vector< vector > Ccode::findClosest(int start, int end, int numWanted) { + try{ + + vector< vector > topMatches; topMatches.resize(querySeqs.size()); + + float smallestOverall, smallestLeft, smallestRight; + smallestOverall = 1000; smallestLeft = 1000; smallestRight = 1000; + + //for each sequence in querySeqs - find top matches to use as reference + for(int j = start; j < end; j++){ + + Sequence query = *(querySeqs[j]); + + vector distances; + + //calc distance to each sequence in template seqs + for (int i = 0; i < templateSeqs.size(); i++) { + + Sequence ref = *(templateSeqs[i]); + + //find overall dist + distCalc->calcDist(query, ref); + float dist = distCalc->getDist(); + + //save distance + SeqDist temp; + temp.seq = templateSeqs[i]; + temp.dist = dist; + + distances.push_back(temp); + } + + sort(distances.begin(), distances.end(), compareSeqDist); + + //save the number of top matches wanted + for (int h = 0; h < numWanted; h++) { + topMatches[j].push_back(distances[h].seq); + } + } + + return topMatches; + + } + catch(exception& e) { + errorOut(e, "Ccode", "findClosestSides"); + exit(1); + } +} +/**************************************************************************************************/ +void Ccode::createProcessesClosest() { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + vector processIDS; + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); + process++; + }else if (pid == 0){ + + mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); + closest = findClosest(lines[process]->start, lines[process]->end, numWanted); + mothurOut("Done finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine(); + + //write out data to file so parent can read it + ofstream out; + string s = toString(getpid()) + ".temp"; + openOutputFile(s, out); + + //output pairs + for (int i = lines[process]->start; i < lines[process]->end; i++) { + out << closest[i].size() << endl; + for (int j = 0; j < closest[i].size(); j++) { + closest[i][j]->printSequence(out); + } + } + out.close(); + + exit(0); + }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;istart; k < lines[i]->end; k++) { + int size; + in >> size; + gobble(in); + + vector tempVector; + + for (int j = 0; j < size; j++) { + + Sequence* temp = new Sequence(in); + gobble(in); + + tempVector.push_back(temp); + } + + closest[k] = tempVector; + } + + in.close(); + remove(s.c_str()); + } + + +#else + closest = findClosest(lines[0]->start, lines[0]->end, numWanted); +#endif + + } + catch(exception& e) { + errorOut(e, "Ccode", "createProcessesClosest"); + exit(1); + } +} + +//*************************************************************************************************************** + diff --git a/ccode.h b/ccode.h new file mode 100644 index 0000000..a4860dc --- /dev/null +++ b/ccode.h @@ -0,0 +1,62 @@ +#ifndef CCODE_H +#define CCODE_H + +/* + * ccode.h + * Mothur + * + * Created by westcott on 8/24/09. + * Copyright 2009 Schloss LAB. All rights reserved. + * + */ + +#include "chimera.h" +#include "dist.h" +#include "decalc.h" + +//This class was created using the algorythms described in the +// "Evaluating putative chimeric sequences from PCR-amplified products" paper +//by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez. + +/***********************************************************/ + +class Ccode : public Chimera { + + public: + Ccode(string, string); + ~Ccode(); + + void getChimeras(); + void print(ostream&); + + void setCons(string c) {} + void setQuantiles(string q) {} + + + private: + + Dist* distCalc; + DeCalculator* decalc; + int iters; + string fastafile, templateFile; + + + vector lines; + vector templateLines; + vector querySeqs; + vector templateSeqs; + + vector< vector > closest; //bestfit[0] is a vector of sequence at are closest to queryseqs[0]... + + vector< vector > findClosest(int, int, int); + void removeSeqs(vector); //removes sequences from closest that are to different of too similar to eachother. + + void createProcessesClosest(); + +}; + +/***********************************************************/ + +#endif + + diff --git a/chimera.cpp b/chimera.cpp index 48e5763..8aaab4b 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -9,6 +9,76 @@ #include "chimera.h" +//*************************************************************************************************************** +//this is a vertical filter +void Chimera::createFilter(vector seqs) { + try { + + int threshold = int (0.5 * seqs.size()); +//cout << "threshhold = " << threshold << endl; + + vector gaps; gaps.resize(seqs[0]->getAligned().length(), 0); + vector a; a.resize(seqs[0]->getAligned().length(), 0); + vector t; t.resize(seqs[0]->getAligned().length(), 0); + vector g; g.resize(seqs[0]->getAligned().length(), 0); + vector c; c.resize(seqs[0]->getAligned().length(), 0); + + filterString = (string(seqs[0]->getAligned().length(), '1')); + + //for each sequence + for (int i = 0; i < seqs.size(); i++) { + + string seqAligned = seqs[i]->getAligned(); + + for (int j = 0; j < seqAligned.length(); j++) { + //if this spot is a gap + if ((seqAligned[j] == '-') || (seqAligned[j] == '.')) { gaps[j]++; } + else if (toupper(seqAligned[j]) == 'A') { a[j]++; } + else if (toupper(seqAligned[j]) == 'T') { t[j]++; } + else if (toupper(seqAligned[j]) == 'G') { g[j]++; } + else if (toupper(seqAligned[j]) == 'C') { c[j]++; } + } + } + + //zero out spot where all sequences have blanks + for(int i = 0;i < seqs[0]->getAligned().length(); i++){ + if(gaps[i] == seqs.size()) { filterString[i] = '0'; } + + else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) { filterString[i] = '0'; } + //cout << "a = " << a[i] << " t = " << t[i] << " g = " << g[i] << " c = " << c[i] << endl; + } +//cout << "filter = " << filterString << endl; + } + catch(exception& e) { + errorOut(e, "Chimera", "createFilter"); + exit(1); + } +} +//*************************************************************************************************************** +void Chimera::runFilter(vector seqs) { + try { + + //for each sequence + for (int i = 0; i < seqs.size(); i++) { + + string seqAligned = seqs[i]->getAligned(); + string newAligned = ""; + + for (int j = 0; j < seqAligned.length(); j++) { + //if this spot is a gap + if (filterString[j] == '1') { newAligned += seqAligned[j]; } + } + + seqs[i]->setAligned(newAligned); + } + + } + catch(exception& e) { + errorOut(e, "Chimera", "runFilter"); + exit(1); + } +} + //*************************************************************************************************************** vector Chimera::readSeqs(string file) { try { @@ -61,3 +131,46 @@ void Chimera::setMask(string filename) { } } //*************************************************************************************************************** + +vector< vector > Chimera::readQuantiles() { + try { + + ifstream in; + openInputFile(quanfile, in); + + vector< vector > quan; + + int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; + + while(!in.eof()){ + + in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; + + vector temp; + + temp.push_back(ten); + temp.push_back(twentyfive); + temp.push_back(fifty); + temp.push_back(seventyfive); + temp.push_back(ninetyfive); + temp.push_back(ninetynine); + + quan.push_back(temp); + + gobble(in); + } + + in.close(); + return quan; + + } + catch(exception& e) { + errorOut(e, "Chimera", "readQuantiles"); + exit(1); + } +} +//*************************************************************************************************************** + + + + diff --git a/chimera.h b/chimera.h index fd6b53a..366068b 100644 --- a/chimera.h +++ b/chimera.h @@ -12,12 +12,45 @@ #include "mothur.h" -#include "sparsematrix.hpp" #include "sequence.hpp" -typedef list::iterator MatData; -typedef map SeqMap; //maps sequence to all distance for that seqeunce +struct Preference { + string name; + vector leftParent; //keep the name of closest left associated with the two scores + vector rightParent; //keep the name of closest right associated with the two scores + vector score; //so you can keep last score and calc this score and keep whichever is bigger. + vector closestLeft; //keep the closest left associated with the two scores + vector closestRight; //keep the closest right associated with the two scores + int midpoint; + +}; + +struct SeqDist { + Sequence* seq; + float dist; +}; + +//******************************************************************************************************************** +//sorts lowest to highest +inline bool compareSeqDist(SeqDist left, SeqDist right){ + return (left.dist < right.dist); +} +//******************************************************************************************************************** + +struct sim { + Sequence* leftParent; + Sequence* rightParent; + float score; + int midpoint; +}; + +struct linePair { + int start; + int end; + linePair(int i, int j) : start(i), end(j) {} + linePair(){} +}; /***********************************************************************/ @@ -29,16 +62,21 @@ class Chimera { Chimera(string); Chimera(string, string); virtual ~Chimera(){}; - virtual void setFilter(bool f) { filter = f; } + virtual void setFilter(bool f) { filter = f; } virtual void setCorrection(bool c) { correction = c; } virtual void setProcessors(int p) { processors = p; } virtual void setWindow(int w) { window = w; } virtual void setIncrement(int i) { increment = i; } + virtual void setNumWanted(int n) { numWanted = n; } - virtual void setCons(string) {}; - virtual void setQuantiles(string) {}; + virtual void setCons(string){}; + virtual void setQuantiles(string){}; virtual vector readSeqs(string); + virtual vector< vector > readQuantiles(); virtual void setMask(string); + virtual void runFilter(vector); + virtual void createFilter(vector); + //pure functions virtual void getChimeras() = 0; @@ -47,8 +85,8 @@ class Chimera { protected: bool filter, correction; - int processors, window, increment; - string seqMask; + int processors, window, increment, numWanted; + string seqMask, quanfile, filterString; }; diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index f980310..6af8aa3 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -10,6 +10,8 @@ #include "chimeraseqscommand.h" #include "bellerophon.h" #include "pintail.h" +#include "alignedsimilarity.h" +#include "ccode.h" //*************************************************************************************************************** @@ -23,7 +25,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask" }; + string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -61,9 +63,6 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ if (ableToOpen == 1) { abort = true; } in.close(); } - - - string temp; temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "F"; } @@ -80,10 +79,13 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "25"; } convert(temp, increment); - + + temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found") { temp = "20"; } + convert(temp, numwanted); + method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; } - if ((method == "pintail") && (templatefile == "")) { mothurOut("You must provide a template file with the pintail method."); mothurOutEndLine(); abort = true; } + if (((method == "pintail") || (method == "alignsim")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail and alignsim methods."); mothurOutEndLine(); abort = true; } } @@ -101,7 +103,7 @@ void ChimeraSeqsCommand::help(){ mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n"); mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation and quantile.\n"); mothurOut("The fasta parameter is always required and template is required if using pintail.\n"); - mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter this only applies when the method is bellerphon. The default is false. \n"); + mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter. The default is false. \n"); mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. This only applies when the method is bellerphon.\n"); mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. Options include..... \n"); @@ -134,8 +136,10 @@ int ChimeraSeqsCommand::execute(){ if (abort == true) { return 0; } - if (method == "bellerophon") { chimera = new Bellerophon(fastafile); } - else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); } + if (method == "bellerophon") { chimera = new Bellerophon(fastafile); } + else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); } + //else if (method == "alignsim") { chimera = new AlignSim(fastafile, templatefile); } + else if (method == "ccode") { chimera = new Ccode(fastafile, templatefile); } else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } //set user options @@ -155,6 +159,7 @@ int ChimeraSeqsCommand::execute(){ chimera->setProcessors(processors); chimera->setWindow(window); chimera->setIncrement(increment); + chimera->setNumWanted(numwanted); //find chimeras chimera->getChimeras(); diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h index 85d5732..6274c48 100644 --- a/chimeraseqscommand.h +++ b/chimeraseqscommand.h @@ -30,7 +30,7 @@ private: bool abort; string method, fastafile, templatefile, consfile, quanfile, maskfile; bool filter, correction; - int processors, midpoint, averageLeft, averageRight, window, iters, increment; + int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted; Chimera* chimera; diff --git a/decalc.cpp b/decalc.cpp index 860b12f..0589751 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -251,9 +251,16 @@ float DeCalculator::calcDE(vector obs, vector exp) { //for each window float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2 - for (int m = 0; m < obs.size(); m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); } + int numZeros = 0; + for (int m = 0; m < obs.size(); m++) { - float de = sqrt((sum / (obs.size() - 1))); + //if (obs[m] != 0.0) { + sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); + //}else { numZeros++; } + + } + + float de = sqrt((sum / (obs.size() - 1 - numZeros))); return de; } @@ -366,6 +373,9 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, //percentage of mismatched pairs 1 to 100 quan.resize(100); +//ofstream o; +//string out = "getQuantiles.out"; +//openOutputFile(out, o); //for each sequence for(int i = start; i < end; i++){ @@ -402,7 +412,7 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, float de = calcDE(obsi, exp); float dist = calcDist(query, subject, front, back); - + //o << i << '\t' << j << '\t' << dist << '\t' << de << endl; dist = ceil(dist); quanMember newScore(de, i, j); @@ -424,19 +434,16 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, exit(1); } } - +//******************************************************************************************************************** +//sorts lowest to highest +inline bool compareQuanMembers(quanMember left, quanMember right){ + return (left.score < right.score); +} //*************************************************************************************************************** //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right. may want to revisit in the future... -vector< vector > DeCalculator::removeObviousOutliers(vector< vector >& quantiles, int num) { +void DeCalculator::removeObviousOutliers(vector< vector >& quantiles, int num) { try { - vector< vector > quan; - quan.resize(100); - - /*vector contributions; - vector seen; //seen[0] is the number of outliers that template seqs[0] was part of. - seen.resize(num,0); - - //find contributions + for (int i = 0; i < quantiles.size(); i++) { //find mean of this quantile score @@ -444,22 +451,21 @@ vector< vector > DeCalculator::removeObviousOutliers(vector< vector temp; + //look at each value in quantiles to see if it is an outlier for (int j = 0; j < quantiles[i].size(); j++) { - //is this score between 1 and 99% if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) { - - }else { - //add to contributions - contributions.push_back(quantiles[i][j]); - seen[quantiles[i][j].member1]++; - seen[quantiles[i][j].member2]++; + temp.push_back(quantiles[i][j]); } } + + quantiles[i] = temp; } +/* //find contributer with most offending score related to it int largestContrib = findLargestContrib(seen); @@ -538,7 +544,7 @@ cout << "high = " << high << endl; } */ - return quan; + } catch(exception& e) { errorOut(e, "DeCalculator", "removeObviousOutliers"); diff --git a/decalc.h b/decalc.h index a307497..da0f96c 100644 --- a/decalc.h +++ b/decalc.h @@ -44,7 +44,7 @@ class DeCalculator { void setAlignmentLength(int l) { alignLength = l; } void runMask(Sequence*); void trimSeqs(Sequence*, Sequence*, map&); - vector< vector > removeObviousOutliers(vector< vector >&, int); + void removeObviousOutliers(vector< vector >&, int); vector calcFreq(vector, string); vector findWindows(Sequence*, int, int, int&, int); vector calcObserved(Sequence*, Sequence*, vector, int); diff --git a/pintail.cpp b/pintail.cpp index f46e0e2..04dfb0a 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -9,6 +9,7 @@ #include "pintail.h" #include "ignoregaps.h" +#include "eachgapdist.h" //******************************************************************************************************************** //sorts lowest to highest @@ -41,9 +42,11 @@ void Pintail::print(ostream& out) { int index = ceil(deviation[i]); + if (index == 0) { index=1; } + //is your DE value higher than the 95% string chimera; - if (DE[i] > quantiles[index][4]) { chimera = "Yes"; } + if (DE[i] > quantiles[index-1][4]) { chimera = "Yes"; } else { chimera = "No"; } out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl; @@ -125,7 +128,7 @@ void Pintail::getChimeras() { templateLines.push_back(new linePair(0, templateSeqs.size())); #endif - distcalculator = new ignoreGaps(); + distcalculator = new eachGapDist(); decalc = new DeCalculator(); //if the user does enter a mask then you want to keep all the spots in the alignment @@ -141,7 +144,18 @@ void Pintail::getChimeras() { mothurOut("Done."); mothurOutEndLine(); }else { createProcessesPairs(); } - +/*string o = "foronlinepintailpairs-eachgap"; +ofstream out7; +openOutputFile(o, out7); + +for (int i = 0; i < bestfit.size(); i++) { + out7 << querySeqs[i]->getName() << endl; + out7 << querySeqs[i]->getUnaligned() << endl << endl; + + out7 << bestfit[i]->getName() << endl; + out7 << bestfit[i]->getUnaligned() << endl << endl << endl; +} +out7.close();/*/ //find P mothurOut("Getting conservation... "); cout.flush(); if (consfile == "") { @@ -171,7 +185,19 @@ void Pintail::getChimeras() { } } - + + if (filter) { + vector temp = templateSeqs; + for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); } + + createFilter(temp); + + runFilter(querySeqs); + runFilter(templateSeqs); + runFilter(bestfit); + } + + if (processors == 1) { for (int j = 0; j < bestfit.size(); j++) { @@ -241,26 +267,22 @@ void Pintail::getChimeras() { //quantiles are used to determine whether the de values found indicate a chimera //if you have to calculate them, its time intensive because you are finding the de and deviation values for each //combination of sequences in the template - if (quanfile != "") { quantiles = readQuantiles(); } + if (quanfile != "") { quantiles = readQuantiles(); } else { mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); if (processors == 1) { quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); }else { createProcessesQuan(); } + + ofstream out4, out5; + string noOutliers, outliers; - - - //decided against this because we were having trouble setting the sensitivity... may want to revisit this... - //quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); - - ofstream out4; - string o; - - o = getRootName(templateFile) + "quan"; + noOutliers = getRootName(templateFile) + "pintail.quanNOOUTLIERS"; + outliers = getRootName(templateFile) + "pintail.quanYESOUTLIERS"; - openOutputFile(o, out4); + openOutputFile(outliers, out4); //adjust quantiles for (int i = 0; i < quantilesMembers.size(); i++) { @@ -301,6 +323,47 @@ void Pintail::getChimeras() { out4.close(); + decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); + + openOutputFile(noOutliers, out5); + + //adjust quantiles + for (int i = 0; i < quantilesMembers.size(); i++) { + vector temp; + + if (quantilesMembers[i].size() == 0) { + //in case this is not a distance found in your template files + for (int g = 0; g < 6; g++) { + temp.push_back(0.0); + } + }else{ + + sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers); + + //save 10% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score); + //save 25% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score); + //save 50% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score); + //save 75% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score); + //save 95% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score); + //save 99% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score); + + } + + //output quan value + out5 << i+1 << '\t'; + for (int u = 0; u < temp.size(); u++) { out5 << temp[u] << '\t'; } + out5 << endl; + + quantiles[i] = temp; + + } + mothurOut("Done."); mothurOutEndLine(); } @@ -360,45 +423,6 @@ vector Pintail::readFreq() { } } -//*************************************************************************************************************** - -vector< vector > Pintail::readQuantiles() { - try { - - ifstream in; - openInputFile(quanfile, in); - - vector< vector > quan; - - int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; - - while(!in.eof()){ - - in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; - - vector temp; - - temp.push_back(ten); - temp.push_back(twentyfive); - temp.push_back(fifty); - temp.push_back(seventyfive); - temp.push_back(ninetyfive); - temp.push_back(ninetynine); - - quan.push_back(temp); - - gobble(in); - } - - in.close(); - return quan; - - } - catch(exception& e) { - errorOut(e, "Pintail", "readQuantiles"); - exit(1); - } -} //*************************************************************************************************************** //calculate the distances from each query sequence to all sequences in the template to find the closest sequence vector Pintail::findPairs(int start, int end) { diff --git a/pintail.h b/pintail.h index ddfba85..37efd00 100644 --- a/pintail.h +++ b/pintail.h @@ -35,17 +35,10 @@ class Pintail : public Chimera { private: - struct linePair { - int start; - int end; - linePair(int i, int j) : start(i), end(j) {} - linePair(){} - }; - Dist* distcalculator; DeCalculator* decalc; int iters; - string fastafile, templateFile, consfile, quanfile; + string fastafile, templateFile, consfile; vector lines; @@ -77,7 +70,6 @@ class Pintail : public Chimera { vector readFreq(); - vector< vector > readQuantiles(); vector findPairs(int, int); void createProcessesSpots();