//do soft filter
if (filter) {
- string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
+ string optionString = "fasta=" + fastafile + ", soft=50";
filterSeqs = new FilterSeqsCommand(optionString);
filterSeqs->execute();
delete filterSeqs;
distCalculator = new eachGapDist();
//read in sequences
- //seqs = readSeqs(fastafile);
+ seqs = readSeqs(fastafile);
int numSeqs = seqs.size();
if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); exit(1); }
//set default window to 25% of sequence length
- string seq0 = seqs[0].getAligned();
+ string seq0 = seqs[0]->getAligned();
if (window == 0) { window = seq0.length() / 4; }
else if (window > (seq0.length() / 2)) {
mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
window = (seq0.length() / 2);
}
- if (increment > (seqs[0].getAlignLength() - (2*window))) {
+ if (increment > (seqs[0]->getAlignLength() - (2*window))) {
if (increment != 10) {
mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
increment = 10;
- if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; }
+ if (increment > (seqs[0]->getAlignLength() - (2*window))) { increment = 0; }
}else{ increment = 0; }
}
cout << "increment = " << increment << endl;
if (increment == 0) { iters = 1; }
- else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
+ else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); }
//initialize pref
pref.resize(numSeqs);
for (int i = 0; i < numSeqs; i++ ) {
pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
- pref[i].name = seqs[i].getName();
+ pref[i].name = seqs[i]->getName();
pref[i].score[0] = 0.0; pref[i].score[1] = 0.0;
pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0;
pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0;
for (int i = 0; i < seqs.size(); i++) {
//cout << "whole = " << seqs[i].getAligned() << endl;
//save left side
- string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
+ string seqLeft = seqs[i]->getAligned().substr(midpoint-window, window);
Sequence tempLeft;
- tempLeft.setName(seqs[i].getName());
+ tempLeft.setName(seqs[i]->getName());
tempLeft.setAligned(seqLeft);
left.push_back(tempLeft);
//cout << "left = " << tempLeft.getAligned() << endl;
//save right side
- string seqRight = seqs[i].getAligned().substr(midpoint, window);
+ string seqRight = seqs[i]->getAligned().substr(midpoint, window);
Sequence tempRight;
- tempRight.setName(seqs[i].getName());
+ tempRight.setName(seqs[i]->getName());
tempRight.setAligned(seqRight);
right.push_back(tempRight);
//cout << "right = " << seqRight << endl;
//how much higher or lower is this than expected
pref[i].score[0] = pref[i].score[0] / expectedPercent;
+
+
}
if (itL->second < pref[i].closestLeft[1]) {
pref[i].closestLeft[1] = itL->second;
- pref[i].leftParent[1] = seqs[j].getName();
+ pref[i].leftParent[1] = seqs[j]->getName();
//cout << "updating closest left to " << pref[i].leftParent[1] << endl;
}
//cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;
if (itL->second < pref[j].closestLeft[1]) {
pref[j].closestLeft[1] = itL->second;
- pref[j].leftParent[1] = seqs[i].getName();
+ pref[j].leftParent[1] = seqs[i]->getName();
//cout << "updating closest left to " << pref[j].leftParent[1] << endl;
}
//are you the closest right sequence
if (itR->second < pref[i].closestRight[1]) {
pref[i].closestRight[1] = itR->second;
- pref[i].rightParent[1] = seqs[j].getName();
+ pref[i].rightParent[1] = seqs[j]->getName();
}
if (itR->second < pref[j].closestRight[1]) {
pref[j].closestRight[1] = itR->second;
- pref[j].rightParent[1] = seqs[i].getName();
+ pref[j].rightParent[1] = seqs[i]->getName();
}
}
private:
Dist* distCalculator;
FilterSeqsCommand* filterSeqs;
- vector<Sequence> seqs;
+ vector<Sequence*> seqs;
vector<Preference> pref;
string fastafile;
int iters;
exit(1);
}
}
-//********************************************************************************************************************
-//sorts lowest to highest
-inline bool compareQuanMembers(quanMember left, quanMember right){
- return (left.score < right.score);
-}
//***************************************************************************************************************
//seqs have already been masked
-vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end, vector<float>& highestDE) {
+vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
try {
vector< vector<quanMember> > quan;
//dist-1 because vector indexes start at 0.
quan[dist-1].push_back(newScore);
- //save highestDE
- if (de > highestDE[i]) { highestDE[i] = de; }
- if(de > highestDE[j]) { highestDE[j] = de; }
-
delete subject;
}
}
}
//***************************************************************************************************************
-//follows Mallard algorythn in paper referenced from mallard class
-vector<int> DeCalculator::returnObviousOutliers(vector< vector<quanMember> > quantiles, int num) {
- try {
- vector< vector<float> > quan;
- quan.resize(100);
-
- map<quanMember*, float> contributions; //map of quanMember to distance from high or low - how bad is it.
- vector<int> marked; //marked[0] is the penalty of template seqs[0]. the higher the penalty the more likely the sequence is chimeric
- marked.resize(num,0);
-
- //find contributions
- for (int i = 0; i < quantiles.size(); i++) {
-
- //find mean of this quantile score
- sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
-
- float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
-
- //look at each value in quantiles to see if it is an outlier
- for (int j = 0; j < quantiles[i].size(); j++) {
-
- //is this score between above 99%
- if (quantiles[i][j].score > high) {
- //find out how "bad" of an outlier you are - so you can rank the outliers
- float dist = quantiles[i][j].score - high;
- contributions[&(quantiles[i][j])] = dist;
-
- //penalizing sequences for being in multiple outliers
- marked[quantiles[i][j].member1]++;
- marked[quantiles[i][j].member2]++;
- }
- }
- }
-
- //find contributer with most offending score related to it
- vector<quanMember> outliers = sortContrib(contributions);
-
- //go through the outliers marking the potential chimeras
- for (int i = 0; i < outliers.size(); i++) {
-
- //who is responsible for this outlying score?
- //if member1 has greater score mark him
- //if member2 has greater score mark her
- //if they are the same mark both
- if (marked[outliers[i].member1] > marked[outliers[i].member2]) { marked[outliers[i].member1]++; }
- else if (marked[outliers[i].member2] > marked[outliers[i].member1]) { marked[outliers[i].member2]++; }
- else if (marked[outliers[i].member2] == marked[outliers[i].member1]) { marked[outliers[i].member2]++; marked[outliers[i].member1]++; }
- }
-
- return marked;
- }
- catch(exception& e) {
- errorOut(e, "DeCalculator", "removeObviousOutliers");
- exit(1);
- }
-}
-//***************************************************************************************************************
//put quanMember in the vector based on how far they are from the 99% or 1%. Biggest offenders in front.
-vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
+/*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
try{
vector<quanMember> newQuan;
//***************************************************************************************************************
//used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
-/*int DeCalculator::findLargestContrib(vector<int> seen) {
+int DeCalculator::findLargestContrib(vector<int> seen) {
try{
int largest = 0;
float calcDE(vector<float>, vector<float>);
float calcDist(Sequence*, Sequence*, int, int);
float getCoef(vector<float>, vector<float>);
- vector< vector<quanMember> > getQuantiles(vector<Sequence*>, vector<int>, int, vector<float>, int, int, int, vector<float>&);
+ vector< vector<quanMember> > getQuantiles(vector<Sequence*>, vector<int>, int, vector<float>, int, int, int);
vector<int> returnObviousOutliers(vector< vector<quanMember> >, int);
private:
- vector<quanMember> sortContrib(map<quanMember*, float>); //used by mallard
+ //vector<quanMember> sortContrib(map<quanMember*, float>); //used by mallard
float findAverage(vector<float>);
//int findLargestContrib(vector<int>);
//void removeContrib(int, vector<quanMember>&);
//header
- mothurOut("mothur v.1.4.1");
+ mothurOut("mothur v.1.5.0");
mothurOutEndLine();
- mothurOut("Last updated: 6/23/2009");
+ mothurOut("Last updated: 8/03/2009");
mothurOutEndLine();
mothurOutEndLine();
mothurOut("by");
#include "pintail.h"
#include "ignoregaps.h"
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareQuanMembers(quanMember left, quanMember right){
+ return (left.score < right.score);
+}
//***************************************************************************************************************
Pintail::Pintail(string filename, string temp) { fastafile = filename; templateFile = temp; }
h.resize(numSeqs);
quantiles.resize(100); //one for every percent mismatch
quantilesMembers.resize(100); //one for every percent mismatch
- makeCompliant.resize(templateSeqs.size(), 0.0);
//break up file if needed
int linesPerProcess = numSeqs / processors ;
mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush();
if (processors == 1) {
- quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant);
+ quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
}else { createProcessesQuan(); }
+
+
//decided against this because we were having trouble setting the sensitivity... may want to revisit this...
//quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
openOutputFile(o, out4);
//adjust quantiles
- for (int i = 0; i < quantiles.size(); i++) {
+ for (int i = 0; i < quantilesMembers.size(); i++) {
vector<float> temp;
- if (quantiles[i].size() == 0) {
+ if (quantilesMembers[i].size() == 0) {
//in case this is not a distance found in your template files
for (int g = 0; g < 6; g++) {
temp.push_back(0.0);
}
}else{
- sort(quantiles[i].begin(), quantiles[i].end());
+ sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
//save 10%
- temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]);
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
//save 25%
- temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)]);
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
//save 50%
- temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)]);
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
//save 75%
- temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)]);
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
//save 95%
- temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)]);
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
//save 99%
- temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]);
+ temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
}
process++;
}else if (pid == 0){
- quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end, makeCompliant);
+ quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end);
//write out data to file so parent can read it
ofstream out;
}
#else
- quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant);
+ quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
#endif
}
catch(exception& e) {
void createProcesses();
void createProcessesQuan();
- vector<float> makeCompliant; //used by decalc->getQuantiles so pintail and mallard can use same function, it contains the highest de value for each seq in the template
-
};
/***********************************************************/