From: westcott Date: Wed, 12 Aug 2009 16:34:29 +0000 (+0000) Subject: removing mallard X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=bbe4d799c02b23891ff4845fe3aad9376f739f86 removing mallard --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index a570003..c4f38cb 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -14,7 +14,6 @@ 370B88070F8A4EE4005AB382 /* getoturepcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 370B88060F8A4EE4005AB382 /* getoturepcommand.cpp */; }; 371B30B40FD7EE67000414CA /* screenseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 371B30B20FD7EE67000414CA /* screenseqscommand.cpp */; }; 372095C2103196D70004D347 /* chimera.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372095C1103196D70004D347 /* chimera.cpp */; }; - 372095CA1031A1D50004D347 /* mallard.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372095C91031A1D50004D347 /* mallard.cpp */; }; 372A55551017922B00C5194B /* decalc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372A55541017922B00C5194B /* decalc.cpp */; }; 372E12700F26365B0095CF7E /* readotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E126F0F26365B0095CF7E /* readotucommand.cpp */; }; 372E12960F263D5A0095CF7E /* readdistcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12950F263D5A0095CF7E /* readdistcommand.cpp */; }; @@ -199,8 +198,6 @@ 371B30B20FD7EE67000414CA /* screenseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = screenseqscommand.cpp; sourceTree = SOURCE_ROOT; }; 371B30B30FD7EE67000414CA /* screenseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = screenseqscommand.h; sourceTree = SOURCE_ROOT; }; 372095C1103196D70004D347 /* chimera.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimera.cpp; sourceTree = ""; }; - 372095C81031A1D50004D347 /* mallard.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mallard.h; sourceTree = ""; }; - 372095C91031A1D50004D347 /* mallard.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mallard.cpp; sourceTree = ""; }; 372A55531017922B00C5194B /* decalc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decalc.h; sourceTree = ""; }; 372A55541017922B00C5194B /* decalc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decalc.cpp; sourceTree = ""; }; 372E126E0F26365B0095CF7E /* readotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readotucommand.h; sourceTree = SOURCE_ROOT; }; @@ -650,8 +647,6 @@ 374F2FEA100634B600E97C66 /* bellerophon.cpp */, 372A55531017922B00C5194B /* decalc.h */, 372A55541017922B00C5194B /* decalc.cpp */, - 372095C81031A1D50004D347 /* mallard.h */, - 372095C91031A1D50004D347 /* mallard.cpp */, 374F301110063B6F00E97C66 /* pintail.h */, 374F301210063B6F00E97C66 /* pintail.cpp */, ); @@ -1172,7 +1167,6 @@ 374F301310063B6F00E97C66 /* pintail.cpp in Sources */, 372A55551017922B00C5194B /* decalc.cpp in Sources */, 372095C2103196D70004D347 /* chimera.cpp in Sources */, - 372095CA1031A1D50004D347 /* mallard.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index 899d6ef..f980310 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -11,6 +11,7 @@ #include "bellerophon.h" #include "pintail.h" + //*************************************************************************************************************** ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ @@ -96,14 +97,15 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ void ChimeraSeqsCommand::help(){ try { - mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED.\n"); - mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n"); + mothurOut("chimera.seqs ASSUMES that your sequences are ALIGNED and if using a template that the template file sequences are the same length as the fasta file sequences.\n\n"); + mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n"); mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation and quantile.\n"); + mothurOut("The fasta parameter is always required and template is required if using pintail.\n"); mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter this only applies when the method is bellerphon. The default is false. \n"); mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. This only applies when the method is bellerphon.\n"); mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. Options include..... \n"); - mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the pintail and mallard method. The default is no mask. If you enter mask=default, then the mask is 236627 EU009184.1 Shigella dysenteriae str. FBD013. \n"); + mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the pintail. The default is no mask. If you enter mask=default, then the mask is 236627 EU009184.1 Shigella dysenteriae str. FBD013. \n"); mothurOut("The window parameter allows you to specify the window size for searching for chimeras. The default is 300 is method is pintail unless the sequence length is less than 300, and 1/4 sequence length for bellerphon.\n"); mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences. The default is 25.\n"); mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences for use by the pintail algorythm. It is a required parameter if using pintail.\n"); @@ -133,21 +135,21 @@ int ChimeraSeqsCommand::execute(){ if (abort == true) { return 0; } if (method == "bellerophon") { chimera = new Bellerophon(fastafile); } - else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); - //saves time to avoid generating it - if (consfile != "") { chimera->setCons(consfile); } - else { chimera->setCons(""); } - - //saves time to avoid generating it - if (quanfile != "") { chimera->setQuantiles(quanfile); } - else { chimera->setQuantiles(""); } - - if (maskfile == "default") { mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); mothurOutEndLine(); } - chimera->setMask(maskfile); - - }else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } + else if (method == "pintail") { chimera = new Pintail(fastafile, templatefile); } + else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } //set user options + if (maskfile == "default") { mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); mothurOutEndLine(); } + + //saves time to avoid generating it + if (consfile != "") { chimera->setCons(consfile); } + else { chimera->setCons(""); } + + //saves time to avoid generating it + if (quanfile != "") { chimera->setQuantiles(quanfile); } + else { chimera->setQuantiles(""); } + + chimera->setMask(maskfile); chimera->setFilter(filter); chimera->setCorrection(correction); chimera->setProcessors(processors); diff --git a/decalc.cpp b/decalc.cpp index 5aed358..c951910 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -154,7 +154,7 @@ vector DeCalculator::calcObserved(Sequence* query, Sequence* subject, vec try { vector temp; - + //int gaps = 0; for (int m = 0; m < window.size(); m++) { string seqFrag = query->getAligned().substr(window[m], size); @@ -164,15 +164,22 @@ vector DeCalculator::calcObserved(Sequence* query, Sequence* subject, vec for (int b = 0; b < seqFrag.length(); b++) { //if at least one is a base and they are not equal if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; } + + //ignore gaps + //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; } } //percentage of mismatched bases float dist; //if the whole fragment is 0 distance = 0 + //if ((seqFrag.length()-gaps) == 0) { dist = 0.0; } + + //percentage of mismatched bases + //else { dist = diff / (float) (seqFrag.length()-gaps) * 100; } + + dist = diff / (float) (seqFrag.length()) * 100; - dist = diff / (float) (seqFrag.length()) * 100; - temp.push_back(dist); } @@ -368,7 +375,7 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, //for each sequence for(int i = start; i < end; i++){ - mothurOut("Processing template sequence " + toString(i)); mothurOutEndLine(); + mothurOut("Processing sequence " + toString(i)); mothurOutEndLine(); Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned()); //compare to every other sequence in template @@ -409,27 +416,26 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, quan[dist-1].push_back(newScore); //save highestDE - if(de > highestDE[i]) { highestDE[i] = de; } - if(de > highestDE[j]) { highestDE[j] = de; } + if (de > highestDE[i]) { highestDE[i] = de; } + if(de > highestDE[j]) { highestDE[j] = de; } delete subject; - } delete query; } - return quan; } catch(exception& e) { - errorOut(e, "DeCalculator", "findQav"); + errorOut(e, "DeCalculator", "getQuantiles"); exit(1); } } //*************************************************************************************************************** +//this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right. may want to revisit in the future... vector< vector > DeCalculator::removeObviousOutliers(vector< vector >& quantiles, int num) { try { vector< vector > quan; @@ -487,8 +493,9 @@ vector< vector > DeCalculator::removeObviousOutliers(vector< vector marked = returnObviousOutliers(quantiles, num); vector copyMarked = marked; @@ -539,7 +546,7 @@ cout << "high = " << high << endl; quan[i] = temp; } - +*/ return quan; } catch(exception& e) { @@ -565,24 +572,19 @@ vector DeCalculator::returnObviousOutliers(vector< vector > qua sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers); float high = quantiles[i][int(quantiles[i].size() * 0.99)].score; - float low = quantiles[i][int(quantiles[i].size() * 0.01)].score; //look at each value in quantiles to see if it is an outlier for (int j = 0; j < quantiles[i].size(); j++) { - //is this score between 1 and 99% - if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) { + //is this score between above 99% + if (quantiles[i][j].score > high) { + //find out how "bad" of an outlier you are - so you can rank the outliers + float dist = quantiles[i][j].score - high; + contributions[&(quantiles[i][j])] = dist; - }else { - float dist; - //add to contributions - if (quantiles[i][j].score < low) { - dist = low - quantiles[i][j].score; - contributions[&(quantiles[i][j])] = dist; - }else { //you are higher than high - dist = quantiles[i][j].score - high; - contributions[&(quantiles[i][j])] = dist; - } + //penalizing sequences for being in multiple outliers + marked[quantiles[i][j].member1]++; + marked[quantiles[i][j].member2]++; } } } @@ -597,9 +599,9 @@ vector DeCalculator::returnObviousOutliers(vector< vector > qua //if member1 has greater score mark him //if member2 has greater score mark her //if they are the same mark both - if (marked[outliers[i].member1] > marked[outliers[i].member2]) { marked[outliers[i].member1]++; } - else if (marked[outliers[i].member2] > marked[outliers[i].member1]) { marked[outliers[i].member2]++; } - else if (marked[outliers[i].member2] == marked[outliers[i].member1]) { marked[outliers[i].member2]++; marked[outliers[i].member1]++; } + if (marked[outliers[i].member1] > marked[outliers[i].member2]) { marked[outliers[i].member1]++; } + else if (marked[outliers[i].member2] > marked[outliers[i].member1]) { marked[outliers[i].member2]++; } + else if (marked[outliers[i].member2] == marked[outliers[i].member1]) { marked[outliers[i].member2]++; marked[outliers[i].member1]++; } } return marked; @@ -610,6 +612,7 @@ vector DeCalculator::returnObviousOutliers(vector< vector > qua } } //*************************************************************************************************************** +//put quanMember in the vector based on how far they are from the 99% or 1%. Biggest offenders in front. vector DeCalculator::sortContrib(map quan) { try{ @@ -624,7 +627,7 @@ vector DeCalculator::sortContrib(map quan) { for (it = quan.begin(); it != quan.end(); it++) { if (it->second > largest->second) { largest = it; } } - +cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl; //save it newQuan.push_back(*(largest->first)); @@ -642,7 +645,8 @@ vector DeCalculator::sortContrib(map quan) { } //*************************************************************************************************************** -int DeCalculator::findLargestContrib(vector seen) { +//used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used... +/*int DeCalculator::findLargestContrib(vector seen) { try{ int largest = 0; @@ -684,7 +688,7 @@ void DeCalculator::removeContrib(int bad, vector& quan) { exit(1); } } - +*/ //*************************************************************************************************************** float DeCalculator::findAverage(vector myVector) { try{ diff --git a/decalc.h b/decalc.h index 65a8f46..e2a6cc7 100644 --- a/decalc.h +++ b/decalc.h @@ -60,8 +60,8 @@ class DeCalculator { private: vector sortContrib(map); //used by mallard float findAverage(vector); - int findLargestContrib(vector); - void removeContrib(int, vector&); + //int findLargestContrib(vector); + //void removeContrib(int, vector&); string seqMask; set h; int alignLength; diff --git a/mallard.cpp b/mallard.cpp deleted file mode 100644 index 7f387c2..0000000 --- a/mallard.cpp +++ /dev/null @@ -1,238 +0,0 @@ -/* - * mallard.cpp - * Mothur - * - * Created by Sarah Westcott on 8/11/09. - * Copyright 2009 __MyCompanyName__. All rights reserved. - * - */ - -#include "mallard.h" - -//*************************************************************************************************************** - -Mallard::Mallard(string filename) { fastafile = filename; } -//*************************************************************************************************************** - -Mallard::~Mallard() { - try { - for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } - } - catch(exception& e) { - errorOut(e, "Mallard", "~Mallard"); - exit(1); - } -} -//*************************************************************************************************************** -void Mallard::print(ostream& out) { - try { - - for (int i = 0; i < querySeqs.size(); i++) { - - out << querySeqs[i]->getName() << "\thighest de value = " << highestDE[i] << "\tpenalty score = " << marked[i] << endl; - cout << querySeqs[i]->getName() << "\tpenalty score = " << marked[i] << endl; - - } - } - catch(exception& e) { - errorOut(e, "Mallard", "print"); - exit(1); - } -} - -//*************************************************************************************************************** -void Mallard::getChimeras() { - try { - - //read in query sequences and subject sequences - mothurOut("Reading sequences and template file... "); cout.flush(); - querySeqs = readSeqs(fastafile); - mothurOut("Done."); mothurOutEndLine(); - - int numSeqs = querySeqs.size(); - - windowSizes.resize(numSeqs, window); - quantilesMembers.resize(100); //one for every percent mismatch - highestDE.resize(numSeqs, 0.0); //contains the highest de value for each seq - - //break up file if needed - int linesPerProcess = numSeqs / processors ; - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - //find breakup of sequences for all times we will Parallelize - if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } - else { - //fill line pairs - for (int i = 0; i < (processors-1); i++) { - lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); - } - //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end - int i = processors - 1; - lines.push_back(new linePair((i*linesPerProcess), numSeqs)); - } - - #else - lines.push_back(new linePair(0, numSeqs)); - #endif - - decalc = new DeCalculator(); - - //if the user does enter a mask then you want to keep all the spots in the alignment - if (seqMask.length() == 0) { decalc->setAlignmentLength(querySeqs[0]->getAligned().length()); } - else { decalc->setAlignmentLength(seqMask.length()); } - - decalc->setMask(seqMask); - - //find P - mothurOut("Getting conservation... "); cout.flush(); - probabilityProfile = decalc->calcFreq(querySeqs, fastafile); - - //make P into Q - for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } - mothurOut("Done."); mothurOutEndLine(); - - //mask sequences if the user wants to - if (seqMask != "") { - //mask querys - for (int i = 0; i < querySeqs.size(); i++) { - decalc->runMask(querySeqs[i]); - } - } - - mothurOut("Calculating DE values..."); cout.flush(); - if (processors == 1) { - quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, 0, querySeqs.size(), highestDE); - }else { createProcessesQuan(); } - mothurOut("Done."); mothurOutEndLine(); - - mothurOut("Ranking outliers..."); cout.flush(); - marked = decalc->returnObviousOutliers(quantilesMembers, querySeqs.size()); - mothurOut("Done."); mothurOutEndLine(); - - - //free memory - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } - delete decalc; - } - catch(exception& e) { - errorOut(e, "Mallard", "getChimeras"); - exit(1); - } -} -/**************************************************************************************************/ - -void Mallard::createProcessesQuan() { - try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, lines[process]->start, lines[process]->end, highestDE); - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - - - //output observed distances - for (int i = 0; i < quantilesMembers.size(); i++) { - out << quantilesMembers[i].size() << '\t'; - for (int j = 0; j < quantilesMembers[i].size(); j++) { - out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t'; - } - out << endl; - } - - out << highestDE.size() << endl; - for (int i = 0; i < highestDE.size(); i++) { - out << highestDE[i] << '\t'; - } - out << endl; - - out.close(); - - exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } - } - - //force parent to wait until all the processes are done - for (int i=0;i > quan; - quan.resize(100); - - //get quantiles - for (int m = 0; m < quan.size(); m++) { - int num; - in >> num; - - gobble(in); - - vector q; float w; int b, n; - for (int j = 0; j < num; j++) { - in >> w >> b >> n; - //cout << w << '\t' << b << '\t' n << endl; - quanMember newMember(w, b, n); - q.push_back(newMember); - } -//cout << "here" << endl; - quan[m] = q; -//cout << "now here" << endl; - gobble(in); - } - - - //save quan in quantiles - for (int j = 0; j < quan.size(); j++) { - //put all values of q[i] into quan[i] - for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); } - //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end()); - } - - int num; - in >> num; gobble(in); - - int count = lines[process]->start; - for (int s = 0; s < num; s++) { - float high; - in >> high; - - highestDE[count] = high; - count++; - } - - in.close(); - remove(s.c_str()); - } - -#else - quantilesMembers = decalc->getQuantiles(querySeqs, windowSizes, window, probabilityProfile, increment, 0, querySeqs.size(), highestDE); -#endif - } - catch(exception& e) { - errorOut(e, "Mallard", "createProcessesQuan"); - exit(1); - } -} -//*************************************************************************************************************** - - diff --git a/pintail.cpp b/pintail.cpp index 28984c6..53bdd9e 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -89,6 +89,7 @@ void Pintail::getChimeras() { h.resize(numSeqs); quantiles.resize(100); //one for every percent mismatch quantilesMembers.resize(100); //one for every percent mismatch + makeCompliant.resize(templateSeqs.size(), 0.0); //break up file if needed int linesPerProcess = numSeqs / processors ; @@ -137,25 +138,6 @@ void Pintail::getChimeras() { }else { createProcessesPairs(); } - for (int j = 0; j < bestfit.size(); j++) { - - //chops off beginning and end of sequences so they both start and end with a base - ofstream out; - string s = querySeqs[j]->getName(); - - openOutputFile(s, out); - out << ">" << querySeqs[j]->getName() << endl; - out << querySeqs[j]->getAligned() << endl; - out.close(); - - string t =querySeqs[j]->getName() + ".ref"; - openOutputFile(t, out); - out << ">" << bestfit[j]->getName() << endl; - out << bestfit[j]->getAligned() << endl; - out.close(); - } - - //find P mothurOut("Getting conservation... "); cout.flush(); if (consfile == "") {