From 348a7bac1d3c5d17ae0e4a93b78f69f4e200a190 Mon Sep 17 00:00:00 2001 From: westcott Date: Fri, 10 Jul 2009 15:53:50 +0000 Subject: [PATCH] added sorted option to get.rabund command --- Mothur.xcodeproj/project.pbxproj | 22 ++ bellerophon.cpp | 409 +++++++++++++++++++++ bellerophon.h | 63 ++++ chimera.h | 53 +++ chimeraseqscommand.cpp | 397 ++------------------- chimeraseqscommand.h | 32 +- getrabundcommand.cpp | 31 +- getrabundcommand.h | 2 +- pintail.cpp | 587 +++++++++++++++++++++++++++++++ pintail.h | 81 +++++ rabundvector.cpp | 14 + rabundvector.hpp | 1 + systemcommand.cpp | 2 +- 13 files changed, 1290 insertions(+), 404 deletions(-) create mode 100644 bellerophon.cpp create mode 100644 bellerophon.h create mode 100644 chimera.h create mode 100644 pintail.cpp create mode 100644 pintail.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 7959c79..9427eec 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -38,6 +38,8 @@ 3746109D0F40657600460C57 /* unifracunweightedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3746109C0F40657600460C57 /* unifracunweightedcommand.cpp */; }; 3749271D0FD58C840031C06B /* getsabundcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3749271C0FD58C840031C06B /* getsabundcommand.cpp */; }; 3749273F0FD5956B0031C06B /* getrabundcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3749273E0FD5956B0031C06B /* getrabundcommand.cpp */; }; + 374F2FEB100634B600E97C66 /* bellerophon.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374F2FEA100634B600E97C66 /* bellerophon.cpp */; }; + 374F301310063B6F00E97C66 /* pintail.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374F301210063B6F00E97C66 /* pintail.cpp */; }; 37519A6B0F80E6EB00FED5E8 /* sharedanderbergs.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37519A6A0F80E6EB00FED5E8 /* sharedanderbergs.cpp */; }; 37519AA10F810D0200FED5E8 /* venncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37519AA00F810D0200FED5E8 /* venncommand.cpp */; }; 37519AB50F810FAE00FED5E8 /* venn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37519AB40F810FAE00FED5E8 /* venn.cpp */; }; @@ -242,6 +244,11 @@ 3749271C0FD58C840031C06B /* getsabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsabundcommand.cpp; sourceTree = SOURCE_ROOT; }; 3749273D0FD5956B0031C06B /* getrabundcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getrabundcommand.h; sourceTree = SOURCE_ROOT; }; 3749273E0FD5956B0031C06B /* getrabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getrabundcommand.cpp; sourceTree = SOURCE_ROOT; }; + 374F2FD51006320200E97C66 /* chimera.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimera.h; sourceTree = ""; }; + 374F2FE9100634B600E97C66 /* bellerophon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bellerophon.h; sourceTree = ""; }; + 374F2FEA100634B600E97C66 /* bellerophon.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bellerophon.cpp; sourceTree = ""; }; + 374F301110063B6F00E97C66 /* pintail.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pintail.h; sourceTree = ""; }; + 374F301210063B6F00E97C66 /* pintail.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pintail.cpp; sourceTree = ""; }; 37519A690F80E6EB00FED5E8 /* sharedanderbergs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedanderbergs.h; sourceTree = SOURCE_ROOT; }; 37519A6A0F80E6EB00FED5E8 /* sharedanderbergs.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedanderbergs.cpp; sourceTree = SOURCE_ROOT; }; 37519A9F0F810D0200FED5E8 /* venncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = venncommand.h; sourceTree = SOURCE_ROOT; }; @@ -546,6 +553,7 @@ 373C68DC0FC1C38D00137ACD /* blastalign.hpp */, 373C68DB0FC1C38D00137ACD /* blastalign.cpp */, 37D928A60F2133C0001D4494 /* calculators */, + 374F2FE81006346600E97C66 /* chimera */, 37D927C20F21331F001D4494 /* cluster.hpp */, 37D927C10F21331F001D4494 /* cluster.cpp */, 37D928A90F2133E5001D4494 /* commands */, @@ -625,6 +633,18 @@ name = Products; sourceTree = ""; }; + 374F2FE81006346600E97C66 /* chimera */ = { + isa = PBXGroup; + children = ( + 374F2FD51006320200E97C66 /* chimera.h */, + 374F2FE9100634B600E97C66 /* bellerophon.h */, + 374F2FEA100634B600E97C66 /* bellerophon.cpp */, + 374F301110063B6F00E97C66 /* pintail.h */, + 374F301210063B6F00E97C66 /* pintail.cpp */, + ); + name = chimera; + sourceTree = ""; + }; 3796441D0FB9B9650081FDB6 /* read */ = { isa = PBXGroup; children = ( @@ -1135,6 +1155,8 @@ 37B73CA61004D89A008C4B41 /* getseqscommand.cpp in Sources */, 37B73CC01004EB38008C4B41 /* removeseqscommand.cpp in Sources */, 37B73CCF1004F5E0008C4B41 /* systemcommand.cpp in Sources */, + 374F2FEB100634B600E97C66 /* bellerophon.cpp in Sources */, + 374F301310063B6F00E97C66 /* pintail.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/bellerophon.cpp b/bellerophon.cpp new file mode 100644 index 0000000..f2dd821 --- /dev/null +++ b/bellerophon.cpp @@ -0,0 +1,409 @@ +/* + * bellerophon.cpp + * Mothur + * + * Created by Sarah Westcott on 7/9/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "bellerophon.h" +#include "eachgapdist.h" +#include "ignoregaps.h" +#include "onegapdist.h" + + +//*************************************************************************************************************** + +Bellerophon::Bellerophon(string name) { + try { + fastafile = name; + } + catch(exception& e) { + errorOut(e, "Bellerophon", "Bellerophon"); + exit(1); + } +} + +//*************************************************************************************************************** +void Bellerophon::print(ostream& out) { + try { + int above1 = 0; + out << "Name\tScore\tLeft\tRight\t" << endl; + //output prefenence structure to .chimeras file + for (int i = 0; i < pref.size(); i++) { + out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl; + + //calc # of seqs with preference above 1.0 + if (pref[i].score[0] > 1.0) { + above1++; + mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine(); + mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine(); + } + } + + //output results to screen + mothurOutEndLine(); + mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine(); + int spot; + spot = pref.size()-1; + mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.975; + mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.75; + mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.50; + mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.25; + mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = pref.size() * 0.025; + mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + spot = 0; + mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + + } + catch(exception& e) { + errorOut(e, "Bellerophon", "print"); + exit(1); + } +} + +//******************************************************************************************************************** +//sorts highest score to lowest +inline bool comparePref(Preference left, Preference right){ + return (left.score[0] > right.score[0]); +} + +//*************************************************************************************************************** +void Bellerophon::getChimeras() { + try { + + //do soft filter + if (filter) { + string optionString = "fasta=" + fastafile + ", soft=50, vertical=F"; + filterSeqs = new FilterSeqsCommand(optionString); + filterSeqs->execute(); + delete filterSeqs; + + //reset fastafile to filtered file + fastafile = getRootName(fastafile) + "filter.fasta"; + } + + distCalculator = new eachGapDist(); + + //read in sequences + readSeqs(); + + int numSeqs = seqs.size(); + + if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); exit(1); } + + //set default window to 25% of sequence length + string seq0 = seqs[0].getAligned(); + if (window == 0) { window = seq0.length() / 4; } + else if (window > (seq0.length() / 2)) { + mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine(); + window = (seq0.length() / 2); + } + + if (increment > (seqs[0].getAlignLength() - (2*window))) { + if (increment != 10) { + + mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine(); + increment = 10; + if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; } + + }else{ increment = 0; } + } +cout << "increment = " << increment << endl; + if (increment == 0) { iters = 1; } + else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); } + + //initialize pref + pref.resize(numSeqs); + + for (int i = 0; i < numSeqs; i++ ) { + pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3); + pref[i].name = seqs[i].getName(); + pref[i].score[0] = 0.0; pref[i].score[1] = 0.0; + pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0; + pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0; + } + + int midpoint = window; + int count = 0; + while (count < iters) { + + //create 2 vectors of sequences, 1 for left side and one for right side + vector left; vector right; + + for (int i = 0; i < seqs.size(); i++) { +//cout << "whole = " << seqs[i].getAligned() << endl; + //save left side + string seqLeft = seqs[i].getAligned().substr(midpoint-window, window); + Sequence tempLeft; + tempLeft.setName(seqs[i].getName()); + tempLeft.setAligned(seqLeft); + left.push_back(tempLeft); +//cout << "left = " << tempLeft.getAligned() << endl; + //save right side + string seqRight = seqs[i].getAligned().substr(midpoint, window); + Sequence tempRight; + tempRight.setName(seqs[i].getName()); + tempRight.setAligned(seqRight); + right.push_back(tempRight); +//cout << "right = " << seqRight << endl; + } + + //adjust midpoint by increment + midpoint += increment; + + + //this should be parallelized + //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | ) + //create a matrix containing the distance from left to left and right to right + //calculate distances + SparseMatrix* SparseLeft = new SparseMatrix(); + SparseMatrix* SparseRight = new SparseMatrix(); + + createSparseMatrix(0, left.size(), SparseLeft, left); + createSparseMatrix(0, right.size(), SparseRight, right); + + vector distMapRight; + vector distMapLeft; + + // Create a data structure to quickly access the distance information. + // It consists of a vector of distance maps, where each map contains + // all distances of a certain sequence. Vector and maps are accessed + // via the index of a sequence in the distance matrix + distMapRight = vector(numSeqs); + distMapLeft = vector(numSeqs); + //cout << "left" << endl << endl; + for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) { + distMapLeft[currentCell->row][currentCell->column] = currentCell->dist; + //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; + } + //cout << "right" << endl << endl; + for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) { + distMapRight[currentCell->row][currentCell->column] = currentCell->dist; + //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; + } + + delete SparseLeft; + delete SparseRight; + + + //fill preference structure + generatePreferences(distMapLeft, distMapRight, midpoint); + + count++; + + } + + delete distCalculator; + + //rank preference score to eachother + float dme = 0.0; + float expectedPercent = 1 / (float) (pref.size()); + + for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[0]; } + + for (int i = 0; i < pref.size(); i++) { + + //gives the actual percentage of the dme this seq adds + pref[i].score[0] = pref[i].score[0] / dme; + + //how much higher or lower is this than expected + pref[i].score[0] = pref[i].score[0] / expectedPercent; + + } + + + //sort Preferences highest to lowest + sort(pref.begin(), pref.end(), comparePref); + + + + } + catch(exception& e) { + errorOut(e, "Bellerophon", "getChimeras"); + exit(1); + } +} + +//*************************************************************************************************************** +void Bellerophon::readSeqs(){ + try { + ifstream inFASTA; + openInputFile(fastafile, inFASTA); + + //read in seqs and store in vector + while(!inFASTA.eof()){ + Sequence current(inFASTA); + + if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); } + + seqs.push_back(current); + + gobble(inFASTA); + } + inFASTA.close(); + + } + catch(exception& e) { + errorOut(e, "Bellerophon", "readSeqs"); + exit(1); + } +} + +/***************************************************************************************************************/ +int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector s){ + try { + + for(int i=startSeq; icalcDist(s[i], s[j]); + float dist = distCalculator->getDist(); + + PCell temp(i, j, dist); + sparse->addCell(temp); + + } + } + + + return 1; + } + catch(exception& e) { + errorOut(e, "Bellerophon", "createSparseMatrix"); + exit(1); + } +} +/***************************************************************************************************************/ +void Bellerophon::generatePreferences(vector left, vector right, int mid){ + try { + + float dme = 0.0; + SeqMap::iterator itR; + SeqMap::iterator itL; + + //initialize pref[i] + for (int i = 0; i < pref.size(); i++) { + pref[i].score[1] = 0.0; + pref[i].closestLeft[1] = 100000.0; + pref[i].closestRight[1] = 100000.0; + pref[i].leftParent[1] = ""; + pref[i].rightParent[1] = ""; + } + + for (int i = 0; i < left.size(); i++) { + + SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0, + // 1 to the distance of sequence 3 to sequence 1, + // 2 to the distance of sequence 3 to sequence 2. + SeqMap currentRight = right[i]; // same as left but with distances on the right side. + + for (int j = 0; j < i; j++) { + + itL = currentLeft.find(j); + itR = currentRight.find(j); +//cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl; +//cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl; + + //if you can find this entry update the preferences + if ((itL != currentLeft.end()) && (itR != currentRight.end())) { + + if (!correction) { + pref[i].score[1] += abs((itL->second - itR->second)); + pref[j].score[1] += abs((itL->second - itR->second)); +//cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; +//cout << "abs = " << abs((itL->second - itR->second)) << endl; +//cout << i << " score = " << pref[i].score[1] << endl; +//cout << j << " score = " << pref[j].score[1] << endl; + }else { + pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); + pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); +//cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; +//cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl; +//cout << i << " score = " << pref[i].score[1] << endl; +//cout << j << " score = " << pref[j].score[1] << endl; + } +//cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl; + //are you the closest left sequence + if (itL->second < pref[i].closestLeft[1]) { + + pref[i].closestLeft[1] = itL->second; + pref[i].leftParent[1] = seqs[j].getName(); +//cout << "updating closest left to " << pref[i].leftParent[1] << endl; + } +//cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl; + if (itL->second < pref[j].closestLeft[1]) { + pref[j].closestLeft[1] = itL->second; + pref[j].leftParent[1] = seqs[i].getName(); +//cout << "updating closest left to " << pref[j].leftParent[1] << endl; + } + + //are you the closest right sequence + if (itR->second < pref[i].closestRight[1]) { + pref[i].closestRight[1] = itR->second; + pref[i].rightParent[1] = seqs[j].getName(); + } + if (itR->second < pref[j].closestRight[1]) { + pref[j].closestRight[1] = itR->second; + pref[j].rightParent[1] = seqs[i].getName(); + } + + } + } + + } + + + + //calculate the dme + + int count0 = 0; + for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } } + + float expectedPercent = 1 / (float) (pref.size() - count0); +//cout << endl << "dme = " << dme << endl; + //recalculate prefernences based on dme + for (int i = 0; i < pref.size(); i++) { +//cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl; + // gives the actual percentage of the dme this seq adds + pref[i].score[1] = pref[i].score[1] / dme; + + //how much higher or lower is this than expected + pref[i].score[1] = pref[i].score[1] / expectedPercent; + + //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]); + + //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher. +//cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl; + } + + //is this score bigger then the last score + for (int i = 0; i < pref.size(); i++) { + + //update biggest score + if (pref[i].score[1] > pref[i].score[0]) { + pref[i].score[0] = pref[i].score[1]; + pref[i].leftParent[0] = pref[i].leftParent[1]; + pref[i].rightParent[0] = pref[i].rightParent[1]; + pref[i].closestLeft[0] = pref[i].closestLeft[1]; + pref[i].closestRight[0] = pref[i].closestRight[1]; + pref[i].midpoint = mid; + } + + } + + } + catch(exception& e) { + errorOut(e, "Bellerophon", "generatePreferences"); + exit(1); + } +} +/**************************************************************************************************/ + diff --git a/bellerophon.h b/bellerophon.h new file mode 100644 index 0000000..3f28e5b --- /dev/null +++ b/bellerophon.h @@ -0,0 +1,63 @@ +#ifndef BELLEROPHON_H +#define BELLEROPHON_H + +/* + * bellerophon.h + * Mothur + * + * Created by Sarah Westcott on 7/9/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + + +#include "chimera.h" +#include "filterseqscommand.h" +#include "sequence.hpp" +#include "dist.h" + +struct Preference { + string name; + vector leftParent; //keep the name of closest left associated with the two scores + vector rightParent; //keep the name of closest right associated with the two scores + vector score; //so you can keep last score and calc this score and keep whichever is bigger. + vector closestLeft; //keep the closest left associated with the two scores + vector closestRight; //keep the closest right associated with the two scores + int midpoint; + +}; + + +/***********************************************************/ + +class Bellerophon : public Chimera { + + public: + Bellerophon(string); + ~Bellerophon() {}; + + void getChimeras(); + void print(ostream&); + + + private: + Dist* distCalculator; + FilterSeqsCommand* filterSeqs; + vector seqs; + vector pref; + string fastafile; + int iters; + + void readSeqs(); + void generatePreferences(vector, vector, int); + int createSparseMatrix(int, int, SparseMatrix*, vector); + + + + +}; + +/***********************************************************/ + +#endif + diff --git a/chimera.h b/chimera.h new file mode 100644 index 0000000..07ad92f --- /dev/null +++ b/chimera.h @@ -0,0 +1,53 @@ +#ifndef CHIMERA_H +#define CHIMERA_H + +/* + * chimera.h + * Mothur + * + * Created by Sarah Westcott on 7/9/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + + +#include "mothur.h" +#include "sparsematrix.hpp" + +typedef list::iterator MatData; +typedef map SeqMap; //maps sequence to all distance for that seqeunce + + +/***********************************************************************/ + +class Chimera { + + public: + + Chimera(){}; + Chimera(string); + virtual ~Chimera(){}; + virtual void setFilter(bool f) { filter = f; } + virtual void setCorrection(bool c) { correction = c; } + virtual void setProcessors(int p) { processors = p; } + virtual void setWindow(int w) { window = w; } + virtual void setIncrement(int i) { increment = i; } + virtual void setTemplate(string t) { templateFile = t; } + + //pure functions + virtual void getChimeras() = 0; + virtual void print(ostream&) = 0; + + protected: + + bool filter, correction; + int processors, window, increment; + string templateFile; + + +}; + +/***********************************************************************/ + +#endif + diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index a334ea3..105fd11 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -8,9 +8,8 @@ */ #include "chimeraseqscommand.h" -#include "eachgapdist.h" -#include "ignoregaps.h" -#include "onegapdist.h" +#include "bellerophon.h" +#include "pintail.h" //*************************************************************************************************************** @@ -23,7 +22,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment" }; + string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -41,6 +40,11 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ if (fastafile == "not open") { abort = true; } else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true; } + templatefile = validParameter.validFile(parameters, "template", true); + if (templatefile == "not open") { abort = true; } + else if (templatefile == "not found") { templatefile = ""; } + + string temp; temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; } filter = isTrue(temp); @@ -57,9 +61,10 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){ temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "10"; } convert(temp, increment); - method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "bellerophon"; } + method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; } + + if ((method == "pintail") && (templatefile == "")) { mothurOut("You must provide a template file with the pintail method."); mothurOutEndLine(); abort = true; } - if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine(); abort = true; } } } @@ -75,9 +80,9 @@ void ChimeraSeqsCommand::help(){ mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n"); mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method. fasta is required.\n"); mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter. The default is false. \n"); - mothurOut("The correction parameter allows you to ..... The default is true. \n"); + mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. \n"); mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); - mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is bellerophon. \n"); + mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. \n"); mothurOut("The chimera.seqs command should be in the following format: \n"); mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n"); mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n"); @@ -88,11 +93,6 @@ void ChimeraSeqsCommand::help(){ exit(1); } } -//******************************************************************************************************************** -//sorts highest score to lowest -inline bool comparePref(Preference left, Preference right){ - return (left.score[0] > right.score[0]); -} //*************************************************************************************************************** @@ -105,373 +105,40 @@ int ChimeraSeqsCommand::execute(){ if (abort == true) { return 0; } + if (method == "bellerophon") { chimera = new Bellerophon(fastafile); } + else if (method == "pintail") { chimera = new Pintail(fastafile); } + else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; } - //do soft filter - if (filter) { - string optionString = "fasta=" + fastafile + ", soft=50, vertical=F"; - filterSeqs = new FilterSeqsCommand(optionString); - filterSeqs->execute(); - delete filterSeqs; - - //reset fastafile to filtered file - fastafile = getRootName(fastafile) + "filter.fasta"; - } - - distCalculator = new eachGapDist(); - - //read in sequences - readSeqs(); + //set user options + chimera->setFilter(filter); + chimera->setCorrection(correction); + chimera->setProcessors(processors); + chimera->setWindow(window); + chimera->setIncrement(increment); + chimera->setTemplate(templatefile); - int numSeqs = seqs.size(); - - if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; } - - //set default window to 25% of sequence length - string seq0 = seqs[0].getAligned(); - if (window == 0) { window = seq0.length() / 4; } - else if (window > (seq0.length() / 2)) { - mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine(); - window = (seq0.length() / 2); - } - - if (increment > (seqs[0].getAlignLength() - (2*window))) { - if (increment != 10) { - - mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine(); - increment = 10; - if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; } - - }else{ increment = 0; } - } -cout << "increment = " << increment << endl; - if (increment == 0) { iters = 1; } - else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); } + //find chimeras + chimera->getChimeras(); - //initialize pref - pref.resize(numSeqs); - - for (int i = 0; i < numSeqs; i++ ) { - pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3); - pref[i].name = seqs[i].getName(); - pref[i].score[0] = 0.0; pref[i].score[1] = 0.0; - pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0; - pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0; - } - - int midpoint = window; - int count = 0; - while (count < iters) { - - //create 2 vectors of sequences, 1 for left side and one for right side - vector left; vector right; - - for (int i = 0; i < seqs.size(); i++) { -//cout << "whole = " << seqs[i].getAligned() << endl; - //save left side - string seqLeft = seqs[i].getAligned().substr(midpoint-window, window); - Sequence tempLeft; - tempLeft.setName(seqs[i].getName()); - tempLeft.setAligned(seqLeft); - left.push_back(tempLeft); -//cout << "left = " << tempLeft.getAligned() << endl; - //save right side - string seqRight = seqs[i].getAligned().substr(midpoint, window); - Sequence tempRight; - tempRight.setName(seqs[i].getName()); - tempRight.setAligned(seqRight); - right.push_back(tempRight); -//cout << "right = " << seqRight << endl; - } - - //adjust midpoint by increment - midpoint += increment; - - - //this should be parallelized - //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | ) - //create a matrix containing the distance from left to left and right to right - //calculate distances - SparseMatrix* SparseLeft = new SparseMatrix(); - SparseMatrix* SparseRight = new SparseMatrix(); - - createSparseMatrix(0, left.size(), SparseLeft, left); - createSparseMatrix(0, right.size(), SparseRight, right); - - vector distMapRight; - vector distMapLeft; - - // Create a data structure to quickly access the distance information. - // It consists of a vector of distance maps, where each map contains - // all distances of a certain sequence. Vector and maps are accessed - // via the index of a sequence in the distance matrix - distMapRight = vector(numSeqs); - distMapLeft = vector(numSeqs); - //cout << "left" << endl << endl; - for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) { - distMapLeft[currentCell->row][currentCell->column] = currentCell->dist; - //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; - } - //cout << "right" << endl << endl; - for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) { - distMapRight[currentCell->row][currentCell->column] = currentCell->dist; - //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl; - } - - delete SparseLeft; - delete SparseRight; - - - //fill preference structure - generatePreferences(distMapLeft, distMapRight, midpoint); - - count++; - - } - - delete distCalculator; - - //rank preference score to eachother - float dme = 0.0; - float expectedPercent = 1 / (float) (pref.size()); - - for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[0]; } - - for (int i = 0; i < pref.size(); i++) { - - //gives the actual percentage of the dme this seq adds - pref[i].score[0] = pref[i].score[0] / dme; - - //how much higher or lower is this than expected - pref[i].score[0] = pref[i].score[0] / expectedPercent; - - } - - - //sort Preferences highest to lowest - sort(pref.begin(), pref.end(), comparePref); - - string outputFileName = getRootName(fastafile) + "chimeras"; + string outputFileName = getRootName(fastafile) + method + ".chimeras"; ofstream out; openOutputFile(outputFileName, out); - int above1 = 0; - out << "Name\tScore\tLeft\tRight\t" << endl; - //output prefenence structure to .chimeras file - for (int i = 0; i < pref.size(); i++) { - out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl; - - //calc # of seqs with preference above 1.0 - if (pref[i].score[0] > 1.0) { - above1++; - mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine(); - mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine(); - } - - - } + //print results + chimera->print(out); - //output results to screen - mothurOutEndLine(); - mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine(); - int spot; - spot = pref.size()-1; - mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = pref.size() * 0.975; - mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = pref.size() * 0.75; - mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = pref.size() * 0.50; - mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = pref.size() * 0.25; - mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = pref.size() * 0.025; - mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); - spot = 0; - mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine(); + out.close(); - return 0; - } - catch(exception& e) { - errorOut(e, "ChimeraSeqsCommand", "execute"); - exit(1); - } -} - -//*************************************************************************************************************** -void ChimeraSeqsCommand::readSeqs(){ - try { - ifstream inFASTA; - openInputFile(fastafile, inFASTA); + delete chimera; - //read in seqs and store in vector - while(!inFASTA.eof()){ - Sequence current(inFASTA); - - if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); } - - seqs.push_back(current); - - gobble(inFASTA); - } - inFASTA.close(); - - } - catch(exception& e) { - errorOut(e, "ChimeraSeqsCommand", "readSeqs"); - exit(1); - } -} - -/***************************************************************************************************************/ -int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector s){ - try { - - for(int i=startSeq; icalcDist(s[i], s[j]); - float dist = distCalculator->getDist(); - - PCell temp(i, j, dist); - sparse->addCell(temp); - - } - } - - - return 1; - } - catch(exception& e) { - errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix"); - exit(1); - } -} -/***************************************************************************************************************/ -void ChimeraSeqsCommand::generatePreferences(vector left, vector right, int mid){ - try { - - float dme = 0.0; - SeqMap::iterator itR; - SeqMap::iterator itL; - - //initialize pref[i] - for (int i = 0; i < pref.size(); i++) { - pref[i].score[1] = 0.0; - pref[i].closestLeft[1] = 100000.0; - pref[i].closestRight[1] = 100000.0; - pref[i].leftParent[1] = ""; - pref[i].rightParent[1] = ""; - } - - for (int i = 0; i < left.size(); i++) { - - SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0, - // 1 to the distance of sequence 3 to sequence 1, - // 2 to the distance of sequence 3 to sequence 2. - SeqMap currentRight = right[i]; // same as left but with distances on the right side. - - for (int j = 0; j < i; j++) { - - itL = currentLeft.find(j); - itR = currentRight.find(j); -//cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl; -//cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl; - - //if you can find this entry update the preferences - if ((itL != currentLeft.end()) && (itR != currentRight.end())) { - - if (!correction) { - pref[i].score[1] += abs((itL->second - itR->second)); - pref[j].score[1] += abs((itL->second - itR->second)); -//cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; -//cout << "abs = " << abs((itL->second - itR->second)) << endl; -//cout << i << " score = " << pref[i].score[1] << endl; -//cout << j << " score = " << pref[j].score[1] << endl; - }else { - pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); - pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second))); -//cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl; -//cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl; -//cout << i << " score = " << pref[i].score[1] << endl; -//cout << j << " score = " << pref[j].score[1] << endl; - } -//cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl; - //are you the closest left sequence - if (itL->second < pref[i].closestLeft[1]) { - - pref[i].closestLeft[1] = itL->second; - pref[i].leftParent[1] = seqs[j].getName(); -//cout << "updating closest left to " << pref[i].leftParent[1] << endl; - } -//cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl; - if (itL->second < pref[j].closestLeft[1]) { - pref[j].closestLeft[1] = itL->second; - pref[j].leftParent[1] = seqs[i].getName(); -//cout << "updating closest left to " << pref[j].leftParent[1] << endl; - } - - //are you the closest right sequence - if (itR->second < pref[i].closestRight[1]) { - pref[i].closestRight[1] = itR->second; - pref[i].rightParent[1] = seqs[j].getName(); - } - if (itR->second < pref[j].closestRight[1]) { - pref[j].closestRight[1] = itR->second; - pref[j].rightParent[1] = seqs[i].getName(); - } - - } - } - - } - - - - //calculate the dme - - int count0 = 0; - for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } } - - float expectedPercent = 1 / (float) (pref.size() - count0); -//cout << endl << "dme = " << dme << endl; - //recalculate prefernences based on dme - for (int i = 0; i < pref.size(); i++) { -//cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl; - // gives the actual percentage of the dme this seq adds - pref[i].score[1] = pref[i].score[1] / dme; - - //how much higher or lower is this than expected - pref[i].score[1] = pref[i].score[1] / expectedPercent; - - //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]); - - //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher. -//cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl; - } + return 0; - //is this score bigger then the last score - for (int i = 0; i < pref.size(); i++) { - - //update biggest score - if (pref[i].score[1] > pref[i].score[0]) { - pref[i].score[0] = pref[i].score[1]; - pref[i].leftParent[0] = pref[i].leftParent[1]; - pref[i].rightParent[0] = pref[i].rightParent[1]; - pref[i].closestLeft[0] = pref[i].closestLeft[1]; - pref[i].closestRight[0] = pref[i].closestRight[1]; - pref[i].midpoint = mid; - } - - } - } catch(exception& e) { - errorOut(e, "ChimeraSeqsCommand", "generatePreferences"); + errorOut(e, "ChimeraSeqsCommand", "execute"); exit(1); } } -/**************************************************************************************************/ /**************************************************************************************************/ diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h index 2398ce6..dc7df8f 100644 --- a/chimeraseqscommand.h +++ b/chimeraseqscommand.h @@ -12,25 +12,7 @@ #include "mothur.h" #include "command.hpp" -#include "filterseqscommand.h" -#include "sequence.hpp" -#include "sparsematrix.hpp" -#include "dist.h" - -typedef list::iterator MatData; -typedef map SeqMap; //maps sequence to all distance for that seqeunce - -struct Preference { - string name; - vector leftParent; //keep the name of closest left associated with the two scores - vector rightParent; //keep the name of closest right associated with the two scores - vector score; //so you can keep last score and calc this score and keep whichever is bigger. - vector closestLeft; //keep the closest left associated with the two scores - vector closestRight; //keep the closest right associated with the two scores - int midpoint; - -}; - +#include "chimera.h" /***********************************************************/ @@ -45,21 +27,13 @@ public: private: - Dist* distCalculator; bool abort; - string method, fastafile; + string method, fastafile, templatefile; bool filter, correction; int processors, midpoint, averageLeft, averageRight, window, iters, increment; - FilterSeqsCommand* filterSeqs; - ListVector* list; - vector seqs; - vector pref; + Chimera* chimera; - void readSeqs(); - void generatePreferences(vector, vector, int); - int createSparseMatrix(int, int, SparseMatrix*, vector); - }; /***********************************************************/ diff --git a/getrabundcommand.cpp b/getrabundcommand.cpp index 73862a8..b0bf819 100644 --- a/getrabundcommand.cpp +++ b/getrabundcommand.cpp @@ -24,7 +24,7 @@ GetRAbundCommand::GetRAbundCommand(string option){ else { //valid paramters for this command - string Array[] = {"line","label"}; + string Array[] = {"line","label","sorted"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -42,6 +42,11 @@ GetRAbundCommand::GetRAbundCommand(string option){ //check for optional parameter and set defaults // ...at some point should added some additional type checking... + + string temp; + temp = validParameter.validFile(parameters, "sorted", false); if (temp == "not found") { temp = "T"; } + sorted = isTrue(temp); + line = validParameter.validFile(parameters, "line", false); if (line == "not found") { line = ""; } else { @@ -82,10 +87,11 @@ GetRAbundCommand::GetRAbundCommand(string option){ void GetRAbundCommand::help(){ try { mothurOut("The get.rabund command can only be executed after a successful read.otu of a listfile.\n"); - mothurOut("The get.rabund command parameters are line and label. No parameters are required, and you may not use line and label at the same time.\n"); + mothurOut("The get.rabund command parameters are line, label and sorted. No parameters are required, and you may not use line and label at the same time.\n"); mothurOut("The line and label allow you to select what distance levels you would like included in your .rabund file, and are separated by dashes.\n"); - mothurOut("The get.rabund command should be in the following format: get.rabund(line=yourLines, label=yourLabels).\n"); - mothurOut("Example get.rabund(line=1-3-5).\n"); + mothurOut("The sorted parameters allows you to print the rabund results sorted by abundance or not. The default is sorted.\n"); + mothurOut("The get.rabund command should be in the following format: get.rabund(line=yourLines, label=yourLabels, sorted=yourSorted).\n"); + mothurOut("Example get.rabund(line=1-3-5, sorted=F).\n"); mothurOut("The default value for line and label are all lines in your inputfile.\n"); mothurOut("The get.rabund command outputs a .rabund file containing the lines you selected.\n"); mothurOut("Note: No spaces between parameter labels (i.e. line), '=' and parameters (i.e.yourLines).\n\n"); @@ -129,9 +135,12 @@ int GetRAbundCommand::execute(){ if(allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){ mothurOut(list->getLabel() + "\t" + toString(count)); mothurOutEndLine(); - rabund = new RAbundVector(); + rabund = new RAbundVector(); *rabund = (list->getRAbundVector()); - rabund->print(out); + + if(sorted) { rabund->print(out); } + else { rabund->nonSortedPrint(out); } + delete rabund; processedLabels.insert(list->getLabel()); @@ -146,7 +155,10 @@ int GetRAbundCommand::execute(){ mothurOut(list->getLabel() + "\t" + toString(count)); mothurOutEndLine(); rabund = new RAbundVector(); *rabund = (list->getRAbundVector()); - rabund->print(out); + + if(sorted) { rabund->print(out); } + else { rabund->nonSortedPrint(out); } + delete rabund; processedLabels.insert(list->getLabel()); @@ -181,7 +193,10 @@ int GetRAbundCommand::execute(){ mothurOut(list->getLabel() + "\t" + toString(count)); mothurOutEndLine(); rabund = new RAbundVector(); *rabund = (list->getRAbundVector()); - rabund->print(out); + + if(sorted) { rabund->print(out); } + else { rabund->nonSortedPrint(out); } + delete rabund; delete list; } diff --git a/getrabundcommand.h b/getrabundcommand.h index 38b5ec7..92e8475 100644 --- a/getrabundcommand.h +++ b/getrabundcommand.h @@ -34,7 +34,7 @@ private: ListVector* list; RAbundVector* rabund; - bool abort, allLines; + bool abort, allLines, sorted; set lines; //hold lines to be used set labels; //holds labels to be used string line, label; diff --git a/pintail.cpp b/pintail.cpp new file mode 100644 index 0000000..6ccccde --- /dev/null +++ b/pintail.cpp @@ -0,0 +1,587 @@ +/* + * pintail.cpp + * Mothur + * + * Created by Sarah Westcott on 7/9/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "pintail.h" +#include "ignoregaps.h" + +//*************************************************************************************************************** + +Pintail::Pintail(string name) { + try { + fastafile = name; + } + catch(exception& e) { + errorOut(e, "Pintail", "Pintail"); + exit(1); + } +} +//*************************************************************************************************************** + +Pintail::~Pintail() { + try { + for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } + for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } + } + catch(exception& e) { + errorOut(e, "Pintail", "~Pintail"); + exit(1); + } +} +//*************************************************************************************************************** +void Pintail::print(ostream& out) { + try { + + for (itCoef = DE.begin(); itCoef != DE.end(); itCoef++) { + + out << itCoef->first->getName() << '\t' << itCoef->second << endl; + out << "Observed\t"; + + itObsDist = obsDistance.find(itCoef->first); + for (int i = 0; i < itObsDist->second.size(); i++) { out << itObsDist->second[i] << '\t'; } + out << endl; + + out << "Expected\t"; + + itExpDist = expectedDistance.find(itCoef->first); + for (int i = 0; i < itExpDist->second.size(); i++) { out << itExpDist->second[i] << '\t'; } + out << endl; + + } + + } + catch(exception& e) { + errorOut(e, "Pintail", "print"); + exit(1); + } +} + +//*************************************************************************************************************** +void Pintail::getChimeras() { + try { + + distCalculator = new ignoreGaps(); + + //read in query sequences and subject sequences + mothurOut("Reading sequences and template file... "); cout.flush(); + querySeqs = readSeqs(fastafile); + templateSeqs = readSeqs(templateFile); + mothurOut("Done."); mothurOutEndLine(); + + int numSeqs = querySeqs.size(); + + //if window is set to default + if (window == 0) { if (querySeqs[0]->getAligned().length() > 800) { setWindow(200); } + else{ setWindow((querySeqs[0]->getAligned().length() / 4)); } } + + //calculate number of iters + iters = (querySeqs[0]->getAligned().length() - window + 1) / increment; + + int linesPerProcess = processors / numSeqs; + + //find breakup of sequences for all times we will Parallelize + if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } + else { + //fill line pairs + for (int i = 0; i < (processors-1); i++) { + lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); + } + //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end + int i = processors - 1; + lines.push_back(new linePair((i*linesPerProcess), numSeqs)); + } + + //map query sequences to their most similiar sequences in the template - Parallelized + mothurOut("Finding closest sequence in template to each sequence... "); cout.flush(); + if (processors == 1) { findPairs(lines[0]->start, lines[0]->end); } + else { createProcessesPairs(); } + mothurOut("Done."); mothurOutEndLine(); + // itBest = bestfit.begin(); + // itBest++; itBest++; +//cout << itBest->first->getName() << '\t' << itBest->second->getName() << endl; + //find Oqs for each sequence - the observed distance in each window - Parallelized + mothurOut("Calculating observed percentage differences for each sequence... "); cout.flush(); + if (processors == 1) { calcObserved(lines[0]->start, lines[0]->end); } + else { createProcessesObserved(); } + mothurOut("Done."); mothurOutEndLine(); + + //find P + mothurOut("Calculating expected percentage differences for each sequence... "); cout.flush(); + vector probabilityProfile = calcFreq(templateSeqs); + + //make P into Q + for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } +//cout << "after p into Q" << endl; + //find Qav + averageProbability = findQav(probabilityProfile); +//cout << "after Qav" << endl; + //find Coefficient - maps a sequence to its coefficient + seqCoef = getCoef(averageProbability); +//cout << "after coef" << endl; + //find Eqs for each sequence - the expected distance in each window - Parallelized + if (processors == 1) { calcExpected(lines[0]->start, lines[0]->end); } + else { createProcessesExpected(); } + mothurOut("Done."); mothurOutEndLine(); + + //find deviation - Parallelized + mothurOut("Finding deviation from expected... "); cout.flush(); + if (processors == 1) { calcDE(lines[0]->start, lines[0]->end); } + else { createProcessesDE(); } + mothurOut("Done."); mothurOutEndLine(); + + + //free memory + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } + delete distCalculator; + + } + catch(exception& e) { + errorOut(e, "Pintail", "getChimeras"); + exit(1); + } +} + +//*************************************************************************************************************** + +vector Pintail::readSeqs(string file) { + try { + + ifstream in; + openInputFile(file, in); + vector container; + + //read in seqs and store in vector + while(!in.eof()){ + Sequence* current = new Sequence(in); + + if (current->getAligned() == "") { current->setAligned(current->getUnaligned()); } + + container.push_back(current); + + gobble(in); + } + + in.close(); + return container; + + } + catch(exception& e) { + errorOut(e, "Pintail", "readSeqs"); + exit(1); + } +} + +//*************************************************************************************************************** +//calculate the distances from each query sequence to all sequences in the template to find the closest sequence +void Pintail::findPairs(int start, int end) { + try { + + for(int i = start; i < end; i++){ + + float smallest = 10000.0; + Sequence query = *(querySeqs[i]); + + for(int j = 0; j < templateSeqs.size(); j++){ + + Sequence temp = *(templateSeqs[j]); + + distCalculator->calcDist(query, temp); + float dist = distCalculator->getDist(); + + if (dist < smallest) { + + bestfit[querySeqs[i]] = templateSeqs[j]; + smallest = dist; + } + } + } + + } + catch(exception& e) { + errorOut(e, "Pintail", "findPairs"); + exit(1); + } +} +//*************************************************************************************************************** +void Pintail::calcObserved(int start, int end) { + try { + + for(int i = start; i < end; i++){ + + itBest = bestfit.find(querySeqs[i]); + Sequence* query; + Sequence* subject; + + if (itBest != bestfit.end()) { + query = itBest->first; + subject = itBest->second; + }else{ mothurOut("Error in calcObserved"); mothurOutEndLine(); } + + vector queryFragment; + vector subjectFragment; + + int startpoint = 0; + for (int m = 0; m < iters; m++) { + string seqFrag = query->getAligned().substr(startpoint, window); + Sequence* temp = new Sequence(query->getName(), seqFrag); + queryFragment.push_back(temp); + + string seqFragsub = subject->getAligned().substr(startpoint, window); + Sequence* tempsub = new Sequence(subject->getName(), seqFragsub); + subjectFragment.push_back(tempsub); + + startpoint += increment; + } + + + for(int j = 0; j < iters; j++){ + + distCalculator->calcDist(*(queryFragment[j]), *(subjectFragment[j])); + float dist = distCalculator->getDist(); + + //convert to similiarity + //dist = 1 - dist; + + //save oi + obsDistance[query].push_back(dist); + } + + //free memory + for (int i = 0; i < queryFragment.size(); i++) { delete queryFragment[i]; } + for (int i = 0; i < subjectFragment.size(); i++) { delete subjectFragment[i]; } + } + + } + catch(exception& e) { + errorOut(e, "Pintail", "calcObserved"); + exit(1); + } +} + +//*************************************************************************************************************** +void Pintail::calcExpected(int start, int end) { + try { + + + //for each sequence + for(int i = start; i < end; i++){ + + itCoef = seqCoef.find(querySeqs[i]); + float coef = itCoef->second; + + //for each window + vector queryExpected; + for (int m = 0; m < iters; m++) { + float expected = averageProbability[m] * coef; + queryExpected.push_back(expected); + } + + expectedDistance[querySeqs[i]] = queryExpected; + + } + + } + catch(exception& e) { + errorOut(e, "Pintail", "calcExpected"); + exit(1); + } +} +//*************************************************************************************************************** +void Pintail::calcDE(int start, int end) { + try { + + + //for each sequence + for(int i = start; i < end; i++){ + + itObsDist = obsDistance.find(querySeqs[i]); + vector obs = itObsDist->second; + + itExpDist = expectedDistance.find(querySeqs[i]); + vector exp = itExpDist->second; + + //for each window + float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2 + for (int m = 0; m < iters; m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); } + + float de = sqrt((sum / (iters - 1))); + + DE[querySeqs[i]] = de; + } + + } + catch(exception& e) { + errorOut(e, "Pintail", "calcDE"); + exit(1); + } +} + +//*************************************************************************************************************** + +vector Pintail::calcFreq(vector seqs) { + try { + + vector prob; + + //at each position in the sequence + for (int i = 0; i < seqs[0]->getAligned().length(); i++) { + + vector freq; freq.resize(4,0); + + //find the frequency of each nucleotide + for (int j = 0; j < seqs.size(); j++) { + + char value = seqs[j]->getAligned()[i]; + + if(toupper(value) == 'A') { freq[0]++; } + else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; } + else if(toupper(value) == 'G') { freq[2]++; } + else if(toupper(value) == 'C') { freq[3]++; } + } + + //find base with highest frequency + int highest = 0; + for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } } + + float highFreq = highest / (float) seqs.size(); + float Pi; + //if this is not a gap column + if (highFreq != 0) { Pi = (highFreq - 0.25) / 0.75; } + else { Pi = 0; } + + prob.push_back(Pi); + + } + + return prob; + + } + catch(exception& e) { + errorOut(e, "Pintail", "calcFreq"); + exit(1); + } +} +//*************************************************************************************************************** +vector Pintail::findQav(vector prob) { + try { + vector averages; + + //for each window find average + int startpoint = 0; + for (int m = 0; m < iters; m++) { + + float average = 0.0; + for (int i = startpoint; i < (startpoint+window); i++) { average += prob[i]; } + + average = average / window; + + //save this windows average + averages.push_back(average); + + startpoint += increment; + } + + return averages; + } + catch(exception& e) { + errorOut(e, "Pintail", "findQav"); + exit(1); + } +} +//*************************************************************************************************************** +map Pintail::getCoef(vector prob) { + try { + map coefs; + + //find average prob + float probAverage = 0.0; + for (int i = 0; i < prob.size(); i++) { probAverage += prob[i]; } + probAverage = probAverage / (float) prob.size(); + + //find a coef for each sequence + map >::iterator it; + for (it = obsDistance.begin(); it != obsDistance.end(); it++) { + + vector temp = it->second; + Sequence* tempSeq = it->first; + + //find observed average + float obsAverage = 0.0; + for (int i = 0; i < temp.size(); i++) { obsAverage += temp[i]; } + obsAverage = obsAverage / (float) temp.size(); + + float coef = obsAverage / probAverage; + + //save this sequences coefficient + coefs[tempSeq] = coef; + } + + + return coefs; + } + catch(exception& e) { + errorOut(e, "Pintail", "getCoef"); + exit(1); + } +} + +/**************************************************************************************************/ + +void Pintail::createProcessesPairs() { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + vector processIDS; + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); + process++; + }else if (pid == 0){ + findPairs(lines[process]->start, lines[process]->end); + exit(0); + }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;istart, lines[0]->end); + +#endif + } + catch(exception& e) { + errorOut(e, "Pintail", "createProcessesPairs"); + exit(1); + } +} + +/**************************************************************************************************/ + +void Pintail::createProcessesObserved() { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + vector processIDS; + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); + process++; + }else if (pid == 0){ + calcObserved(lines[process]->start, lines[process]->end); + exit(0); + }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;istart, lines[0]->end); + +#endif + } + catch(exception& e) { + errorOut(e, "Pintail", "createProcessesObserved"); + exit(1); + } +} + +//*************************************************************************************************************** + +void Pintail::createProcessesExpected() { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + vector processIDS; + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); + process++; + }else if (pid == 0){ + calcExpected(lines[process]->start, lines[process]->end); + exit(0); + }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;istart, lines[0]->end); + +#endif + } + catch(exception& e) { + errorOut(e, "Pintail", "createProcessesExpected"); + exit(1); + } +} + +/**************************************************************************************************/ + +void Pintail::createProcessesDE() { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + vector processIDS; + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); + process++; + }else if (pid == 0){ + calcDE(lines[process]->start, lines[process]->end); + exit(0); + }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;istart, lines[0]->end); + +#endif + } + catch(exception& e) { + errorOut(e, "Pintail", "createProcessesDE"); + exit(1); + } +} + +//*************************************************************************************************************** + + diff --git a/pintail.h b/pintail.h new file mode 100644 index 0000000..bae39d7 --- /dev/null +++ b/pintail.h @@ -0,0 +1,81 @@ +#ifndef PINTAIL_H +#define PINTAIL_H + +/* + * pintail.h + * Mothur + * + * Created by Sarah Westcott on 7/9/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "chimera.h" +#include "dist.h" + +//This class was created using the algorythms described in the +// "At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies" paper +//by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1. + +/***********************************************************/ + +class Pintail : public Chimera { + + public: + Pintail(string); + ~Pintail(); + + void getChimeras(); + void print(ostream&); + + + private: + + struct linePair { + int start; + int end; + linePair(int i, int j) : start(i), end(j) {} + }; + + + Dist* distCalculator; + string fastafile; + int iters; + vector lines; + vector querySeqs; + vector templateSeqs; + + map bestfit; //maps a query sequence to its most similiar sequence in the template + map::iterator itBest; + + map > obsDistance; //maps a query sequence to its observed distance at each window + map > expectedDistance; //maps a query sequence to its expected distance at each window + map >::iterator itObsDist; + map >::iterator itExpDist; + + vector averageProbability; //Qav + map seqCoef; //maps a sequence to its coefficient + map DE; //maps a sequence to its deviation + map::iterator itCoef; + + vector readSeqs(string); + vector findQav(vector); + vector calcFreq(vector); + map getCoef(vector); + + void findPairs(int, int); + void calcObserved(int, int); + void calcExpected(int, int); + void calcDE(int, int); + + void createProcessesPairs(); + void createProcessesObserved(); + void createProcessesExpected(); + void createProcessesDE(); + +}; + +/***********************************************************/ + +#endif + diff --git a/rabundvector.cpp b/rabundvector.cpp index ce04df1..7d62470 100644 --- a/rabundvector.cpp +++ b/rabundvector.cpp @@ -193,6 +193,19 @@ vector::reverse_iterator RAbundVector::rend(){ return data.rend(); } +/***********************************************************************/ +void RAbundVector::nonSortedPrint(ostream& output){ + try { + output << label << '\t' << numBins << '\t'; + + for(int i=0;i