3746109D0F40657600460C57 /* unifracunweightedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3746109C0F40657600460C57 /* unifracunweightedcommand.cpp */; };
3749271D0FD58C840031C06B /* getsabundcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3749271C0FD58C840031C06B /* getsabundcommand.cpp */; };
3749273F0FD5956B0031C06B /* getrabundcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3749273E0FD5956B0031C06B /* getrabundcommand.cpp */; };
+ 374F2FEB100634B600E97C66 /* bellerophon.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374F2FEA100634B600E97C66 /* bellerophon.cpp */; };
+ 374F301310063B6F00E97C66 /* pintail.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374F301210063B6F00E97C66 /* pintail.cpp */; };
37519A6B0F80E6EB00FED5E8 /* sharedanderbergs.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37519A6A0F80E6EB00FED5E8 /* sharedanderbergs.cpp */; };
37519AA10F810D0200FED5E8 /* venncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37519AA00F810D0200FED5E8 /* venncommand.cpp */; };
37519AB50F810FAE00FED5E8 /* venn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37519AB40F810FAE00FED5E8 /* venn.cpp */; };
3749271C0FD58C840031C06B /* getsabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsabundcommand.cpp; sourceTree = SOURCE_ROOT; };
3749273D0FD5956B0031C06B /* getrabundcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getrabundcommand.h; sourceTree = SOURCE_ROOT; };
3749273E0FD5956B0031C06B /* getrabundcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getrabundcommand.cpp; sourceTree = SOURCE_ROOT; };
+ 374F2FD51006320200E97C66 /* chimera.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimera.h; sourceTree = "<group>"; };
+ 374F2FE9100634B600E97C66 /* bellerophon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bellerophon.h; sourceTree = "<group>"; };
+ 374F2FEA100634B600E97C66 /* bellerophon.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bellerophon.cpp; sourceTree = "<group>"; };
+ 374F301110063B6F00E97C66 /* pintail.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pintail.h; sourceTree = "<group>"; };
+ 374F301210063B6F00E97C66 /* pintail.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pintail.cpp; sourceTree = "<group>"; };
37519A690F80E6EB00FED5E8 /* sharedanderbergs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedanderbergs.h; sourceTree = SOURCE_ROOT; };
37519A6A0F80E6EB00FED5E8 /* sharedanderbergs.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedanderbergs.cpp; sourceTree = SOURCE_ROOT; };
37519A9F0F810D0200FED5E8 /* venncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = venncommand.h; sourceTree = SOURCE_ROOT; };
373C68DC0FC1C38D00137ACD /* blastalign.hpp */,
373C68DB0FC1C38D00137ACD /* blastalign.cpp */,
37D928A60F2133C0001D4494 /* calculators */,
+ 374F2FE81006346600E97C66 /* chimera */,
37D927C20F21331F001D4494 /* cluster.hpp */,
37D927C10F21331F001D4494 /* cluster.cpp */,
37D928A90F2133E5001D4494 /* commands */,
name = Products;
sourceTree = "<group>";
};
+ 374F2FE81006346600E97C66 /* chimera */ = {
+ isa = PBXGroup;
+ children = (
+ 374F2FD51006320200E97C66 /* chimera.h */,
+ 374F2FE9100634B600E97C66 /* bellerophon.h */,
+ 374F2FEA100634B600E97C66 /* bellerophon.cpp */,
+ 374F301110063B6F00E97C66 /* pintail.h */,
+ 374F301210063B6F00E97C66 /* pintail.cpp */,
+ );
+ name = chimera;
+ sourceTree = "<group>";
+ };
3796441D0FB9B9650081FDB6 /* read */ = {
isa = PBXGroup;
children = (
37B73CA61004D89A008C4B41 /* getseqscommand.cpp in Sources */,
37B73CC01004EB38008C4B41 /* removeseqscommand.cpp in Sources */,
37B73CCF1004F5E0008C4B41 /* systemcommand.cpp in Sources */,
+ 374F2FEB100634B600E97C66 /* bellerophon.cpp in Sources */,
+ 374F301310063B6F00E97C66 /* pintail.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
--- /dev/null
+/*
+ * bellerophon.cpp
+ * Mothur
+ *
+ * Created by Sarah Westcott on 7/9/09.
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "bellerophon.h"
+#include "eachgapdist.h"
+#include "ignoregaps.h"
+#include "onegapdist.h"
+
+
+//***************************************************************************************************************
+
+Bellerophon::Bellerophon(string name) {
+ try {
+ fastafile = name;
+ }
+ catch(exception& e) {
+ errorOut(e, "Bellerophon", "Bellerophon");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+void Bellerophon::print(ostream& out) {
+ try {
+ int above1 = 0;
+ out << "Name\tScore\tLeft\tRight\t" << endl;
+ //output prefenence structure to .chimeras file
+ for (int i = 0; i < pref.size(); i++) {
+ out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
+
+ //calc # of seqs with preference above 1.0
+ if (pref[i].score[0] > 1.0) {
+ above1++;
+ mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
+ mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
+ }
+ }
+
+ //output results to screen
+ mothurOutEndLine();
+ mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
+ int spot;
+ spot = pref.size()-1;
+ mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = pref.size() * 0.975;
+ mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = pref.size() * 0.75;
+ mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = pref.size() * 0.50;
+ mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = pref.size() * 0.25;
+ mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = pref.size() * 0.025;
+ mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ spot = 0;
+ mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Bellerophon", "print");
+ exit(1);
+ }
+}
+
+//********************************************************************************************************************
+//sorts highest score to lowest
+inline bool comparePref(Preference left, Preference right){
+ return (left.score[0] > right.score[0]);
+}
+
+//***************************************************************************************************************
+void Bellerophon::getChimeras() {
+ try {
+
+ //do soft filter
+ if (filter) {
+ string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
+ filterSeqs = new FilterSeqsCommand(optionString);
+ filterSeqs->execute();
+ delete filterSeqs;
+
+ //reset fastafile to filtered file
+ fastafile = getRootName(fastafile) + "filter.fasta";
+ }
+
+ distCalculator = new eachGapDist();
+
+ //read in sequences
+ readSeqs();
+
+ int numSeqs = seqs.size();
+
+ if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); exit(1); }
+
+ //set default window to 25% of sequence length
+ string seq0 = seqs[0].getAligned();
+ if (window == 0) { window = seq0.length() / 4; }
+ else if (window > (seq0.length() / 2)) {
+ mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
+ window = (seq0.length() / 2);
+ }
+
+ if (increment > (seqs[0].getAlignLength() - (2*window))) {
+ if (increment != 10) {
+
+ mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
+ increment = 10;
+ if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; }
+
+ }else{ increment = 0; }
+ }
+cout << "increment = " << increment << endl;
+ if (increment == 0) { iters = 1; }
+ else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
+
+ //initialize pref
+ pref.resize(numSeqs);
+
+ for (int i = 0; i < numSeqs; i++ ) {
+ pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
+ pref[i].name = seqs[i].getName();
+ pref[i].score[0] = 0.0; pref[i].score[1] = 0.0;
+ pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0;
+ pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0;
+ }
+
+ int midpoint = window;
+ int count = 0;
+ while (count < iters) {
+
+ //create 2 vectors of sequences, 1 for left side and one for right side
+ vector<Sequence> left; vector<Sequence> right;
+
+ for (int i = 0; i < seqs.size(); i++) {
+//cout << "whole = " << seqs[i].getAligned() << endl;
+ //save left side
+ string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
+ Sequence tempLeft;
+ tempLeft.setName(seqs[i].getName());
+ tempLeft.setAligned(seqLeft);
+ left.push_back(tempLeft);
+//cout << "left = " << tempLeft.getAligned() << endl;
+ //save right side
+ string seqRight = seqs[i].getAligned().substr(midpoint, window);
+ Sequence tempRight;
+ tempRight.setName(seqs[i].getName());
+ tempRight.setAligned(seqRight);
+ right.push_back(tempRight);
+//cout << "right = " << seqRight << endl;
+ }
+
+ //adjust midpoint by increment
+ midpoint += increment;
+
+
+ //this should be parallelized
+ //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
+ //create a matrix containing the distance from left to left and right to right
+ //calculate distances
+ SparseMatrix* SparseLeft = new SparseMatrix();
+ SparseMatrix* SparseRight = new SparseMatrix();
+
+ createSparseMatrix(0, left.size(), SparseLeft, left);
+ createSparseMatrix(0, right.size(), SparseRight, right);
+
+ vector<SeqMap> distMapRight;
+ vector<SeqMap> distMapLeft;
+
+ // Create a data structure to quickly access the distance information.
+ // It consists of a vector of distance maps, where each map contains
+ // all distances of a certain sequence. Vector and maps are accessed
+ // via the index of a sequence in the distance matrix
+ distMapRight = vector<SeqMap>(numSeqs);
+ distMapLeft = vector<SeqMap>(numSeqs);
+ //cout << "left" << endl << endl;
+ for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
+ distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
+ //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
+ }
+ //cout << "right" << endl << endl;
+ for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
+ distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
+ //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
+ }
+
+ delete SparseLeft;
+ delete SparseRight;
+
+
+ //fill preference structure
+ generatePreferences(distMapLeft, distMapRight, midpoint);
+
+ count++;
+
+ }
+
+ delete distCalculator;
+
+ //rank preference score to eachother
+ float dme = 0.0;
+ float expectedPercent = 1 / (float) (pref.size());
+
+ for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[0]; }
+
+ for (int i = 0; i < pref.size(); i++) {
+
+ //gives the actual percentage of the dme this seq adds
+ pref[i].score[0] = pref[i].score[0] / dme;
+
+ //how much higher or lower is this than expected
+ pref[i].score[0] = pref[i].score[0] / expectedPercent;
+
+ }
+
+
+ //sort Preferences highest to lowest
+ sort(pref.begin(), pref.end(), comparePref);
+
+
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Bellerophon", "getChimeras");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+void Bellerophon::readSeqs(){
+ try {
+ ifstream inFASTA;
+ openInputFile(fastafile, inFASTA);
+
+ //read in seqs and store in vector
+ while(!inFASTA.eof()){
+ Sequence current(inFASTA);
+
+ if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
+
+ seqs.push_back(current);
+
+ gobble(inFASTA);
+ }
+ inFASTA.close();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Bellerophon", "readSeqs");
+ exit(1);
+ }
+}
+
+/***************************************************************************************************************/
+int Bellerophon::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
+ try {
+
+ for(int i=startSeq; i<endSeq; i++){
+
+ for(int j=0;j<i;j++){
+
+ distCalculator->calcDist(s[i], s[j]);
+ float dist = distCalculator->getDist();
+
+ PCell temp(i, j, dist);
+ sparse->addCell(temp);
+
+ }
+ }
+
+
+ return 1;
+ }
+ catch(exception& e) {
+ errorOut(e, "Bellerophon", "createSparseMatrix");
+ exit(1);
+ }
+}
+/***************************************************************************************************************/
+void Bellerophon::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
+ try {
+
+ float dme = 0.0;
+ SeqMap::iterator itR;
+ SeqMap::iterator itL;
+
+ //initialize pref[i]
+ for (int i = 0; i < pref.size(); i++) {
+ pref[i].score[1] = 0.0;
+ pref[i].closestLeft[1] = 100000.0;
+ pref[i].closestRight[1] = 100000.0;
+ pref[i].leftParent[1] = "";
+ pref[i].rightParent[1] = "";
+ }
+
+ for (int i = 0; i < left.size(); i++) {
+
+ SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
+ // 1 to the distance of sequence 3 to sequence 1,
+ // 2 to the distance of sequence 3 to sequence 2.
+ SeqMap currentRight = right[i]; // same as left but with distances on the right side.
+
+ for (int j = 0; j < i; j++) {
+
+ itL = currentLeft.find(j);
+ itR = currentRight.find(j);
+//cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
+//cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
+
+ //if you can find this entry update the preferences
+ if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
+
+ if (!correction) {
+ pref[i].score[1] += abs((itL->second - itR->second));
+ pref[j].score[1] += abs((itL->second - itR->second));
+//cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
+//cout << "abs = " << abs((itL->second - itR->second)) << endl;
+//cout << i << " score = " << pref[i].score[1] << endl;
+//cout << j << " score = " << pref[j].score[1] << endl;
+ }else {
+ pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
+ pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
+//cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
+//cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
+//cout << i << " score = " << pref[i].score[1] << endl;
+//cout << j << " score = " << pref[j].score[1] << endl;
+ }
+//cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;
+ //are you the closest left sequence
+ if (itL->second < pref[i].closestLeft[1]) {
+
+ pref[i].closestLeft[1] = itL->second;
+ pref[i].leftParent[1] = seqs[j].getName();
+//cout << "updating closest left to " << pref[i].leftParent[1] << endl;
+ }
+//cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;
+ if (itL->second < pref[j].closestLeft[1]) {
+ pref[j].closestLeft[1] = itL->second;
+ pref[j].leftParent[1] = seqs[i].getName();
+//cout << "updating closest left to " << pref[j].leftParent[1] << endl;
+ }
+
+ //are you the closest right sequence
+ if (itR->second < pref[i].closestRight[1]) {
+ pref[i].closestRight[1] = itR->second;
+ pref[i].rightParent[1] = seqs[j].getName();
+ }
+ if (itR->second < pref[j].closestRight[1]) {
+ pref[j].closestRight[1] = itR->second;
+ pref[j].rightParent[1] = seqs[i].getName();
+ }
+
+ }
+ }
+
+ }
+
+
+
+ //calculate the dme
+
+ int count0 = 0;
+ for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } }
+
+ float expectedPercent = 1 / (float) (pref.size() - count0);
+//cout << endl << "dme = " << dme << endl;
+ //recalculate prefernences based on dme
+ for (int i = 0; i < pref.size(); i++) {
+//cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;
+ // gives the actual percentage of the dme this seq adds
+ pref[i].score[1] = pref[i].score[1] / dme;
+
+ //how much higher or lower is this than expected
+ pref[i].score[1] = pref[i].score[1] / expectedPercent;
+
+ //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
+
+ //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
+//cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;
+ }
+
+ //is this score bigger then the last score
+ for (int i = 0; i < pref.size(); i++) {
+
+ //update biggest score
+ if (pref[i].score[1] > pref[i].score[0]) {
+ pref[i].score[0] = pref[i].score[1];
+ pref[i].leftParent[0] = pref[i].leftParent[1];
+ pref[i].rightParent[0] = pref[i].rightParent[1];
+ pref[i].closestLeft[0] = pref[i].closestLeft[1];
+ pref[i].closestRight[0] = pref[i].closestRight[1];
+ pref[i].midpoint = mid;
+ }
+
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Bellerophon", "generatePreferences");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
--- /dev/null
+#ifndef BELLEROPHON_H
+#define BELLEROPHON_H
+
+/*
+ * bellerophon.h
+ * Mothur
+ *
+ * Created by Sarah Westcott on 7/9/09.
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+
+#include "chimera.h"
+#include "filterseqscommand.h"
+#include "sequence.hpp"
+#include "dist.h"
+
+struct Preference {
+ string name;
+ vector<string> leftParent; //keep the name of closest left associated with the two scores
+ vector<string> rightParent; //keep the name of closest right associated with the two scores
+ vector<float> score; //so you can keep last score and calc this score and keep whichever is bigger.
+ vector<float> closestLeft; //keep the closest left associated with the two scores
+ vector<float> closestRight; //keep the closest right associated with the two scores
+ int midpoint;
+
+};
+
+
+/***********************************************************/
+
+class Bellerophon : public Chimera {
+
+ public:
+ Bellerophon(string);
+ ~Bellerophon() {};
+
+ void getChimeras();
+ void print(ostream&);
+
+
+ private:
+ Dist* distCalculator;
+ FilterSeqsCommand* filterSeqs;
+ vector<Sequence> seqs;
+ vector<Preference> pref;
+ string fastafile;
+ int iters;
+
+ void readSeqs();
+ void generatePreferences(vector<SeqMap>, vector<SeqMap>, int);
+ int createSparseMatrix(int, int, SparseMatrix*, vector<Sequence>);
+
+
+
+
+};
+
+/***********************************************************/
+
+#endif
+
--- /dev/null
+#ifndef CHIMERA_H
+#define CHIMERA_H
+
+/*
+ * chimera.h
+ * Mothur
+ *
+ * Created by Sarah Westcott on 7/9/09.
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+
+#include "mothur.h"
+#include "sparsematrix.hpp"
+
+typedef list<PCell>::iterator MatData;
+typedef map<int, float> SeqMap; //maps sequence to all distance for that seqeunce
+
+
+/***********************************************************************/
+
+class Chimera {
+
+ public:
+
+ Chimera(){};
+ Chimera(string);
+ virtual ~Chimera(){};
+ virtual void setFilter(bool f) { filter = f; }
+ virtual void setCorrection(bool c) { correction = c; }
+ virtual void setProcessors(int p) { processors = p; }
+ virtual void setWindow(int w) { window = w; }
+ virtual void setIncrement(int i) { increment = i; }
+ virtual void setTemplate(string t) { templateFile = t; }
+
+ //pure functions
+ virtual void getChimeras() = 0;
+ virtual void print(ostream&) = 0;
+
+ protected:
+
+ bool filter, correction;
+ int processors, window, increment;
+ string templateFile;
+
+
+};
+
+/***********************************************************************/
+
+#endif
+
*/
#include "chimeraseqscommand.h"
-#include "eachgapdist.h"
-#include "ignoregaps.h"
-#include "onegapdist.h"
+#include "bellerophon.h"
+#include "pintail.h"
//***************************************************************************************************************
else {
//valid paramters for this command
- string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment" };
+ string Array[] = {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template" };
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
if (fastafile == "not open") { abort = true; }
else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true; }
+ templatefile = validParameter.validFile(parameters, "template", true);
+ if (templatefile == "not open") { abort = true; }
+ else if (templatefile == "not found") { templatefile = ""; }
+
+
string temp;
temp = validParameter.validFile(parameters, "filter", false); if (temp == "not found") { temp = "T"; }
filter = isTrue(temp);
temp = validParameter.validFile(parameters, "increment", false); if (temp == "not found") { temp = "10"; }
convert(temp, increment);
- method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "bellerophon"; }
+ method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "pintail"; }
+
+ if ((method == "pintail") && (templatefile == "")) { mothurOut("You must provide a template file with the pintail method."); mothurOutEndLine(); abort = true; }
- if (method != "bellerophon") { mothurOut(method + " is not a valid method."); mothurOutEndLine(); abort = true; }
}
}
mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method. fasta is required.\n");
mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter. The default is false. \n");
- mothurOut("The correction parameter allows you to ..... The default is true. \n");
+ mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs. The default is true. \n");
mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
- mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is bellerophon. \n");
+ mothurOut("The method parameter allows you to specify the method for finding chimeric sequences. The default is pintail. \n");
mothurOut("The chimera.seqs command should be in the following format: \n");
mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
exit(1);
}
}
-//********************************************************************************************************************
-//sorts highest score to lowest
-inline bool comparePref(Preference left, Preference right){
- return (left.score[0] > right.score[0]);
-}
//***************************************************************************************************************
if (abort == true) { return 0; }
+ if (method == "bellerophon") { chimera = new Bellerophon(fastafile); }
+ else if (method == "pintail") { chimera = new Pintail(fastafile); }
+ else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0; }
- //do soft filter
- if (filter) {
- string optionString = "fasta=" + fastafile + ", soft=50, vertical=F";
- filterSeqs = new FilterSeqsCommand(optionString);
- filterSeqs->execute();
- delete filterSeqs;
-
- //reset fastafile to filtered file
- fastafile = getRootName(fastafile) + "filter.fasta";
- }
-
- distCalculator = new eachGapDist();
-
- //read in sequences
- readSeqs();
+ //set user options
+ chimera->setFilter(filter);
+ chimera->setCorrection(correction);
+ chimera->setProcessors(processors);
+ chimera->setWindow(window);
+ chimera->setIncrement(increment);
+ chimera->setTemplate(templatefile);
- int numSeqs = seqs.size();
-
- if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); return 0; }
-
- //set default window to 25% of sequence length
- string seq0 = seqs[0].getAligned();
- if (window == 0) { window = seq0.length() / 4; }
- else if (window > (seq0.length() / 2)) {
- mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine();
- window = (seq0.length() / 2);
- }
-
- if (increment > (seqs[0].getAlignLength() - (2*window))) {
- if (increment != 10) {
-
- mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine();
- increment = 10;
- if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; }
-
- }else{ increment = 0; }
- }
-cout << "increment = " << increment << endl;
- if (increment == 0) { iters = 1; }
- else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); }
+ //find chimeras
+ chimera->getChimeras();
- //initialize pref
- pref.resize(numSeqs);
-
- for (int i = 0; i < numSeqs; i++ ) {
- pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3);
- pref[i].name = seqs[i].getName();
- pref[i].score[0] = 0.0; pref[i].score[1] = 0.0;
- pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0;
- pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0;
- }
-
- int midpoint = window;
- int count = 0;
- while (count < iters) {
-
- //create 2 vectors of sequences, 1 for left side and one for right side
- vector<Sequence> left; vector<Sequence> right;
-
- for (int i = 0; i < seqs.size(); i++) {
-//cout << "whole = " << seqs[i].getAligned() << endl;
- //save left side
- string seqLeft = seqs[i].getAligned().substr(midpoint-window, window);
- Sequence tempLeft;
- tempLeft.setName(seqs[i].getName());
- tempLeft.setAligned(seqLeft);
- left.push_back(tempLeft);
-//cout << "left = " << tempLeft.getAligned() << endl;
- //save right side
- string seqRight = seqs[i].getAligned().substr(midpoint, window);
- Sequence tempRight;
- tempRight.setName(seqs[i].getName());
- tempRight.setAligned(seqRight);
- right.push_back(tempRight);
-//cout << "right = " << seqRight << endl;
- }
-
- //adjust midpoint by increment
- midpoint += increment;
-
-
- //this should be parallelized
- //perference = sum of (| distance of my left to sequence j's left - distance of my right to sequence j's right | )
- //create a matrix containing the distance from left to left and right to right
- //calculate distances
- SparseMatrix* SparseLeft = new SparseMatrix();
- SparseMatrix* SparseRight = new SparseMatrix();
-
- createSparseMatrix(0, left.size(), SparseLeft, left);
- createSparseMatrix(0, right.size(), SparseRight, right);
-
- vector<SeqMap> distMapRight;
- vector<SeqMap> distMapLeft;
-
- // Create a data structure to quickly access the distance information.
- // It consists of a vector of distance maps, where each map contains
- // all distances of a certain sequence. Vector and maps are accessed
- // via the index of a sequence in the distance matrix
- distMapRight = vector<SeqMap>(numSeqs);
- distMapLeft = vector<SeqMap>(numSeqs);
- //cout << "left" << endl << endl;
- for (MatData currentCell = SparseLeft->begin(); currentCell != SparseLeft->end(); currentCell++) {
- distMapLeft[currentCell->row][currentCell->column] = currentCell->dist;
- //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
- }
- //cout << "right" << endl << endl;
- for (MatData currentCell = SparseRight->begin(); currentCell != SparseRight->end(); currentCell++) {
- distMapRight[currentCell->row][currentCell->column] = currentCell->dist;
- //cout << " i = " << currentCell->row << " j = " << currentCell->column << " dist = " << currentCell->dist << endl;
- }
-
- delete SparseLeft;
- delete SparseRight;
-
-
- //fill preference structure
- generatePreferences(distMapLeft, distMapRight, midpoint);
-
- count++;
-
- }
-
- delete distCalculator;
-
- //rank preference score to eachother
- float dme = 0.0;
- float expectedPercent = 1 / (float) (pref.size());
-
- for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[0]; }
-
- for (int i = 0; i < pref.size(); i++) {
-
- //gives the actual percentage of the dme this seq adds
- pref[i].score[0] = pref[i].score[0] / dme;
-
- //how much higher or lower is this than expected
- pref[i].score[0] = pref[i].score[0] / expectedPercent;
-
- }
-
-
- //sort Preferences highest to lowest
- sort(pref.begin(), pref.end(), comparePref);
-
- string outputFileName = getRootName(fastafile) + "chimeras";
+ string outputFileName = getRootName(fastafile) + method + ".chimeras";
ofstream out;
openOutputFile(outputFileName, out);
- int above1 = 0;
- out << "Name\tScore\tLeft\tRight\t" << endl;
- //output prefenence structure to .chimeras file
- for (int i = 0; i < pref.size(); i++) {
- out << pref[i].name << '\t' << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
-
- //calc # of seqs with preference above 1.0
- if (pref[i].score[0] > 1.0) {
- above1++;
- mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
- mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
- }
-
-
- }
+ //print results
+ chimera->print(out);
- //output results to screen
- mothurOutEndLine();
- mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
- int spot;
- spot = pref.size()-1;
- mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
- spot = pref.size() * 0.975;
- mothurOut("2.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
- spot = pref.size() * 0.75;
- mothurOut("25%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
- spot = pref.size() * 0.50;
- mothurOut("Median: \t" + toString(pref[spot].score[0])); mothurOutEndLine();
- spot = pref.size() * 0.25;
- mothurOut("75%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
- spot = pref.size() * 0.025;
- mothurOut("97.5%-tile:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
- spot = 0;
- mothurOut("Maximum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
+ out.close();
- return 0;
- }
- catch(exception& e) {
- errorOut(e, "ChimeraSeqsCommand", "execute");
- exit(1);
- }
-}
-
-//***************************************************************************************************************
-void ChimeraSeqsCommand::readSeqs(){
- try {
- ifstream inFASTA;
- openInputFile(fastafile, inFASTA);
+ delete chimera;
- //read in seqs and store in vector
- while(!inFASTA.eof()){
- Sequence current(inFASTA);
-
- if (current.getAligned() == "") { current.setAligned(current.getUnaligned()); }
-
- seqs.push_back(current);
-
- gobble(inFASTA);
- }
- inFASTA.close();
-
- }
- catch(exception& e) {
- errorOut(e, "ChimeraSeqsCommand", "readSeqs");
- exit(1);
- }
-}
-
-/***************************************************************************************************************/
-int ChimeraSeqsCommand::createSparseMatrix(int startSeq, int endSeq, SparseMatrix* sparse, vector<Sequence> s){
- try {
-
- for(int i=startSeq; i<endSeq; i++){
-
- for(int j=0;j<i;j++){
-
- distCalculator->calcDist(s[i], s[j]);
- float dist = distCalculator->getDist();
-
- PCell temp(i, j, dist);
- sparse->addCell(temp);
-
- }
- }
-
-
- return 1;
- }
- catch(exception& e) {
- errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix");
- exit(1);
- }
-}
-/***************************************************************************************************************/
-void ChimeraSeqsCommand::generatePreferences(vector<SeqMap> left, vector<SeqMap> right, int mid){
- try {
-
- float dme = 0.0;
- SeqMap::iterator itR;
- SeqMap::iterator itL;
-
- //initialize pref[i]
- for (int i = 0; i < pref.size(); i++) {
- pref[i].score[1] = 0.0;
- pref[i].closestLeft[1] = 100000.0;
- pref[i].closestRight[1] = 100000.0;
- pref[i].leftParent[1] = "";
- pref[i].rightParent[1] = "";
- }
-
- for (int i = 0; i < left.size(); i++) {
-
- SeqMap currentLeft = left[i]; //example i = 3; currentLeft is a map of 0 to the distance of sequence 3 to sequence 0,
- // 1 to the distance of sequence 3 to sequence 1,
- // 2 to the distance of sequence 3 to sequence 2.
- SeqMap currentRight = right[i]; // same as left but with distances on the right side.
-
- for (int j = 0; j < i; j++) {
-
- itL = currentLeft.find(j);
- itR = currentRight.find(j);
-//cout << " i = " << i << " j = " << j << " distLeft = " << itL->second << endl;
-//cout << " i = " << i << " j = " << j << " distright = " << itR->second << endl;
-
- //if you can find this entry update the preferences
- if ((itL != currentLeft.end()) && (itR != currentRight.end())) {
-
- if (!correction) {
- pref[i].score[1] += abs((itL->second - itR->second));
- pref[j].score[1] += abs((itL->second - itR->second));
-//cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
-//cout << "abs = " << abs((itL->second - itR->second)) << endl;
-//cout << i << " score = " << pref[i].score[1] << endl;
-//cout << j << " score = " << pref[j].score[1] << endl;
- }else {
- pref[i].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
- pref[j].score[1] += abs((sqrt(itL->second) - sqrt(itR->second)));
-//cout << "left " << i << " " << j << " = " << itL->second << " right " << i << " " << j << " = " << itR->second << endl;
-//cout << "abs = " << abs((sqrt(itL->second) - sqrt(itR->second))) << endl;
-//cout << i << " score = " << pref[i].score[1] << endl;
-//cout << j << " score = " << pref[j].score[1] << endl;
- }
-//cout << "pref[" << i << "].closestLeft[1] = " << pref[i].closestLeft[1] << " parent = " << pref[i].leftParent[1] << endl;
- //are you the closest left sequence
- if (itL->second < pref[i].closestLeft[1]) {
-
- pref[i].closestLeft[1] = itL->second;
- pref[i].leftParent[1] = seqs[j].getName();
-//cout << "updating closest left to " << pref[i].leftParent[1] << endl;
- }
-//cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl;
- if (itL->second < pref[j].closestLeft[1]) {
- pref[j].closestLeft[1] = itL->second;
- pref[j].leftParent[1] = seqs[i].getName();
-//cout << "updating closest left to " << pref[j].leftParent[1] << endl;
- }
-
- //are you the closest right sequence
- if (itR->second < pref[i].closestRight[1]) {
- pref[i].closestRight[1] = itR->second;
- pref[i].rightParent[1] = seqs[j].getName();
- }
- if (itR->second < pref[j].closestRight[1]) {
- pref[j].closestRight[1] = itR->second;
- pref[j].rightParent[1] = seqs[i].getName();
- }
-
- }
- }
-
- }
-
-
-
- //calculate the dme
-
- int count0 = 0;
- for (int i = 0; i < pref.size(); i++) { dme += pref[i].score[1]; if (pref[i].score[1] == 0.0) { count0++; } }
-
- float expectedPercent = 1 / (float) (pref.size() - count0);
-//cout << endl << "dme = " << dme << endl;
- //recalculate prefernences based on dme
- for (int i = 0; i < pref.size(); i++) {
-//cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;
- // gives the actual percentage of the dme this seq adds
- pref[i].score[1] = pref[i].score[1] / dme;
-
- //how much higher or lower is this than expected
- pref[i].score[1] = pref[i].score[1] / expectedPercent;
-
- //pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
-
- //so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
-//cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;
- }
+ return 0;
- //is this score bigger then the last score
- for (int i = 0; i < pref.size(); i++) {
-
- //update biggest score
- if (pref[i].score[1] > pref[i].score[0]) {
- pref[i].score[0] = pref[i].score[1];
- pref[i].leftParent[0] = pref[i].leftParent[1];
- pref[i].rightParent[0] = pref[i].rightParent[1];
- pref[i].closestLeft[0] = pref[i].closestLeft[1];
- pref[i].closestRight[0] = pref[i].closestRight[1];
- pref[i].midpoint = mid;
- }
-
- }
-
}
catch(exception& e) {
- errorOut(e, "ChimeraSeqsCommand", "generatePreferences");
+ errorOut(e, "ChimeraSeqsCommand", "execute");
exit(1);
}
}
-/**************************************************************************************************/
/**************************************************************************************************/
#include "mothur.h"
#include "command.hpp"
-#include "filterseqscommand.h"
-#include "sequence.hpp"
-#include "sparsematrix.hpp"
-#include "dist.h"
-
-typedef list<PCell>::iterator MatData;
-typedef map<int, float> SeqMap; //maps sequence to all distance for that seqeunce
-
-struct Preference {
- string name;
- vector<string> leftParent; //keep the name of closest left associated with the two scores
- vector<string> rightParent; //keep the name of closest right associated with the two scores
- vector<float> score; //so you can keep last score and calc this score and keep whichever is bigger.
- vector<float> closestLeft; //keep the closest left associated with the two scores
- vector<float> closestRight; //keep the closest right associated with the two scores
- int midpoint;
-
-};
-
+#include "chimera.h"
/***********************************************************/
private:
- Dist* distCalculator;
bool abort;
- string method, fastafile;
+ string method, fastafile, templatefile;
bool filter, correction;
int processors, midpoint, averageLeft, averageRight, window, iters, increment;
- FilterSeqsCommand* filterSeqs;
- ListVector* list;
- vector<Sequence> seqs;
- vector<Preference> pref;
+ Chimera* chimera;
- void readSeqs();
- void generatePreferences(vector<SeqMap>, vector<SeqMap>, int);
- int createSparseMatrix(int, int, SparseMatrix*, vector<Sequence>);
-
};
/***********************************************************/
else {
//valid paramters for this command
- string Array[] = {"line","label"};
+ string Array[] = {"line","label","sorted"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
+
+ string temp;
+ temp = validParameter.validFile(parameters, "sorted", false); if (temp == "not found") { temp = "T"; }
+ sorted = isTrue(temp);
+
line = validParameter.validFile(parameters, "line", false);
if (line == "not found") { line = ""; }
else {
void GetRAbundCommand::help(){
try {
mothurOut("The get.rabund command can only be executed after a successful read.otu of a listfile.\n");
- mothurOut("The get.rabund command parameters are line and label. No parameters are required, and you may not use line and label at the same time.\n");
+ mothurOut("The get.rabund command parameters are line, label and sorted. No parameters are required, and you may not use line and label at the same time.\n");
mothurOut("The line and label allow you to select what distance levels you would like included in your .rabund file, and are separated by dashes.\n");
- mothurOut("The get.rabund command should be in the following format: get.rabund(line=yourLines, label=yourLabels).\n");
- mothurOut("Example get.rabund(line=1-3-5).\n");
+ mothurOut("The sorted parameters allows you to print the rabund results sorted by abundance or not. The default is sorted.\n");
+ mothurOut("The get.rabund command should be in the following format: get.rabund(line=yourLines, label=yourLabels, sorted=yourSorted).\n");
+ mothurOut("Example get.rabund(line=1-3-5, sorted=F).\n");
mothurOut("The default value for line and label are all lines in your inputfile.\n");
mothurOut("The get.rabund command outputs a .rabund file containing the lines you selected.\n");
mothurOut("Note: No spaces between parameter labels (i.e. line), '=' and parameters (i.e.yourLines).\n\n");
if(allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
mothurOut(list->getLabel() + "\t" + toString(count)); mothurOutEndLine();
- rabund = new RAbundVector();
+ rabund = new RAbundVector();
*rabund = (list->getRAbundVector());
- rabund->print(out);
+
+ if(sorted) { rabund->print(out); }
+ else { rabund->nonSortedPrint(out); }
+
delete rabund;
processedLabels.insert(list->getLabel());
mothurOut(list->getLabel() + "\t" + toString(count)); mothurOutEndLine();
rabund = new RAbundVector();
*rabund = (list->getRAbundVector());
- rabund->print(out);
+
+ if(sorted) { rabund->print(out); }
+ else { rabund->nonSortedPrint(out); }
+
delete rabund;
processedLabels.insert(list->getLabel());
mothurOut(list->getLabel() + "\t" + toString(count)); mothurOutEndLine();
rabund = new RAbundVector();
*rabund = (list->getRAbundVector());
- rabund->print(out);
+
+ if(sorted) { rabund->print(out); }
+ else { rabund->nonSortedPrint(out); }
+
delete rabund;
delete list;
}
ListVector* list;
RAbundVector* rabund;
- bool abort, allLines;
+ bool abort, allLines, sorted;
set<int> lines; //hold lines to be used
set<string> labels; //holds labels to be used
string line, label;
--- /dev/null
+/*
+ * pintail.cpp
+ * Mothur
+ *
+ * Created by Sarah Westcott on 7/9/09.
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "pintail.h"
+#include "ignoregaps.h"
+
+//***************************************************************************************************************
+
+Pintail::Pintail(string name) {
+ try {
+ fastafile = name;
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "Pintail");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+
+Pintail::~Pintail() {
+ try {
+ for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; }
+ for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; }
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "~Pintail");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void Pintail::print(ostream& out) {
+ try {
+
+ for (itCoef = DE.begin(); itCoef != DE.end(); itCoef++) {
+
+ out << itCoef->first->getName() << '\t' << itCoef->second << endl;
+ out << "Observed\t";
+
+ itObsDist = obsDistance.find(itCoef->first);
+ for (int i = 0; i < itObsDist->second.size(); i++) { out << itObsDist->second[i] << '\t'; }
+ out << endl;
+
+ out << "Expected\t";
+
+ itExpDist = expectedDistance.find(itCoef->first);
+ for (int i = 0; i < itExpDist->second.size(); i++) { out << itExpDist->second[i] << '\t'; }
+ out << endl;
+
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "print");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+void Pintail::getChimeras() {
+ try {
+
+ distCalculator = new ignoreGaps();
+
+ //read in query sequences and subject sequences
+ mothurOut("Reading sequences and template file... "); cout.flush();
+ querySeqs = readSeqs(fastafile);
+ templateSeqs = readSeqs(templateFile);
+ mothurOut("Done."); mothurOutEndLine();
+
+ int numSeqs = querySeqs.size();
+
+ //if window is set to default
+ if (window == 0) { if (querySeqs[0]->getAligned().length() > 800) { setWindow(200); }
+ else{ setWindow((querySeqs[0]->getAligned().length() / 4)); } }
+
+ //calculate number of iters
+ iters = (querySeqs[0]->getAligned().length() - window + 1) / increment;
+
+ int linesPerProcess = processors / numSeqs;
+
+ //find breakup of sequences for all times we will Parallelize
+ if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); }
+ else {
+ //fill line pairs
+ for (int i = 0; i < (processors-1); i++) {
+ lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+ }
+ //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+ int i = processors - 1;
+ lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+ }
+
+ //map query sequences to their most similiar sequences in the template - Parallelized
+ mothurOut("Finding closest sequence in template to each sequence... "); cout.flush();
+ if (processors == 1) { findPairs(lines[0]->start, lines[0]->end); }
+ else { createProcessesPairs(); }
+ mothurOut("Done."); mothurOutEndLine();
+ // itBest = bestfit.begin();
+ // itBest++; itBest++;
+//cout << itBest->first->getName() << '\t' << itBest->second->getName() << endl;
+ //find Oqs for each sequence - the observed distance in each window - Parallelized
+ mothurOut("Calculating observed percentage differences for each sequence... "); cout.flush();
+ if (processors == 1) { calcObserved(lines[0]->start, lines[0]->end); }
+ else { createProcessesObserved(); }
+ mothurOut("Done."); mothurOutEndLine();
+
+ //find P
+ mothurOut("Calculating expected percentage differences for each sequence... "); cout.flush();
+ vector<float> probabilityProfile = calcFreq(templateSeqs);
+
+ //make P into Q
+ for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; }
+//cout << "after p into Q" << endl;
+ //find Qav
+ averageProbability = findQav(probabilityProfile);
+//cout << "after Qav" << endl;
+ //find Coefficient - maps a sequence to its coefficient
+ seqCoef = getCoef(averageProbability);
+//cout << "after coef" << endl;
+ //find Eqs for each sequence - the expected distance in each window - Parallelized
+ if (processors == 1) { calcExpected(lines[0]->start, lines[0]->end); }
+ else { createProcessesExpected(); }
+ mothurOut("Done."); mothurOutEndLine();
+
+ //find deviation - Parallelized
+ mothurOut("Finding deviation from expected... "); cout.flush();
+ if (processors == 1) { calcDE(lines[0]->start, lines[0]->end); }
+ else { createProcessesDE(); }
+ mothurOut("Done."); mothurOutEndLine();
+
+
+ //free memory
+ for (int i = 0; i < lines.size(); i++) { delete lines[i]; }
+ delete distCalculator;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "getChimeras");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+
+vector<Sequence*> Pintail::readSeqs(string file) {
+ try {
+
+ ifstream in;
+ openInputFile(file, in);
+ vector<Sequence*> container;
+
+ //read in seqs and store in vector
+ while(!in.eof()){
+ Sequence* current = new Sequence(in);
+
+ if (current->getAligned() == "") { current->setAligned(current->getUnaligned()); }
+
+ container.push_back(current);
+
+ gobble(in);
+ }
+
+ in.close();
+ return container;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "readSeqs");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+//calculate the distances from each query sequence to all sequences in the template to find the closest sequence
+void Pintail::findPairs(int start, int end) {
+ try {
+
+ for(int i = start; i < end; i++){
+
+ float smallest = 10000.0;
+ Sequence query = *(querySeqs[i]);
+
+ for(int j = 0; j < templateSeqs.size(); j++){
+
+ Sequence temp = *(templateSeqs[j]);
+
+ distCalculator->calcDist(query, temp);
+ float dist = distCalculator->getDist();
+
+ if (dist < smallest) {
+
+ bestfit[querySeqs[i]] = templateSeqs[j];
+ smallest = dist;
+ }
+ }
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "findPairs");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void Pintail::calcObserved(int start, int end) {
+ try {
+
+ for(int i = start; i < end; i++){
+
+ itBest = bestfit.find(querySeqs[i]);
+ Sequence* query;
+ Sequence* subject;
+
+ if (itBest != bestfit.end()) {
+ query = itBest->first;
+ subject = itBest->second;
+ }else{ mothurOut("Error in calcObserved"); mothurOutEndLine(); }
+
+ vector<Sequence*> queryFragment;
+ vector<Sequence*> subjectFragment;
+
+ int startpoint = 0;
+ for (int m = 0; m < iters; m++) {
+ string seqFrag = query->getAligned().substr(startpoint, window);
+ Sequence* temp = new Sequence(query->getName(), seqFrag);
+ queryFragment.push_back(temp);
+
+ string seqFragsub = subject->getAligned().substr(startpoint, window);
+ Sequence* tempsub = new Sequence(subject->getName(), seqFragsub);
+ subjectFragment.push_back(tempsub);
+
+ startpoint += increment;
+ }
+
+
+ for(int j = 0; j < iters; j++){
+
+ distCalculator->calcDist(*(queryFragment[j]), *(subjectFragment[j]));
+ float dist = distCalculator->getDist();
+
+ //convert to similiarity
+ //dist = 1 - dist;
+
+ //save oi
+ obsDistance[query].push_back(dist);
+ }
+
+ //free memory
+ for (int i = 0; i < queryFragment.size(); i++) { delete queryFragment[i]; }
+ for (int i = 0; i < subjectFragment.size(); i++) { delete subjectFragment[i]; }
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "calcObserved");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+void Pintail::calcExpected(int start, int end) {
+ try {
+
+
+ //for each sequence
+ for(int i = start; i < end; i++){
+
+ itCoef = seqCoef.find(querySeqs[i]);
+ float coef = itCoef->second;
+
+ //for each window
+ vector<float> queryExpected;
+ for (int m = 0; m < iters; m++) {
+ float expected = averageProbability[m] * coef;
+ queryExpected.push_back(expected);
+ }
+
+ expectedDistance[querySeqs[i]] = queryExpected;
+
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "calcExpected");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+void Pintail::calcDE(int start, int end) {
+ try {
+
+
+ //for each sequence
+ for(int i = start; i < end; i++){
+
+ itObsDist = obsDistance.find(querySeqs[i]);
+ vector<float> obs = itObsDist->second;
+
+ itExpDist = expectedDistance.find(querySeqs[i]);
+ vector<float> exp = itExpDist->second;
+
+ //for each window
+ float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
+ for (int m = 0; m < iters; m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); }
+
+ float de = sqrt((sum / (iters - 1)));
+
+ DE[querySeqs[i]] = de;
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "calcDE");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+
+vector<float> Pintail::calcFreq(vector<Sequence*> seqs) {
+ try {
+
+ vector<float> prob;
+
+ //at each position in the sequence
+ for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
+
+ vector<int> freq; freq.resize(4,0);
+
+ //find the frequency of each nucleotide
+ for (int j = 0; j < seqs.size(); j++) {
+
+ char value = seqs[j]->getAligned()[i];
+
+ if(toupper(value) == 'A') { freq[0]++; }
+ else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
+ else if(toupper(value) == 'G') { freq[2]++; }
+ else if(toupper(value) == 'C') { freq[3]++; }
+ }
+
+ //find base with highest frequency
+ int highest = 0;
+ for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
+
+ float highFreq = highest / (float) seqs.size();
+ float Pi;
+ //if this is not a gap column
+ if (highFreq != 0) { Pi = (highFreq - 0.25) / 0.75; }
+ else { Pi = 0; }
+
+ prob.push_back(Pi);
+
+ }
+
+ return prob;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "calcFreq");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+vector<float> Pintail::findQav(vector<float> prob) {
+ try {
+ vector<float> averages;
+
+ //for each window find average
+ int startpoint = 0;
+ for (int m = 0; m < iters; m++) {
+
+ float average = 0.0;
+ for (int i = startpoint; i < (startpoint+window); i++) { average += prob[i]; }
+
+ average = average / window;
+
+ //save this windows average
+ averages.push_back(average);
+
+ startpoint += increment;
+ }
+
+ return averages;
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "findQav");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+map<Sequence*, float> Pintail::getCoef(vector<float> prob) {
+ try {
+ map<Sequence*, float> coefs;
+
+ //find average prob
+ float probAverage = 0.0;
+ for (int i = 0; i < prob.size(); i++) { probAverage += prob[i]; }
+ probAverage = probAverage / (float) prob.size();
+
+ //find a coef for each sequence
+ map<Sequence*, vector<float> >::iterator it;
+ for (it = obsDistance.begin(); it != obsDistance.end(); it++) {
+
+ vector<float> temp = it->second;
+ Sequence* tempSeq = it->first;
+
+ //find observed average
+ float obsAverage = 0.0;
+ for (int i = 0; i < temp.size(); i++) { obsAverage += temp[i]; }
+ obsAverage = obsAverage / (float) temp.size();
+
+ float coef = obsAverage / probAverage;
+
+ //save this sequences coefficient
+ coefs[tempSeq] = coef;
+ }
+
+
+ return coefs;
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "getCoef");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void Pintail::createProcessesPairs() {
+ try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 0;
+ vector<int> processIDS;
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+ findPairs(lines[process]->start, lines[process]->end);
+ exit(0);
+ }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+ }
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processors;i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+#else
+ findPairs(lines[0]->start, lines[0]->end);
+
+#endif
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "createProcessesPairs");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void Pintail::createProcessesObserved() {
+ try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 0;
+ vector<int> processIDS;
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+ calcObserved(lines[process]->start, lines[process]->end);
+ exit(0);
+ }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+ }
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processors;i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+#else
+ calcObserved(lines[0]->start, lines[0]->end);
+
+#endif
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "createProcessesObserved");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+
+void Pintail::createProcessesExpected() {
+ try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 0;
+ vector<int> processIDS;
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+ calcExpected(lines[process]->start, lines[process]->end);
+ exit(0);
+ }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+ }
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processors;i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+#else
+ calcExpected(lines[0]->start, lines[0]->end);
+
+#endif
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "createProcessesExpected");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void Pintail::createProcessesDE() {
+ try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 0;
+ vector<int> processIDS;
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid);
+ process++;
+ }else if (pid == 0){
+ calcDE(lines[process]->start, lines[process]->end);
+ exit(0);
+ }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+ }
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processors;i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+
+#else
+ calcDE(lines[0]->start, lines[0]->end);
+
+#endif
+ }
+ catch(exception& e) {
+ errorOut(e, "Pintail", "createProcessesDE");
+ exit(1);
+ }
+}
+
+//***************************************************************************************************************
+
+
--- /dev/null
+#ifndef PINTAIL_H
+#define PINTAIL_H
+
+/*
+ * pintail.h
+ * Mothur
+ *
+ * Created by Sarah Westcott on 7/9/09.
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "chimera.h"
+#include "dist.h"
+
+//This class was created using the algorythms described in the
+// "At Least 1 in 20 16S rRNA Sequence Records Currently Held in the Public Repositories is Estimated To Contain Substantial Anomalies" paper
+//by Kevin E. Ashelford 1, Nadia A. Chuzhanova 3, John C. Fry 1, Antonia J. Jones 2 and Andrew J. Weightman 1.
+
+/***********************************************************/
+
+class Pintail : public Chimera {
+
+ public:
+ Pintail(string);
+ ~Pintail();
+
+ void getChimeras();
+ void print(ostream&);
+
+
+ private:
+
+ struct linePair {
+ int start;
+ int end;
+ linePair(int i, int j) : start(i), end(j) {}
+ };
+
+
+ Dist* distCalculator;
+ string fastafile;
+ int iters;
+ vector<linePair*> lines;
+ vector<Sequence*> querySeqs;
+ vector<Sequence*> templateSeqs;
+
+ map<Sequence*, Sequence*> bestfit; //maps a query sequence to its most similiar sequence in the template
+ map<Sequence*, Sequence*>::iterator itBest;
+
+ map<Sequence*, vector<float> > obsDistance; //maps a query sequence to its observed distance at each window
+ map<Sequence*, vector<float> > expectedDistance; //maps a query sequence to its expected distance at each window
+ map<Sequence*, vector<float> >::iterator itObsDist;
+ map<Sequence*, vector<float> >::iterator itExpDist;
+
+ vector<float> averageProbability; //Qav
+ map<Sequence*, float> seqCoef; //maps a sequence to its coefficient
+ map<Sequence*, float> DE; //maps a sequence to its deviation
+ map<Sequence*, float>::iterator itCoef;
+
+ vector<Sequence*> readSeqs(string);
+ vector<float> findQav(vector<float>);
+ vector<float> calcFreq(vector<Sequence*>);
+ map<Sequence*, float> getCoef(vector<float>);
+
+ void findPairs(int, int);
+ void calcObserved(int, int);
+ void calcExpected(int, int);
+ void calcDE(int, int);
+
+ void createProcessesPairs();
+ void createProcessesObserved();
+ void createProcessesExpected();
+ void createProcessesDE();
+
+};
+
+/***********************************************************/
+
+#endif
+
return data.rend();
}
+/***********************************************************************/
+void RAbundVector::nonSortedPrint(ostream& output){
+ try {
+ output << label << '\t' << numBins << '\t';
+
+ for(int i=0;i<numBins;i++){ output << data[i] << '\t'; }
+ output << endl;
+ }
+ catch(exception& e) {
+ errorOut(e, "RAbundVector", "nonSortedPrint");
+ exit(1);
+ }
+}
/***********************************************************************/
void RAbundVector::print(string prefix, ostream& output){
try {
}
}
+
/***********************************************************************/
void RAbundVector::print(ostream& output){
try {
void print(ostream&);
void print(string, ostream&);
+ void nonSortedPrint(ostream&);
RAbundVector getRAbundVector();
SAbundVector getSAbundVector();
if(option == "help") { help(); abort = true; }
else {
- if (option = "") { mothurOut("You must enter a command to run."); mothurOutEndLine(); abort = true; }
+ if (option == "") { mothurOut("You must enter a command to run."); mothurOutEndLine(); abort = true; }
else { command = option; }
}