From: westcott <westcott>
Date: Mon, 31 Aug 2009 14:33:57 +0000 (+0000)
Subject: checking in chimera files in progress after move to michigan
X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=f7fbd74ffedcf62217109c22e828453eaefa1458

checking in chimera files in progress after move to michigan
---

diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj
index c4f38cb..c6d6c0e 100644
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -162,6 +162,8 @@
 		A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; };
 		A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; };
 		A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
+		A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B8879104C16860083C454 /* alignedsimilarity.cpp */; };
+		A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
 		EB1216880F619B83004A865F /* bergerparker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216870F619B83004A865F /* bergerparker.cpp */; };
 		EB1216E50F61ACFB004A865F /* bstick.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216E40F61ACFB004A865F /* bstick.cpp */; };
 		EB1217230F61C9AC004A865F /* sharedkstest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1217220F61C9AC004A865F /* sharedkstest.cpp */; };
@@ -513,6 +515,10 @@
 		A70B53A70F4CD7AD0064797E /* getlabelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlabelcommand.h; sourceTree = SOURCE_ROOT; };
 		A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlinecommand.cpp; sourceTree = SOURCE_ROOT; };
 		A70B53A90F4CD7AD0064797E /* getlinecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlinecommand.h; sourceTree = SOURCE_ROOT; };
+		A75B8879104C16860083C454 /* alignedsimilarity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignedsimilarity.cpp; sourceTree = "<group>"; };
+		A75B887A104C16860083C454 /* alignedsimilarity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignedsimilarity.h; sourceTree = "<group>"; };
+		A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = "<group>"; };
+		A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = "<group>"; };
 		C6859E8B029090EE04C91782 /* Mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = Mothur.1; sourceTree = "<group>"; };
 		EB1216860F619B83004A865F /* bergerparker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bergerparker.h; sourceTree = SOURCE_ROOT; };
 		EB1216870F619B83004A865F /* bergerparker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bergerparker.cpp; sourceTree = SOURCE_ROOT; };
@@ -645,6 +651,10 @@
 				372095C1103196D70004D347 /* chimera.cpp */,
 				374F2FE9100634B600E97C66 /* bellerophon.h */,
 				374F2FEA100634B600E97C66 /* bellerophon.cpp */,
+				A75B8879104C16860083C454 /* alignedsimilarity.cpp */,
+				A75B887A104C16860083C454 /* alignedsimilarity.h */,
+				A75B887B104C16860083C454 /* ccode.cpp */,
+				A75B887C104C16860083C454 /* ccode.h */,
 				372A55531017922B00C5194B /* decalc.h */,
 				372A55541017922B00C5194B /* decalc.cpp */,
 				374F301110063B6F00E97C66 /* pintail.h */,
@@ -1167,6 +1177,8 @@
 				374F301310063B6F00E97C66 /* pintail.cpp in Sources */,
 				372A55551017922B00C5194B /* decalc.cpp in Sources */,
 				372095C2103196D70004D347 /* chimera.cpp in Sources */,
+				A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */,
+				A75B887E104C16860083C454 /* ccode.cpp in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
diff --git a/alignedsimilarity.cpp b/alignedsimilarity.cpp
new file mode 100644
index 0000000..0ed9ce3
--- /dev/null
+++ b/alignedsimilarity.cpp
@@ -0,0 +1,543 @@
+/*
+ *  alignedsimilarity.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 8/18/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "alignedsimilarity.h"
+#include "ignoregaps.h"
+
+
+//***************************************************************************************************************
+AlignSim::AlignSim(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
+//***************************************************************************************************************
+
+AlignSim::~AlignSim() {
+	try {
+		for (int i = 0; i < querySeqs.size(); i++)		{  delete querySeqs[i];		}
+		for (int i = 0; i < templateSeqs.size(); i++)	{  delete templateSeqs[i];	}
+		delete distCalc;
+	}
+	catch(exception& e) {
+		errorOut(e, "AlignSim", "~AlignSim");
+		exit(1);
+	}
+}	
+//***************************************************************************************************************
+void AlignSim::print(ostream& out) {
+	try {
+		
+		mothurOutEndLine();
+		
+		for (int i = 0; i < querySeqs.size(); i++) {
+			
+			int j = 0;  float largest = -10;
+			//find largest sim value
+			for (int k = 0; k < IS[i].size(); k++) {
+				//is this score larger
+				if (IS[i][k].score > largest) {
+					j = k;
+					largest = IS[i][k].score;
+				}
+			}
+			
+			//find parental similarity
+			distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
+			float dist = distCalc->getDist();
+			
+			//convert to similarity
+			dist = (1 - dist) * 100;
+
+			//warn about parental similarity - if its above 82% may not detect a chimera 
+			if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly.");  mothurOutEndLine(); }
+			
+			int index = ceil(dist);
+			
+			if (index == 0) { index=1;  }
+	
+			//is your DE value higher than the 95%
+			string chimera;
+			if (IS[i][j].score > quantile[index-1][4])		{	chimera = "Yes";	}
+			else										{	chimera = "No";		}			
+			
+			out << querySeqs[i]->getName() <<  "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
+			
+			if (chimera == "Yes") {
+				mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
+			}
+			out << "Improvement Score\t";
+			
+			for (int r = 0; r < IS[i].size(); r++) {  out << IS[i][r].score << '\t';  }
+			out << endl;
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "AlignSim", "print");
+		exit(1);
+	}
+}
+
+//***************************************************************************************************************
+void AlignSim::getChimeras() {
+	try {
+		
+		//read in query sequences and subject sequences
+		mothurOut("Reading sequences and template file... "); cout.flush();
+		querySeqs = readSeqs(fastafile);
+		templateSeqs = readSeqs(templateFile);
+		mothurOut("Done."); mothurOutEndLine();
+		
+		int numSeqs = querySeqs.size();
+		
+		IS.resize(numSeqs);
+		
+		//break up file if needed
+		int linesPerProcess = numSeqs / processors ;
+		
+		#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+			//find breakup of sequences for all times we will Parallelize
+			if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
+			else {
+				//fill line pairs
+				for (int i = 0; i < (processors-1); i++) {			
+					lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+				}
+				//this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+				int i = processors - 1;
+				lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+			}
+			
+			//find breakup of templatefile for quantiles
+			if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
+			else { 
+				for (int i = 0; i < processors; i++) {
+					templateLines.push_back(new linePair());
+					templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
+					templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
+				}
+			}
+		#else
+			lines.push_back(new linePair(0, numSeqs));
+			templateLines.push_back(new linePair(0, templateSeqs.size()));
+		#endif
+	
+	
+		distCalc = new ignoreGaps();
+		
+		//find window breaks
+		windowBreak = findWindows();
+//for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t';  }
+//cout << endl;
+
+		mothurOut("Finding the IS values for your sequences..."); cout.flush();
+		if (processors == 1) { 
+			IS = findIS(lines[0]->start, lines[0]->end, querySeqs);
+		}else {		IS = createProcessesIS(querySeqs, lines);		}
+		mothurOut("Done."); mothurOutEndLine();
+		
+		//quantiles are used to determine whether the de values found indicate a chimera
+		if (quanfile != "") {  quantile = readQuantiles();  }
+		else {
+			
+			mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
+			//get IS quantiles as a reference to these IS scores
+			//you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see
+			if (processors == 1) { 
+				templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs);
+			}else {		templateIS = createProcessesIS(templateSeqs, templateLines);		}
+//cout << "here" << endl;			
+			ofstream out4;
+			string o;
+			
+			o = getRootName(templateFile) + "alignedsimilarity.quan";
+			
+			openOutputFile(o, out4);
+			
+			//adjust quantiles
+			//construct table
+			quantile.resize(100);
+		
+			for (int i = 0; i < templateIS.size(); i++) {
+				
+				if (templateIS[i].size() != 0) {
+					int j = 0; float score = -1000;
+					//find highest IS score
+	//cout << templateIS[i].size() << endl;
+					for (int k = 0; k < templateIS[i].size(); k++) {
+						if (templateIS[i][k].score > score) {
+							score = templateIS[i][k].score;
+							j = k;
+						}
+					}
+	//cout << j << endl;		
+					//find similarity of parents
+					distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent));
+					float dist = distCalc->getDist();
+			
+					//convert to similarity
+					dist = (1 - dist) * 100;
+	//	cout << dist << endl;	
+					int index = ceil(dist);
+	//	cout << "index = " << index << endl;	
+					if (index == 0) {	index = 1;	}
+					quantile[index-1].push_back(templateIS[i][j].score);
+				}
+			}
+		
+		
+			for (int i = 0; i < quantile.size(); i++) {
+				vector<float> temp;
+				
+				if (quantile[i].size() == 0) {
+					//in case this is not a distance found in your template files
+					for (int g = 0; g < 6; g++) {
+						temp.push_back(0.0);
+					}
+				}else{
+					
+					sort(quantile[i].begin(), quantile[i].end());
+					
+					//save 10%
+					temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]);
+					//save 25%
+					temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]);
+					//save 50%
+					temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]);
+					//save 75%
+					temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]);
+					//save 95%
+					temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]);
+					//save 99%
+					temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]);
+					
+				}
+				
+				//output quan value
+				out4 << i+1 << '\t';				
+				for (int u = 0; u < temp.size(); u++) {   out4 << temp[u] << '\t'; }
+				out4 << endl;
+				
+				quantile[i] = temp;
+				
+			}
+			
+			out4.close();
+			
+			mothurOut("Done."); mothurOutEndLine();
+
+		}
+		
+		//free memory
+		for (int i = 0; i < lines.size(); i++)					{	delete lines[i];				}
+		for (int i = 0; i < templateLines.size(); i++)			{	delete templateLines[i];		}
+			
+	}
+	catch(exception& e) {
+		errorOut(e, "AlignSim", "getChimeras");
+		exit(1);
+	}
+}
+//***************************************************************************************************************
+vector<int> AlignSim::findWindows() {
+	try {
+		
+		vector<int> win; 
+		
+		if (increment > querySeqs[0]->getAligned().length()) {  mothurOut("You have selected an increment larger than the length of your sequences.  I will use the default of 25.");  increment = 25; }
+		
+		for (int m = increment;  m < (querySeqs[0]->getAligned().length() - increment); m+=increment) {  win.push_back(m);  }
+
+		return win;
+	
+	}
+	catch(exception& e) {
+		errorOut(e, "AlignSim", "findWindows");
+		exit(1);
+	}
+}
+
+//***************************************************************************************************************
+vector< vector<sim>  > AlignSim::findIS(int start, int end, vector<Sequence*> seqs) {
+	try {
+		
+		vector< vector<sim> >  isValues;
+		isValues.resize(seqs.size());
+		
+		//for each sequence
+		for(int i = start; i < end; i++){
+			
+			vector<sim> temp;  temp.resize(windowBreak.size());
+			
+			mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine();
+			
+			//for each window
+			for (int m = 0; m < windowBreak.size(); m++) {
+				
+				vector<Sequence*> closest;  //left, right, overall
+				vector<int> similarity; //left+right and overall
+				
+				//find closest left, closest right and closest overall
+				closest = findClosestSides(seqs[i], windowBreak[m], similarity, i);
+				
+				int totalNumBases = seqs[i]->getUnaligned().length();
+				
+				//IS = left+right-overall
+				float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases);
+				
+				///save IS, leftparent, rightparent, breakpoint
+				temp[m].leftParent = closest[0];
+				temp[m].rightParent = closest[1];
+				temp[m].score = is;
+				temp[m].midpoint = windowBreak[m];
+//cout << is << '\t';
+			}
+//cout << endl;			
+			isValues[i] = temp;
+		
+		}
+		
+		return isValues;
+	
+	}
+	catch(exception& e) {
+		errorOut(e, "AlignSim", "findIS");
+		exit(1);
+	}
+}
+//***************************************************************************************************************
+vector<Sequence*> AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector<int>& sim, int i) {
+	try{
+	
+		vector<Sequence*> closest;
+		
+		Sequence query, queryLeft, queryRight;
+		string frag = seq->getAligned();
+		string fragLeft = frag.substr(0, breakpoint);
+		string fragRight = frag.substr(breakpoint, frag.length());
+		
+		//get pieces
+		query = *(seq);  
+		queryLeft = *(seq);  
+		queryRight = *(seq);
+		queryLeft.setAligned(fragLeft);
+		queryRight.setAligned(fragRight);
+		
+		//initialize
+		Sequence* overall = templateSeqs[0];
+		Sequence* left = templateSeqs[0];
+		Sequence* right = templateSeqs[0];
+		
+		float smallestOverall, smallestLeft, smallestRight;
+		smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
+		
+		//go through the templateSeqs and search for the closest
+		for(int j = 0; j < templateSeqs.size(); j++){
+			
+			//so you don't pick yourself
+			if (j != i) {
+				Sequence temp, tempLeft, tempRight;
+				string fragTemp = templateSeqs[j]->getAligned();
+				string fragTempLeft = fragTemp.substr(0, breakpoint);
+				string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length());
+				
+				//get pieces
+				temp = *(templateSeqs[j]); 
+				tempLeft = *(templateSeqs[j]); 
+				tempRight = *(templateSeqs[j]);
+				tempLeft.setAligned(fragTempLeft);
+				tempRight.setAligned(fragTempRight);
+				
+				//find overall dist
+				distCalc->calcDist(query, temp);
+				float dist = distCalc->getDist();	
+				
+				if (dist < smallestOverall) { 
+					overall = templateSeqs[j];
+					smallestOverall = dist;
+				}
+				
+				//find left dist
+				distCalc->calcDist(queryLeft, tempLeft);
+				dist = distCalc->getDist();
+				
+				if (dist < smallestLeft) { 
+					left = templateSeqs[j];
+					smallestLeft = dist;
+				}
+				
+				//find left dist
+				distCalc->calcDist(queryRight, tempRight);
+				dist = distCalc->getDist();
+				
+				if (dist < smallestRight) { 
+					right = templateSeqs[j];
+					smallestRight = dist;
+				}
+			}
+
+		}
+			
+		closest.push_back(left);
+		closest.push_back(right);
+		closest.push_back(overall);
+		
+		//fill sim with number of matched bases
+		Sequence tempL = *(left);
+		string tempLFrag = tempL.getAligned();
+		tempL.setAligned(tempLFrag.substr(0, breakpoint));
+		
+		int bothMatches = findNumMatchedBases(queryLeft, tempL);
+		sim.push_back(bothMatches);
+
+		Sequence tempR = *(right);
+		string tempRFrag = tempR.getAligned();
+		tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length()));
+		
+		bothMatches = findNumMatchedBases(queryRight, tempR);
+		sim.push_back(bothMatches);
+
+		bothMatches = findNumMatchedBases(query, *(overall));
+		sim.push_back(bothMatches);
+			
+		return closest;
+
+
+	}
+	catch(exception& e) {
+		errorOut(e, "AlignSim", "findClosestSides");
+		exit(1);
+	}
+}
+//***************************************************************************************************************
+
+int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) {
+	try{
+	
+		int num = 0;
+		
+		string query = seq.getAligned();
+		string subject = temp.getAligned();
+//cout << seq.getName() << endl << endl;
+//cout << temp.getName()  << endl << endl;
+		
+		for (int i = 0; i < query.length(); i++) {
+			//count matches if query[i] is a base and subject is same bases or gap
+			if (isalpha(query[i])) {
+				if(!isalpha(subject[i])) {  num++;  } 
+				else if (query[i] == subject[i])  { num++;  }
+				else {  }
+			}
+		}
+//cout << "num = " << num << endl;		
+		return num;
+	}
+	catch(exception& e) {
+		errorOut(e, "AlignSim", "findNumMatchedBases");
+		exit(1);
+	}
+}
+
+/**************************************************************************************************/
+vector< vector<sim> > AlignSim::createProcessesIS(vector<Sequence*> seqs, vector<linePair*> linesToProcess) {
+	try {
+	vector< vector<sim> > localIs; localIs.resize(seqs.size());
+	
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+		int process = 0;
+		vector<int> processIDS;
+		
+		//loop through and create all the processes you want
+		while (process != processors) {
+			int pid = fork();
+			
+			if (pid > 0) {
+				processIDS.push_back(pid);  
+				process++;
+			}else if (pid == 0){
+				
+				mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
+				localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs);
+				mothurOut("Done finding IS values for sequences " +  toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
+				
+				//write out data to file so parent can read it
+				ofstream out;
+				string s = toString(getpid()) + ".temp";
+				openOutputFile(s, out);
+				
+				//output pairs
+				for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) {
+					 out << localIs[i].size() << endl;
+					 for (int j = 0; j < localIs[i].size(); j++) {
+						localIs[i][j].leftParent->printSequence(out);
+						localIs[i][j].rightParent->printSequence(out);
+						out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl;
+					 }
+				}
+				out.close();
+				
+				exit(0);
+			}else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+		}
+		
+		//force parent to wait until all the processes are done
+		for (int i=0;i<processors;i++) { 
+			int temp = processIDS[i];
+			wait(&temp);
+		}
+		
+		//get data created by processes
+		for (int i=0;i<processors;i++) { 
+			ifstream in;
+			string s = toString(processIDS[i]) + ".temp";
+			openInputFile(s, in);
+			
+			//get pairs
+			for (int k = linesToProcess[i]->start; k < linesToProcess[i]->end; k++) {
+				int size;
+				in >> size;
+				gobble(in);
+				
+				vector<sim> tempVector;
+				
+				for (int j = 0; j < size; j++) {
+				
+					sim temp;
+					
+					temp.leftParent = new Sequence(in);
+					gobble(in);
+	//temp.leftParent->printSequence(cout);				
+					temp.rightParent = new Sequence(in);
+					gobble(in);
+//temp.rightParent->printSequence(cout);
+					string throwaway;
+					in >> throwaway >> temp.score >> temp.midpoint;
+	//cout << temp.score << '\t' << temp.midpoint << endl;				
+					gobble(in);
+					
+					tempVector.push_back(temp);
+				}
+				
+				localIs[k] = tempVector;
+			}
+			
+			in.close();
+			remove(s.c_str());
+		}
+			
+	
+#else
+		localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs);
+#endif	
+
+		return localIs;
+	}
+	catch(exception& e) {
+		errorOut(e, "AlignSim", "createProcessesIS");
+		exit(1);
+	}
+}
+
+//***************************************************************************************************************
diff --git a/alignedsimilarity.h b/alignedsimilarity.h
new file mode 100644
index 0000000..716b8b7
--- /dev/null
+++ b/alignedsimilarity.h
@@ -0,0 +1,67 @@
+#ifndef ALIGNSIM_H
+#define ALIGNSIM_H
+
+/*
+ *  alignedsimilarity.h
+ *  Mothur
+ *
+ *  Created by westcott on 8/18/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "chimera.h"
+#include "dist.h"
+
+//This class was created using the algorythms described in the 
+//"Evaluation of Nearest Neighbor Methods for Detection of Chimeric Small-Subunit rRna Sequences" paper 
+//by J.F. Robison-Cox 1, M.M. Bateson 2 and D.M. Ward 2
+
+
+/***********************************************************/
+
+class AlignSim : public Chimera {
+	
+	public:
+		AlignSim(string, string);	
+		~AlignSim();
+		
+		void getChimeras();
+		void print(ostream&);
+		
+		void setCons(string){};
+		void setQuantiles(string q) { quanfile = q; };
+		
+		
+	private:
+		
+		Dist* distCalc;
+		vector<linePair*> lines;
+		vector<linePair*> templateLines;
+		
+		vector<Sequence*> querySeqs;
+		vector<Sequence*> templateSeqs;
+		
+		vector< vector<sim> > IS;  //IS[0] is the vector os sim values for each window for querySeqs[0]
+		vector< vector<sim> > templateIS;  //templateIS[0] is the vector os sim values for each window for templateSeqs[0]
+		vector<int> windowBreak;
+		
+		string fastafile, templateFile;
+		int iters;
+		
+		vector< vector<sim> > findIS(int, int, vector<Sequence*>);
+		vector<Sequence*> findClosestSides(Sequence*, int, vector<int>&, int);
+		int findNumMatchedBases(Sequence, Sequence);
+		vector<int> findWindows();
+		
+		vector< vector<float> > quantile;
+		
+		vector< vector<sim> > createProcessesIS(vector<Sequence*>, vector<linePair*>);
+		
+};
+
+/***********************************************************/
+
+#endif
+
diff --git a/bellerophon.cpp b/bellerophon.cpp
index dc021c0..2ce3951 100644
--- a/bellerophon.cpp
+++ b/bellerophon.cpp
@@ -35,7 +35,7 @@ void Bellerophon::print(ostream& out) {
 			out << pref[i].name << '\t' << setprecision(3) << pref[i].score[0] << '\t' << pref[i].leftParent[0] << '\t' << pref[i].rightParent[0] << endl;
 			
 			//calc # of seqs with preference above 1.0
-			if (pref[i].score[0] > 1.0) { 
+			if (pref[i].score[0] > 1.1) { 
 				above1++; 
 				mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine();
 				mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine();
@@ -44,7 +44,7 @@ void Bellerophon::print(ostream& out) {
 		
 		//output results to screen
 		mothurOutEndLine();
-		mothurOut("Sequence with preference score above 1.0: " + toString(above1)); mothurOutEndLine();
+		mothurOut("Sequence with preference score above 1.1: " + toString(above1)); mothurOutEndLine();
 		int spot;
 		spot = pref.size()-1;
 		mothurOut("Minimum:\t" + toString(pref[spot].score[0])); mothurOutEndLine();
@@ -115,7 +115,7 @@ void Bellerophon::getChimeras() {
 				
 			}else{ increment = 0; }
 		}
-cout << "increment = " << increment << endl;		
+		
 		if (increment == 0) { iters = 1; }
 		else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); }
 		
@@ -193,7 +193,6 @@ cout << "increment = " << increment << endl;
 				delete SparseLeft;
 				delete SparseRight;
 				
-				
 				//fill preference structure
 				generatePreferences(distMapLeft, distMapRight, midpoint);
 				
@@ -204,7 +203,7 @@ cout << "increment = " << increment << endl;
 		delete distCalculator;
 		
 		//rank preference score to eachother
-		float dme = 0.0;
+		/*float dme = 0.0;
 		float expectedPercent = 1 / (float) (pref.size());
 		
 		for (int i = 0; i < pref.size(); i++) {	 dme += pref[i].score[0];  }
@@ -216,10 +215,8 @@ cout << "increment = " << increment << endl;
 			
 			//how much higher or lower is this than expected
 			pref[i].score[0] = pref[i].score[0] / expectedPercent;
-			
-			
-			
-		}
+		
+		}*/
 		
 		//sort Preferences highest to lowest
 		sort(pref.begin(), pref.end(), comparePref);
@@ -337,23 +334,25 @@ void Bellerophon::generatePreferences(vector<SeqMap> left, vector<SeqMap> right,
 		
 		  
 		//calculate the dme
-		
 		int count0 = 0;
-		for (int i = 0; i < pref.size(); i++) {	 dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) { count0++; }  }
+		for (int i = 0; i < pref.size(); i++) {	 dme += pref[i].score[1];  if (pref[i].score[1] == 0.0) {  count0++; }  }
 		
-		float expectedPercent = 1 / (float) (pref.size() - count0);
+		//float expectedPercent = 1 / (float) (pref.size() - count0);
 //cout << endl << "dme = " << dme << endl;
 		//recalculate prefernences based on dme
 		for (int i = 0; i < pref.size(); i++) {
-//cout << "unadjusted pref " << i << " = " << pref[i].score[1] << endl;	
+//cout << "unadjusted col[i] " << i << " = " << pref[i].score[1] << endl;	
+//cout << i << '\t' <<  (dme / (dme - 2 * pref[i].score[1])) << endl;
+			
 			// gives the actual percentage of the dme this seq adds
-			pref[i].score[1] = pref[i].score[1] / dme;
+			//pref[i].score[1] = pref[i].score[1] / dme;
 			
 			//how much higher or lower is this than expected
-			pref[i].score[1] = pref[i].score[1] / expectedPercent;
-			
-			//pref[i].score[1] = dme / (dme - 2 * pref[i].score[1]);
+			//pref[i].score[1] = pref[i].score[1] / expectedPercent;
 			
+			//not 2 * pref[i].score[1] because i only calulate the pref scores once.
+			pref[i].score[1] = dme / (dme - pref[i].score[1]);
+//cout << i << '\t' << pref[i].score[1] << endl;
 			//so a non chimeric sequence would be around 1, and a chimeric would be signifigantly higher.
 //cout << "adjusted pref " << i << " = " << pref[i].score[1] << endl;					
 		}
diff --git a/bellerophon.h b/bellerophon.h
index bc86c18..b309545 100644
--- a/bellerophon.h
+++ b/bellerophon.h
@@ -13,20 +13,12 @@
 
 #include "chimera.h"
 #include "filterseqscommand.h"
+#include "sparsematrix.hpp"
 #include "sequence.hpp"
 #include "dist.h"
 
-struct Preference {
-		string name;
-		vector<string> leftParent; //keep the name of closest left associated with the two scores
-		vector<string> rightParent; //keep the name of closest right associated with the two scores
-		vector<float> score;  //so you can keep last score and calc this score and keep whichever is bigger.
-		vector<float> closestLeft;  //keep the closest left associated with the two scores
-		vector<float> closestRight; //keep the closest right associated with the two scores
-		int midpoint;
-
-};
-
+typedef list<PCell>::iterator MatData;
+typedef map<int, float> SeqMap;  //maps sequence to all distance for that seqeunce
 
 /***********************************************************/
 
diff --git a/ccode.cpp b/ccode.cpp
new file mode 100644
index 0000000..a3ef5e6
--- /dev/null
+++ b/ccode.cpp
@@ -0,0 +1,272 @@
+/*
+ *  ccode.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 8/24/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "ccode.h"
+#include "ignoregaps.h"
+
+
+//***************************************************************************************************************
+Ccode::Ccode(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
+//***************************************************************************************************************
+
+Ccode::~Ccode() {
+	try {
+		for (int i = 0; i < querySeqs.size(); i++)		{  delete querySeqs[i];		}
+		for (int i = 0; i < templateSeqs.size(); i++)	{  delete templateSeqs[i];	}
+		delete distCalc;
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "~Ccode");
+		exit(1);
+	}
+}	
+//***************************************************************************************************************
+void Ccode::print(ostream& out) {
+	try {
+		
+		mothurOutEndLine();
+		
+		for (int i = 0; i < querySeqs.size(); i++) {
+			
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "print");
+		exit(1);
+	}
+}
+
+//***************************************************************************************************************
+void Ccode::getChimeras() {
+	try {
+		
+		//read in query sequences and subject sequences
+		mothurOut("Reading sequences and template file... "); cout.flush();
+		querySeqs = readSeqs(fastafile);
+		templateSeqs = readSeqs(templateFile);
+		mothurOut("Done."); mothurOutEndLine();
+		
+		int numSeqs = querySeqs.size();
+		
+		closest.resize(numSeqs);
+		
+		//break up file if needed
+		int linesPerProcess = numSeqs / processors ;
+		
+		#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+			//find breakup of sequences for all times we will Parallelize
+			if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
+			else {
+				//fill line pairs
+				for (int i = 0; i < (processors-1); i++) {			
+					lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+				}
+				//this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+				int i = processors - 1;
+				lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+			}
+			
+			//find breakup of templatefile for quantiles
+			if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
+			else { 
+				for (int i = 0; i < processors; i++) {
+					templateLines.push_back(new linePair());
+					templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
+					templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
+				}
+			}
+		#else
+			lines.push_back(new linePair(0, numSeqs));
+			templateLines.push_back(new linePair(0, templateSeqs.size()));
+		#endif
+	
+		distCalc = new ignoreGaps();
+		
+		//find closest
+		if (processors == 1) { 
+			mothurOut("Finding top matches for sequences... "); cout.flush();
+			closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
+			mothurOut("Done."); mothurOutEndLine();
+		}else {		createProcessesClosest();		}
+
+		
+		for (int i = 0; i < closest.size(); i++) {
+			cout << querySeqs[i]->getName() << ": ";
+			for (int j = 0; j < closest[i].size(); j++) {
+			
+				cout << closest[i][j]->getName() << '\t';
+			}
+			cout << endl;
+		}
+			
+		//free memory
+		for (int i = 0; i < lines.size(); i++)					{	delete lines[i];				}
+		for (int i = 0; i < templateLines.size(); i++)			{	delete templateLines[i];		}
+			
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "getChimeras");
+		exit(1);
+	}
+}
+/***************************************************************************************************************
+vector<int> Ccode::findWindows() {
+	try {
+		
+		vector<int> win; 
+		
+		if (increment > querySeqs[0]->getAligned().length()) {  mothurOut("You have selected an increment larger than the length of your sequences.  I will use the default of 25.");  increment = 25; }
+		
+		for (int m = increment;  m < (querySeqs[0]->getAligned().length() - increment); m+=increment) {  win.push_back(m);  }
+
+		return win;
+	
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "findWindows");
+		exit(1);
+	}
+}
+*/
+//***************************************************************************************************************
+vector< vector<Sequence*> > Ccode::findClosest(int start, int end, int numWanted) {
+	try{
+	
+		vector< vector<Sequence*> > topMatches;  topMatches.resize(querySeqs.size());
+	
+		float smallestOverall, smallestLeft, smallestRight;
+		smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
+		
+		//for each sequence in querySeqs - find top matches to use as reference
+		for(int j = start; j < end; j++){
+			
+			Sequence query = *(querySeqs[j]);
+			
+			vector<SeqDist> distances;
+			
+			//calc distance to each sequence in template seqs
+			for (int i = 0; i < templateSeqs.size(); i++) {
+			
+				Sequence ref = *(templateSeqs[i]); 
+					
+				//find overall dist
+				distCalc->calcDist(query, ref);
+				float dist = distCalc->getDist();	
+				
+				//save distance
+				SeqDist temp;
+				temp.seq = templateSeqs[i];
+				temp.dist = dist;
+
+				distances.push_back(temp);
+			}
+			
+			sort(distances.begin(), distances.end(), compareSeqDist);
+			
+			//save the number of top matches wanted
+			for (int h = 0; h < numWanted; h++) {
+				topMatches[j].push_back(distances[h].seq);
+			}
+		}
+			
+		return topMatches;
+
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "findClosestSides");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+void Ccode::createProcessesClosest() {
+	try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+		int process = 0;
+		vector<int> processIDS;
+		
+		//loop through and create all the processes you want
+		while (process != processors) {
+			int pid = fork();
+			
+			if (pid > 0) {
+				processIDS.push_back(pid);  
+				process++;
+			}else if (pid == 0){
+				
+				mothurOut("Finding top matches for sequences " + toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
+				closest = findClosest(lines[process]->start, lines[process]->end, numWanted);
+				mothurOut("Done finding top matches for sequences " +  toString(lines[process]->start) + " to " + toString(lines[process]->end)); mothurOutEndLine();
+				
+				//write out data to file so parent can read it
+				ofstream out;
+				string s = toString(getpid()) + ".temp";
+				openOutputFile(s, out);
+				
+				//output pairs
+				for (int i = lines[process]->start; i < lines[process]->end; i++) {
+					 out << closest[i].size() << endl;
+					 for (int j = 0; j < closest[i].size(); j++) {
+						closest[i][j]->printSequence(out);
+					 }
+				}
+				out.close();
+				
+				exit(0);
+			}else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+		}
+		
+		//force parent to wait until all the processes are done
+		for (int i=0;i<processors;i++) { 
+			int temp = processIDS[i];
+			wait(&temp);
+		}
+		
+		//get data created by processes
+		for (int i=0;i<processors;i++) { 
+			ifstream in;
+			string s = toString(processIDS[i]) + ".temp";
+			openInputFile(s, in);
+			
+			//get pairs
+			for (int k = lines[i]->start; k < lines[i]->end; k++) {
+				int size;
+				in >> size;
+				gobble(in);
+				
+				vector<Sequence*> tempVector;
+				
+				for (int j = 0; j < size; j++) {
+				
+					Sequence* temp = new Sequence(in);
+					gobble(in);
+						
+					tempVector.push_back(temp);
+				}
+				
+				closest[k] = tempVector;
+			}
+			
+			in.close();
+			remove(s.c_str());
+		}
+			
+	
+#else
+		closest = findClosest(lines[0]->start, lines[0]->end, numWanted);
+#endif	
+
+	}
+	catch(exception& e) {
+		errorOut(e, "Ccode", "createProcessesClosest");
+		exit(1);
+	}
+}
+
+//***************************************************************************************************************
+
diff --git a/ccode.h b/ccode.h
new file mode 100644
index 0000000..a4860dc
--- /dev/null
+++ b/ccode.h
@@ -0,0 +1,62 @@
+#ifndef CCODE_H
+#define CCODE_H
+
+/*
+ *  ccode.h
+ *  Mothur
+ *
+ *  Created by westcott on 8/24/09.
+ *  Copyright 2009 Schloss LAB. All rights reserved.
+ *
+ */
+
+#include "chimera.h"
+#include "dist.h"
+#include "decalc.h"
+
+//This class was created using the algorythms described in the 
+// "Evaluating putative chimeric sequences from PCR-amplified products" paper 
+//by Juan M. Gonzalez, Johannes Zimmerman and Cesareo Saiz-Jimenez.
+
+/***********************************************************/
+
+class Ccode : public Chimera {
+	
+	public:
+		Ccode(string, string);	
+		~Ccode();
+		
+		void getChimeras();
+		void print(ostream&);
+		
+		void setCons(string c)		{}
+		void setQuantiles(string q) {}
+		
+		
+	private:
+	
+		Dist* distCalc;
+		DeCalculator* decalc;
+		int iters;
+		string fastafile, templateFile;
+		
+		
+		vector<linePair*> lines;
+		vector<linePair*> templateLines;
+		vector<Sequence*> querySeqs;
+		vector<Sequence*> templateSeqs;
+		
+		vector< vector<Sequence*> > closest;  //bestfit[0] is a vector of sequence at are closest to queryseqs[0]...
+		
+		vector< vector<Sequence*> > findClosest(int, int, int); 
+		void removeSeqs(vector<Sequence*>);  //removes sequences from closest that are to different of too similar to eachother. 
+		
+		void createProcessesClosest();
+		
+};
+
+/***********************************************************/
+
+#endif
+
+
diff --git a/chimera.cpp b/chimera.cpp
index 48e5763..8aaab4b 100644
--- a/chimera.cpp
+++ b/chimera.cpp
@@ -9,6 +9,76 @@
 
 #include "chimera.h"
 
+//***************************************************************************************************************
+//this is a vertical filter
+void Chimera::createFilter(vector<Sequence*> seqs) {
+	try {
+		
+		int threshold = int (0.5 * seqs.size());
+//cout << "threshhold = " << threshold << endl;
+		
+		vector<int> gaps;	gaps.resize(seqs[0]->getAligned().length(), 0);
+		vector<int> a;		a.resize(seqs[0]->getAligned().length(), 0);
+		vector<int> t;		t.resize(seqs[0]->getAligned().length(), 0);
+		vector<int> g;		g.resize(seqs[0]->getAligned().length(), 0);
+		vector<int> c;		c.resize(seqs[0]->getAligned().length(), 0);
+		
+		filterString = (string(seqs[0]->getAligned().length(), '1'));
+		
+		//for each sequence
+		for (int i = 0; i < seqs.size(); i++) {
+		
+			string seqAligned = seqs[i]->getAligned();
+			
+			for (int j = 0; j < seqAligned.length(); j++) {
+				//if this spot is a gap
+				if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))	{	gaps[j]++;	}
+				else if (toupper(seqAligned[j]) == 'A')					{	a[j]++;		}
+				else if (toupper(seqAligned[j]) == 'T')					{	t[j]++;		}
+				else if (toupper(seqAligned[j]) == 'G')					{	g[j]++;		}
+				else if (toupper(seqAligned[j]) == 'C')					{	c[j]++;		}
+			}
+		}
+		
+		//zero out spot where all sequences have blanks
+		for(int i = 0;i < seqs[0]->getAligned().length(); i++){
+			if(gaps[i] == seqs.size())	{	filterString[i] = '0'; 	}
+			
+			else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) {	filterString[i] = '0';	}
+			//cout << "a = " << a[i] <<  " t = " << t[i] <<  " g = " << g[i] <<  " c = " << c[i] << endl;
+		}
+//cout << "filter = " << filterString << endl;	
+	}
+	catch(exception& e) {
+		errorOut(e, "Chimera", "createFilter");
+		exit(1);
+	}
+}
+//***************************************************************************************************************
+void Chimera::runFilter(vector<Sequence*> seqs) {
+	try {
+		
+		//for each sequence
+		for (int i = 0; i < seqs.size(); i++) {
+		
+			string seqAligned = seqs[i]->getAligned();
+			string newAligned = "";
+			
+			for (int j = 0; j < seqAligned.length(); j++) {
+				//if this spot is a gap
+				if (filterString[j] == '1') { newAligned += seqAligned[j]; }
+			}
+			
+			seqs[i]->setAligned(newAligned);
+		}
+		
+	}
+	catch(exception& e) {
+		errorOut(e, "Chimera", "runFilter");
+		exit(1);
+	}
+}
+
 //***************************************************************************************************************
 vector<Sequence*> Chimera::readSeqs(string file) {
 	try {
@@ -61,3 +131,46 @@ void Chimera::setMask(string filename) {
 	}
 }
 //***************************************************************************************************************
+
+vector< vector<float> > Chimera::readQuantiles() {
+	try {
+	
+		ifstream in;
+		openInputFile(quanfile, in);
+		
+		vector< vector<float> > quan;
+	
+		int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; 
+		
+		while(!in.eof()){
+			
+			in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; 
+			
+			vector <float> temp;
+			
+			temp.push_back(ten); 
+			temp.push_back(twentyfive);
+			temp.push_back(fifty);
+			temp.push_back(seventyfive);
+			temp.push_back(ninetyfive);
+			temp.push_back(ninetynine);
+			
+			quan.push_back(temp);  
+	
+			gobble(in);
+		}
+		
+		in.close();
+		return quan;
+		
+	}
+	catch(exception& e) {
+		errorOut(e, "Chimera", "readQuantiles");
+		exit(1);
+	}
+}
+//***************************************************************************************************************
+
+
+
+
diff --git a/chimera.h b/chimera.h
index fd6b53a..366068b 100644
--- a/chimera.h
+++ b/chimera.h
@@ -12,12 +12,45 @@
 
 
 #include "mothur.h"
-#include "sparsematrix.hpp"
 #include "sequence.hpp"
 
-typedef list<PCell>::iterator MatData;
-typedef map<int, float> SeqMap;  //maps sequence to all distance for that seqeunce
 
+struct Preference {
+		string name;
+		vector<string> leftParent; //keep the name of closest left associated with the two scores
+		vector<string> rightParent; //keep the name of closest right associated with the two scores
+		vector<float> score;  //so you can keep last score and calc this score and keep whichever is bigger.
+		vector<float> closestLeft;  //keep the closest left associated with the two scores
+		vector<float> closestRight; //keep the closest right associated with the two scores
+		int midpoint;
+
+};
+
+struct SeqDist {
+	Sequence* seq;
+	float dist;
+};
+
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareSeqDist(SeqDist left, SeqDist right){
+	return (left.dist < right.dist);	
+} 
+//********************************************************************************************************************
+
+struct sim {
+		Sequence* leftParent;
+		Sequence* rightParent; 
+		float score;  
+		int midpoint;
+};
+
+struct linePair {
+			int start;
+			int end;
+			linePair(int i, int j) : start(i), end(j) {}
+			linePair(){}
+};
 
 /***********************************************************************/
 
@@ -29,16 +62,21 @@ class Chimera {
 		Chimera(string);
 		Chimera(string, string);
 		virtual ~Chimera(){};
-		virtual void setFilter(bool f)			{	filter = f;			}
+		virtual void setFilter(bool f)			{	filter = f;	 		}
 		virtual void setCorrection(bool c)		{	correction = c;		}
 		virtual void setProcessors(int p)		{	processors = p;		}
 		virtual void setWindow(int w)			{	window = w;			}
 		virtual void setIncrement(int i)		{	increment = i;		}
+		virtual void setNumWanted(int n)		{	numWanted = n;		}
 		
-		virtual void setCons(string) {};
-		virtual void setQuantiles(string) {};
+		virtual void setCons(string){};
+		virtual void setQuantiles(string){};
 		virtual vector<Sequence*> readSeqs(string);
+		virtual vector< vector<float> > readQuantiles();
 		virtual void setMask(string);
+		virtual void runFilter(vector<Sequence*>);
+		virtual void createFilter(vector<Sequence*>);
+		
 		
 		//pure functions
 		virtual void getChimeras() = 0;	
@@ -47,8 +85,8 @@ class Chimera {
 	protected:
 		
 		bool filter, correction;
-		int processors, window, increment;
-		string seqMask;
+		int processors, window, increment, numWanted;
+		string seqMask, quanfile, filterString;
 			
 
 };
diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp
index f980310..6af8aa3 100644
--- a/chimeraseqscommand.cpp
+++ b/chimeraseqscommand.cpp
@@ -10,6 +10,8 @@
 #include "chimeraseqscommand.h"
 #include "bellerophon.h"
 #include "pintail.h"
+#include "alignedsimilarity.h"
+#include "ccode.h"
 
 
 //***************************************************************************************************************
@@ -23,7 +25,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
 		
 		else {
 			//valid paramters for this command
-			string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask" };
+			string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted" };
 			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -61,9 +63,6 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
 				if (ableToOpen == 1) { abort = true; }
 				in.close();
 			}
-		
-
-			
 
 			string temp;
 			temp = validParameter.validFile(parameters, "filter", false);			if (temp == "not found") { temp = "F"; }
@@ -80,10 +79,13 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
 					
 			temp = validParameter.validFile(parameters, "increment", false);			if (temp == "not found") { temp = "25"; }
 			convert(temp, increment);
-				
+			
+			temp = validParameter.validFile(parameters, "numwanted", false);			if (temp == "not found") { temp = "20"; }
+			convert(temp, numwanted);
+
 			method = validParameter.validFile(parameters, "method", false);		if (method == "not found") { method = "pintail"; }
 			
-			if ((method == "pintail") && (templatefile == "")) { mothurOut("You must provide a template file with the pintail method."); mothurOutEndLine(); abort = true;  }
+			if (((method == "pintail") || (method == "alignsim")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail and alignsim methods."); mothurOutEndLine(); abort = true;  }
 			
 
 		}
@@ -101,7 +103,7 @@ void ChimeraSeqsCommand::help(){
 		mothurOut("The chimera.seqs command reads a fastafile and creates list of potentially chimeric sequences.\n");
 		mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors, mask, method, window, increment, template, conservation and quantile.\n");
 		mothurOut("The fasta parameter is always required and template is required if using pintail.\n");
-		mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter this only applies when the method is bellerphon.  The default is false. \n");
+		mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter.  The default is false. \n");
 		mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.   The default is true. This only applies when the method is bellerphon.\n");
 		mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
 		mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is pintail. Options include..... \n");
@@ -134,8 +136,10 @@ int ChimeraSeqsCommand::execute(){
 		
 		if (abort == true) { return 0; }
 		
-		if (method == "bellerophon")	{		chimera = new Bellerophon(fastafile);			}
-		else if (method == "pintail")	{		chimera = new Pintail(fastafile, templatefile);	}
+		if (method == "bellerophon")	{		chimera = new Bellerophon(fastafile);				}
+		else if (method == "pintail")	{		chimera = new Pintail(fastafile, templatefile);		}
+		//else if (method == "alignsim")	{		chimera = new AlignSim(fastafile, templatefile);	}
+		else if (method == "ccode")		{		chimera = new Ccode(fastafile, templatefile);		}
 		else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0;		}
 		
 		//set user options
@@ -155,6 +159,7 @@ int ChimeraSeqsCommand::execute(){
 		chimera->setProcessors(processors);
 		chimera->setWindow(window);
 		chimera->setIncrement(increment);
+		chimera->setNumWanted(numwanted);
 				
 		//find chimeras
 		chimera->getChimeras();
diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h
index 85d5732..6274c48 100644
--- a/chimeraseqscommand.h
+++ b/chimeraseqscommand.h
@@ -30,7 +30,7 @@ private:
 	bool abort;
 	string method, fastafile, templatefile, consfile, quanfile, maskfile;
 	bool filter, correction;
-	int processors, midpoint, averageLeft, averageRight, window, iters, increment;
+	int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted;
 	Chimera* chimera;
 	
 	
diff --git a/decalc.cpp b/decalc.cpp
index 860b12f..0589751 100644
--- a/decalc.cpp
+++ b/decalc.cpp
@@ -251,9 +251,16 @@ float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
 		
 		//for each window
 		float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
-		for (int m = 0; m < obs.size(); m++) { 		sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));		}
+		int numZeros = 0;
+		for (int m = 0; m < obs.size(); m++) { 		
 			
-		float de = sqrt((sum / (obs.size() - 1)));
+			//if (obs[m] != 0.0) {
+				sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));	
+			//}else {  numZeros++;   }
+			
+		}
+			
+		float de = sqrt((sum / (obs.size() - 1 - numZeros)));
 			
 		return de;
 	}
@@ -366,6 +373,9 @@ vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs,
 		
 		//percentage of mismatched pairs 1 to 100
 		quan.resize(100);
+//ofstream o;
+//string out = "getQuantiles.out";
+//openOutputFile(out, o);
 		
 		//for each sequence
 		for(int i = start; i < end; i++){
@@ -402,7 +412,7 @@ vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs,
 				float de = calcDE(obsi, exp);
 								
 				float dist = calcDist(query, subject, front, back); 
-				
+	//o << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;			
 				dist = ceil(dist);
 				
 				quanMember newScore(de, i, j);
@@ -424,19 +434,16 @@ vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs,
 		exit(1);
 	}
 }
-
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareQuanMembers(quanMember left, quanMember right){
+	return (left.score < right.score);	
+} 
 //***************************************************************************************************************
 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
-vector< vector<float> > DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
+void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
 	try {
-		vector< vector<float> > quan; 
-		quan.resize(100);
-	
-		/*vector<quanMember> contributions;  
-		vector<int> seen;  //seen[0] is the number of outliers that template seqs[0] was part of.
-		seen.resize(num,0);
-				
-		//find contributions
+						
 		for (int i = 0; i < quantiles.size(); i++) {
 		
 			//find mean of this quantile score
@@ -444,22 +451,21 @@ vector< vector<float> > DeCalculator::removeObviousOutliers(vector< vector<quanM
 			
 			float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
 			float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
-		
+			
+			vector<quanMember> temp;
+			
 			//look at each value in quantiles to see if it is an outlier
 			for (int j = 0; j < quantiles[i].size(); j++) {
-				
 				//is this score between 1 and 99%
 				if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
-					
-				}else {
-					//add to contributions
-					contributions.push_back(quantiles[i][j]);
-					seen[quantiles[i][j].member1]++;
-					seen[quantiles[i][j].member2]++;
+					temp.push_back(quantiles[i][j]);
 				}
 			}
+			
+			quantiles[i] = temp;
 		}
 
+/*
 		//find contributer with most offending score related to it
 		int largestContrib = findLargestContrib(seen);
 	
@@ -538,7 +544,7 @@ cout << "high = " << high << endl;
 			
 		}
 */
-		return quan;
+		
 	}
 	catch(exception& e) {
 		errorOut(e, "DeCalculator", "removeObviousOutliers");
diff --git a/decalc.h b/decalc.h
index a307497..da0f96c 100644
--- a/decalc.h
+++ b/decalc.h
@@ -44,7 +44,7 @@ class DeCalculator {
 		void setAlignmentLength(int l) {  alignLength = l;  }
 		void runMask(Sequence*);
 		void trimSeqs(Sequence*, Sequence*, map<int, int>&);
-		vector< vector<float> > removeObviousOutliers(vector< vector<quanMember> >&, int);
+		void removeObviousOutliers(vector< vector<quanMember> >&, int);
 		vector<float> calcFreq(vector<Sequence*>, string);
 		vector<int> findWindows(Sequence*, int, int, int&, int);
 		vector<float> calcObserved(Sequence*, Sequence*, vector<int>, int);
diff --git a/pintail.cpp b/pintail.cpp
index f46e0e2..04dfb0a 100644
--- a/pintail.cpp
+++ b/pintail.cpp
@@ -9,6 +9,7 @@
 
 #include "pintail.h"
 #include "ignoregaps.h"
+#include "eachgapdist.h"
 
 //********************************************************************************************************************
 //sorts lowest to highest
@@ -41,9 +42,11 @@ void Pintail::print(ostream& out) {
 			
 			int index = ceil(deviation[i]);
 			
+			if (index == 0) { index=1;  }
+						
 			//is your DE value higher than the 95%
 			string chimera;
-			if (DE[i] > quantiles[index][4])		{	chimera = "Yes";	}
+			if (DE[i] > quantiles[index-1][4])		{	chimera = "Yes";	}
 			else									{	chimera = "No";		}
 			
 			out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl;
@@ -125,7 +128,7 @@ void Pintail::getChimeras() {
 			templateLines.push_back(new linePair(0, templateSeqs.size()));
 		#endif
 		
-		distcalculator = new ignoreGaps();
+		distcalculator = new eachGapDist();
 		decalc = new DeCalculator();
 		
 		//if the user does enter a mask then you want to keep all the spots in the alignment
@@ -141,7 +144,18 @@ void Pintail::getChimeras() {
 			mothurOut("Done."); mothurOutEndLine();
 		}else {		createProcessesPairs();		}
 		
-		
+/*string o = "foronlinepintailpairs-eachgap";
+ofstream out7;
+openOutputFile(o, out7);
+
+for (int i = 0; i < bestfit.size(); i++) {
+	out7 << querySeqs[i]->getName() << endl;
+	out7 << querySeqs[i]->getUnaligned() << endl << endl;
+	
+	out7 << bestfit[i]->getName() << endl;
+	out7 << bestfit[i]->getUnaligned() << endl << endl << endl;
+}		
+out7.close();/*/	
 		//find P
 		mothurOut("Getting conservation... "); cout.flush();
 		if (consfile == "") { 
@@ -171,7 +185,19 @@ void Pintail::getChimeras() {
 			}
 
 		}
-						
+		
+		if (filter) {
+			vector<Sequence*> temp = templateSeqs;
+			for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]);  }
+			
+			createFilter(temp);
+			
+			runFilter(querySeqs);
+			runFilter(templateSeqs);
+			runFilter(bestfit);
+		}
+										
+																		
 		if (processors == 1) { 
 	
 			for (int j = 0; j < bestfit.size(); j++) { 
@@ -241,26 +267,22 @@ void Pintail::getChimeras() {
 		//quantiles are used to determine whether the de values found indicate a chimera
 		//if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
 		//combination of sequences in the template
-		if (quanfile != "") {  quantiles =  readQuantiles();  }
+		if (quanfile != "") {  quantiles = readQuantiles();  }
 		else {
 			
 			mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
 			if (processors == 1) { 
 				quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size());
 			}else {		createProcessesQuan();		}
+		
 			
+			ofstream out4, out5;
+			string noOutliers, outliers;
 			
-						
-			
-			//decided against this because we were having trouble setting the sensitivity... may want to revisit this...
-			//quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
-			
-			ofstream out4;
-			string o;
-			
-			o = getRootName(templateFile) + "quan";
+			noOutliers = getRootName(templateFile) + "pintail.quanNOOUTLIERS";
+			outliers = getRootName(templateFile) + "pintail.quanYESOUTLIERS";
 			
-			openOutputFile(o, out4);
+			openOutputFile(outliers, out4);
 			
 			//adjust quantiles
 			for (int i = 0; i < quantilesMembers.size(); i++) {
@@ -301,6 +323,47 @@ void Pintail::getChimeras() {
 			
 			out4.close();
 			
+			decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size());
+			
+			openOutputFile(noOutliers, out5);
+			
+			//adjust quantiles
+			for (int i = 0; i < quantilesMembers.size(); i++) {
+				vector<float> temp;
+				
+				if (quantilesMembers[i].size() == 0) {
+					//in case this is not a distance found in your template files
+					for (int g = 0; g < 6; g++) {
+						temp.push_back(0.0);
+					}
+				}else{
+					
+					sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers);
+					
+					//save 10%
+					temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score);
+					//save 25%
+					temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score);
+					//save 50%
+					temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score);
+					//save 75%
+					temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score);
+					//save 95%
+					temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score);
+					//save 99%
+					temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score);
+					
+				}
+				
+				//output quan value
+				out5 << i+1 << '\t';				
+				for (int u = 0; u < temp.size(); u++) {   out5 << temp[u] << '\t'; }
+				out5 << endl;
+				
+				quantiles[i] = temp;
+				
+			}
+
 			mothurOut("Done."); mothurOutEndLine();
 			
 		}
@@ -360,45 +423,6 @@ vector<float> Pintail::readFreq() {
 	}
 }
 
-//***************************************************************************************************************
-
-vector< vector<float> > Pintail::readQuantiles() {
-	try {
-	
-		ifstream in;
-		openInputFile(quanfile, in);
-		
-		vector< vector<float> > quan;
-	
-		int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; 
-		
-		while(!in.eof()){
-			
-			in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; 
-			
-			vector <float> temp;
-			
-			temp.push_back(ten); 
-			temp.push_back(twentyfive);
-			temp.push_back(fifty);
-			temp.push_back(seventyfive);
-			temp.push_back(ninetyfive);
-			temp.push_back(ninetynine);
-			
-			quan.push_back(temp);  
-			
-			gobble(in);
-		}
-		
-		in.close();
-		return quan;
-		
-	}
-	catch(exception& e) {
-		errorOut(e, "Pintail", "readQuantiles");
-		exit(1);
-	}
-}
 //***************************************************************************************************************
 //calculate the distances from each query sequence to all sequences in the template to find the closest sequence
 vector<Sequence*> Pintail::findPairs(int start, int end) {
diff --git a/pintail.h b/pintail.h
index ddfba85..37efd00 100644
--- a/pintail.h
+++ b/pintail.h
@@ -35,17 +35,10 @@ class Pintail : public Chimera {
 		
 	private:
 	
-		struct linePair {
-			int start;
-			int end;
-			linePair(int i, int j) : start(i), end(j) {}
-			linePair(){}
-		};
-
 		Dist* distcalculator;
 		DeCalculator* decalc;
 		int iters;
-		string fastafile, templateFile, consfile, quanfile;
+		string fastafile, templateFile, consfile;
 		
 		
 		vector<linePair*> lines;
@@ -77,7 +70,6 @@ class Pintail : public Chimera {
 		
 		
 		vector<float> readFreq();
-		vector< vector<float> > readQuantiles();
 		vector<Sequence*> findPairs(int, int);
 			
 		void createProcessesSpots();