From 81276c241b984898f8d30ad123c00592ee6db7b8 Mon Sep 17 00:00:00 2001
From: westcott <westcott>
Date: Mon, 22 Feb 2010 17:22:22 +0000
Subject: [PATCH] chimeras, fix to sabundvector and sharedsabundvector that
 caused getRabundVector to step out of bounds

---
 averagelinkage.cpp     |   7 -
 blastdb.cpp            |   5 +-
 ccode.cpp              |  22 ++--
 chimera.cpp            |  18 ++-
 chimera.h              |   8 +-
 chimeraseqscommand.cpp |  11 +-
 chimeraslayer.cpp      | 292 +++++++++++++++++++++++++++--------------
 chimeraslayer.h        |   9 +-
 cluster.cpp            |   4 +
 decalc.cpp             | 142 +++++++++++++++++---
 decalc.h               |   3 +-
 maligner.cpp           | 124 ++++++++++++++---
 maligner.h             |   7 +-
 mothur.cpp             |   9 ++
 pintail.cpp            |  93 ++++++++-----
 pintail.h              |   1 +
 removeseqscommand.cpp  |  64 ++++++---
 removeseqscommand.h    |   2 +-
 sabundvector.cpp       |   6 +-
 sharedsabundvector.cpp |   4 +-
 20 files changed, 596 insertions(+), 235 deletions(-)
diff --git a/averagelinkage.cpp b/averagelinkage.cpp
index db2c51e..7522d2a 100644
--- a/averagelinkage.cpp
+++ b/averagelinkage.cpp
@@ -38,15 +38,8 @@ bool AverageLinkage::updateDistance(MatData& colCell, MatData& rowCell) {
 			saveCol = smallCol;
 		}
 		
-		float oldColCell = colCell->dist;
-		
 		colCell->dist = (colBin * colCell->dist + rowBin * rowCell->dist) / totalBin;
 		
-		//warn user if merge with value above cutoff produces a value below cutoff
-		if ((colCell->dist < cutoff) && ((oldColCell > cutoff) || (rowCell->dist > cutoff)) ) {
-			mothurOut("Warning: merging " + toString(oldColCell) + " with " + toString(rowCell->dist) + ", new value = " + toString(colCell->dist) + ". Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine();
-		}
-
 		return(true);
 	}
 	catch(exception& e) {
diff --git a/blastdb.cpp b/blastdb.cpp
index dff1074..396a6b4 100644
--- a/blastdb.cpp
+++ b/blastdb.cpp
@@ -118,7 +118,7 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
 		system(blastCommand.c_str());
 		
 		ifstream m8FileHandle;
-		openInputFile(blastFileName, m8FileHandle);
+		openInputFile(blastFileName, m8FileHandle, "no error");
 	
 		string dummy;
 		int templateAccession;
@@ -132,9 +132,10 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
 			
 			gobble(m8FileHandle);
 			topMatches.push_back(templateAccession);
+//cout << templateAccession << endl;
 		}
 		m8FileHandle.close();
-		
+//cout << "\n\n" ;		
 		return topMatches;
 	}
 	catch(exception& e) {
diff --git a/ccode.cpp b/ccode.cpp
index bfcd44d..326af7c 100644
--- a/ccode.cpp
+++ b/ccode.cpp
@@ -48,13 +48,10 @@ void Ccode::print(ostream& out) {
 			out2 << it->first << '\t' << it->second << endl;
 		}
 		out2.close();
-		
-		out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint);
-		
 		out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
 			
 		for (int j = 0; j < closest.size(); j++) {
-			out << setprecision(3) << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
+			out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
 		}
 		out << endl << endl;
 		
@@ -72,14 +69,12 @@ void Ccode::print(ostream& out) {
 			
 		out << endl << "Window\tStartPos\tEndPos" << endl;
 		it = trim.begin();
-			
 		for (int k = 0; k < windows.size()-1; k++) {
 			out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
 		}
 			
 		out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
 		out << endl;
-			
 		out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
 		for (int k = 0; k < windows.size(); k++) {
 			float ds = averageQuery[k] / averageRef[k]; 
@@ -94,7 +89,7 @@ void Ccode::print(ostream& out) {
 		//float fs = varQuery[query] / varRef[query];	/* F-Snedecor, test for differences of variances */
 			
 		bool results = false;	
-						
+					
 		//confidence limit, t - Student, anova
 		out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
 			
@@ -118,6 +113,10 @@ void Ccode::print(ostream& out) {
 		if (results) {
 			mothurOut(querySeq->getName() + " was found have at least one chimeric window."); mothurOutEndLine();
 		}
+
+		//free memory
+		for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
+
 		
 	}
 	catch(exception& e) {
@@ -176,7 +175,7 @@ int Ccode::getChimeras(Sequence* query) {
 			for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq);  }
 			temp.push_back(query);  
 			
-			createFilter(temp);
+			createFilter(temp, 0.5);
 		
 			for (int i = 0; i < temp.size(); i++) { runFilter(temp[i]);  }
 			
@@ -221,9 +220,6 @@ int Ccode::getChimeras(Sequence* query) {
 					
 		determineChimeras();  //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA. 
 		
-		//free memory
-		for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
-		
 		return 0;
 	}
 	catch(exception& e) {
@@ -525,6 +521,10 @@ vector<SeqDist>  Ccode::findClosest(Sequence* q, int numWanted) {
 		}
 			
 		sort(topMatches.begin(), topMatches.end(), compareSeqDist);
+		
+		for (int i = numWanted; i < topMatches.size(); i++) {  delete topMatches[i].seq;  }
+		
+		topMatches.resize(numWanted);
 			
 		return topMatches;
 
diff --git a/chimera.cpp b/chimera.cpp
index bf16de4..9ad287b 100644
--- a/chimera.cpp
+++ b/chimera.cpp
@@ -11,10 +11,10 @@
 
 //***************************************************************************************************************
 //this is a vertical soft filter
-void Chimera::createFilter(vector<Sequence*> seqs) {
+string Chimera::createFilter(vector<Sequence*> seqs, float t) {
 	try {
 		filterString = "";
-		int threshold = int (0.5 * seqs.size());
+		int threshold = int (t * seqs.size());
 //cout << "threshhold = " << threshold << endl;
 		
 		vector<int> gaps;	gaps.resize(seqs[0]->getAligned().length(), 0);
@@ -53,6 +53,7 @@ void Chimera::createFilter(vector<Sequence*> seqs) {
 //cout << "filter = " << filterString << endl;	
 
 		mothurOut("Filter removed " + toString(numColRemoved) + " columns.");  mothurOutEndLine();
+		return filterString;
 	}
 	catch(exception& e) {
 		errorOut(e, "Chimera", "createFilter");
@@ -60,18 +61,25 @@ void Chimera::createFilter(vector<Sequence*> seqs) {
 	}
 }
 //***************************************************************************************************************
-void Chimera::runFilter(Sequence* seq) {
+map<int, int> Chimera::runFilter(Sequence* seq) {
 	try {
-		
+		map<int, int> maskMap;
 		string seqAligned = seq->getAligned();
 		string newAligned = "";
+		int count = 0;
 			
 		for (int j = 0; j < seqAligned.length(); j++) {
 			//if this spot is a gap
-			if (filterString[j] == '1') { newAligned += seqAligned[j]; }
+			if (filterString[j] == '1') { 
+				newAligned += seqAligned[j]; 
+				maskMap[count] = j;
+				count++;
+			}
 		}
 			
 		seq->setAligned(newAligned);
+		
+		return maskMap;
 	}
 	catch(exception& e) {
 		errorOut(e, "Chimera", "runFilter");
diff --git a/chimera.h b/chimera.h
index 4161573..da908ce 100644
--- a/chimera.h
+++ b/chimera.h
@@ -45,6 +45,7 @@ struct results {
 	int nastRegionStart;
 	int nastRegionEnd;
 	string parent;
+	string parentAligned;
 	float queryToParent;
 	float queryToParentLocal;
 	float divR;
@@ -53,6 +54,7 @@ struct results {
 struct SeqDist {
 	Sequence* seq;
 	float dist;
+	int index;
 };
 //********************************************************************************************************************
 //sorts lowest to highest
@@ -88,7 +90,7 @@ class Chimera {
 	
 		Chimera(){};
 		Chimera(string);
-		Chimera(string, bool);
+		Chimera(string, bool, string);
 		Chimera(string, string);
 		virtual ~Chimera(){};
 		virtual void setFilter(bool f)			{	filter = f;	 		}
@@ -119,8 +121,8 @@ class Chimera {
 		virtual vector<Sequence*> readSeqs(string);
 		virtual vector< vector<float> > readQuantiles();
 		virtual void setMask(string);
-		virtual void runFilter(Sequence*);
-		virtual void createFilter(vector<Sequence*>);
+		virtual map<int, int> runFilter(Sequence*);
+		virtual string createFilter(vector<Sequence*>, float);
 		
 		virtual void printHeader(ostream&){};
 		virtual int getChimeras(Sequence*){ return 0; }
diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp
index 13adb52..2ec6ea9 100644
--- a/chimeraseqscommand.cpp
+++ b/chimeraseqscommand.cpp
@@ -180,7 +180,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
 			temp = validParameter.validFile(parameters, "realign", false);			if (temp == "not found") { temp = "f"; }
 			realign = isTrue(temp); 
 			
-			search = validParameter.validFile(parameters, "search", false);			if (search == "not found") { temp = "distance"; }
+			search = validParameter.validFile(parameters, "search", false);			if (search == "not found") { search = "distance"; }
 			
 			temp = validParameter.validFile(parameters, "iters", false);	
 			if ((temp == "not found") && (method == "chimeraslayer")) { temp = "100"; }		
@@ -198,7 +198,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
 			else if (temp == "not found") { temp = "20"; }
 			convert(temp, numwanted);
 
-			
+			if ((search != "distance") && (search != "blast") && (search != "kmer")) { mothurOut(search + " is not a valid search."); mothurOutEndLine(); abort = true;  }
 			
 			if (((method != "bellerophon")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail, ccode, chimeraslayer or chimeracheck methods."); mothurOutEndLine(); abort = true;  }
 			
@@ -239,7 +239,7 @@ void ChimeraSeqsCommand::help(){
 		mothurOut("The mincov parameter allows you to specify minimum coverage by closest matches found in template. Default is 70, meaning 70%. \n");
 		mothurOut("The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n");
 		mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n");
-		mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance and blast, default distance.  -used only by chimeraslayer. \n");
+		mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance.  -used only by chimeraslayer. \n");
 		mothurOut("The realign parameter allows you to realign the query to the potential paretns. Choices are true or false, default false.  -used only by chimeraslayer. \n");
 		mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n");
 		mothurOut("Details for each method: \n"); 
@@ -282,7 +282,7 @@ int ChimeraSeqsCommand::execute(){
 		else if (method == "pintail")			{		chimera = new Pintail(fastafile, outputDir);				}
 		else if (method == "ccode")				{		chimera = new Ccode(fastafile, outputDir);					}
 		else if (method == "chimeracheck")		{		chimera = new ChimeraCheckRDP(fastafile, outputDir);		}
-		else if (method == "chimeraslayer")		{		chimera = new ChimeraSlayer(search, realign);				}
+		else if (method == "chimeraslayer")		{		chimera = new ChimeraSlayer(search, realign, fastafile);	}
 		else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0;		}
 		
 		//set user options
@@ -418,7 +418,8 @@ int ChimeraSeqsCommand::execute(){
 		//	else				 { fastafile = outputDir + getRootName(getSimpleName(fastafile)) + "filter.fasta"; }
 		
 		appendOutputFiles(tempHeader, outputFileName);
-		remove(tempHeader.c_str());
+		remove(outputFileName.c_str());
+		rename(tempHeader.c_str(), outputFileName.c_str());
 
 		for (int i = 0; i < templateSeqs.size(); i++)		{   delete templateSeqs[i];	}
 		
diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp
index 8a0cbd0..7690328 100644
--- a/chimeraslayer.cpp
+++ b/chimeraslayer.cpp
@@ -9,16 +9,113 @@
 
 #include "chimeraslayer.h"
 #include "chimerarealigner.h"
+#include "kmerdb.hpp"
 
 //***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string mode, bool r) : searchMethod(mode), realign(r) {  	decalc = new DeCalculator();	  }
+ChimeraSlayer::ChimeraSlayer(string mode, bool r, string f) : searchMethod(mode), realign(r), fastafile(f) {  	
+	decalc = new DeCalculator();	
+}
 //***************************************************************************************************************
-ChimeraSlayer::~ChimeraSlayer() { 	delete decalc;	 }
+void ChimeraSlayer::doPrep() {
+	try {
+	
+		string 	kmerDBNameLeft;
+		string 	kmerDBNameRight;
+		
+		//generate the kmerdb to pass to maligner
+		if (searchMethod == "kmer") { 
+			//leftside
+			string leftTemplateFileName = "left." + templateFileName;
+			databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);			
+			kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+			ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
+			
+			if(!kmerFileTestLeft){	
+			
+				for (int i = 0; i < templateSeqs.size(); i++) {
+					string leftFrag = templateSeqs[i]->getUnaligned();
+					leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+					
+					Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
+					databaseLeft->addSequence(leftTemp);	
+				}
+				databaseLeft->generateDB();
+				
+			}else {	
+				databaseLeft->readKmerDB(kmerFileTestLeft);	
+			}
+			kmerFileTestLeft.close();
+			
+			databaseLeft->setNumSeqs(templateSeqs.size());
+			
+			//rightside
+			string rightTemplateFileName = "right." + templateFileName;
+			databaseRight = new KmerDB(rightTemplateFileName, kmerSize);			
+			kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+			ifstream kmerFileTestRight(kmerDBNameRight.c_str());
+			
+			if(!kmerFileTestRight){	
+			
+				for (int i = 0; i < templateSeqs.size(); i++) {
+					string rightFrag = templateSeqs[i]->getUnaligned();
+					rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+					
+					Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
+					databaseRight->addSequence(rightTemp);	
+				}
+				databaseRight->generateDB();
+				
+			}else {	
+				databaseRight->readKmerDB(kmerFileTestRight);	
+			}
+			kmerFileTestRight.close();
+			
+			databaseRight->setNumSeqs(templateSeqs.size());
+
+		}
+		
+		int start = time(NULL);	
+		//filter the sequences
+		//read in all query seqs
+		ifstream in; 
+		openInputFile(fastafile, in);
+		
+		vector<Sequence*> tempQuerySeqs;
+		while(!in.eof()){
+			Sequence* s = new Sequence(in);
+			gobble(in);
+			
+			if (s->getName() != "") { tempQuerySeqs.push_back(s); }
+		}
+		in.close();
+		
+		vector<Sequence*> temp = templateSeqs;
+		for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
+				
+		createFilter(temp, 0.0); //just removed columns where all seqs have a gap
+				
+		for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
+		
+		//run filter on template
+		for (int i = 0; i < templateSeqs.size(); i++) {  runFilter(templateSeqs[i]);  }
+		
+		mothurOutEndLine(); mothurOut("It took " + toString(time(NULL) - start) + " secs to filter.");	mothurOutEndLine();
+
+	}
+	catch(exception& e) {
+		errorOut(e, "ChimeraSlayer", "doprep");
+		exit(1);
+	}
+}
+//***************************************************************************************************************
+ChimeraSlayer::~ChimeraSlayer() { 	delete decalc;  if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }	 }
 //***************************************************************************************************************
 void ChimeraSlayer::printHeader(ostream& out) {
 	mothurOutEndLine();
-	mothurOut("Only reporting sequence supported by 90% of bootstrapped results.");
+	mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
 	mothurOutEndLine();
+	
+	out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
 }
 //***************************************************************************************************************
 void ChimeraSlayer::print(ostream& out) {
@@ -35,7 +132,7 @@ void ChimeraSlayer::print(ostream& out) {
 					mothurOut(querySeq->getName() + "\tyes"); mothurOutEndLine();
 				}
 			}
-			out << querySeq->getName() << "\tyes" << endl;
+			
 			printBlock(chimeraResults[0], out);
 			out << endl;
 		}else {  out << querySeq->getName() << "\tno" << endl;  }
@@ -50,114 +147,102 @@ void ChimeraSlayer::print(ostream& out) {
 int ChimeraSlayer::getChimeras(Sequence* query) {
 	try {
 		chimeraFlags = "no";
-		querySeq = query;
 		
-		for (int i = 0; i < query->getAligned().length(); i++) {
-			spotMap[i] = i;
-		}
+		//filter query
+		spotMap = runFilter(query);
+		
+		querySeq = query;
 		
 		//referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
-		maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod);
+		maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
 		slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
 		
 		string chimeraFlag = maligner->getResults(query, decalc);
 		vector<results> Results = maligner->getOutput();
-
-		//realign query to parents to improve slayers detection rate???
+				
+		//found in testing realigning only made things worse
 		if (realign) {
 			ChimeraReAligner realigner(templateSeqs, match, misMatch);
 			realigner.reAlign(query, Results);
 		}
 
-			//if (chimeraFlag == "yes") {
-			
-		//get sequence that were given from maligner results
-		vector<SeqDist> seqs;
-		map<string, float> removeDups;
-		map<string, float>::iterator itDup;
-		for (int j = 0; j < Results.size(); j++) {
-			float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
-			//only add if you are not a duplicate
-			itDup = removeDups.find(Results[j].parent);
-			if (itDup == removeDups.end()) { //this is not duplicate
-				removeDups[Results[j].parent] = dist;
-			}else if (dist > itDup->second) { //is this a stronger number for this parent
-				removeDups[Results[j].parent] = dist;
-			}
-		}
-		
-		for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
-			Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
-			
-			SeqDist member;
-			member.seq = seq;
-			member.dist = itDup->second;
+		if (chimeraFlag == "yes") {
 			
-			seqs.push_back(member);
-		}
-		
-		//limit number of parents to explore - default 3
-		if (Results.size() > parents) {
-			//sort by distance
-			sort(seqs.begin(), seqs.end(), compareSeqDist);
-			//prioritize larger more similiar sequence fragments
-			reverse(seqs.begin(), seqs.end());
+			//get sequence that were given from maligner results
+			vector<SeqDist> seqs;
+			map<string, float> removeDups;
+			map<string, float>::iterator itDup;
+			map<string, string> parentNameSeq;
+			map<string, string>::iterator itSeq;
+			for (int j = 0; j < Results.size(); j++) {
+				float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
+				//only add if you are not a duplicate
+				itDup = removeDups.find(Results[j].parent);
+				if (itDup == removeDups.end()) { //this is not duplicate
+					removeDups[Results[j].parent] = dist;
+					parentNameSeq[Results[j].parent] = Results[j].parentAligned;
+				}else if (dist > itDup->second) { //is this a stronger number for this parent
+					removeDups[Results[j].parent] = dist;
+					parentNameSeq[Results[j].parent] = Results[j].parentAligned;
+				}
+			}
 			
-			for (int k = seqs.size()-1; k > (parents-1); k--)  {  
-				delete seqs[k].seq;
-				seqs.pop_back();	
+			for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
+				//Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
+				itSeq = parentNameSeq.find(itDup->first);
+//cout << itDup->first << itSeq->second << endl;
+				Sequence* seq = new Sequence(itDup->first, itSeq->second);
+				
+				SeqDist member;
+				member.seq = seq;
+				member.dist = itDup->second;
+				
+				seqs.push_back(member);
 			}
-		}
-		
-		//put seqs into vector to send to slayer
-		vector<Sequence*> seqsForSlayer;
-		for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);	}
-		//cout << i+1 << "num parents = " << seqsForSlayer.size() << '\t' << chimeraFlag << endl;
-//ofstream out;
-//string name = querySeqs[i]->getName();
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(name.find_first_of("|")+1);
-//cout << name << endl;
-//name = name.substr(0, name.find_last_of("|"));
-//cout << name << endl;
-//string filename = toString(i+1) + ".seqsforslayer";
-//openOutputFile(filename, out);	
-//cout << querySeqs[i]->getName() << endl;
-//for (int u = 0; u < seqsForSlayer.size(); u++) { cout << seqsForSlayer[u]->getName() << '\t'; seqsForSlayer[u]->printSequence(out);	}
-//cout << endl;
-//out.close();
-//filename = toString(i+1) + ".fasta";
-//openOutputFile(filename, out);	
-//querySeqs[i]->printSequence(out);
-//out.close();
-
-
 			
-		//mask then send to slayer...
-		if (seqMask != "") {
-			decalc->setMask(seqMask);
+			//limit number of parents to explore - default 3
+			if (Results.size() > parents) {
+				//sort by distance
+				sort(seqs.begin(), seqs.end(), compareSeqDist);
+				//prioritize larger more similiar sequence fragments
+				reverse(seqs.begin(), seqs.end());
+				
+				for (int k = seqs.size()-1; k > (parents-1); k--)  {  
+					delete seqs[k].seq;
+					seqs.pop_back();	
+				}
+			}
 			
-			//mask querys
-			decalc->runMask(query);
+			//put seqs into vector to send to slayer
+			vector<Sequence*> seqsForSlayer;
+			for (int k = 0; k < seqs.size(); k++) {  seqsForSlayer.push_back(seqs[k].seq);	}
 			
-			//mask parents
-			for (int k = 0; k < seqsForSlayer.size(); k++) {
-				decalc->runMask(seqsForSlayer[k]);
+			//mask then send to slayer...
+			if (seqMask != "") {
+				decalc->setMask(seqMask);
+				
+				//mask querys
+				decalc->runMask(query);
+				
+				//mask parents
+				for (int k = 0; k < seqsForSlayer.size(); k++) {
+					decalc->runMask(seqsForSlayer[k]);
+				}
+				
+				spotMap = decalc->getMaskMap();
 			}
 			
-			spotMap = decalc->getMaskMap();
+			//send to slayer
+			chimeraFlags = slayer->getResults(query, seqsForSlayer);
+			chimeraResults = slayer->getOutput();
+			
+			//free memory
+			for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
 		}
 		
-		//send to slayer
-		chimeraFlags = slayer->getResults(query, seqsForSlayer);
-		chimeraResults = slayer->getOutput();
-	
-		//free memory
-		for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
-		//}
-			
+		delete maligner;
+		delete slayer;
+		
 		return 0;
 	}
 	catch(exception& e) {
@@ -168,28 +253,31 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
 //***************************************************************************************************************
 void ChimeraSlayer::printBlock(data_struct data, ostream& out){
 	try {
+	//out << "Name\tParentA\tParentB\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
+		
+		out << querySeq->getName() << '\t';
+		out << data.parentA.getName() << "\t" << data.parentB.getName()  << '\t';
+		//out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
+		//out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
+		
+		out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
+		out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
 		
-		out << "parentA: " << data.parentA.getName() << "  parentB: " << data.parentB.getName()  << endl;
-		out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
-		out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
+		out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
 		
-		out << "Divergence of Query to Leftside ParentA and Rightside ParentB: " << data.divr_qla_qrb << '\t' << "Bootstrap: " << data.bsa << endl;
-		out << "Divergence of Query to Rightside ParentA and Leftside ParentB: " << data.divr_qlb_qra << '\t' << "Bootstrap: " << data.bsb << endl;
+		//out << "Similarity of parents: " << data.ab << endl;
+		//out << "Similarity of query to parentA: " << data.qa << endl;
+		//out << "Similarity of query to parentB: " << data.qb << endl;
 		
-		out << "Similarity of parents: " << data.ab << endl;
-		out << "Similarity of query to parentA: " << data.qa << endl;
-		out << "Similarity of query to parentB: " << data.qb << endl;
 		
-		out << "Percent_ID QLA_QRB: " << data.qla_qrb << endl;
-		out << "Percent_ID QLB_QRA: " << data.qlb_qra << endl;
 		//out << "Per_id(QL,A): " << data.qla << endl;
 		//out << "Per_id(QL,B): " << data.qlb << endl;
 		//out << "Per_id(QR,A): " << data.qra << endl;
 		//out << "Per_id(QR,B): " << data.qrb << endl;
 
 		
-		out << "DeltaL: " << (data.qla - data.qlb) << endl;
-		out << "DeltaR: " << (data.qra - data.qrb) << endl;
+		//out << "DeltaL: " << (data.qla - data.qlb) << endl;
+		//out << "DeltaR: " << (data.qra - data.qrb) << endl;
 
 	}
 	catch(exception& e) {
diff --git a/chimeraslayer.h b/chimeraslayer.h
index f8006c4..bf9cf7f 100644
--- a/chimeraslayer.h
+++ b/chimeraslayer.h
@@ -23,12 +23,13 @@
 class ChimeraSlayer : public Chimera {
 	
 	public:
-		ChimeraSlayer(string, bool);	
+		ChimeraSlayer(string, bool, string);	
 		~ChimeraSlayer();
 		
 		int getChimeras(Sequence*);
 		void print(ostream&);
 		void printHeader(ostream&);
+		void doPrep();
 		
 	private:
 		Sequence* querySeq;
@@ -36,13 +37,15 @@ class ChimeraSlayer : public Chimera {
 		Maligner* maligner;
 		Slayer* slayer;
 		map<int, int>  spotMap;
+		Database* databaseRight;
+		Database* databaseLeft;
 		
 		vector<data_struct>  chimeraResults;
-		string chimeraFlags, searchMethod;
-		string fastafile;
+		string chimeraFlags, searchMethod, fastafile;
 		bool realign;
 	
 		void printBlock(data_struct, ostream&);
+		
 };
 
 /************************************************************************/
diff --git a/cluster.cpp b/cluster.cpp
index 00d8083..931b44d 100644
--- a/cluster.cpp
+++ b/cluster.cpp
@@ -216,6 +216,10 @@ void Cluster::update(){
 		for (int i=nColCells-1;i>=0;i--) {
 			if (found[i] == 0) {
 				removeCell(colCells[i], -1, i);
+cout << "smallRow = " << smallRow+1 << " smallCol = " << smallCol+1 << endl;
+				if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) {
+					mothurOut("Warning: merging " + toString(colCells[i]->row+1)  + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine();
+				}
 			}
 		}
 	}
diff --git a/decalc.cpp b/decalc.cpp
index 61de7c8..c5f0c26 100644
--- a/decalc.cpp
+++ b/decalc.cpp
@@ -11,6 +11,7 @@
 #include "chimera.h"
 #include "dist.h"
 #include "eachgapdist.h"
+#include "ignoregaps.h"
 
 
 //***************************************************************************************************************
@@ -680,11 +681,13 @@ float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
 }
 //***************************************************************************************************************
 //gets closest matches to each end, since chimeras will most likely have different parents on each end
-vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int numWanted) {
+vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int& numWanted, vector<int>& indexes) {
 	try {
+		indexes.clear();
 		
 		vector<Sequence*> seqsMatches;  
-		vector<SeqDist> dists;
+		vector<SeqDist> distsLeft;
+		vector<SeqDist> distsRight;
 		
 		Dist* distcalculator = new eachGapDist();
 		
@@ -696,12 +699,19 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
 		string queryAligned = querySeq->getAligned();
 		
 		//left side
+		bool foundFirstBase = false;
 		int baseCount = 0;
 		int leftSpot = 0;
+		int firstBaseSpot = 0;
 		for (int i = 0; i < queryAligned.length(); i++) {
-			leftQuery += queryAligned[i];
 			//if you are a base
-			if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) {		baseCount++;	}
+			if (isalpha(queryAligned[i])) {		
+				baseCount++; 
+				if (!foundFirstBase) {   foundFirstBase = true;  firstBaseSpot = i;  }
+			}
+			
+			//eliminate opening .'s
+			if (foundFirstBase) {   leftQuery += queryAligned[i];  }
 			//if you have 1/3
 			if (baseCount >= numBases) {  leftSpot = i; break; } //first 1/3
 		}
@@ -711,21 +721,31 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
 		int rightSpot = 0;
 		for (int i = leftSpot; i < queryAligned.length(); i++) {
 			//if you are a base
-			if ((queryAligned[i] != '.') && (queryAligned[i] != '-')) {		baseCount++;	}
+			if (isalpha(queryAligned[i])) {		baseCount++;	}
 			//if you have 1/3
 			if (baseCount >= numBases) { rightSpot = i;  break; } //last 1/3
 		}
-		rightQuery = queryAligned.substr(rightSpot); //sequence from pos spot to end
+		
+		//trim end
+		//find last position in query that is a non gap character
+		int lastBaseSpot = queryAligned.length()-1;
+		for (int j = queryAligned.length()-1; j >= 0; j--) {
+			if (isalpha(queryAligned[j])) {
+				lastBaseSpot = j;
+				break;
+			}
+		}
+		rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //sequence from pos spot to end
 		
 		Sequence queryLeft(querySeq->getName(), leftQuery);
 		Sequence queryRight(querySeq->getName(), rightQuery);
-//cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << endl;
+//cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl;
 //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl;
 		for(int j = 0; j < db.size(); j++){
 			
 			string dbAligned = db[j]->getAligned();
-			string leftDB = dbAligned.substr(0, leftSpot+1); //first 1/3 of the sequence
-			string rightDB = dbAligned.substr(rightSpot); //last 1/3 of the sequence
+			string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence
+			string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //last 1/3 of the sequence
 			
 			Sequence dbLeft(db[j]->getName(), leftDB);
 			Sequence dbRight(db[j]->getName(), rightDB);
@@ -736,22 +756,78 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
 			distcalculator->calcDist(queryRight, dbRight);
 			float distRight = distcalculator->getDist();
 			
-			float dist = min(distLeft, distRight); //keep smallest distance
+			SeqDist subjectLeft;
+			subjectLeft.seq = db[j];
+			subjectLeft.dist = distLeft;
+			subjectLeft.index = j;
+			
+			distsLeft.push_back(subjectLeft);
 			
-			SeqDist subject;
-			subject.seq = db[j];
-			subject.dist = dist;
+			SeqDist subjectRight;
+			subjectRight.seq = db[j];
+			subjectRight.dist = distRight;
+			subjectRight.index = j;
 			
-			dists.push_back(subject);
+			distsRight.push_back(subjectRight);
+
 		}
 		
 		delete distcalculator;
 		
-		sort(dists.begin(), dists.end(), compareSeqDist);
+		//sort by smallest distance
+		sort(distsRight.begin(), distsRight.end(), compareSeqDist);
+		sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
+		
+		//merge results		
+		map<string, string> seen;
+		map<string, string>::iterator it;
 		
+		vector<SeqDist> dists;
+		float lastRight = distsRight[0].dist;
+		float lastLeft = distsLeft[0].dist;
+		int lasti = 0;
+		for (int i = 0; i < distsLeft.size(); i++) {
+			//add left if you havent already
+			it = seen.find(distsLeft[i].seq->getName());
+			if (it == seen.end()) {  
+				dists.push_back(distsLeft[i]);
+				seen[distsLeft[i].seq->getName()] = distsLeft[i].seq->getName();
+				lastLeft =  distsLeft[i].dist;
+			}
+
+			//add right if you havent already
+			it = seen.find(distsRight[i].seq->getName());
+			if (it == seen.end()) {  
+				dists.push_back(distsRight[i]);
+				seen[distsRight[i].seq->getName()] = distsRight[i].seq->getName();
+				lastRight =  distsRight[i].dist;
+			}
+			
+			if (dists.size() > numWanted) { lasti = i; break; } //you have enough results
+		}
+		
+		//add in dups
+		lasti++;
+		int i = lasti;
+		while (i < distsLeft.size()) {  
+			if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  numWanted++; }
+			else { break; }
+			i++;
+		}
+		
+		i = lasti;
+		while (i < distsRight.size()) {  
+			if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  numWanted++; }
+			else { break; }
+			i++;
+		}
+
+//cout << numWanted << endl;
 		for (int i = 0; i < numWanted; i++) {
+//cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl;
 			Sequence* temp = new Sequence(dists[i].seq->getName(), dists[i].seq->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
 			seqsMatches.push_back(temp);
+			indexes.push_back(dists[i].index);
 		}
 		
 		return seqsMatches;
@@ -761,6 +837,38 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
 		exit(1);
 	}
 }
+//***************************************************************************************************************
+Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
+	try {
+		
+		Sequence* seqsMatch;  
+		
+		Dist* distcalculator = new eachGapDist();
+		int index = 0;
+		int smallest = 1000000;
+		
+		for(int j = 0; j < db.size(); j++){
+			
+			distcalculator->calcDist(*querySeq, *db[j]);
+			float dist = distcalculator->getDist();
+			
+			if (dist < smallest) { 
+				smallest = dist;
+				index = j;
+			}
+		}
+		
+		delete distcalculator;
+		
+		seqsMatch = new Sequence(db[index]->getName(), db[index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
+			
+		return seqsMatch;
+	}
+	catch(exception& e) {
+		errorOut(e, "DeCalculator", "findClosest");
+		exit(1);
+	}
+}
 /***************************************************************************************************************/
 map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
 	try {
@@ -838,10 +946,10 @@ map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatch
 
 		//check to make sure that is not whole seq
 		if ((rearPos - frontPos - 1) <= 0) {  mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1);  }
-//cout << "front = " << frontPos << " rear = " << rearPos << endl;		
+//cout << query->getName() << " front = " << frontPos << " rear = " << rearPos << endl;		
 		//trim query
 		string newAligned = query->getAligned();
-		newAligned = newAligned.substr(frontPos, (rearPos-frontPos));
+		newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
 		query->setAligned(newAligned);
 		
 		//trim topMatches
diff --git a/decalc.h b/decalc.h
index 0119936..48a72bb 100644
--- a/decalc.h
+++ b/decalc.h
@@ -39,7 +39,8 @@ class DeCalculator {
 		DeCalculator() {};
 		~DeCalculator() {};
 		
-		vector<Sequence*> findClosest(Sequence*, vector<Sequence*>, int);  //takes querySeq, a reference db and numWanted - returns indexes to closest seqs in db
+		vector<Sequence*> findClosest(Sequence*, vector<Sequence*>, int&, vector<int>&);  //takes querySeq, a reference db, numWanted and indexes 
+		Sequence* findClosest(Sequence*, vector<Sequence*>);
 		set<int> getPos() {  return h;  }
 		void setMask(string); 
 		void setAlignmentLength(int l) {  alignLength = l;  }
diff --git a/maligner.cpp b/maligner.cpp
index b393194..91b0fa7 100644
--- a/maligner.cpp
+++ b/maligner.cpp
@@ -8,12 +8,12 @@
  */
 
 #include "maligner.h"
-#include "database.hpp"
+#include "kmerdb.hpp"
 #include "blastdb.hpp"
 
 /***********************************************************************/
-Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) :
-		db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {}
+Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode, Database* dataLeft, Database* dataRight) :
+		db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight) {}
 /***********************************************************************/
 string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
 	try {
@@ -25,15 +25,17 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
 		
 		string chimera;
 		
-		if (searchMethod != "blast") {
+		if (searchMethod == "distance") {
 			//find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
-			refSeqs = decalc->findClosest(query, db, numWanted);
-		}else{
-			refSeqs = getBlastSeqs(query, numWanted);
-		}
+			refSeqs = decalc->findClosest(query, db, numWanted, indexes);
+		}else if (searchMethod == "blast")  {
+			refSeqs = getBlastSeqs(query, numWanted); //fills indexes
+		}else if (searchMethod == "kmer") {
+			refSeqs = getKmerSeqs(query, numWanted); //fills indexes
+		}else { mothurOut("not valid search."); exit(1);  } //should never get here
 		
 		refSeqs = minCoverageFilter(refSeqs);
-		
+
 		if (refSeqs.size() < 2)  { 
 			for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];	}
 			percentIdenticalQueryChimera = 0.0;
@@ -41,13 +43,14 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
 		}
 		
 		int chimeraPenalty = computeChimeraPenalty();
-	
+
 		//fills outputResults
 		chimera = chimeraMaligner(chimeraPenalty, decalc);
-		
+	
 				
 		//free memory
 		delete query;
+
 		for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];	}
 		
 		return chimera;
@@ -65,31 +68,31 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
 		
 		//trims seqs to first non gap char in all seqs and last non gap char in all seqs
 		spotMap = decalc->trimSeqs(query, refSeqs);
-		
+
 		vector<Sequence*> temp = refSeqs;
 		temp.push_back(query);
 		
 		verticalFilter(temp);
+//for (int i = 0; i < temp.size(); i++) { cout << temp[i]->getName() << '\n' << temp[i]->getAligned() << endl; } return "no";
 
 		vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
 		
 		fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
-		
+	
 		vector<score_struct> path = extractHighestPath(matrix);
 		
 		vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
-		
+	
 		if (trace.size() > 1) {		chimera = "yes";	}
 		else { chimera = "no";	}
 		
 		int traceStart = path[0].col;
-		int traceEnd = path[path.size()-1].col;
-		
+		int traceEnd = path[path.size()-1].col;	
 		string queryInRange = query->getAligned();
 		queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
 	
 		string chimeraSeq = constructChimericSeq(trace, refSeqs);
-		
+	
 		percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
 		
 		//save output results
@@ -101,6 +104,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
 			results temp;
 			
 			temp.parent = refSeqs[seqIndex]->getName();
+			temp.parentAligned = db[indexes[seqIndex]]->getAligned();
 			temp.nastRegionStart = spotMap[regionStart];
 			temp.nastRegionEnd = spotMap[regionEnd];
 			temp.regionStart = regionStart;
@@ -481,6 +485,8 @@ float Maligner::computePercentID(string queryAlign, string chimera) {
 //***************************************************************************************************************
 vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
 	try {	
+		indexes.clear();
+		vector<Sequence*> refResults;
 		//generate blastdb
 		Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty);
 		for (int i = 0; i < db.size(); i++) { 	database->addSequence(*db[i]);	}
@@ -498,6 +504,82 @@ vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
 		vector<int> tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1);
 		vector<int> tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1);
 		
+		//if ((tempIndexesRight.size() != (num+1)) || (tempIndexesLeft.size() != (num+1)))  {  mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to porcess sequence " + q->getName()); mothurOutEndLine(); return refResults; }
+		
+		vector<int> smaller;
+		vector<int> larger;
+		
+		if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight;  larger = tempIndexesLeft; }
+		else { smaller = tempIndexesLeft; larger = tempIndexesRight; } 
+		
+		//merge results		
+		map<int, int> seen;
+		map<int, int>::iterator it;
+		
+		vector<int> mergedResults;
+		for (int i = 0; i < smaller.size(); i++) {
+			//add left if you havent already
+			it = seen.find(smaller[i]);
+			if (it == seen.end()) {  
+				mergedResults.push_back(smaller[i]);
+				seen[smaller[i]] = smaller[i];
+			}
+
+			//add right if you havent already
+			it = seen.find(larger[i]);
+			if (it == seen.end()) {  
+				mergedResults.push_back(larger[i]);
+				seen[larger[i]] = larger[i];
+			}
+		}
+		
+		for (int i = smaller.size(); i < larger.size(); i++) {
+			it = seen.find(larger[i]);
+			if (it == seen.end()) {  
+				mergedResults.push_back(larger[i]);
+				seen[larger[i]] = larger[i];
+			}
+		}
+		
+		if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); }
+//cout << q->getName() <<  endl;		
+		for (int i = 0; i < numWanted; i++) {
+//cout << db[mergedResults[i]]->getName() << endl;	
+			if (db[mergedResults[i]]->getName() != q->getName()) { 
+				Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
+				refResults.push_back(temp);
+				indexes.push_back(mergedResults[i]);
+			}
+//cout << mergedResults[i] << endl;
+		}
+//cout << endl;		
+		delete queryRight;
+		delete queryLeft;
+		delete database;
+		
+		return refResults;
+	}
+	catch(exception& e) {
+		errorOut(e, "Maligner", "getBlastSeqs");
+		exit(1);
+	}
+}
+//***************************************************************************************************************
+vector<Sequence*> Maligner::getKmerSeqs(Sequence* q, int num) {
+	try {	
+		indexes.clear();
+		
+		//get parts of query
+		string queryUnAligned = q->getUnaligned();
+		string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
+		string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
+
+		Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
+		Sequence* queryRight = new Sequence(q->getName(), rightQuery);
+		
+		vector<int> tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, numWanted);
+		vector<int> tempIndexesRight = databaseRight->findClosestSequences(queryRight, numWanted);
+		
 		//merge results		
 		map<int, int> seen;
 		map<int, int>::iterator it;
@@ -519,16 +601,17 @@ vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
 			}
 		}
 		
-		
+//cout << q->getName() << endl;		
 		vector<Sequence*> refResults;
 		for (int i = 0; i < numWanted; i++) {
+//cout << db[mergedResults[i]]->getName() << endl;	
 			Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
 			refResults.push_back(temp);
+			indexes.push_back(mergedResults[i]);
 		}
-		
+//cout << endl;		
 		delete queryRight;
 		delete queryLeft;
-		delete database;
 		
 		return refResults;
 	}
@@ -537,6 +620,5 @@ vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
 		exit(1);
 	}
 }
-
 //***************************************************************************************************************
 
diff --git a/maligner.h b/maligner.h
index abe18a3..13aa917 100644
--- a/maligner.h
+++ b/maligner.h
@@ -11,6 +11,7 @@
  
 #include "decalc.h"
 #include "chimera.h"
+#include "database.hpp"
 
 /***********************************************************************/
 //This class was modeled after the chimeraMaligner written by the Broad Institute
@@ -19,7 +20,7 @@ class Maligner {
 
 	public:
 		
-		Maligner(vector<Sequence*>, int, int, int, float, int, int, string);
+		Maligner(vector<Sequence*>, int, int, int, float, int, int, string, Database*, Database*);
 		~Maligner() {};
 		
 		string getResults(Sequence*, DeCalculator*);
@@ -35,7 +36,10 @@ class Maligner {
 		string searchMethod;
 		float minDivR, percentIdenticalQueryChimera;
 		vector<results> outputResults;
+		vector<int> indexes;  //stores index into template seqs of the refSeqs, so we can return the whole sequence rather than the trimmed and filtered one
 		map<int, int> spotMap;
+		Database* databaseLeft;
+		Database* databaseRight;
 		
 		vector<Sequence*> minCoverageFilter(vector<Sequence*>);  //removes top matches that do not have minimum coverage with query.
 		int computeChimeraPenalty();
@@ -49,6 +53,7 @@ class Maligner {
 		float computePercentID(string, string);
 		string chimeraMaligner(int, DeCalculator*);
 		vector<Sequence*> getBlastSeqs(Sequence*, int);
+		vector<Sequence*> getKmerSeqs(Sequence*, int);
 		
 };
 
diff --git a/mothur.cpp b/mothur.cpp
index 8203c57..7acf389 100644
--- a/mothur.cpp
+++ b/mothur.cpp
@@ -93,12 +93,21 @@ int main(int argc, char *argv[]){
 			input = argv[1];
 
 			if (input[0] == '#') {
+				mothurOutJustToLog("Script Mode");
+				mothurOutEndLine(); mothurOutEndLine();
+
 				mothur = new ScriptEngine(argv[0], argv[1]);
 			}else{
+				mothurOutJustToLog("Batch Mode");
+				mothurOutEndLine(); mothurOutEndLine();
+				
 				mothur = new BatchEngine(argv[0], argv[1]);
 			}
 		}
 		else{
+			mothurOutJustToLog("Interactive Mode");
+			mothurOutEndLine(); mothurOutEndLine();
+			
 			mothur = new InteractEngine(argv[0]);	
 		}
 		
diff --git a/pintail.cpp b/pintail.cpp
index 74e1ce4..46da913 100644
--- a/pintail.cpp
+++ b/pintail.cpp
@@ -39,7 +39,8 @@ Pintail::~Pintail() {
 //***************************************************************************************************************
 void Pintail::doPrep() {
 	try {
-	
+		
+		mergedFilterString = "";
 		windowSizesTemplate.resize(templateSeqs.size(), window);
 		quantiles.resize(100);  //one for every percent mismatch
 		quantilesMembers.resize(100);  //one for every percent mismatch
@@ -72,14 +73,15 @@ void Pintail::doPrep() {
 			mothurOut("Done."); mothurOutEndLine();
 		}else				{   probabilityProfile = readFreq();	mothurOut("Done.");		  }
 		mothurOutEndLine();
-	
-		//quantiles are used to determine whether the de values found indicate a chimera
-		//if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
-		//combination of sequences in the template
-		if (quanfile != "") {  quantiles = readQuantiles();  }
-		else {
+		
+		//make P into Q
+		for (int i = 0; i < probabilityProfile.size(); i++)  {	probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
+		
+		bool reRead = false;
+		//create filter if needed for later
+		if (filter) {
+			reRead = true;
 			
-			//if user has not provided the quantiles and wants the mask we need to mask, but then delete and reread or we will mess up the best match process in getChimeras
 			if (seqMask != "") {
 				//mask templates
 				for (int i = 0; i < templateSeqs.size(); i++) {
@@ -87,11 +89,44 @@ void Pintail::doPrep() {
 				}
 			}
 			
-			if (filter) {
-				createFilter(templateSeqs);
-				for (int i = 0; i < templateSeqs.size(); i++) {  runFilter(templateSeqs[i]);  }
+			//read in all query seqs
+			ifstream in; 
+			openInputFile(fastafile, in);
+			
+			vector<Sequence*> tempQuerySeqs;
+			while(!in.eof()){
+				Sequence* s = new Sequence(in);
+				gobble(in);
+				
+				if (s->getName() != "") { tempQuerySeqs.push_back(s); }
+			}
+			in.close();
+			
+			vector<Sequence*> temp;
+			//merge query seqs and template seqs
+			temp = templateSeqs;
+			for (int i = 0; i < tempQuerySeqs.size(); i++) {  temp.push_back(tempQuerySeqs[i]);  }
+			
+			mergedFilterString = createFilter(temp, 0.5);
+			
+			//reread template seqs
+			for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i];  }
+		}
+		
+		
+		//quantiles are used to determine whether the de values found indicate a chimera
+		//if you have to calculate them, its time intensive because you are finding the de and deviation values for each 
+		//combination of sequences in the template
+		if (quanfile != "") {  
+			quantiles = readQuantiles(); 
+		}else {
+			if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above.
+				reRead = true;
+				//mask templates
+				for (int i = 0; i < templateSeqs.size(); i++) {
+					decalc->runMask(templateSeqs[i]);
+				}
 			}
-
 			
 			mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
 			if (processors == 1) { 
@@ -155,15 +190,15 @@ void Pintail::doPrep() {
 			}
 
 			mothurOut("Done."); mothurOutEndLine();
-			
-			//if mask reread template
-			if ((seqMask != "") || (filter)) {
-				for (int i = 0; i < templateSeqs.size(); i++)		{  delete templateSeqs[i];		}
-				templateSeqs = readSeqs(templateFileName);
-			}
-			
 		}
 		
+		if (reRead) {
+			for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i];  }
+			templateSeqs.clear();
+			templateSeqs = readSeqs(templateFileName);
+		}
+
+		
 		//free memory
 		for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i];  }
 		
@@ -217,24 +252,22 @@ int Pintail::getChimeras(Sequence* query) {
 		querySeq = query;
 		trimmed.clear();
 		windowSizes = window;
-				
-		//find pairs
-		bestfit = findPairs(query);
 							
-		//make P into Q
-		for (int i = 0; i < probabilityProfile.size(); i++)  {	probabilityProfile[i] = 1 - probabilityProfile[i];  }  //cout << i << '\t' << probabilityProfile[i] << endl;
-	
-		//mask sequences if the user wants to 
+		//find pairs has to be done before a mask
+		bestfit = findPairs(query);
+		
+		//if they mask  
 		if (seqMask != "") {
 			decalc->runMask(query);
 			decalc->runMask(bestfit);
 		}
-		
-		if (filter) {
+
+		if (filter) { //must be done after a mask
 			runFilter(query);
 			runFilter(bestfit);
 		}
 		
+				
 		//trim seq
 		decalc->trimSeqs(query, bestfit, trimmed);  
 		
@@ -319,9 +352,7 @@ Sequence* Pintail::findPairs(Sequence* q) {
 		
 		Sequence* seqsMatches;  
 		
-		vector<Sequence*> copy = decalc->findClosest(q, templateSeqs, 1);
-		seqsMatches = copy[0];
-		
+		seqsMatches = decalc->findClosest(q, templateSeqs);
 		return seqsMatches;
 	
 	}
diff --git a/pintail.h b/pintail.h
index ed52c3a..efb8ba6 100644
--- a/pintail.h
+++ b/pintail.h
@@ -65,6 +65,7 @@ class Pintail : public Chimera {
 		vector< vector<float> > quantiles;  //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
 		vector< vector<quanMember> > quantilesMembers;  //quantiles[0] is the vector of deviations with ceiling score of 1, quantiles[1] is the vector of deviations with ceiling score of 2...
 		set<int>  h;
+		string mergedFilterString;
 		
 		
 		vector<float> readFreq();
diff --git a/removeseqscommand.cpp b/removeseqscommand.cpp
index a8f03e0..19a4bc0 100644
--- a/removeseqscommand.cpp
+++ b/removeseqscommand.cpp
@@ -22,7 +22,7 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option){
 		
 		else {
 			//valid paramters for this command
-			string Array[] =  {"fasta","name", "group", "alignreport", "accnos", "list","outputdir","inputdir" };
+			string Array[] =  {"fasta","name", "group", "alignreport", "accnos", "list","outputdir","inputdir", "dups" };
 			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -119,10 +119,17 @@ RemoveSeqsCommand::RemoveSeqsCommand(string option){
 			if (listfile == "not open") { abort = true; }
 			else if (listfile == "not found") {  listfile = "";  }
 			
+			string usedDups = "true";
+			string temp = validParameter.validFile(parameters, "dups", false);	if (temp == "not found") { temp = "false"; usedDups = ""; }
+			dups = isTrue(temp);
+			
 			if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == ""))  { mothurOut("You must provide one of the following: fasta, name, group, alignreport or list."); mothurOutEndLine(); abort = true; }
 			
 			int okay = 2;
 			if (outputDir != "") { okay++; }
+			if (usedDups != "") { okay++;  }
+			
+			if ((usedDups != "") && (namefile == "")) {  mothurOut("You may only use dups with the name option."); mothurOutEndLine();  abort = true; }
 			
 			if (parameters.size() > okay) { mothurOut("You may only enter one of the following: fasta, name, group, alignreport, or list."); mothurOutEndLine(); abort = true;  }
 		}
@@ -139,7 +146,8 @@ void RemoveSeqsCommand::help(){
 	try {
 		mothurOut("The remove.seqs command reads an .accnos file and one of the following file types: fasta, name, group, list or alignreport file.\n");
 		mothurOut("It outputs a file containing the sequences NOT in the .accnos file.\n");
-		mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list and alignreport.  You must provide accnos and one of the other parameters.\n");
+		mothurOut("The remove.seqs command parameters are accnos, fasta, name, group, list, alignreport and dups.  You must provide accnos and one of the file parameters.\n");
+		mothurOut("The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=false. If dups=true, then remove.seqs outputs a new .accnos file containing all the sequences removed. \n");
 		mothurOut("The remove.seqs command should be in the following format: remove.seqs(accnos=yourAccnos, fasta=yourFasta).\n");
 		mothurOut("Example remove.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n");
 		mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
@@ -288,7 +296,12 @@ void RemoveSeqsCommand::readName(){
 	try {
 		if (outputDir == "") {  outputDir += hasPath(namefile);  }
 		string outputFileName = getRootName(namefile) + "pick" + getExtension(namefile);
+		string outputFileName2 = getRootName(namefile) + "dups.accnos";
 
+		ofstream out2;
+		if (dups) {	 openOutputFile(outputFileName2, out2);	}
+		bool wroteDups = false;
+		
 		ofstream out;
 		openOutputFile(outputFileName, out);
 
@@ -322,38 +335,47 @@ void RemoveSeqsCommand::readName(){
 				}
 			}
 			
-			//if the name in the first column is in the set then print it and any other names in second column also in set
-			if (names.count(firstCol) == 0) {
-			
-				wroteSomething = true;
-				
-				out << firstCol << '\t';
-				
-				//you know you have at least one valid second since first column is valid
-				for (int i = 0; i < validSecond.size()-1; i++) {  out << validSecond[i] << ',';  }
-				out << validSecond[validSecond.size()-1] << endl;
-			
-			//make first name in set you come to first column and then add the remaining names to second column
+			if ((dups) && (validSecond.size() != parsedNames.size())) { 
+				wroteDups = true;
+				for (int i = 0; i < parsedNames.size(); i++) {  out2 << parsedNames[i] << endl;  }
 			}else {
-				
-				//you want part of this row
-				if (validSecond.size() != 0) {
-				
+					//if the name in the first column is in the set then print it and any other names in second column also in set
+				if (names.count(firstCol) == 0) {
+					
 					wroteSomething = true;
 					
-					out << validSecond[0] << '\t';
-				
+					out << firstCol << '\t';
+					
 					//you know you have at least one valid second since first column is valid
 					for (int i = 0; i < validSecond.size()-1; i++) {  out << validSecond[i] << ',';  }
 					out << validSecond[validSecond.size()-1] << endl;
+					
+					//make first name in set you come to first column and then add the remaining names to second column
+				}else {
+					
+					//you want part of this row
+					if (validSecond.size() != 0) {
+						
+						wroteSomething = true;
+						
+						out << validSecond[0] << '\t';
+						
+						//you know you have at least one valid second since first column is valid
+						for (int i = 0; i < validSecond.size()-1; i++) {  out << validSecond[i] << ',';  }
+						out << validSecond[validSecond.size()-1] << endl;
+					}
 				}
 			}
-			
 			gobble(in);
 		}
 		in.close();
 		out.close();
 		
+		if (dups) { out2.close();  }
+		if (wroteDups == false) {
+			remove(outputFileName2.c_str()); 
+		}
+		
 		if (wroteSomething == false) {
 			mothurOut("Your file contains only sequences from the .accnos file."); mothurOutEndLine();
 			remove(outputFileName.c_str()); 
diff --git a/removeseqscommand.h b/removeseqscommand.h
index 1dcb0fc..647cb76 100644
--- a/removeseqscommand.h
+++ b/removeseqscommand.h
@@ -24,7 +24,7 @@ class RemoveSeqsCommand : public Command {
 	private:
 		set<string> names;
 		string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, outputDir;
-		bool abort;
+		bool abort, dups;
 		
 		void readFasta();
 		void readName();
diff --git a/sabundvector.cpp b/sabundvector.cpp
index b70c642..2bb8d69 100644
--- a/sabundvector.cpp
+++ b/sabundvector.cpp
@@ -60,8 +60,10 @@ SAbundVector::SAbundVector(ifstream& f): DataVector(), maxRank(0), numBins(0), n
 	
 		for(int i=1;i<=hold;i++){
 			f >> inputData;
+
 			set(i, inputData);
 		}
+
 	}
 	catch(exception& e) {
 		errorOut(e, "SAbundVector", "SAbundVector");
@@ -192,13 +194,13 @@ RAbundVector SAbundVector::getRAbundVector(){
 	try {
 		RAbundVector rav;
 	
-		for(int i=1;i<=data.size();i++){		
+		for(int i=1;i < data.size();i++){
 			for(int j=0;j<data[i];j++){
 				rav.push_back(i);
 			}
 		}
 		sort(rav.rbegin(), rav.rend());
-	
+
 		rav.setLabel(label);
 		return rav;
 	}
diff --git a/sharedsabundvector.cpp b/sharedsabundvector.cpp
index 9b89921..a7dfd66 100644
--- a/sharedsabundvector.cpp
+++ b/sharedsabundvector.cpp
@@ -143,7 +143,7 @@ RAbundVector SharedSAbundVector::getRAbundVector(){
 	try {
 		RAbundVector rav;
 	
-		for(int i=1;i<=data.size();i++){		
+		for(int i=1;i<data.size();i++){		
 			for(int j=0;j<data[i].abundance;j++){
 				rav.push_back(i);
 			}
@@ -191,7 +191,7 @@ SharedRAbundVector SharedSAbundVector::getSharedRAbundVector(){
 	try {
 		SharedRAbundVector rav;
 		
-		for(int i=1;i<=data.size();i++){		
+		for(int i=1;i<data.size();i++){		
 			for(int j=0;j<data[i].abundance;j++){
 				rav.push_back(i, data[i].group);
 			}
-- 
2.39.2