/* Begin PBXFileReference section */
7E6BE10812F710D8007ADDBE /* refchimeratest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = refchimeratest.h; sourceTree = "<group>"; };
7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = refchimeratest.cpp; sourceTree = "<group>"; };
+ 7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eachgapdistignorens.h; sourceTree = "<group>"; };
8DD76FB20486AB0100D96B5E /* Mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Mothur; sourceTree = BUILT_PRODUCTS_DIR; };
A70332B512D3A13400761E33 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = "<group>"; };
A713EBAA12DC7613000092AC /* readphylipvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readphylipvector.h; sourceTree = "<group>"; };
A7E9BA3F12D395F700DA6239 /* calculators */ = {
isa = PBXGroup;
children = (
+ 7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */,
A7E9B64F12D37EC300DA6239 /* ace.cpp */,
A7E9B65012D37EC300DA6239 /* ace.h */,
A7E9B65E12D37EC300DA6239 /* bergerparker.cpp */,
CommandParameter pminbs("minbs", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminbs);
CommandParameter psearch("search", "Multiple", "kmer-blast-distance", "distance", "", "", "",false,false); parameters.push_back(psearch);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
- CommandParameter prealign("realign", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prealign);
+ CommandParameter prealign("realign", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(prealign);
CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptrim);
CommandParameter psplit("split", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psplit);
CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "",false,false); parameters.push_back(pnumwanted);
string helpString = "";
helpString += "The chimera.slayer command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
helpString += "This command was modeled after the chimeraSlayer written by the Broad Institute.\n";
- helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"; //realign,
+ helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, and realign.\n";
helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n";
helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
helpString += "The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n";
helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 100) \n";
helpString += "The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n";
- helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default false. \n";
+ helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default true. \n";
helpString += "The chimera.slayer command should be in the following format: \n";
helpString += "chimera.slayer(fasta=yourFastaFile, reference=yourTemplate, search=yourSearch) \n";
helpString += "Example: chimera.slayer(fasta=AD.align, reference=core_set_aligned.imputed.fasta, search=kmer) \n";
#include "dist.h"
#include "eachgapdist.h"
#include "ignoregaps.h"
-
+#include "eachgapdistignorens.h"
//***************************************************************************************************************
void DeCalculator::setMask(string ms) {
vector<SeqDist> distsLeft;
vector<SeqDist> distsRight;
- Dist* distcalculator = new eachGapDist();
+ Dist* distcalculator = new eachGapDistIgnoreNs();
string queryUnAligned = querySeq->getUnaligned();
int numBases = int(queryUnAligned.length() * 0.33);
//sort by smallest distance
sort(distsRight.begin(), distsRight.end(), compareSeqDist);
sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
-
+// cout << distsLeft.size() << '\t' << distsRight.size() << endl;
+// for(int i=0;i<15;i++){
+// cout << "left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl;
+// }
+// for(int i=0;i<15;i++){
+// cout << "right\t" << db[distsLeft[i].index]->getName() << '\t' << distsRight[i].dist << endl;
+// }
+
+
//merge results
map<string, string> seen;
map<string, string>::iterator it;
dists.push_back(distsLeft[i]);
seen[db[distsLeft[i].index]->getName()] = db[distsLeft[i].index]->getName();
lastLeft = distsLeft[i].dist;
+// cout << "loop-left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl;
}
//add right if you havent already
dists.push_back(distsRight[i]);
seen[db[distsRight[i].index]->getName()] = db[distsRight[i].index]->getName();
lastRight = distsRight[i].dist;
+// cout << "loop-right\t" << db[distsRight[i].index]->getName() << '\t' << distsRight[i].dist << endl;
}
if (dists.size() > numWanted) { lasti = i; break; } //you have enough results
}
+// cout << "lastLeft\t" << lastLeft << endl;
+
//add in dups
lasti++;
int i = lasti;
while (i < distsLeft.size()) {
- if (distsLeft[i].dist == lastLeft) { dists.push_back(distsLeft[i]); numWanted++; }
+ if (distsLeft[i].dist == lastLeft) {
+ it = seen.find(db[distsLeft[i].index]->getName());
+
+ if (it == seen.end()) {
+// cout << "newLoop-left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl;
+ dists.push_back(distsLeft[i]);
+ seen[db[distsRight[i].index]->getName()] = db[distsLeft[i].index]->getName();
+// numWanted++;
+ }
+ }
else { break; }
i++;
}
+// cout << "lastRight\t" << lastRight << endl;
+
i = lasti;
while (i < distsRight.size()) {
- if (distsRight[i].dist == lastRight) { dists.push_back(distsRight[i]); numWanted++; }
+ if (distsRight[i].dist == lastRight) {
+ it = seen.find(db[distsRight[i].index]->getName());
+
+ if (it == seen.end()) {
+// cout << "newLoop-right\t" << db[distsRight[i].index]->getName() << '\t' << distsRight[i].dist << endl;
+ dists.push_back(distsRight[i]);
+ seen[db[distsRight[i].index]->getName()] = db[distsRight[i].index]->getName();
+// numWanted++;
+ }
+ }
else { break; }
i++;
}
+ numWanted = seen.size();
+
if (numWanted > dists.size()) {
//m->mothurOut("numwanted is larger than the number of template sequences, adjusting numwanted."); m->mothurOutEndLine();
numWanted = dists.size();
//cout << numWanted << endl;
for (int i = 0; i < numWanted; i++) {
-//cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl;
+// cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl;
Sequence* temp = new Sequence(db[dists[i].index]->getName(), db[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
Sequence* seqsMatch;
- Dist* distcalculator = new eachGapDist();
+ Dist* distcalculator = new eachGapDistIgnoreNs();
int index = 0;
int smallest = 1000000;
--- /dev/null
+#ifndef EACHGAPDISTIGNORENS_H
+#define EACHGAPDISTIGNORENS_H
+
+/*
+ * eachgapdistignorens.h
+ * Mothur
+ *
+ * Created by Pat Schloss on 4/20/11.
+ * Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "dist.h"
+
+/**************************************************************************************************/
+
+class eachGapDistIgnoreNs : public Dist {
+
+public:
+ void calcDist(Sequence A, Sequence B){
+ int diff = 0;
+ int length = 0;
+ int start = 0;
+
+ string seqA = A.getAligned();
+ string seqB = B.getAligned();
+
+ int alignLength = seqA.length();
+
+ for(int i=0; i<alignLength; i++){
+ if(seqA[i] != '.' || seqB[i] != '.'){
+ start = i;
+ break;
+ }
+ }
+
+ for(int i=start;i<alignLength;i++){
+ if(seqA[i] == '.' && seqB[i] == '.'){
+ break;
+ }
+ else if((seqA[i] == '-' && seqB[i] == '-') || (seqA[i] == '-' && seqB[i] == '.') || (seqA[i] == '.' && seqB[i] == '-') || seqA[i] == 'N' || seqB[i] == 'N'){;}
+ else{
+ if(seqA[i] != seqB[i]){
+ diff++;
+ }
+ length++;
+ }
+ }
+
+ if(length == 0) { dist = 1.0000; }
+ else { dist = ((double)diff / (double)length); }
+ }
+};
+
+/**************************************************************************************************/
+
+#endif
vector<Sequence*> temp = refSeqs;
temp.push_back(query);
-
+
+
verticalFilter(temp);
vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
-// if(numAllowable < 2){ numAllowable = 2; }
+// if(numAllowable < 1){ numAllowable = 1; }
int penalty = int(numAllowable + 1) * misMatchPenalty;
if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
ms[i][0].score = 0;
// ms[i][0].mismatches = 0;
- }else if (queryAligned[0] == subjectAligned[0]) {
+ }else if (queryAligned[0] == subjectAligned[0] || subjectAligned[0] == 'N') {
ms[i][0].score = matchScore;
// ms[i][0].mismatches = 0;
}else{
if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
//leave the same
}else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
+ matchMisMatchScore = matchScore;
//leave the same
}else if (queryAligned[j] == subjectAligned[j]) {
matchMisMatchScore = matchScore;
}
}
-// cout << highestScore << '\t' << highestStruct.size() << endl;
+// cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl;
vector<trace_struct> maxTrace;
double maxPercentIdenticalQueryAntiChimera = 0;
- int maxIndex = -1;
for(int i=0;i<highestStruct.size();i++){
// for(int j=0;j<trace.size();j++){
// cout << trace[j].col << '\t' << trace[j].oldCol << '\t' << refSeqs[trace[j].row]->getName() << endl;
// }
-
-// Need to do something with this in a bit...
-// if (trace.size() > 1) { chimera = "yes"; }
-// else { chimera = "no"; }
-
+
int traceStart = path[0].col;
int traceEnd = path[path.size()-1].col;
// cout << "traceStart/End\t" << traceStart << '\t' << traceEnd << endl;
if(percentIdenticalQueryAntiChimera > maxPercentIdenticalQueryAntiChimera){
maxPercentIdenticalQueryAntiChimera = percentIdenticalQueryAntiChimera;
maxTrace = trace;
- maxIndex = i;
}
}
-// cout << maxPercentIdenticalQueryAntiChimera << '\t' << maxIndex << endl;
+// cout << maxPercentIdenticalQueryAntiChimera << endl;
return maxTrace;
}
for(int i=0;i<numRefSeqs;i++){
int lDiffs = 0;
+
for(int l=0;l<alignLength;l++){
// if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
- if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l]){
+
+ if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
lDiffs++;
}
left[i][l] = lDiffs;
int index = 0;
for(int l=alignLength-1;l>=0;l--){
// if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
- if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l]){
+ if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
rDiffs++;
}
right[i][index++] = rDiffs;
}
+
if(lDiffs < bestSequenceMismatch){
bestSequenceMismatch = lDiffs;
bestRefSeq = i;
for(int i=0;i<alignLength;i++){
// if(querySeq[i] != '.' && chimeraRefSeq[i] != '.'){
- if(chimeraRefSeq[i] != '.'){
- if(querySeq[i] == '-' && chimeraRefSeq[i] == '-'){ /* do nothing */ }
+ if(chimeraRefSeq[i] != '.' && querySeq[i] != '.'){
+ if(querySeq[i] == '-' && chimeraRefSeq[i] == '-' && chimeraRefSeq[i] != 'N'){ /* do nothing */ }
else if(querySeq[i] == chimeraRefSeq[i]){
match++;
}
else{ minCompare.weight = 1; }
printErrorData(minCompare, numParentSeqs);
-
+
if(!ignoreSeq){
-
- for(int i=0;i<minCompare.total;i++){
+
+ for(int i=0;i<minCompare.sequence.length();i++){
char letter = minCompare.sequence[i];
errorForward[letter][i] += minCompare.weight;
int numAmbigs = currentSeq.getAmbigBases();
if(numAmbigs > 0){ numAmbigSeqs++; }
- int startPos = currentSeq.getStartPos();
- if(startPos > maxStartPos) { maxStartPos = startPos; }
-
- int endPos = currentSeq.getEndPos();
- if(endPos < minEndPos) { minEndPos = endPos; }
+// int startPos = currentSeq.getStartPos();
+// if(startPos > maxStartPos) { maxStartPos = startPos; }
+//
+// int endPos = currentSeq.getEndPos();
+// if(endPos < minEndPos) { minEndPos = endPos; }
referenceSeqs.push_back(currentSeq);
+
m->gobble(referenceFile);
}
referenceFile.close();
else if(isprint(letter)){
letter = toupper(letter);
if(letter == 'U'){letter = 'T';}
+ if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G' && letter != 'C'){
+ letter = 'N';
+ }
sequence += letter;
}
}