From 0d4b21e5ccc56276b0c18d17d8e75d94ce1df4e7 Mon Sep 17 00:00:00 2001 From: pschloss Date: Wed, 20 Apr 2011 16:54:49 +0000 Subject: [PATCH] some alterations to chimera.slayer --- Mothur.xcodeproj/project.pbxproj | 2 ++ chimeraslayercommand.cpp | 6 ++-- decalc.cpp | 48 ++++++++++++++++++++++---- eachgapdistignorens.h | 58 ++++++++++++++++++++++++++++++++ maligner.cpp | 20 +++++------ refchimeratest.cpp | 11 +++--- seqerrorcommand.cpp | 17 +++++----- sequence.cpp | 3 ++ 8 files changed, 131 insertions(+), 34 deletions(-) create mode 100644 eachgapdistignorens.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index d3b9cfc..ab82683 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -310,6 +310,7 @@ /* Begin PBXFileReference section */ 7E6BE10812F710D8007ADDBE /* refchimeratest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = refchimeratest.h; sourceTree = ""; }; 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = refchimeratest.cpp; sourceTree = ""; }; + 7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eachgapdistignorens.h; sourceTree = ""; }; 8DD76FB20486AB0100D96B5E /* Mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Mothur; sourceTree = BUILT_PRODUCTS_DIR; }; A70332B512D3A13400761E33 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = ""; }; A713EBAA12DC7613000092AC /* readphylipvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readphylipvector.h; sourceTree = ""; }; @@ -1259,6 +1260,7 @@ A7E9BA3F12D395F700DA6239 /* calculators */ = { isa = PBXGroup; children = ( + 7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */, A7E9B64F12D37EC300DA6239 /* ace.cpp */, A7E9B65012D37EC300DA6239 /* ace.h */, A7E9B65E12D37EC300DA6239 /* bergerparker.cpp */, diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index f6baa0a..cf082f8 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -27,7 +27,7 @@ vector ChimeraSlayerCommand::setParameters(){ CommandParameter pminbs("minbs", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminbs); CommandParameter psearch("search", "Multiple", "kmer-blast-distance", "distance", "", "", "",false,false); parameters.push_back(psearch); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter prealign("realign", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prealign); + CommandParameter prealign("realign", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(prealign); CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptrim); CommandParameter psplit("split", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psplit); CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "",false,false); parameters.push_back(pnumwanted); @@ -53,7 +53,7 @@ string ChimeraSlayerCommand::getHelpString(){ string helpString = ""; helpString += "The chimera.slayer command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n"; helpString += "This command was modeled after the chimeraSlayer written by the Broad Institute.\n"; - helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"; //realign, + helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, and realign.\n"; helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n"; helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n"; helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"; @@ -78,7 +78,7 @@ string ChimeraSlayerCommand::getHelpString(){ helpString += "The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n"; helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 100) \n"; helpString += "The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n"; - helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default false. \n"; + helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default true. \n"; helpString += "The chimera.slayer command should be in the following format: \n"; helpString += "chimera.slayer(fasta=yourFastaFile, reference=yourTemplate, search=yourSearch) \n"; helpString += "Example: chimera.slayer(fasta=AD.align, reference=core_set_aligned.imputed.fasta, search=kmer) \n"; diff --git a/decalc.cpp b/decalc.cpp index f0b1ecc..1442bd8 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -12,7 +12,7 @@ #include "dist.h" #include "eachgapdist.h" #include "ignoregaps.h" - +#include "eachgapdistignorens.h" //*************************************************************************************************************** void DeCalculator::setMask(string ms) { @@ -692,7 +692,7 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector distsLeft; vector distsRight; - Dist* distcalculator = new eachGapDist(); + Dist* distcalculator = new eachGapDistIgnoreNs(); string queryUnAligned = querySeq->getUnaligned(); int numBases = int(queryUnAligned.length() * 0.33); @@ -780,7 +780,15 @@ vector DeCalculator::findClosest(Sequence* querySeq, vectorgetName() << '\t' << distsLeft[i].dist << endl; +// } +// for(int i=0;i<15;i++){ +// cout << "right\t" << db[distsLeft[i].index]->getName() << '\t' << distsRight[i].dist << endl; +// } + + //merge results map seen; map::iterator it; @@ -796,6 +804,7 @@ vector DeCalculator::findClosest(Sequence* querySeq, vectorgetName()] = db[distsLeft[i].index]->getName(); lastLeft = distsLeft[i].dist; +// cout << "loop-left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl; } //add right if you havent already @@ -804,27 +813,52 @@ vector DeCalculator::findClosest(Sequence* querySeq, vectorgetName()] = db[distsRight[i].index]->getName(); lastRight = distsRight[i].dist; +// cout << "loop-right\t" << db[distsRight[i].index]->getName() << '\t' << distsRight[i].dist << endl; } if (dists.size() > numWanted) { lasti = i; break; } //you have enough results } +// cout << "lastLeft\t" << lastLeft << endl; + //add in dups lasti++; int i = lasti; while (i < distsLeft.size()) { - if (distsLeft[i].dist == lastLeft) { dists.push_back(distsLeft[i]); numWanted++; } + if (distsLeft[i].dist == lastLeft) { + it = seen.find(db[distsLeft[i].index]->getName()); + + if (it == seen.end()) { +// cout << "newLoop-left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl; + dists.push_back(distsLeft[i]); + seen[db[distsRight[i].index]->getName()] = db[distsLeft[i].index]->getName(); +// numWanted++; + } + } else { break; } i++; } +// cout << "lastRight\t" << lastRight << endl; + i = lasti; while (i < distsRight.size()) { - if (distsRight[i].dist == lastRight) { dists.push_back(distsRight[i]); numWanted++; } + if (distsRight[i].dist == lastRight) { + it = seen.find(db[distsRight[i].index]->getName()); + + if (it == seen.end()) { +// cout << "newLoop-right\t" << db[distsRight[i].index]->getName() << '\t' << distsRight[i].dist << endl; + dists.push_back(distsRight[i]); + seen[db[distsRight[i].index]->getName()] = db[distsRight[i].index]->getName(); +// numWanted++; + } + } else { break; } i++; } + numWanted = seen.size(); + if (numWanted > dists.size()) { //m->mothurOut("numwanted is larger than the number of template sequences, adjusting numwanted."); m->mothurOutEndLine(); numWanted = dists.size(); @@ -832,7 +866,7 @@ vector DeCalculator::findClosest(Sequence* querySeq, vectorgetName() << '\t' << dists[i].dist << endl; +// cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl; Sequence* temp = new Sequence(db[dists[i].index]->getName(), db[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother. @@ -853,7 +887,7 @@ Sequence* DeCalculator::findClosest(Sequence* querySeq, vector db) { Sequence* seqsMatch; - Dist* distcalculator = new eachGapDist(); + Dist* distcalculator = new eachGapDistIgnoreNs(); int index = 0; int smallest = 1000000; diff --git a/eachgapdistignorens.h b/eachgapdistignorens.h new file mode 100644 index 0000000..9605a00 --- /dev/null +++ b/eachgapdistignorens.h @@ -0,0 +1,58 @@ +#ifndef EACHGAPDISTIGNORENS_H +#define EACHGAPDISTIGNORENS_H + +/* + * eachgapdistignorens.h + * Mothur + * + * Created by Pat Schloss on 4/20/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + + +#include "dist.h" + +/**************************************************************************************************/ + +class eachGapDistIgnoreNs : public Dist { + +public: + void calcDist(Sequence A, Sequence B){ + int diff = 0; + int length = 0; + int start = 0; + + string seqA = A.getAligned(); + string seqB = B.getAligned(); + + int alignLength = seqA.length(); + + for(int i=0; i temp = refSeqs; temp.push_back(query); - + + verticalFilter(temp); vector< vector > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes @@ -204,7 +205,7 @@ int Maligner::computeChimeraPenalty() { int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases()); -// if(numAllowable < 2){ numAllowable = 2; } +// if(numAllowable < 1){ numAllowable = 1; } int penalty = int(numAllowable + 1) * misMatchPenalty; @@ -314,7 +315,7 @@ void Maligner::fillScoreMatrix(vector >& ms, vector >& ms, vector Maligner::extractHighestPath(vector > } } -// cout << highestScore << '\t' << highestStruct.size() << endl; +// cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl; vector maxTrace; double maxPercentIdenticalQueryAntiChimera = 0; - int maxIndex = -1; for(int i=0;i Maligner::extractHighestPath(vector > // for(int j=0;jgetName() << endl; // } - -// Need to do something with this in a bit... -// if (trace.size() > 1) { chimera = "yes"; } -// else { chimera = "no"; } - + int traceStart = path[0].col; int traceEnd = path[path.size()-1].col; // cout << "traceStart/End\t" << traceStart << '\t' << traceEnd << endl; @@ -475,10 +472,9 @@ vector Maligner::extractHighestPath(vector > if(percentIdenticalQueryAntiChimera > maxPercentIdenticalQueryAntiChimera){ maxPercentIdenticalQueryAntiChimera = percentIdenticalQueryAntiChimera; maxTrace = trace; - maxIndex = i; } } -// cout << maxPercentIdenticalQueryAntiChimera << '\t' << maxIndex << endl; +// cout << maxPercentIdenticalQueryAntiChimera << endl; return maxTrace; } diff --git a/refchimeratest.cpp b/refchimeratest.cpp index 7af9ecd..28e6728 100644 --- a/refchimeratest.cpp +++ b/refchimeratest.cpp @@ -112,9 +112,11 @@ int RefChimeraTest::getMismatches(string& querySeq, vector >& left, for(int i=0;i >& left, int index = 0; for(int l=alignLength-1;l>=0;l--){ // if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){ - if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l]){ + if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){ rDiffs++; } right[i][index++] = rDiffs; } + if(lDiffs < bestSequenceMismatch){ bestSequenceMismatch = lDiffs; bestRefSeq = i; @@ -259,8 +262,8 @@ double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq for(int i=0;i 0){ numAmbigSeqs++; } - int startPos = currentSeq.getStartPos(); - if(startPos > maxStartPos) { maxStartPos = startPos; } - - int endPos = currentSeq.getEndPos(); - if(endPos < minEndPos) { minEndPos = endPos; } +// int startPos = currentSeq.getStartPos(); +// if(startPos > maxStartPos) { maxStartPos = startPos; } +// +// int endPos = currentSeq.getEndPos(); +// if(endPos < minEndPos) { minEndPos = endPos; } referenceSeqs.push_back(currentSeq); + m->gobble(referenceFile); } referenceFile.close(); diff --git a/sequence.cpp b/sequence.cpp index 6aa0c0f..69e905e 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -233,6 +233,9 @@ string Sequence::getSequenceString(ifstream& fastaFile) { else if(isprint(letter)){ letter = toupper(letter); if(letter == 'U'){letter = 'T';} + if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G' && letter != 'C'){ + letter = 'N'; + } sequence += letter; } } -- 2.39.2