some alterations to chimera.slayer

author pschloss <pschloss>

Wed, 20 Apr 2011 16:54:49 +0000 (16:54 +0000)

committer pschloss <pschloss>

Wed, 20 Apr 2011 16:54:49 +0000 (16:54 +0000)
author pschloss <pschloss>
Wed, 20 Apr 2011 16:54:49 +0000 (16:54 +0000)
committer pschloss <pschloss>
Wed, 20 Apr 2011 16:54:49 +0000 (16:54 +0000)
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index d3b9cfc04051882ed9f2260dfd5143b6192714c9..ab8268391b1d39181ee47645f582bd75f2e1e43f 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -310,6 +310,7 @@
  /* Begin PBXFileReference section */
                 7E6BE10812F710D8007ADDBE /* refchimeratest.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = refchimeratest.h; sourceTree = "<group>"; };
                 7E6BE10912F710D8007ADDBE /* refchimeratest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = refchimeratest.cpp; sourceTree = "<group>"; };
+               7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eachgapdistignorens.h; sourceTree = "<group>"; };
                 8DD76FB20486AB0100D96B5E /* Mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Mothur; sourceTree = BUILT_PRODUCTS_DIR; };
                 A70332B512D3A13400761E33 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = "<group>"; };
                 A713EBAA12DC7613000092AC /* readphylipvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readphylipvector.h; sourceTree = "<group>"; };
@@ -1259,6 +1260,7 @@
                 A7E9BA3F12D395F700DA6239 /* calculators */ = {
                         isa = PBXGroup;
                         children = (
+                               7E78911B135F3E8600E725D2 /* eachgapdistignorens.h */,
                                 A7E9B64F12D37EC300DA6239 /* ace.cpp */,
                                 A7E9B65012D37EC300DA6239 /* ace.h */,
                                 A7E9B65E12D37EC300DA6239 /* bergerparker.cpp */,
diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp

index f6baa0a2074d75e01c73eeb50cae1f77f291ee20..cf082f87a2721650d50ffed24c4a0a55a3ffa469 100644 (file)
--- a/chimeraslayercommand.cpp
+++ b/chimeraslayercommand.cpp
@@ -27,7 +27,7 @@ vector<string> ChimeraSlayerCommand::setParameters(){
                 CommandParameter pminbs("minbs", "Number", "", "90", "", "", "",false,false); parameters.push_back(pminbs);
                 CommandParameter psearch("search", "Multiple", "kmer-blast-distance", "distance", "", "", "",false,false); parameters.push_back(psearch);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
-               CommandParameter prealign("realign", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prealign);
+               CommandParameter prealign("realign", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(prealign);
                 CommandParameter ptrim("trim", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptrim);
                 CommandParameter psplit("split", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psplit);
                 CommandParameter pnumwanted("numwanted", "Number", "", "15", "", "", "",false,false); parameters.push_back(pnumwanted);
@@ -53,7 +53,7 @@ string ChimeraSlayerCommand::getHelpString(){
                 string helpString = "";
                 helpString += "The chimera.slayer command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
                 helpString += "This command was modeled after the chimeraSlayer written by the Broad Institute.\n";
-               helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"; //realign,
+               helpString += "The chimera.slayer command parameters are fasta, name, template, processors, trim, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment, numwanted, and realign.\n";
                 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
                 helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n";
                 helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
@@ -78,7 +78,7 @@ string ChimeraSlayerCommand::getHelpString(){
                 helpString += "The minbs parameter allows you to specify minimum bootstrap support for calling a sequence chimeric. Default is 90, meaning 90%. \n";
                 helpString += "The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 100) \n";
                 helpString += "The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n";
-               helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default false.  \n";
+               helpString += "The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default true.  \n";
                 helpString += "The chimera.slayer command should be in the following format: \n";
                 helpString += "chimera.slayer(fasta=yourFastaFile, reference=yourTemplate, search=yourSearch) \n";
                 helpString += "Example: chimera.slayer(fasta=AD.align, reference=core_set_aligned.imputed.fasta, search=kmer) \n";
diff --git a/decalc.cpp b/decalc.cpp

index f0b1ecc4fb1d31cbb79837579604cffaf84863e9..1442bd8ec8865f0574d338675d5899332814d8a8 100644 (file)
--- a/decalc.cpp
+++ b/decalc.cpp
@@ -12,7 +12,7 @@
  #include "dist.h"
  #include "eachgapdist.h"
  #include "ignoregaps.h"
-
+#include "eachgapdistignorens.h"
  
  //***************************************************************************************************************
  void DeCalculator::setMask(string ms) { 
@@ -692,7 +692,7 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                 vector<SeqDist> distsLeft;
                 vector<SeqDist> distsRight;
                 
-               Dist* distcalculator = new eachGapDist();
+               Dist* distcalculator = new eachGapDistIgnoreNs();
                 
                 string queryUnAligned = querySeq->getUnaligned();
                 int numBases = int(queryUnAligned.length() * 0.33);
@@ -780,7 +780,15 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                 //sort by smallest distance
                 sort(distsRight.begin(), distsRight.end(), compareSeqDist);
                 sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
-                               
+//             cout << distsLeft.size() << '\t' << distsRight.size() << endl;
+//             for(int i=0;i<15;i++){
+//                     cout << "left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl;
+//             }
+//             for(int i=0;i<15;i++){
+//                     cout << "right\t" << db[distsLeft[i].index]->getName() << '\t' << distsRight[i].dist << endl;
+//             }
+               
+               
                 //merge results         
                 map<string, string> seen;
                 map<string, string>::iterator it;
@@ -796,6 +804,7 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                                 dists.push_back(distsLeft[i]);
                                 seen[db[distsLeft[i].index]->getName()] = db[distsLeft[i].index]->getName();
                                 lastLeft =  distsLeft[i].dist;
+//                             cout << "loop-left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl;
                         }
  
                         //add right if you havent already
@@ -804,27 +813,52 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
                                 dists.push_back(distsRight[i]);
                                 seen[db[distsRight[i].index]->getName()] = db[distsRight[i].index]->getName();
                                 lastRight =  distsRight[i].dist;
+//                             cout << "loop-right\t" << db[distsRight[i].index]->getName() << '\t' << distsRight[i].dist << endl;
                         }
                         
                         if (dists.size() > numWanted) { lasti = i; break; } //you have enough results
                 }
                 
+//             cout << "lastLeft\t" << lastLeft << endl;
+               
                 //add in dups
                 lasti++;
                 int i = lasti;
                 while (i < distsLeft.size()) {  
-                       if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  numWanted++; }
+                       if (distsLeft[i].dist == lastLeft) {
+                               it = seen.find(db[distsLeft[i].index]->getName());
+
+                               if (it == seen.end()) {  
+//                                     cout << "newLoop-left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist <<  endl;
+                                       dists.push_back(distsLeft[i]);
+                                       seen[db[distsRight[i].index]->getName()] = db[distsLeft[i].index]->getName();
+//                                     numWanted++; 
+                               }
+                       }
                         else { break; }
                         i++;
                 }
                 
+//             cout << "lastRight\t" << lastRight << endl;
+
                 i = lasti;
                 while (i < distsRight.size()) {  
-                       if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  numWanted++; }
+                       if (distsRight[i].dist == lastRight) {
+                               it = seen.find(db[distsRight[i].index]->getName());
+                               
+                               if (it == seen.end()) {  
+//                                     cout << "newLoop-right\t" << db[distsRight[i].index]->getName() << '\t' << distsRight[i].dist << endl;
+                                       dists.push_back(distsRight[i]);
+                                       seen[db[distsRight[i].index]->getName()] = db[distsRight[i].index]->getName();
+//                                     numWanted++; 
+                               }
+                       }
                         else { break; }
                         i++;
                 }
  
+               numWanted = seen.size();
+               
                 if (numWanted > dists.size()) { 
                         //m->mothurOut("numwanted is larger than the number of template sequences, adjusting numwanted."); m->mothurOutEndLine(); 
                         numWanted = dists.size();
@@ -832,7 +866,7 @@ vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*
  
  //cout << numWanted << endl;
                 for (int i = 0; i < numWanted; i++) {
-//cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl;
+//                     cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl;
  
                         Sequence* temp = new Sequence(db[dists[i].index]->getName(), db[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
                         
@@ -853,7 +887,7 @@ Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
                 
                 Sequence* seqsMatch;  
                 
-               Dist* distcalculator = new eachGapDist();
+               Dist* distcalculator = new eachGapDistIgnoreNs();
                 int index = 0;
                 int smallest = 1000000;
                 
diff --git a/eachgapdistignorens.h b/eachgapdistignorens.h

new file mode 100644 (file)

index 0000000..9605a00
--- /dev/null
+++ b/eachgapdistignorens.h
@@ -0,0 +1,58 @@
+#ifndef EACHGAPDISTIGNORENS_H
+#define EACHGAPDISTIGNORENS_H
+
+/*
+ *  eachgapdistignorens.h
+ *  Mothur
+ *
+ *  Created by Pat Schloss on 4/20/11.
+ *  Copyright 2011 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "dist.h"
+
+/**************************************************************************************************/
+
+class eachGapDistIgnoreNs : public Dist {
+       
+public:
+       void calcDist(Sequence A, Sequence B){          
+               int diff = 0;
+               int length = 0;
+               int start = 0;
+               
+               string seqA = A.getAligned();
+               string seqB = B.getAligned();
+               
+               int alignLength = seqA.length();
+               
+               for(int i=0; i<alignLength; i++){
+                       if(seqA[i] != '.' || seqB[i] != '.'){
+                               start = i;
+                               break;
+                       }
+               }
+               
+               for(int i=start;i<alignLength;i++){
+                       if(seqA[i] == '.' && seqB[i] == '.'){
+                               break;  
+                       }
+                       else if((seqA[i] == '-' && seqB[i] == '-') || (seqA[i] == '-' && seqB[i] == '.') || (seqA[i] == '.' && seqB[i] == '-') || seqA[i] == 'N' || seqB[i] == 'N'){;}
+                       else{
+                               if(seqA[i] != seqB[i]){
+                                       diff++;
+                               }
+                               length++;
+                       }
+               }
+               
+               if(length == 0) {       dist = 1.0000;                                                          }
+               else                    {       dist = ((double)diff  / (double)length);        }
+       }
+};
+
+/**************************************************************************************************/
+
+#endif
diff --git a/maligner.cpp b/maligner.cpp

index eaaaa8789961d4938a1aba793fb32e7ac331a105..b3cb991299fbf29d595318e613f8654cc7f3426f 100644 (file)
--- a/maligner.cpp
+++ b/maligner.cpp
@@ -88,7 +88,8 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
  
                 vector<Sequence*> temp = refSeqs;
                 temp.push_back(query);
-               
+                       
+                       
                 verticalFilter(temp);
  
                 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
@@ -204,7 +205,7 @@ int Maligner::computeChimeraPenalty() {
                 
                 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
  
-//             if(numAllowable < 2){   numAllowable = 2;       }
+//             if(numAllowable < 1){   numAllowable = 1;       }
                 
                 int penalty = int(numAllowable + 1) * misMatchPenalty;
  
@@ -314,7 +315,7 @@ void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequenc
                         if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
                                 ms[i][0].score = 0;
  //                             ms[i][0].mismatches = 0;
-                       }else if (queryAligned[0] == subjectAligned[0]) {
+                       }else if (queryAligned[0] == subjectAligned[0] || subjectAligned[0] == 'N') {
                                 ms[i][0].score = matchScore;
  //                             ms[i][0].mismatches = 0;
                         }else{
@@ -335,6 +336,7 @@ void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequenc
                                 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
                                         //leave the same
                                 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
+                                       matchMisMatchScore = matchScore;
                                         //leave the same
                                 }else if (queryAligned[j] == subjectAligned[j]) {
                                         matchMisMatchScore = matchScore;
@@ -421,11 +423,10 @@ vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> >
                         }
                 }
                         
-//             cout << highestScore << '\t' << highestStruct.size() << endl;   
+//             cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl;   
                 
                 vector<trace_struct> maxTrace;
                 double maxPercentIdenticalQueryAntiChimera = 0;
-               int maxIndex = -1;
                 
                 for(int i=0;i<highestStruct.size();i++){
                         
@@ -453,11 +454,7 @@ vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> >
  //                     for(int j=0;j<trace.size();j++){
  //                             cout << trace[j].col << '\t' << trace[j].oldCol << '\t' << refSeqs[trace[j].row]->getName() << endl;
  //                     }
-                       
-//                     Need to do something with this in a bit...
-//                     if (trace.size() > 1) {         chimera = "yes";        }
-//                     else { chimera = "no";  }
-                       
+                                               
                         int traceStart = path[0].col;
                         int traceEnd = path[path.size()-1].col; 
  //                     cout << "traceStart/End\t" << traceStart << '\t' << traceEnd << endl;
@@ -475,10 +472,9 @@ vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> >
                         if(percentIdenticalQueryAntiChimera > maxPercentIdenticalQueryAntiChimera){
                                 maxPercentIdenticalQueryAntiChimera = percentIdenticalQueryAntiChimera;
                                 maxTrace = trace;
-                               maxIndex = i;
                         }
                 }
-//             cout << maxPercentIdenticalQueryAntiChimera << '\t' << maxIndex << endl;
+//             cout << maxPercentIdenticalQueryAntiChimera << endl;
                 return maxTrace;
                 
         }
diff --git a/refchimeratest.cpp b/refchimeratest.cpp

index 7af9ecd0a07ec27c65b287de98e127c53a29b513..28e6728907470c85b97538b097180e0db71170dc 100644 (file)
--- a/refchimeratest.cpp
+++ b/refchimeratest.cpp
@@ -112,9 +112,11 @@ int RefChimeraTest::getMismatches(string& querySeq, vector<vector<int> >& left,
         for(int i=0;i<numRefSeqs;i++){
                 
                 int lDiffs = 0;
+               
                 for(int l=0;l<alignLength;l++){
  //                     if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
-                       if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l]){
+                                               
+                       if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
                                 lDiffs++;
                         }
                         left[i][l] = lDiffs;
@@ -124,11 +126,12 @@ int RefChimeraTest::getMismatches(string& querySeq, vector<vector<int> >& left,
                 int index = 0;
                 for(int l=alignLength-1;l>=0;l--){
  //                     if(querySeq[l] != '.' && querySeq[l] != referenceSeqs[i][l]){
-                       if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l]){
+                       if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
                                 rDiffs++;
                         }                       
                         right[i][index++] = rDiffs;
                 }
+               
                 if(lDiffs < bestSequenceMismatch){
                         bestSequenceMismatch = lDiffs;
                         bestRefSeq = i;
@@ -259,8 +262,8 @@ double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq
         
         for(int i=0;i<alignLength;i++){
  //             if(querySeq[i] != '.' && chimeraRefSeq[i] != '.'){
-               if(chimeraRefSeq[i] != '.'){
-                       if(querySeq[i] == '-' && chimeraRefSeq[i] == '-'){      /*      do nothing      */      }
+               if(chimeraRefSeq[i] != '.' && querySeq[i] != '.'){
+                       if(querySeq[i] == '-' && chimeraRefSeq[i] == '-' && chimeraRefSeq[i] != 'N'){   /*      do nothing      */      }
                         else if(querySeq[i] == chimeraRefSeq[i]){
                                 match++;
                         }
diff --git a/seqerrorcommand.cpp b/seqerrorcommand.cpp

index b4259bfddfa565d1a6982e6583009c3690c90089..392f5877d9099308593d0cba26a8ff7baddd1d3b 100644 (file)
--- a/seqerrorcommand.cpp
+++ b/seqerrorcommand.cpp
@@ -313,10 +313,10 @@ int SeqErrorCommand::execute(){
                         else{   minCompare.weight = 1;  }
  
                         printErrorData(minCompare, numParentSeqs);
-
+               
                         if(!ignoreSeq){
-                               
-                               for(int i=0;i<minCompare.total;i++){
+
+                               for(int i=0;i<minCompare.sequence.length();i++){
                                         char letter = minCompare.sequence[i];
  
                                         errorForward[letter][i] += minCompare.weight;
@@ -429,12 +429,13 @@ void SeqErrorCommand::getReferences(){
                         int numAmbigs = currentSeq.getAmbigBases();
                         if(numAmbigs > 0){      numAmbigSeqs++; }
                         
-                       int startPos = currentSeq.getStartPos();
-                       if(startPos > maxStartPos)      {       maxStartPos = startPos; }
-
-                       int endPos = currentSeq.getEndPos();
-                       if(endPos < minEndPos)          {       minEndPos = endPos;             }
+//                     int startPos = currentSeq.getStartPos();
+//                     if(startPos > maxStartPos)      {       maxStartPos = startPos; }
+//
+//                     int endPos = currentSeq.getEndPos();
+//                     if(endPos < minEndPos)          {       minEndPos = endPos;             }
                         referenceSeqs.push_back(currentSeq);
+                               
                         m->gobble(referenceFile);
                 }
                 referenceFile.close();
diff --git a/sequence.cpp b/sequence.cpp

index 6aa0c0f8ae2c46a32edde8a67276ccb966bc0003..69e905e6bc93e401a242f695577b95019be191c3 100644 (file)
--- a/sequence.cpp
+++ b/sequence.cpp
@@ -233,6 +233,9 @@ string Sequence::getSequenceString(ifstream& fastaFile) {
                         else if(isprint(letter)){
                                 letter = toupper(letter);
                                 if(letter == 'U'){letter = 'T';}
+                               if(letter != '.' && letter != '-' && letter != 'A' && letter != 'T' && letter != 'G'  && letter != 'C'){
+                                       letter = 'N';
+                               }
                                 sequence += letter;
                         }
                 }
author	pschloss <pschloss>
	Wed, 20 Apr 2011 16:54:49 +0000 (16:54 +0000)
committer	pschloss <pschloss>
	Wed, 20 Apr 2011 16:54:49 +0000 (16:54 +0000)
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
chimeraslayercommand.cpp		patch \| blob \| history
decalc.cpp		patch \| blob \| history
eachgapdistignorens.h	[new file with mode: 0644]	patch \| blob
maligner.cpp		patch \| blob \| history
refchimeratest.cpp		patch \| blob \| history
seqerrorcommand.cpp		patch \| blob \| history
sequence.cpp		patch \| blob \| history