]> git.donarmstrong.com Git - mothur.git/commitdiff
continued work on chimeras and fixed bug in trim.seqs and reverse.seqs that was due...
authorwestcott <westcott>
Wed, 9 Sep 2009 20:20:23 +0000 (20:20 +0000)
committerwestcott <westcott>
Wed, 9 Sep 2009 20:20:23 +0000 (20:20 +0000)
22 files changed:
Mothur.xcodeproj/project.pbxproj
aligncommand.cpp
alignedsimilarity.cpp [deleted file]
alignedsimilarity.h [deleted file]
alignment.cpp
alignment.hpp
ccode.cpp
ccode.h
chimera.h
chimeracheckrdp.cpp [new file with mode: 0644]
chimeracheckrdp.h [new file with mode: 0644]
chimeraseqscommand.cpp
chimeraseqscommand.h
database.cpp
kmer.cpp
kmer.hpp
kmerdb.hpp
nast.cpp
needlemanoverlap.cpp
needlemanoverlap.hpp
sequence.cpp
trimseqscommand.cpp

index 75ef0d57dda26943c7a2da0c295369cd501c8ed1..6aa6b2774a80080173293dc3af259e01952f1d49 100644 (file)
                A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; };
                A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; };
                A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
-               A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B8879104C16860083C454 /* alignedsimilarity.cpp */; };
+               A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */; };
                A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
                EB1216880F619B83004A865F /* bergerparker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216870F619B83004A865F /* bergerparker.cpp */; };
                EB1216E50F61ACFB004A865F /* bstick.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB1216E40F61ACFB004A865F /* bstick.cpp */; };
                A70B53A70F4CD7AD0064797E /* getlabelcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlabelcommand.h; sourceTree = SOURCE_ROOT; };
                A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlinecommand.cpp; sourceTree = SOURCE_ROOT; };
                A70B53A90F4CD7AD0064797E /* getlinecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getlinecommand.h; sourceTree = SOURCE_ROOT; };
-               A75B8879104C16860083C454 /* alignedsimilarity.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignedsimilarity.cpp; sourceTree = SOURCE_ROOT; };
-               A75B887A104C16860083C454 /* alignedsimilarity.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignedsimilarity.h; sourceTree = SOURCE_ROOT; };
+               A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeracheckrdp.h; sourceTree = "<group>"; };
+               A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeracheckrdp.cpp; sourceTree = "<group>"; };
                A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = SOURCE_ROOT; };
                A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = SOURCE_ROOT; };
                C6859E8B029090EE04C91782 /* Mothur.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = Mothur.1; sourceTree = "<group>"; };
                        children = (
                                374F2FD51006320200E97C66 /* chimera.h */,
                                372095C1103196D70004D347 /* chimera.cpp */,
-                               A75B887A104C16860083C454 /* alignedsimilarity.h */,
-                               A75B8879104C16860083C454 /* alignedsimilarity.cpp */,
                                374F2FE9100634B600E97C66 /* bellerophon.h */,
                                374F2FEA100634B600E97C66 /* bellerophon.cpp */,
                                A75B887C104C16860083C454 /* ccode.h */,
                                A75B887B104C16860083C454 /* ccode.cpp */,
+                               A7283FF61056CAE100D0CC69 /* chimeracheckrdp.h */,
+                               A7283FF71056CAE100D0CC69 /* chimeracheckrdp.cpp */,
                                372A55531017922B00C5194B /* decalc.h */,
                                372A55541017922B00C5194B /* decalc.cpp */,
                                374F301110063B6F00E97C66 /* pintail.h */,
                                374F301310063B6F00E97C66 /* pintail.cpp in Sources */,
                                372A55551017922B00C5194B /* decalc.cpp in Sources */,
                                372095C2103196D70004D347 /* chimera.cpp in Sources */,
-                               A75B887D104C16860083C454 /* alignedsimilarity.cpp in Sources */,
                                A75B887E104C16860083C454 /* ccode.cpp in Sources */,
+                               A7283FF81056CAE100D0CC69 /* chimeracheckrdp.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
index 958f29b39213c3fda29e810af8832fef4afa83e1..da807bdd5103e5f8f8024217e12b15e4ba1b59ce 100644 (file)
@@ -272,6 +272,11 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName){
                for(int i=0;i<line->numSeqs;i++){
                        
                        Sequence* candidateSeq = new Sequence(inFASTA);
+
+                       if (candidateSeq->getUnaligned().length() > alignment->getnRows()) {
+                               alignment->resize(candidateSeq->getUnaligned().length()+1);
+                       }
+                       
                        report.setCandidate(candidateSeq);
        
                        Sequence temp = templateDB->findClosestSequence(candidateSeq);
diff --git a/alignedsimilarity.cpp b/alignedsimilarity.cpp
deleted file mode 100644 (file)
index 0ed9ce3..0000000
+++ /dev/null
@@ -1,543 +0,0 @@
-/*
- *  alignedsimilarity.cpp
- *  Mothur
- *
- *  Created by westcott on 8/18/09.
- *  Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "alignedsimilarity.h"
-#include "ignoregaps.h"
-
-
-//***************************************************************************************************************
-AlignSim::AlignSim(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
-//***************************************************************************************************************
-
-AlignSim::~AlignSim() {
-       try {
-               for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
-               for (int i = 0; i < templateSeqs.size(); i++)   {  delete templateSeqs[i];      }
-               delete distCalc;
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "~AlignSim");
-               exit(1);
-       }
-}      
-//***************************************************************************************************************
-void AlignSim::print(ostream& out) {
-       try {
-               
-               mothurOutEndLine();
-               
-               for (int i = 0; i < querySeqs.size(); i++) {
-                       
-                       int j = 0;  float largest = -10;
-                       //find largest sim value
-                       for (int k = 0; k < IS[i].size(); k++) {
-                               //is this score larger
-                               if (IS[i][k].score > largest) {
-                                       j = k;
-                                       largest = IS[i][k].score;
-                               }
-                       }
-                       
-                       //find parental similarity
-                       distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
-                       float dist = distCalc->getDist();
-                       
-                       //convert to similarity
-                       dist = (1 - dist) * 100;
-
-                       //warn about parental similarity - if its above 82% may not detect a chimera 
-                       if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly.");  mothurOutEndLine(); }
-                       
-                       int index = ceil(dist);
-                       
-                       if (index == 0) { index=1;  }
-       
-                       //is your DE value higher than the 95%
-                       string chimera;
-                       if (IS[i][j].score > quantile[index-1][4])              {       chimera = "Yes";        }
-                       else                                                                            {       chimera = "No";         }                       
-                       
-                       out << querySeqs[i]->getName() <<  "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
-                       
-                       if (chimera == "Yes") {
-                               mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
-                       }
-                       out << "Improvement Score\t";
-                       
-                       for (int r = 0; r < IS[i].size(); r++) {  out << IS[i][r].score << '\t';  }
-                       out << endl;
-               }
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "print");
-               exit(1);
-       }
-}
-
-//***************************************************************************************************************
-void AlignSim::getChimeras() {
-       try {
-               
-               //read in query sequences and subject sequences
-               mothurOut("Reading sequences and template file... "); cout.flush();
-               querySeqs = readSeqs(fastafile);
-               templateSeqs = readSeqs(templateFile);
-               mothurOut("Done."); mothurOutEndLine();
-               
-               int numSeqs = querySeqs.size();
-               
-               IS.resize(numSeqs);
-               
-               //break up file if needed
-               int linesPerProcess = numSeqs / processors ;
-               
-               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                       //find breakup of sequences for all times we will Parallelize
-                       if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
-                       else {
-                               //fill line pairs
-                               for (int i = 0; i < (processors-1); i++) {                      
-                                       lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
-                               }
-                               //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
-                               int i = processors - 1;
-                               lines.push_back(new linePair((i*linesPerProcess), numSeqs));
-                       }
-                       
-                       //find breakup of templatefile for quantiles
-                       if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
-                       else { 
-                               for (int i = 0; i < processors; i++) {
-                                       templateLines.push_back(new linePair());
-                                       templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
-                                       templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
-                               }
-                       }
-               #else
-                       lines.push_back(new linePair(0, numSeqs));
-                       templateLines.push_back(new linePair(0, templateSeqs.size()));
-               #endif
-       
-       
-               distCalc = new ignoreGaps();
-               
-               //find window breaks
-               windowBreak = findWindows();
-//for (int i = 0; i < windowBreak.size(); i++) { cout << windowBreak[i] << '\t';  }
-//cout << endl;
-
-               mothurOut("Finding the IS values for your sequences..."); cout.flush();
-               if (processors == 1) { 
-                       IS = findIS(lines[0]->start, lines[0]->end, querySeqs);
-               }else {         IS = createProcessesIS(querySeqs, lines);               }
-               mothurOut("Done."); mothurOutEndLine();
-               
-               //quantiles are used to determine whether the de values found indicate a chimera
-               if (quanfile != "") {  quantile = readQuantiles();  }
-               else {
-                       
-                       mothurOut("Calculating quantiles for your template.  This can take a while...  I will output the quantiles to a .alignedsimilarity.quan file that you can input them using the quantiles parameter next time you run this command.  Providing the .quan file will dramatically improve speed.    "); cout.flush();
-                       //get IS quantiles as a reference to these IS scores
-                       //you are assuming the template is free of chimeras and therefore will give you a baseline as to the scores you would like to see
-                       if (processors == 1) { 
-                               templateIS = findIS(templateLines[0]->start, templateLines[0]->end, templateSeqs);
-                       }else {         templateIS = createProcessesIS(templateSeqs, templateLines);            }
-//cout << "here" << endl;                      
-                       ofstream out4;
-                       string o;
-                       
-                       o = getRootName(templateFile) + "alignedsimilarity.quan";
-                       
-                       openOutputFile(o, out4);
-                       
-                       //adjust quantiles
-                       //construct table
-                       quantile.resize(100);
-               
-                       for (int i = 0; i < templateIS.size(); i++) {
-                               
-                               if (templateIS[i].size() != 0) {
-                                       int j = 0; float score = -1000;
-                                       //find highest IS score
-       //cout << templateIS[i].size() << endl;
-                                       for (int k = 0; k < templateIS[i].size(); k++) {
-                                               if (templateIS[i][k].score > score) {
-                                                       score = templateIS[i][k].score;
-                                                       j = k;
-                                               }
-                                       }
-       //cout << j << endl;            
-                                       //find similarity of parents
-                                       distCalc->calcDist(*(templateIS[i][j].leftParent), *(templateIS[i][j].rightParent));
-                                       float dist = distCalc->getDist();
-                       
-                                       //convert to similarity
-                                       dist = (1 - dist) * 100;
-       //      cout << dist << endl;   
-                                       int index = ceil(dist);
-       //      cout << "index = " << index << endl;    
-                                       if (index == 0) {       index = 1;      }
-                                       quantile[index-1].push_back(templateIS[i][j].score);
-                               }
-                       }
-               
-               
-                       for (int i = 0; i < quantile.size(); i++) {
-                               vector<float> temp;
-                               
-                               if (quantile[i].size() == 0) {
-                                       //in case this is not a distance found in your template files
-                                       for (int g = 0; g < 6; g++) {
-                                               temp.push_back(0.0);
-                                       }
-                               }else{
-                                       
-                                       sort(quantile[i].begin(), quantile[i].end());
-                                       
-                                       //save 10%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.10)]);
-                                       //save 25%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.25)]);
-                                       //save 50%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.5)]);
-                                       //save 75%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.75)]);
-                                       //save 95%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.95)]);
-                                       //save 99%
-                                       temp.push_back(quantile[i][int(quantile[i].size() * 0.99)]);
-                                       
-                               }
-                               
-                               //output quan value
-                               out4 << i+1 << '\t';                            
-                               for (int u = 0; u < temp.size(); u++) {   out4 << temp[u] << '\t'; }
-                               out4 << endl;
-                               
-                               quantile[i] = temp;
-                               
-                       }
-                       
-                       out4.close();
-                       
-                       mothurOut("Done."); mothurOutEndLine();
-
-               }
-               
-               //free memory
-               for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
-               for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
-                       
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "getChimeras");
-               exit(1);
-       }
-}
-//***************************************************************************************************************
-vector<int> AlignSim::findWindows() {
-       try {
-               
-               vector<int> win; 
-               
-               if (increment > querySeqs[0]->getAligned().length()) {  mothurOut("You have selected an increment larger than the length of your sequences.  I will use the default of 25.");  increment = 25; }
-               
-               for (int m = increment;  m < (querySeqs[0]->getAligned().length() - increment); m+=increment) {  win.push_back(m);  }
-
-               return win;
-       
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "findWindows");
-               exit(1);
-       }
-}
-
-//***************************************************************************************************************
-vector< vector<sim>  > AlignSim::findIS(int start, int end, vector<Sequence*> seqs) {
-       try {
-               
-               vector< vector<sim> >  isValues;
-               isValues.resize(seqs.size());
-               
-               //for each sequence
-               for(int i = start; i < end; i++){
-                       
-                       vector<sim> temp;  temp.resize(windowBreak.size());
-                       
-                       mothurOut("Finding IS value for sequence " + toString(i)); mothurOutEndLine();
-                       
-                       //for each window
-                       for (int m = 0; m < windowBreak.size(); m++) {
-                               
-                               vector<Sequence*> closest;  //left, right, overall
-                               vector<int> similarity; //left+right and overall
-                               
-                               //find closest left, closest right and closest overall
-                               closest = findClosestSides(seqs[i], windowBreak[m], similarity, i);
-                               
-                               int totalNumBases = seqs[i]->getUnaligned().length();
-                               
-                               //IS = left+right-overall
-                               float is = ((similarity[0]+similarity[1]) / (float) totalNumBases) - (similarity[2] / (float) totalNumBases);
-                               
-                               ///save IS, leftparent, rightparent, breakpoint
-                               temp[m].leftParent = closest[0];
-                               temp[m].rightParent = closest[1];
-                               temp[m].score = is;
-                               temp[m].midpoint = windowBreak[m];
-//cout << is << '\t';
-                       }
-//cout << endl;                        
-                       isValues[i] = temp;
-               
-               }
-               
-               return isValues;
-       
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "findIS");
-               exit(1);
-       }
-}
-//***************************************************************************************************************
-vector<Sequence*> AlignSim::findClosestSides(Sequence* seq, int breakpoint, vector<int>& sim, int i) {
-       try{
-       
-               vector<Sequence*> closest;
-               
-               Sequence query, queryLeft, queryRight;
-               string frag = seq->getAligned();
-               string fragLeft = frag.substr(0, breakpoint);
-               string fragRight = frag.substr(breakpoint, frag.length());
-               
-               //get pieces
-               query = *(seq);  
-               queryLeft = *(seq);  
-               queryRight = *(seq);
-               queryLeft.setAligned(fragLeft);
-               queryRight.setAligned(fragRight);
-               
-               //initialize
-               Sequence* overall = templateSeqs[0];
-               Sequence* left = templateSeqs[0];
-               Sequence* right = templateSeqs[0];
-               
-               float smallestOverall, smallestLeft, smallestRight;
-               smallestOverall = 1000;  smallestLeft = 1000;  smallestRight = 1000;
-               
-               //go through the templateSeqs and search for the closest
-               for(int j = 0; j < templateSeqs.size(); j++){
-                       
-                       //so you don't pick yourself
-                       if (j != i) {
-                               Sequence temp, tempLeft, tempRight;
-                               string fragTemp = templateSeqs[j]->getAligned();
-                               string fragTempLeft = fragTemp.substr(0, breakpoint);
-                               string fragTempRight = fragTemp.substr(breakpoint, fragTemp.length());
-                               
-                               //get pieces
-                               temp = *(templateSeqs[j]); 
-                               tempLeft = *(templateSeqs[j]); 
-                               tempRight = *(templateSeqs[j]);
-                               tempLeft.setAligned(fragTempLeft);
-                               tempRight.setAligned(fragTempRight);
-                               
-                               //find overall dist
-                               distCalc->calcDist(query, temp);
-                               float dist = distCalc->getDist();       
-                               
-                               if (dist < smallestOverall) { 
-                                       overall = templateSeqs[j];
-                                       smallestOverall = dist;
-                               }
-                               
-                               //find left dist
-                               distCalc->calcDist(queryLeft, tempLeft);
-                               dist = distCalc->getDist();
-                               
-                               if (dist < smallestLeft) { 
-                                       left = templateSeqs[j];
-                                       smallestLeft = dist;
-                               }
-                               
-                               //find left dist
-                               distCalc->calcDist(queryRight, tempRight);
-                               dist = distCalc->getDist();
-                               
-                               if (dist < smallestRight) { 
-                                       right = templateSeqs[j];
-                                       smallestRight = dist;
-                               }
-                       }
-
-               }
-                       
-               closest.push_back(left);
-               closest.push_back(right);
-               closest.push_back(overall);
-               
-               //fill sim with number of matched bases
-               Sequence tempL = *(left);
-               string tempLFrag = tempL.getAligned();
-               tempL.setAligned(tempLFrag.substr(0, breakpoint));
-               
-               int bothMatches = findNumMatchedBases(queryLeft, tempL);
-               sim.push_back(bothMatches);
-
-               Sequence tempR = *(right);
-               string tempRFrag = tempR.getAligned();
-               tempR.setAligned(tempRFrag.substr(breakpoint, tempRFrag.length()));
-               
-               bothMatches = findNumMatchedBases(queryRight, tempR);
-               sim.push_back(bothMatches);
-
-               bothMatches = findNumMatchedBases(query, *(overall));
-               sim.push_back(bothMatches);
-                       
-               return closest;
-
-
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "findClosestSides");
-               exit(1);
-       }
-}
-//***************************************************************************************************************
-
-int AlignSim::findNumMatchedBases(Sequence seq, Sequence temp) {
-       try{
-       
-               int num = 0;
-               
-               string query = seq.getAligned();
-               string subject = temp.getAligned();
-//cout << seq.getName() << endl << endl;
-//cout << temp.getName()  << endl << endl;
-               
-               for (int i = 0; i < query.length(); i++) {
-                       //count matches if query[i] is a base and subject is same bases or gap
-                       if (isalpha(query[i])) {
-                               if(!isalpha(subject[i])) {  num++;  } 
-                               else if (query[i] == subject[i])  { num++;  }
-                               else {  }
-                       }
-               }
-//cout << "num = " << num << endl;             
-               return num;
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "findNumMatchedBases");
-               exit(1);
-       }
-}
-
-/**************************************************************************************************/
-vector< vector<sim> > AlignSim::createProcessesIS(vector<Sequence*> seqs, vector<linePair*> linesToProcess) {
-       try {
-       vector< vector<sim> > localIs; localIs.resize(seqs.size());
-       
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-               int process = 0;
-               vector<int> processIDS;
-               
-               //loop through and create all the processes you want
-               while (process != processors) {
-                       int pid = fork();
-                       
-                       if (pid > 0) {
-                               processIDS.push_back(pid);  
-                               process++;
-                       }else if (pid == 0){
-                               
-                               mothurOut("Finding IS values for sequences " + toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
-                               localIs = findIS(linesToProcess[process]->start, linesToProcess[process]->end, seqs);
-                               mothurOut("Done finding IS values for sequences " +  toString(linesToProcess[process]->start) + " to " + toString(linesToProcess[process]->end)); mothurOutEndLine();
-                               
-                               //write out data to file so parent can read it
-                               ofstream out;
-                               string s = toString(getpid()) + ".temp";
-                               openOutputFile(s, out);
-                               
-                               //output pairs
-                               for (int i = linesToProcess[process]->start; i < linesToProcess[process]->end; i++) {
-                                        out << localIs[i].size() << endl;
-                                        for (int j = 0; j < localIs[i].size(); j++) {
-                                               localIs[i][j].leftParent->printSequence(out);
-                                               localIs[i][j].rightParent->printSequence(out);
-                                               out << ">" << '\t' << localIs[i][j].score << '\t' << localIs[i][j].midpoint << endl;
-                                        }
-                               }
-                               out.close();
-                               
-                               exit(0);
-                       }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
-               }
-               
-               //force parent to wait until all the processes are done
-               for (int i=0;i<processors;i++) { 
-                       int temp = processIDS[i];
-                       wait(&temp);
-               }
-               
-               //get data created by processes
-               for (int i=0;i<processors;i++) { 
-                       ifstream in;
-                       string s = toString(processIDS[i]) + ".temp";
-                       openInputFile(s, in);
-                       
-                       //get pairs
-                       for (int k = linesToProcess[i]->start; k < linesToProcess[i]->end; k++) {
-                               int size;
-                               in >> size;
-                               gobble(in);
-                               
-                               vector<sim> tempVector;
-                               
-                               for (int j = 0; j < size; j++) {
-                               
-                                       sim temp;
-                                       
-                                       temp.leftParent = new Sequence(in);
-                                       gobble(in);
-       //temp.leftParent->printSequence(cout);                         
-                                       temp.rightParent = new Sequence(in);
-                                       gobble(in);
-//temp.rightParent->printSequence(cout);
-                                       string throwaway;
-                                       in >> throwaway >> temp.score >> temp.midpoint;
-       //cout << temp.score << '\t' << temp.midpoint << endl;                          
-                                       gobble(in);
-                                       
-                                       tempVector.push_back(temp);
-                               }
-                               
-                               localIs[k] = tempVector;
-                       }
-                       
-                       in.close();
-                       remove(s.c_str());
-               }
-                       
-       
-#else
-               localIs = findIS(linesToProcess[0]->start, linesToProcess[0]->end, seqs);
-#endif 
-
-               return localIs;
-       }
-       catch(exception& e) {
-               errorOut(e, "AlignSim", "createProcessesIS");
-               exit(1);
-       }
-}
-
-//***************************************************************************************************************
diff --git a/alignedsimilarity.h b/alignedsimilarity.h
deleted file mode 100644 (file)
index 716b8b7..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef ALIGNSIM_H
-#define ALIGNSIM_H
-
-/*
- *  alignedsimilarity.h
- *  Mothur
- *
- *  Created by westcott on 8/18/09.
- *  Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-
-#include "chimera.h"
-#include "dist.h"
-
-//This class was created using the algorythms described in the 
-//"Evaluation of Nearest Neighbor Methods for Detection of Chimeric Small-Subunit rRna Sequences" paper 
-//by J.F. Robison-Cox 1, M.M. Bateson 2 and D.M. Ward 2
-
-
-/***********************************************************/
-
-class AlignSim : public Chimera {
-       
-       public:
-               AlignSim(string, string);       
-               ~AlignSim();
-               
-               void getChimeras();
-               void print(ostream&);
-               
-               void setCons(string){};
-               void setQuantiles(string q) { quanfile = q; };
-               
-               
-       private:
-               
-               Dist* distCalc;
-               vector<linePair*> lines;
-               vector<linePair*> templateLines;
-               
-               vector<Sequence*> querySeqs;
-               vector<Sequence*> templateSeqs;
-               
-               vector< vector<sim> > IS;  //IS[0] is the vector os sim values for each window for querySeqs[0]
-               vector< vector<sim> > templateIS;  //templateIS[0] is the vector os sim values for each window for templateSeqs[0]
-               vector<int> windowBreak;
-               
-               string fastafile, templateFile;
-               int iters;
-               
-               vector< vector<sim> > findIS(int, int, vector<Sequence*>);
-               vector<Sequence*> findClosestSides(Sequence*, int, vector<int>&, int);
-               int findNumMatchedBases(Sequence, Sequence);
-               vector<int> findWindows();
-               
-               vector< vector<float> > quantile;
-               
-               vector< vector<sim> > createProcessesIS(vector<Sequence*>, vector<linePair*>);
-               
-};
-
-/***********************************************************/
-
-#endif
-
index 25b27604533cbefa4ba1748001a45622e9712547..2734c389cd0122add50ee1b3cc23cd879788ca0b 100644 (file)
@@ -31,7 +31,22 @@ Alignment::Alignment(int A) : nCols(A), nRows(A) {
                exit(1);
        }
 }
+/**************************************************************************************************/
+void Alignment::resize(int A) {
+       try {
+               nCols = A;
+               nRows = A;
 
+               alignment.resize(nRows);                        
+               for(int i=0;i<nRows;i++){                       
+                       alignment[i].resize(nCols);             
+               }       
+       }
+       catch(exception& e) {
+               errorOut(e, "Alignment", "resize");
+               exit(1);
+       }
+}
 /**************************************************************************************************/
 
 void Alignment::traceBack(){                   //      This traceback routine is used by the dynamic programming algorithms
index 77b650cab75b8f9bbbd4241f851ccf48da556c3f..6dd7f0aed42938bd94a50bc6d4ebf88eda1c2b40 100644 (file)
@@ -36,6 +36,8 @@ public:
        int getTemplateEndPos();
        
        int getPairwiseLength();
+       void resize(int);
+       int getnRows() { return nRows; }
 //     int getLongestTemplateGap();
 
 protected:
index ec2e355f882d86a60aba6f520604ef06ead6e6c8..0f1dd50aba5fec78ddd3aff3667533fcdef500b4 100644 (file)
--- a/ccode.cpp
+++ b/ccode.cpp
@@ -177,18 +177,8 @@ void Ccode::getChimeras() {
                                lines.push_back(new linePair((i*linesPerProcess), numSeqs));
                        }
                        
-                       //find breakup of templatefile for quantiles
-                       if (processors == 1) {   templateLines.push_back(new linePair(0, templateSeqs.size()));  }
-                       else { 
-                               for (int i = 0; i < processors; i++) {
-                                       templateLines.push_back(new linePair());
-                                       templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size());
-                                       templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size());
-                               }
-                       }
                #else
                        lines.push_back(new linePair(0, numSeqs));
-                       templateLines.push_back(new linePair(0, templateSeqs.size()));
                #endif
        
                distCalc = new eachGapDist();
@@ -305,7 +295,6 @@ void Ccode::getChimeras() {
                
                //free memory
                for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];                                }
-               for (int i = 0; i < templateLines.size(); i++)                  {       delete templateLines[i];                }
                delete distCalc;
                delete decalc;
                        
diff --git a/ccode.h b/ccode.h
index 2888aea3b7cc8bf2e063c078b7abbaae43f1d62c..bc1aad8b2b6776c57922ad644e40ef325ea78d4d 100644 (file)
--- a/ccode.h
+++ b/ccode.h
@@ -42,7 +42,6 @@ class Ccode : public Chimera {
                
                
                vector<linePair*> lines;
-               vector<linePair*> templateLines;
                vector<Sequence*> querySeqs;
                vector<Sequence*> templateSeqs;
                vector< map<int, int> > spotMap;
index 366068b43f77cae77f6df9ce82766e253a8789e1..8d7e08d50b37fb92049f52d4d9083030a6acbd5b 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -39,8 +39,8 @@ inline bool compareSeqDist(SeqDist left, SeqDist right){
 //********************************************************************************************************************
 
 struct sim {
-               Sequence* leftParent;
-               Sequence* rightParent; 
+               string leftParent;
+               string rightParent; 
                float score;  
                int midpoint;
 };
@@ -68,6 +68,7 @@ class Chimera {
                virtual void setWindow(int w)                   {       window = w;                     }
                virtual void setIncrement(int i)                {       increment = i;          }
                virtual void setNumWanted(int n)                {       numWanted = n;          }
+               virtual void setKmerSize(int k)                 {       kmerSize = k;           }
                
                virtual void setCons(string){};
                virtual void setQuantiles(string){};
@@ -85,7 +86,7 @@ class Chimera {
        protected:
                
                bool filter, correction;
-               int processors, window, increment, numWanted;
+               int processors, window, increment, numWanted, kmerSize;
                string seqMask, quanfile, filterString;
                        
 
diff --git a/chimeracheckrdp.cpp b/chimeracheckrdp.cpp
new file mode 100644 (file)
index 0000000..73bc1a0
--- /dev/null
@@ -0,0 +1,295 @@
+/*
+ *  chimeracheckrdp.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 9/8/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "chimeracheckrdp.h"
+               
+//***************************************************************************************************************
+ChimeraCheckRDP::ChimeraCheckRDP(string filename, string temp) {  fastafile = filename;  templateFile = temp;  }
+//***************************************************************************************************************
+
+ChimeraCheckRDP::~ChimeraCheckRDP() {
+       try {
+               for (int i = 0; i < querySeqs.size(); i++)              {  delete querySeqs[i];         }
+               delete templateDB;
+               delete kmer;
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraCheckRDP", "~AlignSim");
+               exit(1);
+       }
+}      
+//***************************************************************************************************************
+void ChimeraCheckRDP::print(ostream& out) {
+       try {
+               
+               mothurOutEndLine();
+       /*      
+               for (int i = 0; i < querySeqs.size(); i++) {
+                       
+                       int j = 0;  float largest = -10;
+                       //find largest sim value
+                       for (int k = 0; k < IS[i].size(); k++) {
+                               //is this score larger
+                               if (IS[i][k].score > largest) {
+                                       j = k;
+                                       largest = IS[i][k].score;
+                               }
+                       }
+                       
+                       //find parental similarity
+                       distCalc->calcDist(*(IS[i][j].leftParent), *(IS[i][j].rightParent));
+                       float dist = distCalc->getDist();
+                       
+                       //convert to similarity
+                       dist = (1 - dist) * 100;
+
+                       //warn about parental similarity - if its above 82% may not detect a chimera 
+                       if (dist >= 82) { mothurOut("When the chimeras parental similarity is above 82%, detection rates drop signifigantly.");  mothurOutEndLine(); }
+                       
+                       int index = ceil(dist);
+                       
+                       if (index == 0) { index=1;  }
+       
+                       //is your DE value higher than the 95%
+                       string chimera;
+                       if (IS[i][j].score > quantile[index-1][4])              {       chimera = "Yes";        }
+                       else                                                                            {       chimera = "No";         }                       
+                       
+                       out << querySeqs[i]->getName() <<  "\tparental similarity: " << dist << "\tIS: " << IS[i][j].score << "\tbreakpoint: " << IS[i][j].midpoint << "\tchimera flag: " << chimera << endl;
+                       
+                       if (chimera == "Yes") {
+                               mothurOut(querySeqs[i]->getName() + "\tparental similarity: " + toString(dist) + "\tIS: " + toString(IS[i][j].score) + "\tbreakpoint: " + toString(IS[i][j].midpoint) + "\tchimera flag: " + chimera); mothurOutEndLine();
+                       }
+                       out << "Improvement Score\t";
+                       
+                       for (int r = 0; r < IS[i].size(); r++) {  out << IS[i][r].score << '\t';  }
+                       out << endl;
+               }*/
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraCheckRDP", "print");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+void ChimeraCheckRDP::getChimeras() {
+       try {
+               
+               //read in query sequences and subject sequences
+               mothurOut("Reading sequences and template file... "); cout.flush();
+               querySeqs = readSeqs(fastafile);
+               //templateSeqs = readSeqs(templateFile);
+               templateDB = new KmerDB(templateFile, kmerSize);
+               mothurOut("Done."); mothurOutEndLine();
+               
+               int numSeqs = querySeqs.size();
+               
+               IS.resize(numSeqs);
+               closest.resize(numSeqs);
+               
+               //break up file if needed
+               int linesPerProcess = numSeqs / processors ;
+               
+               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       //find breakup of sequences for all times we will Parallelize
+                       if (processors == 1) {   lines.push_back(new linePair(0, numSeqs));  }
+                       else {
+                               //fill line pairs
+                               for (int i = 0; i < (processors-1); i++) {                      
+                                       lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess)));
+                               }
+                               //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end
+                               int i = processors - 1;
+                               lines.push_back(new linePair((i*linesPerProcess), numSeqs));
+                       }
+                       
+               #else
+                       lines.push_back(new linePair(0, numSeqs));
+               #endif
+               
+               kmer = new Kmer(kmerSize);
+               
+               //find closest seq to each querySeq
+               for (int i = 0; i < querySeqs.size(); i++) {
+                       closest[i] = templateDB->findClosestSequence(querySeqs[i]);  
+               }
+               
+               //fill seqKmerInfo for query seqs
+               for (int i = 0; i < querySeqs.size(); i++) {
+                       seqKmerInfo[querySeqs[i]->getName()] = kmer->getKmerCounts(querySeqs[i]->getUnaligned());
+               }
+               
+               //fill seqKmerInfo for closest
+               for (int i = 0; i < closest.size(); i++) {
+                       seqKmerInfo[closest[i].getName()] = kmer->getKmerCounts(closest[i].getUnaligned());
+               }
+
+               
+               //for each query find IS value - this should be paralellized, 
+               //but paralellizing may cause you to have to recalculate some seqKmerInfo since the separate processes don't share memory after they split
+               for (int i = 0; i < querySeqs.size(); i++) {
+                       IS[i] = findIS(i);  //fills seqKmerInfo
+               }
+               
+               
+               //free memory
+               for (int i = 0; i < lines.size(); i++)                                  {       delete lines[i];        }
+       
+                       
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraCheckRDP", "getChimeras");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+vector<sim> ChimeraCheckRDP::findIS(int query) {
+       try {
+               
+               vector<sim>  isValues;
+               string queryName = querySeqs[query]->getName();
+               string seq = querySeqs[query]->getUnaligned();
+               
+               mothurOut("Finding IS values for sequence " + query); mothurOutEndLine();
+               
+               //find total kmers you have in common with closest[query] by looking at the last entry in the vector of maps for each
+               int nTotal = calcKmers(seqKmerInfo[queryName][(seqKmerInfo[queryName].size()-1)], seqKmerInfo[closest[query].getName()][(seqKmerInfo[closest[query].getName()].size()-1)]);
+               
+               //you don't want the starting point to be virtually at hte end so move it in 10%
+               int start = seq.length() / 10;
+                       
+               //for each window
+               for (int m = start; m < (seq.length() - start); m+=increment) {
+                       
+                       sim temp;
+                       
+                       string fragLeft = seq.substr(0, m);  //left side of breakpoint
+                       string fragRight = seq.substr(m, seq.length());  //right side of breakpoint
+                       
+                       //make a sequence of the left side and right side
+                       Sequence* left = new Sequence(queryName, fragLeft);
+                       Sequence* right = new Sequence(queryName, fragRight);
+                       
+                       //find seqs closest to each fragment
+                       Sequence closestLeft = templateDB->findClosestSequence(left); 
+                       Sequence closestRight = templateDB->findClosestSequence(right); 
+                       
+                       map<int, int>::iterator itleft;
+                       map<int, int>::iterator itleftclose;
+                       
+                       //get kmer in the closest seqs
+                       //if it's not found calc kmer info and save, otherwise use already calculated data
+                       //left
+                       it = seqKmerInfo.find(closestLeft.getName());
+                       if (it == seqKmerInfo.end()) {  //you have to calc it
+                               seqKmerInfo[closestLeft.getName()] = kmer->getKmerCounts(closestLeft.getUnaligned());
+                       }
+                       
+                       //right
+                       it = seqKmerInfo.find(closestRight.getName());
+                       if (it == seqKmerInfo.end()) {  //you have to calc it
+                               seqKmerInfo[closestRight.getName()] = kmer->getKmerCounts(closestRight.getUnaligned());
+                       }
+                       
+                       //right side is tricky - since the counts grow on eachother to find the correct counts of only the right side you must subtract the counts of the left side
+                       //iterate through left sides map to subtract the number of times you saw things before you got the the right side
+                       map<int, int> rightside;
+                       for (itleft = seqKmerInfo[queryName][m-kmerSize].begin(); itleft != seqKmerInfo[queryName][m-kmerSize].end(); itleft++) {
+                               int howManyTotal = seqKmerInfo[queryName][seqKmerInfo[queryName].size()-1][itleft->first];   //times that kmer was seen in total
+                               
+                               //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
+                               int howmanyright = howManyTotal - itleft->second;
+                               
+                               //if any were seen just on the right add that ammount to map
+                               if (howmanyright > 0) {
+                                       rightside[itleft->first] = howmanyright;
+                               }
+                       }
+                       
+                       //iterate through left side of the seq closest to the right fragment of query to subtract the number you saw before you reached the right side of the closest right
+                       //this way you can get the map for just the fragment you want to compare and not hte whole sequence
+                       map<int, int> rightsideclose;
+                       for (itleftclose = seqKmerInfo[closestRight.getName()][m-kmerSize].begin(); itleftclose != seqKmerInfo[closestRight.getName()][m-kmerSize].end(); itleftclose++) {
+                               int howManyTotal = seqKmerInfo[closestRight.getName()][seqKmerInfo[closestRight.getName()].size()-1][itleftclose->first];   //times that kmer was seen in total
+                               
+                               //itleft->second is times it was seen in left side, so howmanytotal - leftside should give you right side
+                               int howmanyright = howManyTotal - itleftclose->second;
+                               
+                               //if any were seen just on the right add that ammount to map
+                               if (howmanyright > 0) {
+                                       rightsideclose[itleftclose->first] = howmanyright;
+                               }
+                       }
+
+                       int nLeft = calcKmers(seqKmerInfo[closestLeft.getName()][m-kmerSize], seqKmerInfo[queryName][m-kmerSize]);
+                       int nRight = calcKmers(rightsideclose, rightside);
+                       
+                       int is = nLeft + nRight - nTotal;
+                       
+                       //save IS, leftparent, rightparent, breakpoint
+                       temp.leftParent = closestLeft.getName();
+                       temp.rightParent = closestRight.getName();
+                       temp.score = is;
+                       temp.midpoint = m;
+                       
+                       isValues.push_back(temp);
+                       
+                       delete left;
+                       delete right;
+               }       
+               
+               return isValues;
+       
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraCheckRDP", "findIS");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+//find the smaller map and iterate through it and count kmers in common
+int ChimeraCheckRDP::calcKmers(map<int, int> query, map<int, int> subject) {
+       try{
+               
+               int common = 0;
+               map<int, int>::iterator small;
+               map<int, int>::iterator large;
+               
+               if (query.size() < subject.size()) {
+               
+                       for (small = query.begin(); small != query.end(); small++) {
+                               large = subject.find(small->first);
+                               
+                               //if you found it they have that kmer in common
+                               if (large != subject.end()) {   common++;       }
+                       }
+                       
+               }else { 
+                
+                       for (small = subject.begin(); small != subject.end(); small++) {
+                               large = query.find(small->first);
+                               
+                               //if you found it they have that kmer in common
+                               if (large != query.end()) {             common++;       }
+                       }
+               }
+               
+               return common;
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "ChimeraCheckRDP", "calcKmers");
+               exit(1);
+       }
+}
+
+//***************************************************************************************************************
+
diff --git a/chimeracheckrdp.h b/chimeracheckrdp.h
new file mode 100644 (file)
index 0000000..e31cae7
--- /dev/null
@@ -0,0 +1,72 @@
+#ifndef CHIMERACHECK_H
+#define CHIMERACHECK_H
+
+/*
+ *  chimeracheckrdp.h
+ *  Mothur
+ *
+ *  Created by westcott on 9/8/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "chimera.h"
+#include "kmer.hpp"
+#include "kmerdb.hpp"
+#include "database.hpp"
+
+//This class was created using the algorythms described in 
+//CHIMERA_CHECK version 2.7 written by Niels Larsen. 
+
+/***********************************************************/
+
+class ChimeraCheckRDP : public Chimera {
+       
+       public:
+               ChimeraCheckRDP(string, string);        
+               ~ChimeraCheckRDP();
+               
+               void getChimeras();
+               void print(ostream&);
+               
+               void setCons(string){};
+               void setQuantiles(string q) {};
+               
+               
+       private:
+               
+               vector<linePair*> lines;
+               vector<Sequence*> querySeqs;
+               Database* templateDB;
+               Kmer* kmer;
+               
+               vector< vector<sim> > IS;  //IS[0] is the vector of IS values for each window for querySeqs[0]
+               
+               //map of vector of maps- I know its a little convaluted but I am trying to save time 
+               //I think that since the window is only sliding 10 bases there is a good probability that the closest seq to each fragment
+               //will be the same for several windows so I want to save the vector of maps containing its kmer info rather than regenerating it.
+               //So...
+               map<string, vector< map<int, int> > > seqKmerInfo;  // outer map - sequence name -> kmer info 
+                                                                                                                       // kmer info: inner vector of maps - each entry in the vector is a map of the kmers up to that spot in the unaligned seq
+                                                                                                                       //example:  seqKmerInfo["ecoli"][50] = map containing the kmers found in the first 50 + kmersize characters of ecoli.
+                                                                                                                       //i chose to store the kmers numbers in a map so you wouldn't have to check for dupilcate entries and could easily find the 
+                                                                                                                       //kmers 2 seqs had in common.  There may be a better way to do this thats why I am leaving so many comments...
+               map<string, vector< map<int, int> > >:: iterator it;
+               map<int, int>::iterator it2;
+               
+               vector<Sequence> closest;               //closest[0] is the closest overall seq to querySeqs[0].
+               
+               string fastafile, templateFile;
+               
+               
+               vector<sim> findIS(int);
+               int calcKmers(map<int, int>, map<int, int>);            
+               vector< vector<sim> > createProcessesIS(vector<Sequence*>, vector<linePair*>);
+               
+};
+
+/***********************************************************/
+
+#endif
+
index 6af8aa3f3d4f1ae3085743eb90b4472edfbfe301..e8fa34df2e8f681bf158ef80b384a93e9ef49b4c 100644 (file)
@@ -10,8 +10,8 @@
 #include "chimeraseqscommand.h"
 #include "bellerophon.h"
 #include "pintail.h"
-#include "alignedsimilarity.h"
 #include "ccode.h"
+#include "chimeracheckrdp.h"
 
 
 //***************************************************************************************************************
@@ -25,7 +25,7 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                
                else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted" };
+                       string Array[] =  {"fasta", "filter", "correction", "processors", "method", "window", "increment", "template", "conservation", "quantile", "mask", "numwanted", "ksize" };
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -63,7 +63,9 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                                if (ableToOpen == 1) { abort = true; }
                                in.close();
                        }
-
+                       
+                       method = validParameter.validFile(parameters, "method", false);                 if (method == "not found") { method = "pintail"; }
+                       
                        string temp;
                        temp = validParameter.validFile(parameters, "filter", false);                   if (temp == "not found") { temp = "F"; }
                        filter = isTrue(temp);
@@ -74,16 +76,21 @@ ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
                        temp = validParameter.validFile(parameters, "processors", false);               if (temp == "not found") { temp = "1"; }
                        convert(temp, processors);
                        
+                       temp = validParameter.validFile(parameters, "ksize", false);                    if (temp == "not found") { temp = "7"; }
+                       convert(temp, ksize);
+                       
                        temp = validParameter.validFile(parameters, "window", false);                   if (temp == "not found") { temp = "0"; }
                        convert(temp, window);
                                        
-                       temp = validParameter.validFile(parameters, "increment", false);                        if (temp == "not found") { temp = "25"; }
+                       temp = validParameter.validFile(parameters, "increment", false);                
+                       if ((temp == "not found") && (method == "chimeracheck")) { temp = "10"; }
+                       else if (temp == "not found") { temp = "25"; }
                        convert(temp, increment);
                        
-                       temp = validParameter.validFile(parameters, "numwanted", false);                        if (temp == "not found") { temp = "20"; }
+                       temp = validParameter.validFile(parameters, "numwanted", false);                if (temp == "not found") { temp = "20"; }
                        convert(temp, numwanted);
 
-                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "pintail"; }
+                       
                        
                        if (((method == "pintail") || (method == "alignsim")) && (templatefile == "")) { mothurOut("You must provide a template file with the pintail and alignsim methods."); mothurOutEndLine(); abort = true;  }
                        
@@ -136,10 +143,10 @@ int ChimeraSeqsCommand::execute(){
                
                if (abort == true) { return 0; }
                
-               if (method == "bellerophon")    {               chimera = new Bellerophon(fastafile);                           }
-               else if (method == "pintail")   {               chimera = new Pintail(fastafile, templatefile);         }
-               //else if (method == "alignsim")        {               chimera = new AlignSim(fastafile, templatefile);        }
-               else if (method == "ccode")             {               chimera = new Ccode(fastafile, templatefile);           }
+               if (method == "bellerophon")                    {               chimera = new Bellerophon(fastafile);                                           }
+               else if (method == "pintail")                   {               chimera = new Pintail(fastafile, templatefile);                         }
+               else if (method == "ccode")                             {               chimera = new Ccode(fastafile, templatefile);                           }
+               else if (method == "chimeracheck")              {               chimera = new ChimeraCheckRDP(fastafile, templatefile);         }
                else { mothurOut("Not a valid method."); mothurOutEndLine(); return 0;          }
                
                //set user options
@@ -160,6 +167,7 @@ int ChimeraSeqsCommand::execute(){
                chimera->setWindow(window);
                chimera->setIncrement(increment);
                chimera->setNumWanted(numwanted);
+               chimera->setKmerSize(ksize);
                                
                //find chimeras
                chimera->getChimeras();
index 6274c48ea2acd8849155e9fc3b8813f6bae74919..b2bce8107590e3d4906ea68b7e758cd5bf15b951 100644 (file)
@@ -30,7 +30,7 @@ private:
        bool abort;
        string method, fastafile, templatefile, consfile, quanfile, maskfile;
        bool filter, correction;
-       int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted;
+       int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize;
        Chimera* chimera;
        
        
index 9784ef93035bbf896eb3ec2c7090cbda7a426227..f84c9e1bd8f3c3f2111cde09657966f5c787b1eb 100644 (file)
@@ -92,3 +92,4 @@ int Database::getLongestBase()        {       return longest+1;               }
 
 /**************************************************************************************************/
 
+
index b3bf02247df2c6c047aea8a30293876c2f715cc7..6ee84d3da3e8917c8c02d9c2d37deb68e08397b5 100644 (file)
--- a/kmer.cpp
+++ b/kmer.cpp
@@ -38,6 +38,32 @@ string Kmer::getKmerString(string sequence){ //      Calculate kmer for each position
        
        return kmerString;      
 }
+/**************************************************************************************************/
+
+vector< map<int, int> > Kmer::getKmerCounts(string sequence){  //      Calculate kmer for each position in the sequence, save info in a map
+       int length = sequence.length();                         //      so you know at each spot in the sequence what kmers were found
+       int nKmers = length - kmerSize + 1;                     //      
+       vector< map<int, int> > counts; counts.resize(nKmers);  // a map kmer counts for each spot
+        map<int, int>::iterator it;
+       
+       for(int i=0;i<nKmers;i++){                                      //      Go though sequence and get the number between 0 and maxKmer for that
+               if (i == 0) {
+                       int kmerNumber = getKmerNumber(sequence, i);//  kmer. 
+                       counts[i][kmerNumber] = 1;                      // add this kmer if not already there
+               }else { 
+                       //your count is everything that came before and whatever you find now
+                       counts[i] = counts[i-1];
+                       int kmerNumber = getKmerNumber(sequence, i);//  kmer.  
+                       
+                       it = counts[i].find(kmerNumber);
+                       if (it!= counts[i].end()) {   counts[i][kmerNumber]++;  }  //increment number of times you have seen this kmer
+                       else {   counts[i][kmerNumber] = 1;   }  // add this kmer since not already there
+               }
+       }
+       
+       return counts;  
+}
+       
        
 /**************************************************************************************************/
 
index 31ca9fb4b7442a03757fc7893efa201a7d3f24be..d5b67d66d173cb24c179569d44886f2591ea2530 100644 (file)
--- a/kmer.hpp
+++ b/kmer.hpp
@@ -21,7 +21,7 @@ public:
        string getKmerString(string);
        int getKmerNumber(string, int);
        string getKmerBases(int);
-       
+       vector< map<int, int> > getKmerCounts(string sequence);  //for use in chimeraCheck
        
 private:
        char getASCII(int);
index 5019858dcf477c2e27bcc9ad3007617e26d3ce96..d50c8b2607347aa337b8cc57fc0566b40ccbb01b 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 #include "mothur.h"
+#include "database.hpp"
 
 class KmerDB : public Database {
        
index 804e3c174d4408c2ae55963ba391098002978c18..294940cd6aabd49f67aca028311d21529f0912ee 100644 (file)
--- a/nast.cpp
+++ b/nast.cpp
@@ -94,7 +94,7 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl
        try {
        
                int longAlignmentLength = newTemplateAlign.length();    
-               
+       
                for(int i=0; i<longAlignmentLength; i++){                               //      use the long alignment as the standard
                        int rightIndex, rightRoom, leftIndex, leftRoom;
                        
@@ -123,7 +123,7 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl
                                int insertLength = 0;                                                   //      figure out how long the anomaly is
                                while(!isalpha(newTemplateAlign[i + insertLength]))     {       insertLength++; }
                                if(insertLength > maxInsertLength){     maxInsertLength = insertLength; }
-                                       
+               
                                if((leftRoom + rightRoom) >= insertLength){
        
                                        //      Parts D & E from Fig. 2 of DeSantis et al.
@@ -168,6 +168,7 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl
                                                                        
                                                }
                                                else{                                                                   //      not enough room to the right, have to steal some 
+                       
                                                        //      space to the left lets move left and then right...
                                                        string leftTemplateString = newTemplateAlign.substr(0,i);
                                                        string rightTemplateString = newTemplateAlign.substr(i+insertLength);
@@ -183,7 +184,7 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl
                                        }
                                }
                                else{
-                                                                                                       //      there could be a case where there isn't enough room in
+                       //      there could be a case where there isn't enough room in
                                        string leftTemplateString = newTemplateAlign.substr(0,i);                       //      either direction to move stuff
                                        string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom);
                                        newTemplateAlign = leftTemplateString + rightTemplateString;
@@ -193,9 +194,9 @@ void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAl
                                        string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
                                        string rightCandidateString = candAln.substr(rightIndex+rightRoom);
                                        candAln = leftCandidateString + insertString + rightCandidateString;
-       
+
                                }
-                               
+                       
                                i -= insertLength;
                        } 
                }
@@ -337,12 +338,11 @@ void Nast::regapSequences(){      //This is essentially part B in Fig 2. of DeSantis
                        candAln[i] = toupper(candAln[i]);                       //      everything is upper case
                }
                
-               
+       
                if(candAln.length() != tempAln.length()){               //      if the regapped candidate sequence is longer than the official
                        removeExtraGaps(candAln, tempAln, newTemplateAlign);//  template alignment then we need to do steps C-F in Fig.
                }                                                                                               //      2 of Desantis et al.
-               
-               
+                       
                candidateSeq->setAligned(candAln);
        }
        catch(exception& e) {
index b14d8e3d10e42ee9c7a21f5595315b29ad172ff5..80ba4ca0aec06ea796a5a9bd7fbe7522f362157a 100644 (file)
@@ -42,8 +42,6 @@ gap(gO), match(m), mismatch(mm), Alignment(r) {                                                       //      the gap openning penalt
                errorOut(e, "NeedlemanOverlap", "NeedlemanOverlap");
                exit(1);
        }
-                                                                                                                               
-                                                                                                                                               
 }
 /**************************************************************************************************/
 
@@ -56,7 +54,7 @@ void NeedlemanOverlap::align(string A, string B){
                seqA = ' ' + A; lA = seqA.length();             //      algorithm requires a dummy space at the beginning of each string
                seqB = ' ' + B; lB = seqB.length();             //      algorithm requires a dummy space at the beginning of each string
 
-               if (lA > nRows) { mothurOut("Your one of your candidate sequences is longer than you longest template sequence."); mothurOutEndLine();  }
+               if (lA > nRows) { mothurOut("One of your candidate sequences is longer than you longest template sequence. Your longest template sequence is " + toString(nRows) + ". Your candidate is " + toString(lA) + "."); mothurOutEndLine();  }
                
                for(int i=1;i<lB;i++){                                  //      This code was largely translated from Perl code provided in Ex 3.1 
                        for(int j=1;j<lA;j++){                          //      of the O'Reilly BLAST book.  I found that the example output had a
@@ -90,10 +88,11 @@ void NeedlemanOverlap::align(string A, string B){
                                }
                        }
                }
+
                Overlap over;                                           
                over.setOverlap(alignment, lA, lB, 0);          //      Fix gaps at the beginning and end of the sequences
                traceBack();                                                            //      Traceback the alignment to populate seqAaln and seqBaln
-               
+       
        }
        catch(exception& e) {
                errorOut(e, "NeedlemanOverlap", "align");
index a5e21406be2743d8709c9ef5cb81aaa95b49824d..ea7bee515bff32685f03b195506fa4416814dddf 100644 (file)
@@ -32,6 +32,7 @@ public:
        NeedlemanOverlap(float, float, float, int);
        ~NeedlemanOverlap();
        void align(string, string);
+
        
 private:       
        float gap;
index 10a964848bd24f81a2f7658b0f1711427fbe3897..b2c2b53d0b4078e6e296312662dcdfe0ffe5fbec 100644 (file)
@@ -291,6 +291,7 @@ void Sequence::reverseComplement(){
                else                                            {       temp += 'N';    }
        }
        unaligned = temp;
+       aligned = temp;
        
 }
 
index 2a982c7135e5b6fc83ed49dbe89cf3804dbc540b..455cdc29aa47ecb1ca26a7f9b4021bef151d6314 100644 (file)
@@ -160,7 +160,7 @@ int TrimSeqsCommand::execute(){
                if(qFileName != "")     {       openInputFile(qFileName, qFile);        }
                
                bool success;
-               
+                       
                while(!inFASTA.eof()){
                        Sequence currSeq(inFASTA);
                        string origSeq = currSeq.getUnaligned();
@@ -173,6 +173,7 @@ int TrimSeqsCommand::execute(){
                                if(!success)                    {       trashCode += 'q';                                                               }
                        }
                        if(barcodes.size() != 0){
+       
                                success = stripBarcode(currSeq, group);
                                if(!success){   trashCode += 'b';       }
                        }
@@ -186,7 +187,8 @@ int TrimSeqsCommand::execute(){
                        }
                        if(minLength > 0 || maxLength > 0){
                                success = cullLength(currSeq);
-                               if(!success){   trashCode += 'l';       }
+                       if ((currSeq.getUnaligned().length() > 300) && (success)) {  cout << "too long " << currSeq.getUnaligned().length() << endl;  }
+                               if(!success){   trashCode += 'l'; }
                        }
                        if(maxHomoP > 0){
                                success = cullHomoP(currSeq);
@@ -200,6 +202,7 @@ int TrimSeqsCommand::execute(){
                        if(flip){       currSeq.reverseComplement();    }               // should go last                       
                        
                        if(trashCode.length() == 0){
+                               currSeq.setAligned(currSeq.getUnaligned());  //this is because of a modification we made to the sequence class to fix a bug.  all seqs have an aligned version, which is the version that gets printed.
                                currSeq.printSequence(outFASTA);
                                if(barcodes.size() != 0){
                                        outGroups << currSeq.getName() << '\t' << groupVector[group] << endl;