]> git.donarmstrong.com Git - mothur.git/commitdiff
added template=self capability to chimers.slayer, worked on corr.axes and added accno...
authorwestcott <westcott>
Mon, 3 Jan 2011 19:16:52 +0000 (19:16 +0000)
committerwestcott <westcott>
Mon, 3 Jan 2011 19:16:52 +0000 (19:16 +0000)
chimera.h
chimeraslayer.cpp
chimeraslayer.h
chimeraslayercommand.cpp
chimeraslayercommand.h
corraxescommand.cpp
corraxescommand.h
getseqscommand.cpp
getseqscommand.h
mothur
sequence.cpp

index 24ab3f74e5b783becc8106beffeef09a22191cb8..6ac6191692a10c7c445b442721fc289784ff549d 100644 (file)
--- a/chimera.h
+++ b/chimera.h
@@ -83,6 +83,7 @@ struct linePair {
                        linePair(){}
 };
 
+
 /***********************************************************************/
 
 class Chimera {
index 1336876966f110dc25e0e18eee8bd13ec4e8d6fd..ee449cc9fd30f7ec521a03311473a805739159d1 100644 (file)
@@ -43,6 +43,97 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num
                exit(1);
        }
 }
+//***************************************************************************************************************
+ChimeraSlayer::ChimeraSlayer(string file, string temp, string name, string mode, int k, int ms, int mms, int win, float div, 
+                                                        int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {      
+       try {
+               fastafile = file; templateSeqs = readSeqs(fastafile);
+               templateFileName = temp; 
+               searchMethod = mode;
+               kmerSize = k;
+               match = ms;
+               misMatch = mms;
+               window = win;
+               divR = div;
+               minSim = minsim;
+               minCov = mincov;
+               minBS = minbs;
+               minSNP = minsnp;
+               parents = par;
+               iters = it;
+               increment = inc;
+               numWanted = numw;
+               realign = r; 
+               
+               //read name file and create nameMapRank
+               readNameFile(name);
+               
+               decalc = new DeCalculator();    
+               
+               createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
+               
+               //run filter on template
+               for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]);  }
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+int ChimeraSlayer::readNameFile(string name) {
+       try {
+               ifstream in;
+               m->openInputFile(name, in);
+               
+               int maxRank = 0;
+               int minRank = 10000000;
+               
+               while(!in.eof()){
+                       
+                       if (m->control_pressed) { in.close(); return 0; }
+                       
+                       string thisname, repnames;
+                       
+                       in >> thisname;         m->gobble(in);          //read from first column
+                       in >> repnames;                 //read from second column
+                       
+                       map<string, vector<string> >::iterator it = nameMapRank.find(thisname);
+                       if (it == nameMapRank.end()) {
+                               
+                               vector<string> splitRepNames;
+                               m->splitAtComma(repnames, splitRepNames);
+                               
+                               nameMapRank[thisname] = splitRepNames;  
+                               
+                               if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); }
+                               if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); }
+                               
+                       }else{  m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine();  }
+                       
+                       m->gobble(in);
+               }
+               in.close();     
+               
+               //sanity check to make sure files match
+               for (int i = 0; i < templateSeqs.size(); i++) {
+                       map<string, vector<string> >::iterator it = nameMapRank.find(templateSeqs[i]->getName());
+                       
+                       if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true;  }
+               }
+               
+               if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true;  }
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraSlayer", "readNameFile");
+               exit(1);
+       }
+}
+
 //***************************************************************************************************************
 int ChimeraSlayer::doPrep() {
        try {
@@ -178,11 +269,124 @@ int ChimeraSlayer::doPrep() {
                exit(1);
        }
 }
+//***************************************************************************************************************
+vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
+       try {
+               
+               vector<Sequence*> thisTemplate;
+               
+               int thisRank;
+               string thisName = q->getName();
+               map<string, vector<string> >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked
+               thisRank = (itRank->second).size();
+               
+               //create list of names we want to put into the template
+               set<string> namesToAdd;
+               for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) {
+                       if ((itRank->second).size() > thisRank) {
+                               //you are more abundant than me
+                               for (int i = 0; i < (itRank->second).size(); i++) {
+                                       namesToAdd.insert((itRank->second)[i]);
+                               }
+                       }
+               }
+               
+               for (int i = 0; i < templateSeqs.size(); i++) {  
+                       if (namesToAdd.count(templateSeqs[i]->getName()) != 0) { 
+                               thisTemplate.push_back(templateSeqs[i]);
+                       }
+               }
+               
+               string  kmerDBNameLeft;
+               string  kmerDBNameRight;
+               
+               //generate the kmerdb to pass to maligner
+               if (searchMethod == "kmer") { 
+                       string templatePath = m->hasPath(templateFileName);
+                       string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
+                       databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
+                       
+                       string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
+                       databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);      
+#ifdef USE_MPI
+                       for (int i = 0; i < thisTemplate.size(); i++) {
+                               
+                               if (m->control_pressed) { return thisTemplate; } 
+                               
+                               string leftFrag = thisTemplate[i]->getUnaligned();
+                               leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+                               
+                               Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
+                               databaseLeft->addSequence(leftTemp);    
+                       }
+                       databaseLeft->generateDB();
+                       databaseLeft->setNumSeqs(thisTemplate.size());
+                       
+                       for (int i = 0; i < thisTemplate.size(); i++) {
+                               if (m->control_pressed) { return thisTemplate;  } 
+                               
+                               string rightFrag = thisTemplate[i]->getUnaligned();
+                               rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+                               
+                               Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
+                               databaseRight->addSequence(rightTemp);  
+                       }
+                       databaseRight->generateDB();
+                       databaseRight->setNumSeqs(thisTemplate.size());
+                       
+#else  
+                       
+                       
+                       for (int i = 0; i < thisTemplate.size(); i++) {
+                               
+                               if (m->control_pressed) { return thisTemplate; } 
+                               
+                               string leftFrag = thisTemplate[i]->getUnaligned();
+                               leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+                               
+                               Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
+                               databaseLeft->addSequence(leftTemp);    
+                       }
+                       databaseLeft->generateDB();
+                       databaseLeft->setNumSeqs(thisTemplate.size());
+                               
+                       for (int i = 0; i < thisTemplate.size(); i++) {
+                               if (m->control_pressed) { return thisTemplate; } 
+                                       
+                               string rightFrag = thisTemplate[i]->getUnaligned();
+                               rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+                                       
+                               Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
+                               databaseRight->addSequence(rightTemp);  
+                       }
+                       databaseRight->generateDB();
+                       databaseRight->setNumSeqs(thisTemplate.size());
+#endif 
+               }else if (searchMethod == "blast") {
+                       
+                       //generate blastdb
+                       databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
+                       for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; }  databaseLeft->addSequence(*thisTemplate[i]);  }
+                       databaseLeft->generateDB();
+                       databaseLeft->setNumSeqs(thisTemplate.size());
+               }
+               
+               return thisTemplate;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraSlayer", "getTemplate");
+               exit(1);
+       }
+}
+
 //***************************************************************************************************************
 ChimeraSlayer::~ChimeraSlayer() {      
        delete decalc;  
-       if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
-       else if (searchMethod == "blast") {  delete databaseLeft; }
+       if (templateFileName != "self") {
+               if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
+               else if (searchMethod == "blast") {  delete databaseLeft; }
+       }
 }
 //***************************************************************************************************************
 void ChimeraSlayer::printHeader(ostream& out) {
@@ -296,9 +500,23 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                
                querySeq = query;
                
+               //you must create a template
+               vector<Sequence*> thisTemplate;
+               if (templateFileName != "self") { thisTemplate = templateSeqs; }
+               else { thisTemplate = getTemplate(query); } //fills thistemplate and creates the databases
+               
+               if (m->control_pressed) {  return 0;  }
+               
+               if (thisTemplate.size() == 0) {  return 0; } //not chimeric
+               
                //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
-               Maligner maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
+               Maligner maligner(thisTemplate, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
                Slayer slayer(window, increment, minSim, divR, iters, minSNP);
+               
+               if (templateFileName == "self") {
+                       if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
+                       else if (searchMethod == "blast") {  delete databaseLeft; }
+               }
        
                if (m->control_pressed) {  return 0;  }
                
@@ -308,7 +526,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                        
                //found in testing realigning only made things worse
                if (realign) {
-                       ChimeraReAligner realigner(templateSeqs, match, misMatch);
+                       ChimeraReAligner realigner(thisTemplate, match, misMatch);
                        realigner.reAlign(query, Results);
                }
 
index 6d57474829e51f44a9eca05021c937ba537cb610..b6cae49f0709922ac9067ef1d39effab77d3d5be 100644 (file)
@@ -15,6 +15,8 @@
 #include "maligner.h"
 #include "slayer.h"
 
+
+
 //***********************************************************************/
 //This class was modeled after the chimeraSlayer written by the Broad Institute
 /***********************************************************************/
@@ -24,6 +26,8 @@ class ChimeraSlayer : public Chimera {
        
        public:
                ChimeraSlayer(string, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
+               ChimeraSlayer(string, string, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
+
                ~ChimeraSlayer();
                
                int getChimeras(Sequence*);
@@ -41,6 +45,7 @@ class ChimeraSlayer : public Chimera {
                map<int, int>  spotMap;
                Database* databaseRight;
                Database* databaseLeft;
+               map<string, vector<string> > nameMapRank;  //sequence name to rank so you can construct a template of the abundant sequences if the user uses itself as template
                
                vector<data_struct>  chimeraResults;
                string chimeraFlags, searchMethod, fastafile;
@@ -50,6 +55,8 @@ class ChimeraSlayer : public Chimera {
        
                void printBlock(data_struct, string, ostream&);
                string getBlock(data_struct, string);
+               int readNameFile(string);
+               vector<Sequence*> getTemplate(Sequence*);
                
 };
 
index cd84bdca5f27726b9547b58c429afe74112fee84..394172c34cc80feaa85f4ced1b6883d63058bb87 100644 (file)
@@ -9,11 +9,12 @@
 
 #include "chimeraslayercommand.h"
 #include "chimeraslayer.h"
+#include "deconvolutecommand.h"
 
 //**********************************************************************************************************************
 vector<string> ChimeraSlayerCommand::getValidParameters(){     
        try {
-               string AlignArray[] =  {"fasta", "processors", "window", "template","numwanted", "ksize", "match","mismatch", 
+               string AlignArray[] =  {"fasta", "processors", "name","window", "template","numwanted", "ksize", "match","mismatch", 
                        "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" };
                vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
                return myArray;
@@ -68,7 +69,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
                
                else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta", "processors", "window", "template","numwanted", "ksize", "match","mismatch", 
+                       string Array[] =  {"fasta", "processors","name", "window", "template","numwanted", "ksize", "match","mismatch", 
                        "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" };
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
@@ -90,21 +91,10 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
                        //if the user changes the input directory command factory will send this info to us in the output parameter 
                        string inputDir = validParameter.validFile(parameters, "inputdir", false);              
                        if (inputDir == "not found"){   inputDir = "";          }
-                       else {
-                               string path;
-                               it = parameters.find("template");
-                               //user has given a template file
-                               if(it != parameters.end()){ 
-                                       path = m->hasPath(it->second);
-                                       //if the user has not given a path then, add inputdir. else leave path alone.
-                                       if (path == "") {       parameters["template"] = inputDir + it->second;         }
-                               }
-                       }
-
-                       
+                                               
                        //check for required parameters
                        fastafile = validParameter.validFile(parameters, "fasta", false);
-                       if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true;  }
+                       if (fastafile == "not found") { fastafile = ""; m->mothurOut("[ERROR]: fasta is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true;  }
                        else { 
                                m->splitAtDash(fastafile, fastaFileNames);
                                
@@ -155,16 +145,89 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option)  {
                                }
                                
                                //make sure there is at least one valid file left
-                               if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
+                               if (fastaFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
+                       }
+                       
+                       
+                       //check for required parameters
+                       bool hasName = true;
+                       namefile = validParameter.validFile(parameters, "name", false);
+                       if (namefile == "not found") { namefile = "";  hasName = false; }
+                       else { 
+                               m->splitAtDash(namefile, nameFileNames);
+                               
+                               //go through files and make sure they are good, if not, then disregard them
+                               for (int i = 0; i < nameFileNames.size(); i++) {
+                                       if (inputDir != "") {
+                                               string path = m->hasPath(nameFileNames[i]);
+                                               //if the user has not given a path then, add inputdir. else leave path alone.
+                                               if (path == "") {       nameFileNames[i] = inputDir + nameFileNames[i];         }
+                                       }
+                                       
+                                       int ableToOpen;
+                                       ifstream in;
+                                       
+                                       ableToOpen = m->openInputFile(nameFileNames[i], in, "noerror");
+                                       
+                                       //if you can't open it, try default location
+                                       if (ableToOpen == 1) {
+                                               if (m->getDefaultPath() != "") { //default path is set
+                                                       string tryPath = m->getDefaultPath() + m->getSimpleName(nameFileNames[i]);
+                                                       m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+                                                       ifstream in2;
+                                                       ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                       in2.close();
+                                                       nameFileNames[i] = tryPath;
+                                               }
+                                       }
+                                       
+                                       if (ableToOpen == 1) {
+                                               if (m->getOutputDir() != "") { //default path is set
+                                                       string tryPath = m->getOutputDir() + m->getSimpleName(nameFileNames[i]);
+                                                       m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+                                                       ifstream in2;
+                                                       ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+                                                       in2.close();
+                                                       nameFileNames[i] = tryPath;
+                                               }
+                                       }
+                                       
+                                       in.close();
+                                       
+                                       if (ableToOpen == 1) { 
+                                               m->mothurOut("Unable to open " + nameFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); 
+                                               //erase from file list
+                                               nameFileNames.erase(nameFileNames.begin()+i);
+                                               i--;
+                                       }
+                               }
+                               
+                               //make sure there is at least one valid file left
+                               if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; }
                        }
                        
+                       if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
+                       
                        //if the user changes the output directory command factory will send this info to us in the output parameter 
                        outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
-
-                       templatefile = validParameter.validFile(parameters, "template", true);
-                       if (templatefile == "not open") { abort = true; }
-                       else if (templatefile == "not found") { templatefile = "";  m->mothurOut("template is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true;  }   
-                                               
+                       
+                       
+                       string path;
+                       it = parameters.find("template");
+                       //user has given a template file
+                       if(it != parameters.end()){ 
+                               if (it->second == "self") { templatefile = "self"; }
+                               else {
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["template"] = inputDir + it->second;         }
+                                       
+                                       templatefile = validParameter.validFile(parameters, "template", true);
+                                       if (templatefile == "not open") { abort = true; }
+                                       else if (templatefile == "not found") { templatefile = "";  m->mothurOut("template is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true;  }   
+                               }
+                       }
+                       
                        string temp = validParameter.validFile(parameters, "processors", false);                if (temp == "not found") { temp = "1"; }
                        convert(temp, processors);
                        
@@ -227,10 +290,11 @@ void ChimeraSlayerCommand::help(){
        
                m->mothurOut("The chimera.slayer command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
                m->mothurOut("This command was modeled after the chimeraSlayer written by the Broad Institute.\n");
-               m->mothurOut("The chimera.slayer command parameters are fasta, template, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign,
+               m->mothurOut("The chimera.slayer command parameters are fasta, name, template, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign,
                m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
+               m->mothurOut("The name parameter allows you to provide a name file, if you are using template=self. \n");
                m->mothurOut("You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n");
-               m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
+               m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n");
                m->mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
                #ifdef USE_MPI
                m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
@@ -278,8 +342,36 @@ int ChimeraSlayerCommand::execute(){
                
                        int start = time(NULL); 
                        
-                       chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);   
+                       if (templatefile != "self") { //you want to run slayer with a refernce template
+                               chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);   
+                       }else {
+                               if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one
+                                       chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, nameFileNames[s], search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign); 
+                               }else {
+                                       
+                                       m->mothurOutEndLine(); m->mothurOut("No namesfile given, running unique.seqs command to generate one."); m->mothurOutEndLine(); m->mothurOutEndLine();
+                                       
+                                       //use unique.seqs to create new name and fastafile
+                                       string inputString = "fasta=" + fastaFileNames[s];
+                                       m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+                                       m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
+                                                                
+                                       Command* uniqueCommand = new DeconvoluteCommand(inputString);
+                                       uniqueCommand->execute();
+                                       
+                                       map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
+                                       
+                                       delete uniqueCommand;
+                                       
+                                       m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+                                       
+                                       string nameFile = filenames["name"][0];
+                                       fastaFileNames[s] = filenames["fasta"][0];
                        
+                                       chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, nameFile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign); 
+                               }
+                       }
+                               
                        if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]);  }//if user entered a file with a path then preserve it                               
                        string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.chimeras";
                        string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s]))  + "slayer.accnos";
index aa9b5e7102a324bce97095c2fce7d78f470ab3fc..9ff0c8ffb0753f799133b3ee54424f0a7aff8345 100644 (file)
@@ -49,7 +49,7 @@ private:
        #endif
 
        bool abort, realign;
-       string fastafile, templatefile, outputDir, search;
+       string fastafile, templatefile, outputDir, search, namefile;
        int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
        float divR;
        Chimera* chimera;
@@ -57,6 +57,7 @@ private:
        vector<string> outputNames;
        map<string, vector<string> > outputTypes;
        vector<string> fastaFileNames;
+       vector<string> nameFileNames;
        
 };
 
index a36f95d1ac310638a31f38e3e172e17ba5aafe09..ccdd7402389d0f761e069ff1d3171c5cdfcb3824 100644 (file)
@@ -9,6 +9,11 @@
 
 #include "corraxescommand.h"
 
+//********************************************************************************************************************
+//sorts highes to lowest
+inline bool compareSpearman(spearmanRank left, spearmanRank right){
+       return (left.score > right.score);      
+} 
 //**********************************************************************************************************************
 vector<string> CorrAxesCommand::getValidParameters(){  
        try {
@@ -262,8 +267,8 @@ int CorrAxesCommand::execute(){
                
                if (method == "pearson")                {  calcPearson(axes, out);      }
                else if (method == "spearman")  {  calcSpearman(axes, out); }
-               //else if (method == "kendall") {  calcKendal(axes, out);       }
-               //else { m->mothurOut("[ERROR]: Invalid method."); m->mothurOutEndLine(); }
+               else if (method == "kendall")   {  calcKendall(axes, out);      }
+               else { m->mothurOut("[ERROR]: Invalid method."); m->mothurOutEndLine(); }
                
                out.close();
                for (int i = 0; i < lookupFloat.size(); i++) {  delete lookupFloat[i];  }
@@ -346,52 +351,106 @@ int CorrAxesCommand::calcPearson(map<string, vector<float> >& axes, ofstream& ou
 int CorrAxesCommand::calcSpearman(map<string, vector<float> >& axes, ofstream& out) {
        try {
                
-               //find average of each axis - X
-               vector<float> averageAxes; averageAxes.resize(numaxes, 0.0);
+               //format data
+               vector< vector<spearmanRank> > scores; scores.resize(numaxes);
                for (map<string, vector<float> >::iterator it = axes.begin(); it != axes.end(); it++) {
                        vector<float> temp = it->second;
                        for (int i = 0; i < temp.size(); i++) {
-                               averageAxes[i] += temp[i];  
+                               spearmanRank member(it->first, temp[i]);
+                               scores[i].push_back(member);  
                        }
                }
                
-               for (int i = 0; i < averageAxes.size(); i++) {  averageAxes[i] = averageAxes[i] / (float) axes.size(); }
+               //sort each axis
+               for (int i = 0; i < numaxes; i++) {  sort(scores[i].begin(), scores[i].end(), compareSpearman); }
+               
+               //find ranks of xi in each axis
+               map<string, vector<float> > rankAxes;
+               for (int i = 0; i < numaxes; i++) {
+                       
+                       vector<spearmanRank> ties;
+                       int rankTotal = 0;
+                       for (int j = 0; j < scores[i].size(); j++) {
+                               rankTotal += j;
+                               ties.push_back(scores[i][j]);
+                               
+                               if (j != scores.size()-1) { // you are not the last so you can look ahead
+                                       if (scores[i][j].score != scores[i][j+1].score) { // you are done with ties, rank them and continue
+                                               for (int k = 0; k < ties.size(); k++) {
+                                                       float thisrank = rankTotal / (float) ties.size();
+                                                       rankAxes[ties[k].name].push_back(thisrank);
+                                               }
+                                               ties.clear();
+                                               rankTotal = 0;
+                                       }
+                               }else { // you are the last one
+                                       for (int k = 0; k < ties.size(); k++) {
+                                               float thisrank = rankTotal / (float) ties.size();
+                                               rankAxes[ties[k].name].push_back(thisrank);
+                                       }
+                               }
+                       }
+               }
+               
+                               
                
                //for each otu
                for (int i = 0; i < lookupFloat[0]->getNumBins(); i++) {
                        
                        out << i+1 << '\t';
                        
-                       //find the averages this otu - Y
-                       float sumOtu = 0.0;
+                       //find the ranks of this otu - Y
+                       vector<spearmanRank> otuScores;
                        for (int j = 0; j < lookupFloat.size(); j++) {
-                               sumOtu += lookupFloat[j]->getAbundance(i);
+                               spearmanRank member(lookupFloat[j]->getGroup(), lookupFloat[j]->getAbundance(i));
+                               otuScores.push_back(member);
                        }
-                       float Ybar = sumOtu / (float) lookupFloat.size();
                        
-                       //find r value for each axis
-                       for (int k = 0; k < averageAxes.size(); k++) {
+                       sort(otuScores.begin(), otuScores.end(), compareSpearman);
+                       
+                       map<string, float> rankOtus;
+                       vector<spearmanRank> ties;
+                       int rankTotal = 0;
+                       for (int j = 0; j < otuScores.size(); j++) {
+                               rankTotal += j;
+                               ties.push_back(otuScores[j]);
                                
-                               double r = 0.0;
-                               double numerator = 0.0;
-                               double denomTerm1 = 0.0;
-                               double denomTerm2 = 0.0;
-                               for (int j = 0; j < lookupFloat.size(); j++) {
-                                       float Yi = lookupFloat[j]->getAbundance(i);
-                                       float Xi = axes[lookupFloat[j]->getGroup()][k];
-                                       
-                                       numerator += ((Xi - averageAxes[k]) * (Yi - Ybar));
-                                       denomTerm1 += ((Xi - averageAxes[k]) * (Xi - averageAxes[k]));
-                                       denomTerm2 += ((Yi - Ybar) * (Yi - Ybar));
+                               if (j != scores.size()-1) { // you are not the last so you can look ahead
+                                       if (otuScores[j].score != otuScores[j+1].score) { // you are done with ties, rank them and continue
+                                               for (int k = 0; k < ties.size(); k++) {
+                                                       float thisrank = rankTotal / (float) ties.size();
+                                                       rankOtus[ties[k].name] = thisrank;
+                                               }
+                                               ties.clear();
+                                               rankTotal = 0;
+                                       }
+                               }else { // you are the last one
+                                       for (int k = 0; k < ties.size(); k++) {
+                                               float thisrank = rankTotal / (float) ties.size();
+                                               rankOtus[ties[k].name] = thisrank;
+                                       }
                                }
+                       }
+                       
+                       //calc spearman ranks for each axis for this otu
+                       for (int j = 0; j < numaxes; j++) {
                                
-                               double denom = (sqrt(denomTerm1 * denomTerm2));
+                               double di = 0.0;
+                               for (int k = 0; k < lookupFloat.size(); k++) {
+                                       
+                                       float xi = rankAxes[lookupFloat[k]->getGroup()][j];
+                                       float yi = rankOtus[lookupFloat[k]->getGroup()];
+                                       
+                                       di += ((xi - yi) * (xi - yi));
+                               }
                                
-                               r = numerator / denom;
+                               int n = lookupFloat.size();
+                               double p = 1.0 - ((6 * di) / (float) (n * ((n*n) - 1)));
                                
-                               out << r << '\t'; 
+                               out << p << '\t';
                        }
                        
+                       
                        out << endl;
                }
                
@@ -403,6 +462,132 @@ int CorrAxesCommand::calcSpearman(map<string, vector<float> >& axes, ofstream& o
        }
 }
 //**********************************************************************************************************************
+int CorrAxesCommand::calcKendall(map<string, vector<float> >& axes, ofstream& out) {
+       try {
+               
+               //format data
+               vector< vector<spearmanRank> > scores; scores.resize(numaxes);
+               for (map<string, vector<float> >::iterator it = axes.begin(); it != axes.end(); it++) {
+                       vector<float> temp = it->second;
+                       for (int i = 0; i < temp.size(); i++) {
+                               spearmanRank member(it->first, temp[i]);
+                               scores[i].push_back(member);  
+                       }
+               }
+               
+               //sort each axis
+               for (int i = 0; i < numaxes; i++) {  sort(scores[i].begin(), scores[i].end(), compareSpearman); }
+               
+               //find ranks of xi in each axis
+               map<string, vector<float> > rankAxes;
+               for (int i = 0; i < numaxes; i++) {
+                       
+                       vector<spearmanRank> ties;
+                       int rankTotal = 0;
+                       for (int j = 0; j < scores[i].size(); j++) {
+                               rankTotal += j;
+                               ties.push_back(scores[i][j]);
+                               
+                               if (j != scores.size()-1) { // you are not the last so you can look ahead
+                                       if (scores[i][j].score != scores[i][j+1].score) { // you are done with ties, rank them and continue
+                                               for (int k = 0; k < ties.size(); k++) {
+                                                       float thisrank = rankTotal / (float) ties.size();
+                                                       rankAxes[ties[k].name].push_back(thisrank);
+                                               }
+                                               ties.clear();
+                                               rankTotal = 0;
+                                       }
+                               }else { // you are the last one
+                                       for (int k = 0; k < ties.size(); k++) {
+                                               float thisrank = rankTotal / (float) ties.size();
+                                               rankAxes[ties[k].name].push_back(thisrank);
+                                       }
+                               }
+                       }
+               }
+               
+               
+               
+               //for each otu
+               for (int i = 0; i < lookupFloat[0]->getNumBins(); i++) {
+                       
+                       out << i+1 << '\t';
+                       
+                       //find the ranks of this otu - Y
+                       vector<spearmanRank> otuScores;
+                       for (int j = 0; j < lookupFloat.size(); j++) {
+                               spearmanRank member(lookupFloat[j]->getGroup(), lookupFloat[j]->getAbundance(i));
+                               otuScores.push_back(member);
+                       }
+                       
+                       sort(otuScores.begin(), otuScores.end(), compareSpearman);
+                       
+                       map<string, float> rankOtus;
+                       vector<spearmanRank> ties;
+                       int rankTotal = 0;
+                       for (int j = 0; j < otuScores.size(); j++) {
+                               rankTotal += j;
+                               ties.push_back(otuScores[j]);
+                               
+                               if (j != scores.size()-1) { // you are not the last so you can look ahead
+                                       if (otuScores[j].score != otuScores[j+1].score) { // you are done with ties, rank them and continue
+                                               for (int k = 0; k < ties.size(); k++) {
+                                                       float thisrank = rankTotal / (float) ties.size();
+                                                       rankOtus[ties[k].name] = thisrank;
+                                               }
+                                               ties.clear();
+                                               rankTotal = 0;
+                                       }
+                               }else { // you are the last one
+                                       for (int k = 0; k < ties.size(); k++) {
+                                               float thisrank = rankTotal / (float) ties.size();
+                                               rankOtus[ties[k].name] = thisrank;
+                                       }
+                               }
+                       }
+                       
+                       //calc kendall coeffient for each axis for this otu
+                       for (int j = 0; j < numaxes; j++) {
+                               
+                               int numConcordant = 0;
+                               int numDiscordant = 0;
+                               
+                               for (int f = 0; f < j; f++) {
+                                       
+                                       for (int k = 0; k < lookupFloat.size(); k++) {
+                                               
+                                               float xi = rankAxes[lookupFloat[k]->getGroup()][j];
+                                               float yi = rankOtus[lookupFloat[k]->getGroup()];
+                                               
+                                               for (int h = 0; h < k; h++) {
+                                                       
+                                                       float xj = rankAxes[lookupFloat[k]->getGroup()][f];
+                                                       float yj = rankOtus[lookupFloat[h]->getGroup()];
+                                                       
+                                                       if ( ((xi > xj) && (yi < yj)) || ((xi < xj) && (yi > yj)) ){  numDiscordant++;  }
+                                                       if ( ((xi > xj) && (yi > yj)) || ((xi < xj) && (yi < yj)) ){  numConcordant++;  }
+                                               }
+                                       }
+                               }
+                               
+                               int n = lookupFloat.size();
+                               double p = (numConcordant - numDiscordant) / (float) (0.5 * n * (n - 1));
+                               
+                               out << p << '\t';
+                       }
+                       
+                       
+                       out << endl;
+               }
+               
+               return 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "CorrAxesCommand", "calcKendall");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
 int CorrAxesCommand::getShared(){
        try {
                InputData* input = new InputData(sharedfile, "sharedfile");
index 01c8d2bacd223d7c742a28a78fcfa65f91827192..45a2b785401ead3f70bd87ea43777e8db05c5fbf 100644 (file)
 #include "sharedrabundfloatvector.h"
 #include "inputdata.h"
 
+/***************************************************************/
+struct spearmanRank {
+       string name;
+       float score;
+       
+       spearmanRank(string n, float s) : name(n), score(s) {}
+};
+/***************************************************************/
+
 class CorrAxesCommand : public Command {
 public:
        CorrAxesCommand(string);
@@ -28,6 +37,8 @@ public:
        void help();
        
 private:
+       
+       
        GlobalData* globaldata;
        string axesfile, sharedfile, relabundfile, metadatafile, groups, label, inputFileName, outputDir, method;
        bool abort, pickedGroups;
@@ -46,6 +57,7 @@ private:
        map<string, vector<float> > readAxes();
        int calcPearson(map<string, vector<float> >&, ofstream&);
        int calcSpearman(map<string, vector<float> >&, ofstream&);
+       int calcKendall(map<string, vector<float> >&, ofstream&);
        
 };
 
index 39c4fe001c8b9b19ef31c5a9a7f3ad98213301d6..d40dcc6a30d20080770ac806855295f98ed83df3 100644 (file)
@@ -14,7 +14,7 @@
 //**********************************************************************************************************************
 vector<string> GetSeqsCommand::getValidParameters(){   
        try {
-               string Array[] =  {"fasta","name", "group", "qfile","alignreport", "accnos", "dups", "list","taxonomy","outputdir","inputdir"};
+               string Array[] =  {"fasta","name", "group", "qfile","alignreport", "accnos", "accnos2","dups", "list","taxonomy","outputdir","inputdir"};
                vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                return myArray;
        }
@@ -36,6 +36,7 @@ GetSeqsCommand::GetSeqsCommand(){
                outputTypes["alignreport"] = tempOutNames;
                outputTypes["list"] = tempOutNames;
                outputTypes["qfile"] = tempOutNames;
+               outputTypes["accnosreport"] = tempOutNames;
        }
        catch(exception& e) {
                m->errorOut(e, "GetSeqsCommand", "GetSeqsCommand");
@@ -75,7 +76,7 @@ GetSeqsCommand::GetSeqsCommand(string option)  {
                
                else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta","name", "group", "alignreport", "qfile", "accnos", "dups", "list","taxonomy","outputdir","inputdir"};
+                       string Array[] =  {"fasta","name", "group", "alignreport", "qfile", "accnos", "accnos2","dups", "list","taxonomy","outputdir","inputdir"};
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -98,6 +99,7 @@ GetSeqsCommand::GetSeqsCommand(string option)  {
                        outputTypes["alignreport"] = tempOutNames;
                        outputTypes["list"] = tempOutNames;
                        outputTypes["qfile"] = tempOutNames;
+                       outputTypes["accnosreport"] = tempOutNames;
                        
                        //if the user changes the output directory command factory will send this info to us in the output parameter 
                        outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
@@ -131,6 +133,14 @@ GetSeqsCommand::GetSeqsCommand(string option)  {
                                        if (path == "") {       parameters["accnos"] = inputDir + it->second;           }
                                }
                                
+                               it = parameters.find("accnos2");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = m->hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["accnos2"] = inputDir + it->second;          }
+                               }
+                               
                                it = parameters.find("list");
                                //user has given a template file
                                if(it != parameters.end()){ 
@@ -178,6 +188,9 @@ GetSeqsCommand::GetSeqsCommand(string option)  {
                        if (accnosfile == "not open") { abort = true; }
                        else if (accnosfile == "not found") {  accnosfile = "";  m->mothurOut("You must provide an accnos file."); m->mothurOutEndLine(); abort = true; }       
                        
+                       accnosfile2 = validParameter.validFile(parameters, "accnos2", true);
+                       if (accnosfile2 == "not open") { abort = true; }
+                       
                        fastafile = validParameter.validFile(parameters, "fasta", true);
                        if (fastafile == "not open") { abort = true; }
                        else if (fastafile == "not found") {  fastafile = "";  }        
@@ -210,7 +223,7 @@ GetSeqsCommand::GetSeqsCommand(string option)  {
                        string temp = validParameter.validFile(parameters, "dups", false);      if (temp == "not found") { temp = "false"; usedDups = ""; }
                        dups = m->isTrue(temp);
                        
-                       if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == ""))  { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; }
+                       if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == ""))  { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; }
                
                        if ((usedDups != "") && (namefile == "")) {  m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine();  abort = true; }                       
 
@@ -253,13 +266,14 @@ int GetSeqsCommand::execute(){
                if (m->control_pressed) { return 0; }
                
                //read through the correct file and output lines you want to keep
-               if (namefile != "")                     {               readName();             }
-               if (fastafile != "")            {               readFasta();    }
-               if (groupfile != "")            {               readGroup();    }
-               if (alignfile != "")            {               readAlign();    }
-               if (listfile != "")                     {               readList();             }
-               if (taxfile != "")                      {               readTax();              }
-               if (qualfile != "")                     {               readQual();             }
+               if (namefile != "")                     {               readName();                     }
+               if (fastafile != "")            {               readFasta();            }
+               if (groupfile != "")            {               readGroup();            }
+               if (alignfile != "")            {               readAlign();            }
+               if (listfile != "")                     {               readList();                     }
+               if (taxfile != "")                      {               readTax();                      }
+               if (qualfile != "")                     {               readQual();                     }
+               if (accnosfile2 != "")          {               compareAccnos();        }
                
                if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) {   remove(outputNames[i].c_str());  } return 0; }
                
@@ -746,6 +760,73 @@ int GetSeqsCommand::readAccnos(){
                exit(1);
        }
 }
+//**********************************************************************************************************************
+
+int GetSeqsCommand::compareAccnos(){
+       try {
+               
+               string thisOutputDir = outputDir;
+               if (outputDir == "") {  thisOutputDir += m->hasPath(accnosfile);  }
+               string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(accnosfile)) + "accnos.report";
+               ofstream out;
+               m->openOutputFile(outputFileName, out);
+               
+               ifstream in;
+               m->openInputFile(accnosfile2, in);
+               string name;
+               
+               set<string> namesAccnos2;
+               set<string> namesDups;
+               set<string> namesAccnos = names;
+               
+               while(!in.eof()){
+                       in >> name;
+                       
+                       if (namesAccnos.count(name) == 0){ //name unique to accnos2
+                               namesAccnos2.insert(name);
+                       }else { //you are in both so erase
+                               namesAccnos.erase(name);
+                               namesDups.insert(name);
+                       }
+                       
+                       m->gobble(in);
+               }
+               in.close();     
+               
+               out << "Names in both files : " + toString(namesDups.size()) << endl;
+               m->mothurOut("Names in both files : " + toString(namesDups.size())); m->mothurOutEndLine();
+               
+               for (set<string>::iterator it = namesDups.begin(); it != namesDups.end(); it++) {
+                       out << (*it) << endl;
+               }
+               
+               out << "Names unique to " + accnosfile + " : " + toString(namesAccnos.size()) << endl;
+               m->mothurOut("Names unique to " + accnosfile + " : " + toString(namesAccnos.size())); m->mothurOutEndLine();
+               
+               for (set<string>::iterator it = namesAccnos.begin(); it != namesAccnos.end(); it++) {
+                       out << (*it) << endl;
+               }
+               
+               out << "Names unique to " + accnosfile2 + " : " + toString(namesAccnos2.size()) << endl;
+               m->mothurOut("Names unique to " + accnosfile2 + " : " + toString(namesAccnos2.size())); m->mothurOutEndLine();
+               
+               for (set<string>::iterator it = namesAccnos2.begin(); it != namesAccnos2.end(); it++) {
+                       out << (*it) << endl;
+               }
+
+               out.close(); 
+               
+               outputNames.push_back(outputFileName);  outputTypes["accnosreport"].push_back(outputFileName);
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "GetSeqsCommand", "readAccnos");
+               exit(1);
+       }
+}
+
 
 //**********************************************************************************************************************
 
index b249072e41aba8b7ff0501c1fc946b90c48dfd22..aef789aa7dd80a656a66c3061a2e8cadd1fe081b 100644 (file)
@@ -29,7 +29,7 @@ class GetSeqsCommand : public Command {
        private:
                set<string> names;
                vector<string> outputNames;
-               string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir;
+               string accnosfile, accnosfile2, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir;
                bool abort, dups;
                map<string, vector<string> > outputTypes;
                
@@ -41,6 +41,7 @@ class GetSeqsCommand : public Command {
                int readList();
                int readTax();
                int readQual();
+               int compareAccnos();
                
 };
 
diff --git a/mothur b/mothur
index 90a4275202b2cc20ab12e3ee783fc7f91e9322bb..e36592e89bc100dea7c6fa4d156974d27838c63b 100755 (executable)
Binary files a/mothur and b/mothur differ
index 42c26d59ab4df8cd9e0d75937a476617135adbd7..872a0f091d49fddac359d528c53c799a50ef3c66 100644 (file)
@@ -139,7 +139,7 @@ Sequence::Sequence(ifstream& fastaFile){
                m = MothurOut::getInstance();
                initialize();
                fastaFile >> name;
-
+               
                if (name.length() != 0) { 
                
                        name = name.substr(1);