From: westcott Date: Mon, 3 Jan 2011 19:16:52 +0000 (+0000) Subject: added template=self capability to chimers.slayer, worked on corr.axes and added accno... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=de67504f85e091a3049ef4c5df8e77f7dcb1d814 added template=self capability to chimers.slayer, worked on corr.axes and added accnos compare function to get.seqs --- diff --git a/chimera.h b/chimera.h index 24ab3f7..6ac6191 100644 --- a/chimera.h +++ b/chimera.h @@ -83,6 +83,7 @@ struct linePair { linePair(){} }; + /***********************************************************************/ class Chimera { diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 1336876..ee449cc 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -43,6 +43,97 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num exit(1); } } +//*************************************************************************************************************** +ChimeraSlayer::ChimeraSlayer(string file, string temp, string name, string mode, int k, int ms, int mms, int win, float div, + int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() { + try { + fastafile = file; templateSeqs = readSeqs(fastafile); + templateFileName = temp; + searchMethod = mode; + kmerSize = k; + match = ms; + misMatch = mms; + window = win; + divR = div; + minSim = minsim; + minCov = mincov; + minBS = minbs; + minSNP = minsnp; + parents = par; + iters = it; + increment = inc; + numWanted = numw; + realign = r; + + //read name file and create nameMapRank + readNameFile(name); + + decalc = new DeCalculator(); + + createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap + + //run filter on template + for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); } + + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer"); + exit(1); + } +} +//*************************************************************************************************************** +int ChimeraSlayer::readNameFile(string name) { + try { + ifstream in; + m->openInputFile(name, in); + + int maxRank = 0; + int minRank = 10000000; + + while(!in.eof()){ + + if (m->control_pressed) { in.close(); return 0; } + + string thisname, repnames; + + in >> thisname; m->gobble(in); //read from first column + in >> repnames; //read from second column + + map >::iterator it = nameMapRank.find(thisname); + if (it == nameMapRank.end()) { + + vector splitRepNames; + m->splitAtComma(repnames, splitRepNames); + + nameMapRank[thisname] = splitRepNames; + + if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); } + if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); } + + }else{ m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine(); } + + m->gobble(in); + } + in.close(); + + //sanity check to make sure files match + for (int i = 0; i < templateSeqs.size(); i++) { + map >::iterator it = nameMapRank.find(templateSeqs[i]->getName()); + + if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true; } + } + + if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true; } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "readNameFile"); + exit(1); + } +} + //*************************************************************************************************************** int ChimeraSlayer::doPrep() { try { @@ -178,11 +269,124 @@ int ChimeraSlayer::doPrep() { exit(1); } } +//*************************************************************************************************************** +vector ChimeraSlayer::getTemplate(Sequence* q) { + try { + + vector thisTemplate; + + int thisRank; + string thisName = q->getName(); + map >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked + thisRank = (itRank->second).size(); + + //create list of names we want to put into the template + set namesToAdd; + for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) { + if ((itRank->second).size() > thisRank) { + //you are more abundant than me + for (int i = 0; i < (itRank->second).size(); i++) { + namesToAdd.insert((itRank->second)[i]); + } + } + } + + for (int i = 0; i < templateSeqs.size(); i++) { + if (namesToAdd.count(templateSeqs[i]->getName()) != 0) { + thisTemplate.push_back(templateSeqs[i]); + } + } + + string kmerDBNameLeft; + string kmerDBNameRight; + + //generate the kmerdb to pass to maligner + if (searchMethod == "kmer") { + string templatePath = m->hasPath(templateFileName); + string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName)); + databaseRight = new KmerDB(rightTemplateFileName, kmerSize); + + string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName)); + databaseLeft = new KmerDB(leftTemplateFileName, kmerSize); +#ifdef USE_MPI + for (int i = 0; i < thisTemplate.size(); i++) { + + if (m->control_pressed) { return thisTemplate; } + + string leftFrag = thisTemplate[i]->getUnaligned(); + leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); + + Sequence leftTemp(thisTemplate[i]->getName(), leftFrag); + databaseLeft->addSequence(leftTemp); + } + databaseLeft->generateDB(); + databaseLeft->setNumSeqs(thisTemplate.size()); + + for (int i = 0; i < thisTemplate.size(); i++) { + if (m->control_pressed) { return thisTemplate; } + + string rightFrag = thisTemplate[i]->getUnaligned(); + rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); + + Sequence rightTemp(thisTemplate[i]->getName(), rightFrag); + databaseRight->addSequence(rightTemp); + } + databaseRight->generateDB(); + databaseRight->setNumSeqs(thisTemplate.size()); + +#else + + + for (int i = 0; i < thisTemplate.size(); i++) { + + if (m->control_pressed) { return thisTemplate; } + + string leftFrag = thisTemplate[i]->getUnaligned(); + leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); + + Sequence leftTemp(thisTemplate[i]->getName(), leftFrag); + databaseLeft->addSequence(leftTemp); + } + databaseLeft->generateDB(); + databaseLeft->setNumSeqs(thisTemplate.size()); + + for (int i = 0; i < thisTemplate.size(); i++) { + if (m->control_pressed) { return thisTemplate; } + + string rightFrag = thisTemplate[i]->getUnaligned(); + rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); + + Sequence rightTemp(thisTemplate[i]->getName(), rightFrag); + databaseRight->addSequence(rightTemp); + } + databaseRight->generateDB(); + databaseRight->setNumSeqs(thisTemplate.size()); +#endif + }else if (searchMethod == "blast") { + + //generate blastdb + databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch); + for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; } databaseLeft->addSequence(*thisTemplate[i]); } + databaseLeft->generateDB(); + databaseLeft->setNumSeqs(thisTemplate.size()); + } + + return thisTemplate; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "getTemplate"); + exit(1); + } +} + //*************************************************************************************************************** ChimeraSlayer::~ChimeraSlayer() { delete decalc; - if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; } - else if (searchMethod == "blast") { delete databaseLeft; } + if (templateFileName != "self") { + if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; } + else if (searchMethod == "blast") { delete databaseLeft; } + } } //*************************************************************************************************************** void ChimeraSlayer::printHeader(ostream& out) { @@ -296,9 +500,23 @@ int ChimeraSlayer::getChimeras(Sequence* query) { querySeq = query; + //you must create a template + vector thisTemplate; + if (templateFileName != "self") { thisTemplate = templateSeqs; } + else { thisTemplate = getTemplate(query); } //fills thistemplate and creates the databases + + if (m->control_pressed) { return 0; } + + if (thisTemplate.size() == 0) { return 0; } //not chimeric + //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity - Maligner maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight); + Maligner maligner(thisTemplate, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight); Slayer slayer(window, increment, minSim, divR, iters, minSNP); + + if (templateFileName == "self") { + if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; } + else if (searchMethod == "blast") { delete databaseLeft; } + } if (m->control_pressed) { return 0; } @@ -308,7 +526,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) { //found in testing realigning only made things worse if (realign) { - ChimeraReAligner realigner(templateSeqs, match, misMatch); + ChimeraReAligner realigner(thisTemplate, match, misMatch); realigner.reAlign(query, Results); } diff --git a/chimeraslayer.h b/chimeraslayer.h index 6d57474..b6cae49 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -15,6 +15,8 @@ #include "maligner.h" #include "slayer.h" + + //***********************************************************************/ //This class was modeled after the chimeraSlayer written by the Broad Institute /***********************************************************************/ @@ -24,6 +26,8 @@ class ChimeraSlayer : public Chimera { public: ChimeraSlayer(string, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool); + ChimeraSlayer(string, string, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool); + ~ChimeraSlayer(); int getChimeras(Sequence*); @@ -41,6 +45,7 @@ class ChimeraSlayer : public Chimera { map spotMap; Database* databaseRight; Database* databaseLeft; + map > nameMapRank; //sequence name to rank so you can construct a template of the abundant sequences if the user uses itself as template vector chimeraResults; string chimeraFlags, searchMethod, fastafile; @@ -50,6 +55,8 @@ class ChimeraSlayer : public Chimera { void printBlock(data_struct, string, ostream&); string getBlock(data_struct, string); + int readNameFile(string); + vector getTemplate(Sequence*); }; diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index cd84bdc..394172c 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -9,11 +9,12 @@ #include "chimeraslayercommand.h" #include "chimeraslayer.h" +#include "deconvolutecommand.h" //********************************************************************************************************************** vector ChimeraSlayerCommand::getValidParameters(){ try { - string AlignArray[] = {"fasta", "processors", "window", "template","numwanted", "ksize", "match","mismatch", + string AlignArray[] = {"fasta", "processors", "name","window", "template","numwanted", "ksize", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" }; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); return myArray; @@ -68,7 +69,7 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { else { //valid paramters for this command - string Array[] = {"fasta", "processors", "window", "template","numwanted", "ksize", "match","mismatch", + string Array[] = {"fasta", "processors","name", "window", "template","numwanted", "ksize", "match","mismatch", "divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" }; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); @@ -90,21 +91,10 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } - else { - string path; - it = parameters.find("template"); - //user has given a template file - if(it != parameters.end()){ - path = m->hasPath(it->second); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["template"] = inputDir + it->second; } - } - } - - + //check for required parameters fastafile = validParameter.validFile(parameters, "fasta", false); - if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; } + if (fastafile == "not found") { fastafile = ""; m->mothurOut("[ERROR]: fasta is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; } else { m->splitAtDash(fastafile, fastaFileNames); @@ -155,16 +145,89 @@ ChimeraSlayerCommand::ChimeraSlayerCommand(string option) { } //make sure there is at least one valid file left - if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } + if (fastaFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; } + } + + + //check for required parameters + bool hasName = true; + namefile = validParameter.validFile(parameters, "name", false); + if (namefile == "not found") { namefile = ""; hasName = false; } + else { + m->splitAtDash(namefile, nameFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < nameFileNames.size(); i++) { + if (inputDir != "") { + string path = m->hasPath(nameFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { nameFileNames[i] = inputDir + nameFileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(nameFileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(nameFileNames[i]); + m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + nameFileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(nameFileNames[i]); + m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + nameFileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + nameFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + nameFileNames.erase(nameFileNames.begin()+i); + i--; + } + } + + //make sure there is at least one valid file left + if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; } } + if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; } + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } - - templatefile = validParameter.validFile(parameters, "template", true); - if (templatefile == "not open") { abort = true; } - else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; } - + + + string path; + it = parameters.find("template"); + //user has given a template file + if(it != parameters.end()){ + if (it->second == "self") { templatefile = "self"; } + else { + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["template"] = inputDir + it->second; } + + templatefile = validParameter.validFile(parameters, "template", true); + if (templatefile == "not open") { abort = true; } + else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; } + } + } + string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } convert(temp, processors); @@ -227,10 +290,11 @@ void ChimeraSlayerCommand::help(){ m->mothurOut("The chimera.slayer command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n"); m->mothurOut("This command was modeled after the chimeraSlayer written by the Broad Institute.\n"); - m->mothurOut("The chimera.slayer command parameters are fasta, template, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign, + m->mothurOut("The chimera.slayer command parameters are fasta, name, template, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign, m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n"); + m->mothurOut("The name parameter allows you to provide a name file, if you are using template=self. \n"); m->mothurOut("You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n"); - m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n"); + m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n"); m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); #ifdef USE_MPI m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); @@ -278,8 +342,36 @@ int ChimeraSlayerCommand::execute(){ int start = time(NULL); - chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign); + if (templatefile != "self") { //you want to run slayer with a refernce template + chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign); + }else { + if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one + chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, nameFileNames[s], search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign); + }else { + + m->mothurOutEndLine(); m->mothurOut("No namesfile given, running unique.seqs command to generate one."); m->mothurOutEndLine(); m->mothurOutEndLine(); + + //use unique.seqs to create new name and fastafile + string inputString = "fasta=" + fastaFileNames[s]; + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); + + Command* uniqueCommand = new DeconvoluteCommand(inputString); + uniqueCommand->execute(); + + map > filenames = uniqueCommand->getOutputFiles(); + + delete uniqueCommand; + + m->mothurOut("/******************************************/"); m->mothurOutEndLine(); + + string nameFile = filenames["name"][0]; + fastaFileNames[s] = filenames["fasta"][0]; + chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, nameFile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign); + } + } + if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.chimeras"; string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.accnos"; diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h index aa9b5e7..9ff0c8f 100644 --- a/chimeraslayercommand.h +++ b/chimeraslayercommand.h @@ -49,7 +49,7 @@ private: #endif bool abort, realign; - string fastafile, templatefile, outputDir, search; + string fastafile, templatefile, outputDir, search, namefile; int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength; float divR; Chimera* chimera; @@ -57,6 +57,7 @@ private: vector outputNames; map > outputTypes; vector fastaFileNames; + vector nameFileNames; }; diff --git a/corraxescommand.cpp b/corraxescommand.cpp index a36f95d..ccdd740 100644 --- a/corraxescommand.cpp +++ b/corraxescommand.cpp @@ -9,6 +9,11 @@ #include "corraxescommand.h" +//******************************************************************************************************************** +//sorts highes to lowest +inline bool compareSpearman(spearmanRank left, spearmanRank right){ + return (left.score > right.score); +} //********************************************************************************************************************** vector CorrAxesCommand::getValidParameters(){ try { @@ -262,8 +267,8 @@ int CorrAxesCommand::execute(){ if (method == "pearson") { calcPearson(axes, out); } else if (method == "spearman") { calcSpearman(axes, out); } - //else if (method == "kendall") { calcKendal(axes, out); } - //else { m->mothurOut("[ERROR]: Invalid method."); m->mothurOutEndLine(); } + else if (method == "kendall") { calcKendall(axes, out); } + else { m->mothurOut("[ERROR]: Invalid method."); m->mothurOutEndLine(); } out.close(); for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; } @@ -346,52 +351,106 @@ int CorrAxesCommand::calcPearson(map >& axes, ofstream& ou int CorrAxesCommand::calcSpearman(map >& axes, ofstream& out) { try { - //find average of each axis - X - vector averageAxes; averageAxes.resize(numaxes, 0.0); + //format data + vector< vector > scores; scores.resize(numaxes); for (map >::iterator it = axes.begin(); it != axes.end(); it++) { vector temp = it->second; for (int i = 0; i < temp.size(); i++) { - averageAxes[i] += temp[i]; + spearmanRank member(it->first, temp[i]); + scores[i].push_back(member); } } - for (int i = 0; i < averageAxes.size(); i++) { averageAxes[i] = averageAxes[i] / (float) axes.size(); } + //sort each axis + for (int i = 0; i < numaxes; i++) { sort(scores[i].begin(), scores[i].end(), compareSpearman); } + + //find ranks of xi in each axis + map > rankAxes; + for (int i = 0; i < numaxes; i++) { + + vector ties; + int rankTotal = 0; + for (int j = 0; j < scores[i].size(); j++) { + rankTotal += j; + ties.push_back(scores[i][j]); + + if (j != scores.size()-1) { // you are not the last so you can look ahead + if (scores[i][j].score != scores[i][j+1].score) { // you are done with ties, rank them and continue + for (int k = 0; k < ties.size(); k++) { + float thisrank = rankTotal / (float) ties.size(); + rankAxes[ties[k].name].push_back(thisrank); + } + ties.clear(); + rankTotal = 0; + } + }else { // you are the last one + for (int k = 0; k < ties.size(); k++) { + float thisrank = rankTotal / (float) ties.size(); + rankAxes[ties[k].name].push_back(thisrank); + } + } + } + } + + //for each otu for (int i = 0; i < lookupFloat[0]->getNumBins(); i++) { out << i+1 << '\t'; - //find the averages this otu - Y - float sumOtu = 0.0; + //find the ranks of this otu - Y + vector otuScores; for (int j = 0; j < lookupFloat.size(); j++) { - sumOtu += lookupFloat[j]->getAbundance(i); + spearmanRank member(lookupFloat[j]->getGroup(), lookupFloat[j]->getAbundance(i)); + otuScores.push_back(member); } - float Ybar = sumOtu / (float) lookupFloat.size(); - //find r value for each axis - for (int k = 0; k < averageAxes.size(); k++) { + sort(otuScores.begin(), otuScores.end(), compareSpearman); + + map rankOtus; + vector ties; + int rankTotal = 0; + for (int j = 0; j < otuScores.size(); j++) { + rankTotal += j; + ties.push_back(otuScores[j]); - double r = 0.0; - double numerator = 0.0; - double denomTerm1 = 0.0; - double denomTerm2 = 0.0; - for (int j = 0; j < lookupFloat.size(); j++) { - float Yi = lookupFloat[j]->getAbundance(i); - float Xi = axes[lookupFloat[j]->getGroup()][k]; - - numerator += ((Xi - averageAxes[k]) * (Yi - Ybar)); - denomTerm1 += ((Xi - averageAxes[k]) * (Xi - averageAxes[k])); - denomTerm2 += ((Yi - Ybar) * (Yi - Ybar)); + if (j != scores.size()-1) { // you are not the last so you can look ahead + if (otuScores[j].score != otuScores[j+1].score) { // you are done with ties, rank them and continue + for (int k = 0; k < ties.size(); k++) { + float thisrank = rankTotal / (float) ties.size(); + rankOtus[ties[k].name] = thisrank; + } + ties.clear(); + rankTotal = 0; + } + }else { // you are the last one + for (int k = 0; k < ties.size(); k++) { + float thisrank = rankTotal / (float) ties.size(); + rankOtus[ties[k].name] = thisrank; + } } + } + + //calc spearman ranks for each axis for this otu + for (int j = 0; j < numaxes; j++) { - double denom = (sqrt(denomTerm1 * denomTerm2)); + double di = 0.0; + for (int k = 0; k < lookupFloat.size(); k++) { + + float xi = rankAxes[lookupFloat[k]->getGroup()][j]; + float yi = rankOtus[lookupFloat[k]->getGroup()]; + + di += ((xi - yi) * (xi - yi)); + } - r = numerator / denom; + int n = lookupFloat.size(); + double p = 1.0 - ((6 * di) / (float) (n * ((n*n) - 1))); - out << r << '\t'; + out << p << '\t'; } + out << endl; } @@ -403,6 +462,132 @@ int CorrAxesCommand::calcSpearman(map >& axes, ofstream& o } } //********************************************************************************************************************** +int CorrAxesCommand::calcKendall(map >& axes, ofstream& out) { + try { + + //format data + vector< vector > scores; scores.resize(numaxes); + for (map >::iterator it = axes.begin(); it != axes.end(); it++) { + vector temp = it->second; + for (int i = 0; i < temp.size(); i++) { + spearmanRank member(it->first, temp[i]); + scores[i].push_back(member); + } + } + + //sort each axis + for (int i = 0; i < numaxes; i++) { sort(scores[i].begin(), scores[i].end(), compareSpearman); } + + //find ranks of xi in each axis + map > rankAxes; + for (int i = 0; i < numaxes; i++) { + + vector ties; + int rankTotal = 0; + for (int j = 0; j < scores[i].size(); j++) { + rankTotal += j; + ties.push_back(scores[i][j]); + + if (j != scores.size()-1) { // you are not the last so you can look ahead + if (scores[i][j].score != scores[i][j+1].score) { // you are done with ties, rank them and continue + for (int k = 0; k < ties.size(); k++) { + float thisrank = rankTotal / (float) ties.size(); + rankAxes[ties[k].name].push_back(thisrank); + } + ties.clear(); + rankTotal = 0; + } + }else { // you are the last one + for (int k = 0; k < ties.size(); k++) { + float thisrank = rankTotal / (float) ties.size(); + rankAxes[ties[k].name].push_back(thisrank); + } + } + } + } + + + + //for each otu + for (int i = 0; i < lookupFloat[0]->getNumBins(); i++) { + + out << i+1 << '\t'; + + //find the ranks of this otu - Y + vector otuScores; + for (int j = 0; j < lookupFloat.size(); j++) { + spearmanRank member(lookupFloat[j]->getGroup(), lookupFloat[j]->getAbundance(i)); + otuScores.push_back(member); + } + + sort(otuScores.begin(), otuScores.end(), compareSpearman); + + map rankOtus; + vector ties; + int rankTotal = 0; + for (int j = 0; j < otuScores.size(); j++) { + rankTotal += j; + ties.push_back(otuScores[j]); + + if (j != scores.size()-1) { // you are not the last so you can look ahead + if (otuScores[j].score != otuScores[j+1].score) { // you are done with ties, rank them and continue + for (int k = 0; k < ties.size(); k++) { + float thisrank = rankTotal / (float) ties.size(); + rankOtus[ties[k].name] = thisrank; + } + ties.clear(); + rankTotal = 0; + } + }else { // you are the last one + for (int k = 0; k < ties.size(); k++) { + float thisrank = rankTotal / (float) ties.size(); + rankOtus[ties[k].name] = thisrank; + } + } + } + + //calc kendall coeffient for each axis for this otu + for (int j = 0; j < numaxes; j++) { + + int numConcordant = 0; + int numDiscordant = 0; + + for (int f = 0; f < j; f++) { + + for (int k = 0; k < lookupFloat.size(); k++) { + + float xi = rankAxes[lookupFloat[k]->getGroup()][j]; + float yi = rankOtus[lookupFloat[k]->getGroup()]; + + for (int h = 0; h < k; h++) { + + float xj = rankAxes[lookupFloat[k]->getGroup()][f]; + float yj = rankOtus[lookupFloat[h]->getGroup()]; + + if ( ((xi > xj) && (yi < yj)) || ((xi < xj) && (yi > yj)) ){ numDiscordant++; } + if ( ((xi > xj) && (yi > yj)) || ((xi < xj) && (yi < yj)) ){ numConcordant++; } + } + } + } + + int n = lookupFloat.size(); + double p = (numConcordant - numDiscordant) / (float) (0.5 * n * (n - 1)); + + out << p << '\t'; + } + + + out << endl; + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "CorrAxesCommand", "calcKendall"); + exit(1); + } +} +//********************************************************************************************************************** int CorrAxesCommand::getShared(){ try { InputData* input = new InputData(sharedfile, "sharedfile"); diff --git a/corraxescommand.h b/corraxescommand.h index 01c8d2b..45a2b78 100644 --- a/corraxescommand.h +++ b/corraxescommand.h @@ -15,6 +15,15 @@ #include "sharedrabundfloatvector.h" #include "inputdata.h" +/***************************************************************/ +struct spearmanRank { + string name; + float score; + + spearmanRank(string n, float s) : name(n), score(s) {} +}; +/***************************************************************/ + class CorrAxesCommand : public Command { public: CorrAxesCommand(string); @@ -28,6 +37,8 @@ public: void help(); private: + + GlobalData* globaldata; string axesfile, sharedfile, relabundfile, metadatafile, groups, label, inputFileName, outputDir, method; bool abort, pickedGroups; @@ -46,6 +57,7 @@ private: map > readAxes(); int calcPearson(map >&, ofstream&); int calcSpearman(map >&, ofstream&); + int calcKendall(map >&, ofstream&); }; diff --git a/getseqscommand.cpp b/getseqscommand.cpp index 39c4fe0..d40dcc6 100644 --- a/getseqscommand.cpp +++ b/getseqscommand.cpp @@ -14,7 +14,7 @@ //********************************************************************************************************************** vector GetSeqsCommand::getValidParameters(){ try { - string Array[] = {"fasta","name", "group", "qfile","alignreport", "accnos", "dups", "list","taxonomy","outputdir","inputdir"}; + string Array[] = {"fasta","name", "group", "qfile","alignreport", "accnos", "accnos2","dups", "list","taxonomy","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); return myArray; } @@ -36,6 +36,7 @@ GetSeqsCommand::GetSeqsCommand(){ outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["accnosreport"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "GetSeqsCommand", "GetSeqsCommand"); @@ -75,7 +76,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { else { //valid paramters for this command - string Array[] = {"fasta","name", "group", "alignreport", "qfile", "accnos", "dups", "list","taxonomy","outputdir","inputdir"}; + string Array[] = {"fasta","name", "group", "alignreport", "qfile", "accnos", "accnos2","dups", "list","taxonomy","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -98,6 +99,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { outputTypes["alignreport"] = tempOutNames; outputTypes["list"] = tempOutNames; outputTypes["qfile"] = tempOutNames; + outputTypes["accnosreport"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -131,6 +133,14 @@ GetSeqsCommand::GetSeqsCommand(string option) { if (path == "") { parameters["accnos"] = inputDir + it->second; } } + it = parameters.find("accnos2"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["accnos2"] = inputDir + it->second; } + } + it = parameters.find("list"); //user has given a template file if(it != parameters.end()){ @@ -178,6 +188,9 @@ GetSeqsCommand::GetSeqsCommand(string option) { if (accnosfile == "not open") { abort = true; } else if (accnosfile == "not found") { accnosfile = ""; m->mothurOut("You must provide an accnos file."); m->mothurOutEndLine(); abort = true; } + accnosfile2 = validParameter.validFile(parameters, "accnos2", true); + if (accnosfile2 == "not open") { abort = true; } + fastafile = validParameter.validFile(parameters, "fasta", true); if (fastafile == "not open") { abort = true; } else if (fastafile == "not found") { fastafile = ""; } @@ -210,7 +223,7 @@ GetSeqsCommand::GetSeqsCommand(string option) { string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "false"; usedDups = ""; } dups = m->isTrue(temp); - if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; } + if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; } if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; } @@ -253,13 +266,14 @@ int GetSeqsCommand::execute(){ if (m->control_pressed) { return 0; } //read through the correct file and output lines you want to keep - if (namefile != "") { readName(); } - if (fastafile != "") { readFasta(); } - if (groupfile != "") { readGroup(); } - if (alignfile != "") { readAlign(); } - if (listfile != "") { readList(); } - if (taxfile != "") { readTax(); } - if (qualfile != "") { readQual(); } + if (namefile != "") { readName(); } + if (fastafile != "") { readFasta(); } + if (groupfile != "") { readGroup(); } + if (alignfile != "") { readAlign(); } + if (listfile != "") { readList(); } + if (taxfile != "") { readTax(); } + if (qualfile != "") { readQual(); } + if (accnosfile2 != "") { compareAccnos(); } if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } @@ -746,6 +760,73 @@ int GetSeqsCommand::readAccnos(){ exit(1); } } +//********************************************************************************************************************** + +int GetSeqsCommand::compareAccnos(){ + try { + + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(accnosfile); } + string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(accnosfile)) + "accnos.report"; + ofstream out; + m->openOutputFile(outputFileName, out); + + ifstream in; + m->openInputFile(accnosfile2, in); + string name; + + set namesAccnos2; + set namesDups; + set namesAccnos = names; + + while(!in.eof()){ + in >> name; + + if (namesAccnos.count(name) == 0){ //name unique to accnos2 + namesAccnos2.insert(name); + }else { //you are in both so erase + namesAccnos.erase(name); + namesDups.insert(name); + } + + m->gobble(in); + } + in.close(); + + out << "Names in both files : " + toString(namesDups.size()) << endl; + m->mothurOut("Names in both files : " + toString(namesDups.size())); m->mothurOutEndLine(); + + for (set::iterator it = namesDups.begin(); it != namesDups.end(); it++) { + out << (*it) << endl; + } + + out << "Names unique to " + accnosfile + " : " + toString(namesAccnos.size()) << endl; + m->mothurOut("Names unique to " + accnosfile + " : " + toString(namesAccnos.size())); m->mothurOutEndLine(); + + for (set::iterator it = namesAccnos.begin(); it != namesAccnos.end(); it++) { + out << (*it) << endl; + } + + out << "Names unique to " + accnosfile2 + " : " + toString(namesAccnos2.size()) << endl; + m->mothurOut("Names unique to " + accnosfile2 + " : " + toString(namesAccnos2.size())); m->mothurOutEndLine(); + + for (set::iterator it = namesAccnos2.begin(); it != namesAccnos2.end(); it++) { + out << (*it) << endl; + } + + out.close(); + + outputNames.push_back(outputFileName); outputTypes["accnosreport"].push_back(outputFileName); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "GetSeqsCommand", "readAccnos"); + exit(1); + } +} + //********************************************************************************************************************** diff --git a/getseqscommand.h b/getseqscommand.h index b249072..aef789a 100644 --- a/getseqscommand.h +++ b/getseqscommand.h @@ -29,7 +29,7 @@ class GetSeqsCommand : public Command { private: set names; vector outputNames; - string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; + string accnosfile, accnosfile2, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir; bool abort, dups; map > outputTypes; @@ -41,6 +41,7 @@ class GetSeqsCommand : public Command { int readList(); int readTax(); int readQual(); + int compareAccnos(); }; diff --git a/mothur b/mothur index 90a4275..e36592e 100755 Binary files a/mothur and b/mothur differ diff --git a/sequence.cpp b/sequence.cpp index 42c26d5..872a0f0 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -139,7 +139,7 @@ Sequence::Sequence(ifstream& fastaFile){ m = MothurOut::getInstance(); initialize(); fastaFile >> name; - + if (name.length() != 0) { name = name.substr(1);