linePair(){}
};
+
/***********************************************************************/
class Chimera {
exit(1);
}
}
+//***************************************************************************************************************
+ChimeraSlayer::ChimeraSlayer(string file, string temp, string name, string mode, int k, int ms, int mms, int win, float div,
+ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
+ try {
+ fastafile = file; templateSeqs = readSeqs(fastafile);
+ templateFileName = temp;
+ searchMethod = mode;
+ kmerSize = k;
+ match = ms;
+ misMatch = mms;
+ window = win;
+ divR = div;
+ minSim = minsim;
+ minCov = mincov;
+ minBS = minbs;
+ minSNP = minsnp;
+ parents = par;
+ iters = it;
+ increment = inc;
+ numWanted = numw;
+ realign = r;
+
+ //read name file and create nameMapRank
+ readNameFile(name);
+
+ decalc = new DeCalculator();
+
+ createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
+
+ //run filter on template
+ for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
+ exit(1);
+ }
+}
+//***************************************************************************************************************
+int ChimeraSlayer::readNameFile(string name) {
+ try {
+ ifstream in;
+ m->openInputFile(name, in);
+
+ int maxRank = 0;
+ int minRank = 10000000;
+
+ while(!in.eof()){
+
+ if (m->control_pressed) { in.close(); return 0; }
+
+ string thisname, repnames;
+
+ in >> thisname; m->gobble(in); //read from first column
+ in >> repnames; //read from second column
+
+ map<string, vector<string> >::iterator it = nameMapRank.find(thisname);
+ if (it == nameMapRank.end()) {
+
+ vector<string> splitRepNames;
+ m->splitAtComma(repnames, splitRepNames);
+
+ nameMapRank[thisname] = splitRepNames;
+
+ if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); }
+ if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); }
+
+ }else{ m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine(); }
+
+ m->gobble(in);
+ }
+ in.close();
+
+ //sanity check to make sure files match
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ map<string, vector<string> >::iterator it = nameMapRank.find(templateSeqs[i]->getName());
+
+ if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true; }
+ }
+
+ if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true; }
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraSlayer", "readNameFile");
+ exit(1);
+ }
+}
+
//***************************************************************************************************************
int ChimeraSlayer::doPrep() {
try {
exit(1);
}
}
+//***************************************************************************************************************
+vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
+ try {
+
+ vector<Sequence*> thisTemplate;
+
+ int thisRank;
+ string thisName = q->getName();
+ map<string, vector<string> >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked
+ thisRank = (itRank->second).size();
+
+ //create list of names we want to put into the template
+ set<string> namesToAdd;
+ for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) {
+ if ((itRank->second).size() > thisRank) {
+ //you are more abundant than me
+ for (int i = 0; i < (itRank->second).size(); i++) {
+ namesToAdd.insert((itRank->second)[i]);
+ }
+ }
+ }
+
+ for (int i = 0; i < templateSeqs.size(); i++) {
+ if (namesToAdd.count(templateSeqs[i]->getName()) != 0) {
+ thisTemplate.push_back(templateSeqs[i]);
+ }
+ }
+
+ string kmerDBNameLeft;
+ string kmerDBNameRight;
+
+ //generate the kmerdb to pass to maligner
+ if (searchMethod == "kmer") {
+ string templatePath = m->hasPath(templateFileName);
+ string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
+ databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
+
+ string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
+ databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
+#ifdef USE_MPI
+ for (int i = 0; i < thisTemplate.size(); i++) {
+
+ if (m->control_pressed) { return thisTemplate; }
+
+ string leftFrag = thisTemplate[i]->getUnaligned();
+ leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+
+ Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
+ databaseLeft->addSequence(leftTemp);
+ }
+ databaseLeft->generateDB();
+ databaseLeft->setNumSeqs(thisTemplate.size());
+
+ for (int i = 0; i < thisTemplate.size(); i++) {
+ if (m->control_pressed) { return thisTemplate; }
+
+ string rightFrag = thisTemplate[i]->getUnaligned();
+ rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+
+ Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
+ databaseRight->addSequence(rightTemp);
+ }
+ databaseRight->generateDB();
+ databaseRight->setNumSeqs(thisTemplate.size());
+
+#else
+
+
+ for (int i = 0; i < thisTemplate.size(); i++) {
+
+ if (m->control_pressed) { return thisTemplate; }
+
+ string leftFrag = thisTemplate[i]->getUnaligned();
+ leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+
+ Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
+ databaseLeft->addSequence(leftTemp);
+ }
+ databaseLeft->generateDB();
+ databaseLeft->setNumSeqs(thisTemplate.size());
+
+ for (int i = 0; i < thisTemplate.size(); i++) {
+ if (m->control_pressed) { return thisTemplate; }
+
+ string rightFrag = thisTemplate[i]->getUnaligned();
+ rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+
+ Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
+ databaseRight->addSequence(rightTemp);
+ }
+ databaseRight->generateDB();
+ databaseRight->setNumSeqs(thisTemplate.size());
+#endif
+ }else if (searchMethod == "blast") {
+
+ //generate blastdb
+ databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
+ for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; } databaseLeft->addSequence(*thisTemplate[i]); }
+ databaseLeft->generateDB();
+ databaseLeft->setNumSeqs(thisTemplate.size());
+ }
+
+ return thisTemplate;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ChimeraSlayer", "getTemplate");
+ exit(1);
+ }
+}
+
//***************************************************************************************************************
ChimeraSlayer::~ChimeraSlayer() {
delete decalc;
- if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
- else if (searchMethod == "blast") { delete databaseLeft; }
+ if (templateFileName != "self") {
+ if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
+ else if (searchMethod == "blast") { delete databaseLeft; }
+ }
}
//***************************************************************************************************************
void ChimeraSlayer::printHeader(ostream& out) {
querySeq = query;
+ //you must create a template
+ vector<Sequence*> thisTemplate;
+ if (templateFileName != "self") { thisTemplate = templateSeqs; }
+ else { thisTemplate = getTemplate(query); } //fills thistemplate and creates the databases
+
+ if (m->control_pressed) { return 0; }
+
+ if (thisTemplate.size() == 0) { return 0; } //not chimeric
+
//referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
- Maligner maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
+ Maligner maligner(thisTemplate, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
Slayer slayer(window, increment, minSim, divR, iters, minSNP);
+
+ if (templateFileName == "self") {
+ if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
+ else if (searchMethod == "blast") { delete databaseLeft; }
+ }
if (m->control_pressed) { return 0; }
//found in testing realigning only made things worse
if (realign) {
- ChimeraReAligner realigner(templateSeqs, match, misMatch);
+ ChimeraReAligner realigner(thisTemplate, match, misMatch);
realigner.reAlign(query, Results);
}
#include "maligner.h"
#include "slayer.h"
+
+
//***********************************************************************/
//This class was modeled after the chimeraSlayer written by the Broad Institute
/***********************************************************************/
public:
ChimeraSlayer(string, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
+ ChimeraSlayer(string, string, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
+
~ChimeraSlayer();
int getChimeras(Sequence*);
map<int, int> spotMap;
Database* databaseRight;
Database* databaseLeft;
+ map<string, vector<string> > nameMapRank; //sequence name to rank so you can construct a template of the abundant sequences if the user uses itself as template
vector<data_struct> chimeraResults;
string chimeraFlags, searchMethod, fastafile;
void printBlock(data_struct, string, ostream&);
string getBlock(data_struct, string);
+ int readNameFile(string);
+ vector<Sequence*> getTemplate(Sequence*);
};
#include "chimeraslayercommand.h"
#include "chimeraslayer.h"
+#include "deconvolutecommand.h"
//**********************************************************************************************************************
vector<string> ChimeraSlayerCommand::getValidParameters(){
try {
- string AlignArray[] = {"fasta", "processors", "window", "template","numwanted", "ksize", "match","mismatch",
+ string AlignArray[] = {"fasta", "processors", "name","window", "template","numwanted", "ksize", "match","mismatch",
"divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" };
vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
return myArray;
else {
//valid paramters for this command
- string Array[] = {"fasta", "processors", "window", "template","numwanted", "ksize", "match","mismatch",
+ string Array[] = {"fasta", "processors","name", "window", "template","numwanted", "ksize", "match","mismatch",
"divergence", "minsim","mincov","minbs", "minsnp","parents", "iters","outputdir","inputdir", "search","realign" };
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
//if the user changes the input directory command factory will send this info to us in the output parameter
string inputDir = validParameter.validFile(parameters, "inputdir", false);
if (inputDir == "not found"){ inputDir = ""; }
- else {
- string path;
- it = parameters.find("template");
- //user has given a template file
- if(it != parameters.end()){
- path = m->hasPath(it->second);
- //if the user has not given a path then, add inputdir. else leave path alone.
- if (path == "") { parameters["template"] = inputDir + it->second; }
- }
- }
-
-
+
//check for required parameters
fastafile = validParameter.validFile(parameters, "fasta", false);
- if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; }
+ if (fastafile == "not found") { fastafile = ""; m->mothurOut("[ERROR]: fasta is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; }
else {
m->splitAtDash(fastafile, fastaFileNames);
}
//make sure there is at least one valid file left
- if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; }
+ if (fastaFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
+ }
+
+
+ //check for required parameters
+ bool hasName = true;
+ namefile = validParameter.validFile(parameters, "name", false);
+ if (namefile == "not found") { namefile = ""; hasName = false; }
+ else {
+ m->splitAtDash(namefile, nameFileNames);
+
+ //go through files and make sure they are good, if not, then disregard them
+ for (int i = 0; i < nameFileNames.size(); i++) {
+ if (inputDir != "") {
+ string path = m->hasPath(nameFileNames[i]);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { nameFileNames[i] = inputDir + nameFileNames[i]; }
+ }
+
+ int ableToOpen;
+ ifstream in;
+
+ ableToOpen = m->openInputFile(nameFileNames[i], in, "noerror");
+
+ //if you can't open it, try default location
+ if (ableToOpen == 1) {
+ if (m->getDefaultPath() != "") { //default path is set
+ string tryPath = m->getDefaultPath() + m->getSimpleName(nameFileNames[i]);
+ m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ nameFileNames[i] = tryPath;
+ }
+ }
+
+ if (ableToOpen == 1) {
+ if (m->getOutputDir() != "") { //default path is set
+ string tryPath = m->getOutputDir() + m->getSimpleName(nameFileNames[i]);
+ m->mothurOut("Unable to open " + nameFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine();
+ ifstream in2;
+ ableToOpen = m->openInputFile(tryPath, in2, "noerror");
+ in2.close();
+ nameFileNames[i] = tryPath;
+ }
+ }
+
+ in.close();
+
+ if (ableToOpen == 1) {
+ m->mothurOut("Unable to open " + nameFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine();
+ //erase from file list
+ nameFileNames.erase(nameFileNames.begin()+i);
+ i--;
+ }
+ }
+
+ //make sure there is at least one valid file left
+ if (nameFileNames.size() == 0) { m->mothurOut("[ERROR]: no valid name files."); m->mothurOutEndLine(); abort = true; }
}
+ if (hasName && (nameFileNames.size() != fastaFileNames.size())) { m->mothurOut("[ERROR]: The number of namefiles does not match the number of fastafiles, please correct."); m->mothurOutEndLine(); abort=true; }
+
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
-
- templatefile = validParameter.validFile(parameters, "template", true);
- if (templatefile == "not open") { abort = true; }
- else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; }
-
+
+
+ string path;
+ it = parameters.find("template");
+ //user has given a template file
+ if(it != parameters.end()){
+ if (it->second == "self") { templatefile = "self"; }
+ else {
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["template"] = inputDir + it->second; }
+
+ templatefile = validParameter.validFile(parameters, "template", true);
+ if (templatefile == "not open") { abort = true; }
+ else if (templatefile == "not found") { templatefile = ""; m->mothurOut("template is a required parameter for the chimera.slayer command."); m->mothurOutEndLine(); abort = true; }
+ }
+ }
+
string temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; }
convert(temp, processors);
m->mothurOut("The chimera.slayer command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
m->mothurOut("This command was modeled after the chimeraSlayer written by the Broad Institute.\n");
- m->mothurOut("The chimera.slayer command parameters are fasta, template, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign,
+ m->mothurOut("The chimera.slayer command parameters are fasta, name, template, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign,
m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
+ m->mothurOut("The name parameter allows you to provide a name file, if you are using template=self. \n");
m->mothurOut("You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n");
- m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
+ m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n");
m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
#ifdef USE_MPI
m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
int start = time(NULL);
- chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);
+ if (templatefile != "self") { //you want to run slayer with a refernce template
+ chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);
+ }else {
+ if (nameFileNames.size() != 0) { //you provided a namefile and we don't need to create one
+ chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, nameFileNames[s], search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);
+ }else {
+
+ m->mothurOutEndLine(); m->mothurOut("No namesfile given, running unique.seqs command to generate one."); m->mothurOutEndLine(); m->mothurOutEndLine();
+
+ //use unique.seqs to create new name and fastafile
+ string inputString = "fasta=" + fastaFileNames[s];
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+ m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
+
+ Command* uniqueCommand = new DeconvoluteCommand(inputString);
+ uniqueCommand->execute();
+
+ map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
+
+ delete uniqueCommand;
+
+ m->mothurOut("/******************************************/"); m->mothurOutEndLine();
+
+ string nameFile = filenames["name"][0];
+ fastaFileNames[s] = filenames["fasta"][0];
+ chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, nameFile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);
+ }
+ }
+
if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]); }//if user entered a file with a path then preserve it
string outputFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.chimeras";
string accnosFileName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "slayer.accnos";
#endif
bool abort, realign;
- string fastafile, templatefile, outputDir, search;
+ string fastafile, templatefile, outputDir, search, namefile;
int processors, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength;
float divR;
Chimera* chimera;
vector<string> outputNames;
map<string, vector<string> > outputTypes;
vector<string> fastaFileNames;
+ vector<string> nameFileNames;
};
#include "corraxescommand.h"
+//********************************************************************************************************************
+//sorts highes to lowest
+inline bool compareSpearman(spearmanRank left, spearmanRank right){
+ return (left.score > right.score);
+}
//**********************************************************************************************************************
vector<string> CorrAxesCommand::getValidParameters(){
try {
if (method == "pearson") { calcPearson(axes, out); }
else if (method == "spearman") { calcSpearman(axes, out); }
- //else if (method == "kendall") { calcKendal(axes, out); }
- //else { m->mothurOut("[ERROR]: Invalid method."); m->mothurOutEndLine(); }
+ else if (method == "kendall") { calcKendall(axes, out); }
+ else { m->mothurOut("[ERROR]: Invalid method."); m->mothurOutEndLine(); }
out.close();
for (int i = 0; i < lookupFloat.size(); i++) { delete lookupFloat[i]; }
int CorrAxesCommand::calcSpearman(map<string, vector<float> >& axes, ofstream& out) {
try {
- //find average of each axis - X
- vector<float> averageAxes; averageAxes.resize(numaxes, 0.0);
+ //format data
+ vector< vector<spearmanRank> > scores; scores.resize(numaxes);
for (map<string, vector<float> >::iterator it = axes.begin(); it != axes.end(); it++) {
vector<float> temp = it->second;
for (int i = 0; i < temp.size(); i++) {
- averageAxes[i] += temp[i];
+ spearmanRank member(it->first, temp[i]);
+ scores[i].push_back(member);
}
}
- for (int i = 0; i < averageAxes.size(); i++) { averageAxes[i] = averageAxes[i] / (float) axes.size(); }
+ //sort each axis
+ for (int i = 0; i < numaxes; i++) { sort(scores[i].begin(), scores[i].end(), compareSpearman); }
+
+ //find ranks of xi in each axis
+ map<string, vector<float> > rankAxes;
+ for (int i = 0; i < numaxes; i++) {
+
+ vector<spearmanRank> ties;
+ int rankTotal = 0;
+ for (int j = 0; j < scores[i].size(); j++) {
+ rankTotal += j;
+ ties.push_back(scores[i][j]);
+
+ if (j != scores.size()-1) { // you are not the last so you can look ahead
+ if (scores[i][j].score != scores[i][j+1].score) { // you are done with ties, rank them and continue
+ for (int k = 0; k < ties.size(); k++) {
+ float thisrank = rankTotal / (float) ties.size();
+ rankAxes[ties[k].name].push_back(thisrank);
+ }
+ ties.clear();
+ rankTotal = 0;
+ }
+ }else { // you are the last one
+ for (int k = 0; k < ties.size(); k++) {
+ float thisrank = rankTotal / (float) ties.size();
+ rankAxes[ties[k].name].push_back(thisrank);
+ }
+ }
+ }
+ }
+
+
//for each otu
for (int i = 0; i < lookupFloat[0]->getNumBins(); i++) {
out << i+1 << '\t';
- //find the averages this otu - Y
- float sumOtu = 0.0;
+ //find the ranks of this otu - Y
+ vector<spearmanRank> otuScores;
for (int j = 0; j < lookupFloat.size(); j++) {
- sumOtu += lookupFloat[j]->getAbundance(i);
+ spearmanRank member(lookupFloat[j]->getGroup(), lookupFloat[j]->getAbundance(i));
+ otuScores.push_back(member);
}
- float Ybar = sumOtu / (float) lookupFloat.size();
- //find r value for each axis
- for (int k = 0; k < averageAxes.size(); k++) {
+ sort(otuScores.begin(), otuScores.end(), compareSpearman);
+
+ map<string, float> rankOtus;
+ vector<spearmanRank> ties;
+ int rankTotal = 0;
+ for (int j = 0; j < otuScores.size(); j++) {
+ rankTotal += j;
+ ties.push_back(otuScores[j]);
- double r = 0.0;
- double numerator = 0.0;
- double denomTerm1 = 0.0;
- double denomTerm2 = 0.0;
- for (int j = 0; j < lookupFloat.size(); j++) {
- float Yi = lookupFloat[j]->getAbundance(i);
- float Xi = axes[lookupFloat[j]->getGroup()][k];
-
- numerator += ((Xi - averageAxes[k]) * (Yi - Ybar));
- denomTerm1 += ((Xi - averageAxes[k]) * (Xi - averageAxes[k]));
- denomTerm2 += ((Yi - Ybar) * (Yi - Ybar));
+ if (j != scores.size()-1) { // you are not the last so you can look ahead
+ if (otuScores[j].score != otuScores[j+1].score) { // you are done with ties, rank them and continue
+ for (int k = 0; k < ties.size(); k++) {
+ float thisrank = rankTotal / (float) ties.size();
+ rankOtus[ties[k].name] = thisrank;
+ }
+ ties.clear();
+ rankTotal = 0;
+ }
+ }else { // you are the last one
+ for (int k = 0; k < ties.size(); k++) {
+ float thisrank = rankTotal / (float) ties.size();
+ rankOtus[ties[k].name] = thisrank;
+ }
}
+ }
+
+ //calc spearman ranks for each axis for this otu
+ for (int j = 0; j < numaxes; j++) {
- double denom = (sqrt(denomTerm1 * denomTerm2));
+ double di = 0.0;
+ for (int k = 0; k < lookupFloat.size(); k++) {
+
+ float xi = rankAxes[lookupFloat[k]->getGroup()][j];
+ float yi = rankOtus[lookupFloat[k]->getGroup()];
+
+ di += ((xi - yi) * (xi - yi));
+ }
- r = numerator / denom;
+ int n = lookupFloat.size();
+ double p = 1.0 - ((6 * di) / (float) (n * ((n*n) - 1)));
- out << r << '\t';
+ out << p << '\t';
}
+
out << endl;
}
}
}
//**********************************************************************************************************************
+int CorrAxesCommand::calcKendall(map<string, vector<float> >& axes, ofstream& out) {
+ try {
+
+ //format data
+ vector< vector<spearmanRank> > scores; scores.resize(numaxes);
+ for (map<string, vector<float> >::iterator it = axes.begin(); it != axes.end(); it++) {
+ vector<float> temp = it->second;
+ for (int i = 0; i < temp.size(); i++) {
+ spearmanRank member(it->first, temp[i]);
+ scores[i].push_back(member);
+ }
+ }
+
+ //sort each axis
+ for (int i = 0; i < numaxes; i++) { sort(scores[i].begin(), scores[i].end(), compareSpearman); }
+
+ //find ranks of xi in each axis
+ map<string, vector<float> > rankAxes;
+ for (int i = 0; i < numaxes; i++) {
+
+ vector<spearmanRank> ties;
+ int rankTotal = 0;
+ for (int j = 0; j < scores[i].size(); j++) {
+ rankTotal += j;
+ ties.push_back(scores[i][j]);
+
+ if (j != scores.size()-1) { // you are not the last so you can look ahead
+ if (scores[i][j].score != scores[i][j+1].score) { // you are done with ties, rank them and continue
+ for (int k = 0; k < ties.size(); k++) {
+ float thisrank = rankTotal / (float) ties.size();
+ rankAxes[ties[k].name].push_back(thisrank);
+ }
+ ties.clear();
+ rankTotal = 0;
+ }
+ }else { // you are the last one
+ for (int k = 0; k < ties.size(); k++) {
+ float thisrank = rankTotal / (float) ties.size();
+ rankAxes[ties[k].name].push_back(thisrank);
+ }
+ }
+ }
+ }
+
+
+
+ //for each otu
+ for (int i = 0; i < lookupFloat[0]->getNumBins(); i++) {
+
+ out << i+1 << '\t';
+
+ //find the ranks of this otu - Y
+ vector<spearmanRank> otuScores;
+ for (int j = 0; j < lookupFloat.size(); j++) {
+ spearmanRank member(lookupFloat[j]->getGroup(), lookupFloat[j]->getAbundance(i));
+ otuScores.push_back(member);
+ }
+
+ sort(otuScores.begin(), otuScores.end(), compareSpearman);
+
+ map<string, float> rankOtus;
+ vector<spearmanRank> ties;
+ int rankTotal = 0;
+ for (int j = 0; j < otuScores.size(); j++) {
+ rankTotal += j;
+ ties.push_back(otuScores[j]);
+
+ if (j != scores.size()-1) { // you are not the last so you can look ahead
+ if (otuScores[j].score != otuScores[j+1].score) { // you are done with ties, rank them and continue
+ for (int k = 0; k < ties.size(); k++) {
+ float thisrank = rankTotal / (float) ties.size();
+ rankOtus[ties[k].name] = thisrank;
+ }
+ ties.clear();
+ rankTotal = 0;
+ }
+ }else { // you are the last one
+ for (int k = 0; k < ties.size(); k++) {
+ float thisrank = rankTotal / (float) ties.size();
+ rankOtus[ties[k].name] = thisrank;
+ }
+ }
+ }
+
+ //calc kendall coeffient for each axis for this otu
+ for (int j = 0; j < numaxes; j++) {
+
+ int numConcordant = 0;
+ int numDiscordant = 0;
+
+ for (int f = 0; f < j; f++) {
+
+ for (int k = 0; k < lookupFloat.size(); k++) {
+
+ float xi = rankAxes[lookupFloat[k]->getGroup()][j];
+ float yi = rankOtus[lookupFloat[k]->getGroup()];
+
+ for (int h = 0; h < k; h++) {
+
+ float xj = rankAxes[lookupFloat[k]->getGroup()][f];
+ float yj = rankOtus[lookupFloat[h]->getGroup()];
+
+ if ( ((xi > xj) && (yi < yj)) || ((xi < xj) && (yi > yj)) ){ numDiscordant++; }
+ if ( ((xi > xj) && (yi > yj)) || ((xi < xj) && (yi < yj)) ){ numConcordant++; }
+ }
+ }
+ }
+
+ int n = lookupFloat.size();
+ double p = (numConcordant - numDiscordant) / (float) (0.5 * n * (n - 1));
+
+ out << p << '\t';
+ }
+
+
+ out << endl;
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CorrAxesCommand", "calcKendall");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
int CorrAxesCommand::getShared(){
try {
InputData* input = new InputData(sharedfile, "sharedfile");
#include "sharedrabundfloatvector.h"
#include "inputdata.h"
+/***************************************************************/
+struct spearmanRank {
+ string name;
+ float score;
+
+ spearmanRank(string n, float s) : name(n), score(s) {}
+};
+/***************************************************************/
+
class CorrAxesCommand : public Command {
public:
CorrAxesCommand(string);
void help();
private:
+
+
GlobalData* globaldata;
string axesfile, sharedfile, relabundfile, metadatafile, groups, label, inputFileName, outputDir, method;
bool abort, pickedGroups;
map<string, vector<float> > readAxes();
int calcPearson(map<string, vector<float> >&, ofstream&);
int calcSpearman(map<string, vector<float> >&, ofstream&);
+ int calcKendall(map<string, vector<float> >&, ofstream&);
};
//**********************************************************************************************************************
vector<string> GetSeqsCommand::getValidParameters(){
try {
- string Array[] = {"fasta","name", "group", "qfile","alignreport", "accnos", "dups", "list","taxonomy","outputdir","inputdir"};
+ string Array[] = {"fasta","name", "group", "qfile","alignreport", "accnos", "accnos2","dups", "list","taxonomy","outputdir","inputdir"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
return myArray;
}
outputTypes["alignreport"] = tempOutNames;
outputTypes["list"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
+ outputTypes["accnosreport"] = tempOutNames;
}
catch(exception& e) {
m->errorOut(e, "GetSeqsCommand", "GetSeqsCommand");
else {
//valid paramters for this command
- string Array[] = {"fasta","name", "group", "alignreport", "qfile", "accnos", "dups", "list","taxonomy","outputdir","inputdir"};
+ string Array[] = {"fasta","name", "group", "alignreport", "qfile", "accnos", "accnos2","dups", "list","taxonomy","outputdir","inputdir"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
outputTypes["alignreport"] = tempOutNames;
outputTypes["list"] = tempOutNames;
outputTypes["qfile"] = tempOutNames;
+ outputTypes["accnosreport"] = tempOutNames;
//if the user changes the output directory command factory will send this info to us in the output parameter
outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
if (path == "") { parameters["accnos"] = inputDir + it->second; }
}
+ it = parameters.find("accnos2");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = m->hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["accnos2"] = inputDir + it->second; }
+ }
+
it = parameters.find("list");
//user has given a template file
if(it != parameters.end()){
if (accnosfile == "not open") { abort = true; }
else if (accnosfile == "not found") { accnosfile = ""; m->mothurOut("You must provide an accnos file."); m->mothurOutEndLine(); abort = true; }
+ accnosfile2 = validParameter.validFile(parameters, "accnos2", true);
+ if (accnosfile2 == "not open") { abort = true; }
+
fastafile = validParameter.validFile(parameters, "fasta", true);
if (fastafile == "not open") { abort = true; }
else if (fastafile == "not found") { fastafile = ""; }
string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "false"; usedDups = ""; }
dups = m->isTrue(temp);
- if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; }
+ if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "") && (accnosfile2 == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy, quality or listfile."); m->mothurOutEndLine(); abort = true; }
if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; }
if (m->control_pressed) { return 0; }
//read through the correct file and output lines you want to keep
- if (namefile != "") { readName(); }
- if (fastafile != "") { readFasta(); }
- if (groupfile != "") { readGroup(); }
- if (alignfile != "") { readAlign(); }
- if (listfile != "") { readList(); }
- if (taxfile != "") { readTax(); }
- if (qualfile != "") { readQual(); }
+ if (namefile != "") { readName(); }
+ if (fastafile != "") { readFasta(); }
+ if (groupfile != "") { readGroup(); }
+ if (alignfile != "") { readAlign(); }
+ if (listfile != "") { readList(); }
+ if (taxfile != "") { readTax(); }
+ if (qualfile != "") { readQual(); }
+ if (accnosfile2 != "") { compareAccnos(); }
if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
exit(1);
}
}
+//**********************************************************************************************************************
+
+int GetSeqsCommand::compareAccnos(){
+ try {
+
+ string thisOutputDir = outputDir;
+ if (outputDir == "") { thisOutputDir += m->hasPath(accnosfile); }
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(accnosfile)) + "accnos.report";
+ ofstream out;
+ m->openOutputFile(outputFileName, out);
+
+ ifstream in;
+ m->openInputFile(accnosfile2, in);
+ string name;
+
+ set<string> namesAccnos2;
+ set<string> namesDups;
+ set<string> namesAccnos = names;
+
+ while(!in.eof()){
+ in >> name;
+
+ if (namesAccnos.count(name) == 0){ //name unique to accnos2
+ namesAccnos2.insert(name);
+ }else { //you are in both so erase
+ namesAccnos.erase(name);
+ namesDups.insert(name);
+ }
+
+ m->gobble(in);
+ }
+ in.close();
+
+ out << "Names in both files : " + toString(namesDups.size()) << endl;
+ m->mothurOut("Names in both files : " + toString(namesDups.size())); m->mothurOutEndLine();
+
+ for (set<string>::iterator it = namesDups.begin(); it != namesDups.end(); it++) {
+ out << (*it) << endl;
+ }
+
+ out << "Names unique to " + accnosfile + " : " + toString(namesAccnos.size()) << endl;
+ m->mothurOut("Names unique to " + accnosfile + " : " + toString(namesAccnos.size())); m->mothurOutEndLine();
+
+ for (set<string>::iterator it = namesAccnos.begin(); it != namesAccnos.end(); it++) {
+ out << (*it) << endl;
+ }
+
+ out << "Names unique to " + accnosfile2 + " : " + toString(namesAccnos2.size()) << endl;
+ m->mothurOut("Names unique to " + accnosfile2 + " : " + toString(namesAccnos2.size())); m->mothurOutEndLine();
+
+ for (set<string>::iterator it = namesAccnos2.begin(); it != namesAccnos2.end(); it++) {
+ out << (*it) << endl;
+ }
+
+ out.close();
+
+ outputNames.push_back(outputFileName); outputTypes["accnosreport"].push_back(outputFileName);
+
+ return 0;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "GetSeqsCommand", "readAccnos");
+ exit(1);
+ }
+}
+
//**********************************************************************************************************************
private:
set<string> names;
vector<string> outputNames;
- string accnosfile, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir;
+ string accnosfile, accnosfile2, fastafile, namefile, groupfile, alignfile, listfile, taxfile, qualfile, outputDir;
bool abort, dups;
map<string, vector<string> > outputTypes;
int readList();
int readTax();
int readQual();
+ int compareAccnos();
};
m = MothurOut::getInstance();
initialize();
fastaFile >> name;
-
+
if (name.length() != 0) {
name = name.substr(1);