From: westcott Date: Tue, 23 Feb 2010 19:31:30 +0000 (+0000) Subject: added warning about merging with something above cutoff to cluster. working on chimeras X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=9c23307c583d4e8595f75278c13e708788f2f058 added warning about merging with something above cutoff to cluster. working on chimeras --- diff --git a/bellerophon.cpp b/bellerophon.cpp index afb88d2..17859dc 100644 --- a/bellerophon.cpp +++ b/bellerophon.cpp @@ -27,7 +27,7 @@ Bellerophon::Bellerophon(string name, string o) { } //*************************************************************************************************************** -void Bellerophon::print(ostream& out) { +void Bellerophon::print(ostream& out, ostream& outAcc) { try { int above1 = 0; out << "Name\tScore\tLeft\tRight\t" << endl; @@ -38,6 +38,7 @@ void Bellerophon::print(ostream& out) { //calc # of seqs with preference above 1.0 if (pref[i].score[0] > 1.0) { above1++; + outAcc << pref[i].name << endl; mothurOut(pref[i].name + " is a suspected chimera at breakpoint " + toString(pref[i].midpoint)); mothurOutEndLine(); mothurOut("It's score is " + toString(pref[i].score[0]) + " with suspected left parent " + pref[i].leftParent[0] + " and right parent " + pref[i].rightParent[0]); mothurOutEndLine(); } diff --git a/bellerophon.h b/bellerophon.h index aa53f98..dd5b3da 100644 --- a/bellerophon.h +++ b/bellerophon.h @@ -29,7 +29,7 @@ class Bellerophon : public Chimera { ~Bellerophon() {}; int getChimeras(); - void print(ostream&); + void print(ostream&, ostream&); void setCons(string){}; void setQuantiles(string) {}; diff --git a/ccode.cpp b/ccode.cpp index 326af7c..28e9723 100644 --- a/ccode.cpp +++ b/ccode.cpp @@ -35,7 +35,7 @@ void Ccode::printHeader(ostream& out) { out << "For full window mapping info refer to " << mapInfo << endl << endl; } //*************************************************************************************************************** -void Ccode::print(ostream& out) { +void Ccode::print(ostream& out, ostream& outAcc) { try { mothurOutEndLine(); @@ -112,6 +112,7 @@ void Ccode::print(ostream& out) { if (results) { mothurOut(querySeq->getName() + " was found have at least one chimeric window."); mothurOutEndLine(); + outAcc << querySeq->getName() << endl; } //free memory diff --git a/ccode.h b/ccode.h index c60b130..5d05cd6 100644 --- a/ccode.h +++ b/ccode.h @@ -28,7 +28,7 @@ class Ccode : public Chimera { ~Ccode(); int getChimeras(Sequence* query); - void print(ostream&); + void print(ostream&, ostream&); void printHeader(ostream&); private: diff --git a/chimera.cpp b/chimera.cpp index 9ad287b..85b0b38 100644 --- a/chimera.cpp +++ b/chimera.cpp @@ -22,14 +22,14 @@ string Chimera::createFilter(vector seqs, float t) { vector t; t.resize(seqs[0]->getAligned().length(), 0); vector g; g.resize(seqs[0]->getAligned().length(), 0); vector c; c.resize(seqs[0]->getAligned().length(), 0); - + filterString = (string(seqs[0]->getAligned().length(), '1')); //for each sequence for (int i = 0; i < seqs.size(); i++) { string seqAligned = seqs[i]->getAligned(); - + for (int j = 0; j < seqAligned.length(); j++) { //if this spot is a gap if ((seqAligned[j] == '-') || (seqAligned[j] == '.')) { gaps[j]++; } @@ -47,10 +47,8 @@ string Chimera::createFilter(vector seqs, float t) { if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; } else if (((a[i] < threshold) && (t[i] < threshold) && (g[i] < threshold) && (c[i] < threshold))) { filterString[i] = '0'; numColRemoved++; } - //cout << "a = " << a[i] << " t = " << t[i] << " g = " << g[i] << " c = " << c[i] << endl; + cout << "a = " << a[i] << " t = " << t[i] << " g = " << g[i] << " c = " << c[i] << endl; } - -//cout << "filter = " << filterString << endl; mothurOut("Filter removed " + toString(numColRemoved) + " columns."); mothurOutEndLine(); return filterString; @@ -95,7 +93,7 @@ vector Chimera::readSeqs(string file) { openInputFile(file, in); vector container; int count = 0; - int length = 0; + length = 0; unaligned = false; //read in seqs and store in vector diff --git a/chimera.h b/chimera.h index da908ce..f893d63 100644 --- a/chimera.h +++ b/chimera.h @@ -114,6 +114,7 @@ class Chimera { virtual void setTemplateSeqs(vector t) { templateSeqs = t; } virtual bool getUnaligned() { return unaligned; } virtual void setTemplateFile(string t) { templateFileName = t; } + virtual int getLength() { return length; } virtual void setCons(string){}; virtual void setQuantiles(string){}; @@ -127,14 +128,14 @@ class Chimera { virtual void printHeader(ostream&){}; virtual int getChimeras(Sequence*){ return 0; } virtual int getChimeras(){ return 0; } - virtual void print(ostream&){}; + virtual void print(ostream&, ostream&){}; protected: vector templateSeqs; bool filter, correction, svg, unaligned; - int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters; + int processors, window, increment, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters, length; float divR; string seqMask, quanfile, filterString, name, outputDir, templateFileName; Sequence* getSequence(string); //find sequence from name diff --git a/chimeracheckrdp.cpp b/chimeracheckrdp.cpp index 39ed487..0a2ea59 100644 --- a/chimeracheckrdp.cpp +++ b/chimeracheckrdp.cpp @@ -24,7 +24,7 @@ ChimeraCheckRDP::~ChimeraCheckRDP() { } } //*************************************************************************************************************** -void ChimeraCheckRDP::print(ostream& out) { +void ChimeraCheckRDP::print(ostream& out, ostream& outAcc) { try { mothurOut("Processing: " + querySeq->getName()); mothurOutEndLine(); diff --git a/chimeracheckrdp.h b/chimeracheckrdp.h index e12e7d3..0ba851a 100644 --- a/chimeracheckrdp.h +++ b/chimeracheckrdp.h @@ -29,7 +29,7 @@ class ChimeraCheckRDP : public Chimera { ~ChimeraCheckRDP(); int getChimeras(Sequence*); - void print(ostream&); + void print(ostream&, ostream&); void doPrep(); private: diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp index 2ec6ea9..d34149d 100644 --- a/chimeraseqscommand.cpp +++ b/chimeraseqscommand.cpp @@ -311,15 +311,18 @@ int ChimeraSeqsCommand::execute(){ chimera->setIters(iters); chimera->setTemplateFile(templatefile); - + string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras"; + string accnosFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".accnos"; + vector templateSeqs; if ((method != "bellerophon") && (method != "chimeracheck")) { templateSeqs = chimera->readSeqs(templatefile); if (chimera->getUnaligned()) { - mothurOut("Your sequences need to be aligned when you use the chimeraslayer method."); mothurOutEndLine(); + mothurOut("Your template sequences are different lengths, please correct."); mothurOutEndLine(); //free memory for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } + delete chimera; return 0; } @@ -329,18 +332,24 @@ int ChimeraSeqsCommand::execute(){ }else if (method == "bellerophon") {//run bellerophon separately since you need to read entire fastafile to run it chimera->getChimeras(); - string outputFName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras"; ofstream out; - openOutputFile(outputFName, out); + openOutputFile(outputFileName, out); - chimera->print(out); + ofstream out2; + openOutputFile(accnosFileName, out2); + + chimera->print(out, out2); out.close(); + out2.close(); + return 0; } //some methods need to do prep work before processing the chimeras chimera->doPrep(); + templateSeqsLength = chimera->getLength(); + ofstream outHeader; string tempHeader = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras.tempHeader"; openOutputFile(tempHeader, outHeader); @@ -348,7 +357,6 @@ int ChimeraSeqsCommand::execute(){ chimera->printHeader(outHeader); outHeader.close(); - string outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + method + maskfile + ".chimeras"; //break up file #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) @@ -360,7 +368,7 @@ int ChimeraSeqsCommand::execute(){ lines.push_back(new linePair(0, numSeqs)); - driver(lines[0], outputFileName, fastafile); + driver(lines[0], outputFileName, fastafile, accnosFileName); }else{ vector positions; @@ -391,14 +399,19 @@ int ChimeraSeqsCommand::execute(){ } - createProcesses(outputFileName, fastafile); - + createProcesses(outputFileName, fastafile, accnosFileName); + rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str()); - + rename((accnosFileName + toString(processIDS[0]) + ".temp").c_str(), accnosFileName.c_str()); + //append alignment and report files for(int i=1;igetName() != "") { //incase there is a commented sequence at the end of a file - - //find chimeras - chimera->getChimeras(candidateSeq); + + if ((candidateSeq->getAligned().length() != templateSeqsLength) && (method != "chimeracheck")) { //chimeracheck does not require seqs to be aligned + mothurOut(candidateSeq->getName() + " is not the same length as the template sequences. Skipping."); mothurOutEndLine(); + }else{ + //find chimeras + chimera->getChimeras(candidateSeq); - //print results - chimera->print(out); + //print results + chimera->print(out, out2); + } } delete candidateSeq; @@ -467,6 +489,7 @@ int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filena if((line->numSeqs) % 100 != 0){ mothurOut("Processing sequence: " + toString(line->numSeqs)); mothurOutEndLine(); } out.close(); + out2.close(); inFASTA.close(); return 1; @@ -479,7 +502,7 @@ int ChimeraSeqsCommand::driver(linePair* line, string outputFName, string filena /**************************************************************************************************/ -void ChimeraSeqsCommand::createProcesses(string outputFileName, string filename) { +void ChimeraSeqsCommand::createProcesses(string outputFileName, string filename, string accnos) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; @@ -493,7 +516,7 @@ void ChimeraSeqsCommand::createProcesses(string outputFileName, string filename) processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename); + driver(lines[process], outputFileName + toString(getpid()) + ".temp", filename, accnos + toString(getpid()) + ".temp"); exit(0); }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } } @@ -520,7 +543,7 @@ void ChimeraSeqsCommand::appendOutputFiles(string temp, string filename) { ifstream input; openOutputFileAppend(temp, output); - openInputFile(filename, input); + openInputFile(filename, input, "noerror"); while(char c = input.get()){ if(input.eof()) { break; } diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h index 3e437ec..56477a1 100644 --- a/chimeraseqscommand.h +++ b/chimeraseqscommand.h @@ -35,14 +35,14 @@ private: vector processIDS; //processid vector lines; - int driver(linePair*, string, string); - void createProcesses(string, string); + int driver(linePair*, string, string, string); + void createProcesses(string, string, string); void appendOutputFiles(string, string); bool abort; string method, fastafile, templatefile, consfile, quanfile, maskfile, namefile, outputDir, search; bool filter, correction, svg, printAll, realign; - int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs; + int processors, midpoint, averageLeft, averageRight, window, iters, increment, numwanted, ksize, match, mismatch, parents, minSimilarity, minCoverage, minBS, minSNP, numSeqs, templateSeqsLength; float divR; Chimera* chimera; diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 7690328..6ca8c29 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -118,7 +118,7 @@ void ChimeraSlayer::printHeader(ostream& out) { out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n"; } //*************************************************************************************************************** -void ChimeraSlayer::print(ostream& out) { +void ChimeraSlayer::print(ostream& out, ostream& outAcc) { try { if (chimeraFlags == "yes") { string chimeraFlag = "no"; @@ -130,6 +130,7 @@ void ChimeraSlayer::print(ostream& out) { if (chimeraFlag == "yes") { if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) { mothurOut(querySeq->getName() + "\tyes"); mothurOutEndLine(); + outAcc << querySeq->getName() << endl; } } diff --git a/chimeraslayer.h b/chimeraslayer.h index bf9cf7f..14a22be 100644 --- a/chimeraslayer.h +++ b/chimeraslayer.h @@ -27,7 +27,7 @@ class ChimeraSlayer : public Chimera { ~ChimeraSlayer(); int getChimeras(Sequence*); - void print(ostream&); + void print(ostream&, ostream&); void printHeader(ostream&); void doPrep(); diff --git a/cluster.cpp b/cluster.cpp index 931b44d..4009ed0 100644 --- a/cluster.cpp +++ b/cluster.cpp @@ -15,7 +15,7 @@ /***********************************************************************/ Cluster::Cluster(RAbundVector* rav, ListVector* lv, SparseMatrix* dm, float c) : -rabund(rav), list(lv), dMatrix(dm), cutoff(c) +rabund(rav), list(lv), dMatrix(dm) { /* cout << "sizeof(MatData): " << sizeof(MatData) << endl; @@ -56,6 +56,9 @@ rabund(rav), list(lv), dMatrix(dm), cutoff(c) seqVec[currentCell->column].push_back(currentCell); } mapWanted = false; //set to true by mgcluster to speed up overlap merge + + //save so you can modify as it changes in average neighbor + cutoff = c; } /***********************************************************************/ @@ -170,11 +173,12 @@ void Cluster::clusterNames(){ //This function clusters based on the method of the derived class //At the moment only average and complete linkage are covered, because //single linkage uses a different approach. -void Cluster::update(){ +void Cluster::update(double& cutOFF){ try { getRowColCells(); - vector found(nColCells, 0); + vector foundCol(nColCells, 0); + int search; bool changed; @@ -187,11 +191,13 @@ void Cluster::update(){ } else { search = rowCells[i]->row; } - + + bool merged = false; for (int j=0;jrow == smallRow) && (colCells[j]->column == smallCol))) { + if (!((colCells[j]->row == smallRow) && (colCells[j]->column == smallCol))) { //if you are not hte smallest distance if (colCells[j]->row == search || colCells[j]->column == search) { - found[j] = 1; + foundCol[j] = 1; + merged = true; changed = updateDistance(colCells[j], rowCells[i]); // If the cell's distance changed and it had the same distance as // the smallest distance, invalidate the mins vector in SparseMatrix @@ -203,9 +209,16 @@ void Cluster::update(){ } break; } - } + } } - removeCell(rowCells[i], i , -1); + //if not merged it you need it for warning + if (!merged) { + mothurOut("Warning: trying to merge cell " + toString(rowCells[i]->row+1) + " " + toString(rowCells[i]->column+1) + " distance " + toString(rowCells[i]->dist) + " with value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); mothurOutEndLine(); + if (cutOFF > rowCells[i]->dist) { cutOFF = rowCells[i]->dist; mothurOut("changing cutoff to " + toString(cutOFF)); mothurOutEndLine(); } + + } + removeCell(rowCells[i], i , -1); + } } clusterBins(); @@ -214,12 +227,12 @@ void Cluster::update(){ // Special handling for singlelinkage case, not sure whether this // could be avoided for (int i=nColCells-1;i>=0;i--) { - if (found[i] == 0) { - removeCell(colCells[i], -1, i); -cout << "smallRow = " << smallRow+1 << " smallCol = " << smallCol+1 << endl; + if (foundCol[i] == 0) { if (!((colCells[i]->row == smallRow) && (colCells[i]->column == smallCol))) { - mothurOut("Warning: merging " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results will differ from those if cutoff was used in the read.dist command."); mothurOutEndLine(); + mothurOut("Warning: merging cell " + toString(colCells[i]->row+1) + " " + toString(colCells[i]->column+1) + " distance " + toString(colCells[i]->dist) + " value above cutoff. Results may vary from using cutoff at cluster command instead of read.dist."); mothurOutEndLine(); + if (cutOFF > colCells[i]->dist) { cutOFF = colCells[i]->dist; mothurOut("changing cutoff to " + toString(cutOFF)); mothurOutEndLine(); } } + removeCell(colCells[i], -1, i); } } } diff --git a/cluster.hpp b/cluster.hpp index 38897c9..685c843 100644 --- a/cluster.hpp +++ b/cluster.hpp @@ -14,7 +14,7 @@ class Cluster { public: Cluster(RAbundVector*, ListVector*, SparseMatrix*, float); - virtual void update(); + virtual void update(double&); virtual string getTag() = 0; virtual void setMapWanted(bool m); virtual map getSeqtoBin() { return seq2Bin; } diff --git a/clustercommand.cpp b/clustercommand.cpp index 9d09d2c..be7cc51 100644 --- a/clustercommand.cpp +++ b/clustercommand.cpp @@ -158,7 +158,7 @@ int ClusterCommand::execute(){ loops++; - cluster->update(); + cluster->update(cutoff); float dist = matrix->getSmallDist(); float rndDist = roundDist(dist, precision); diff --git a/distancecommand.cpp b/distancecommand.cpp index 5364cbf..c615b80 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -26,7 +26,7 @@ DistanceCommand::DistanceCommand(string option){ else { //valid paramters for this command - string Array[] = {"fasta", "phylip", "calc", "countends", "cutoff", "processors", "outputdir","inputdir"}; + string Array[] = {"fasta", "output", "calc", "countends", "cutoff", "processors", "outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -90,8 +90,9 @@ DistanceCommand::DistanceCommand(string option){ temp = validParameter.validFile(parameters, "processors", false); if(temp == "not found"){ temp = "1"; } convert(temp, processors); - phylip = validParameter.validFile(parameters, "phylip", false); if(phylip == "not found"){ phylip = "F"; } - + output = validParameter.validFile(parameters, "output", false); if(output == "not found"){ output = "column"; } + + if ((output != "column") && (output != "lt") && (output != "square")) { mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column."); mothurOutEndLine(); output = "column"; } ValidCalculators validCalculator; @@ -137,11 +138,12 @@ DistanceCommand::~DistanceCommand(){ void DistanceCommand::help(){ try { mothurOut("The dist.seqs command reads a file containing sequences and creates a distance file.\n"); - mothurOut("The dist.seqs command parameters are fasta, calc, countends, cutoff and processors. \n"); + mothurOut("The dist.seqs command parameters are fasta, calc, countends, output, cutoff and processors. \n"); mothurOut("The fasta parameter is required.\n"); mothurOut("The calc parameter allows you to specify the method of calculating the distances. Your options are: nogaps, onegap or eachgap. The default is onegap.\n"); mothurOut("The countends parameter allows you to specify whether to include terminal gaps in distance. Your options are: T or F. The default is T.\n"); mothurOut("The cutoff parameter allows you to specify maximum distance to keep. The default is 1.0.\n"); + mothurOut("The output parameter allows you to specify format of your distance matrix. Options are column, lt, and square. The default is column.\n"); mothurOut("The processors parameter allows you to specify number of processors to use. The default is 1.\n"); mothurOut("The dist.seqs command should be in the following format: \n"); mothurOut("dist.seqs(fasta=yourFastaFile, calc=yourCalc, countends=yourEnds, cutoff= yourCutOff, processors=yourProcessors) \n"); @@ -165,15 +167,17 @@ int DistanceCommand::execute(){ string outputFile; - //doses the user want the phylip formatted file as well - if (isTrue(phylip) == true) { + if (output == "lt") { //does the user want lower triangle phylip formatted file outputFile = outputDir + getRootName(getSimpleName(fastafile)) + "phylip.dist"; remove(outputFile.c_str()); //output numSeqs to phylip formatted dist file - }else { //user wants column format + }else if (output == "column") { //user wants column format outputFile = outputDir + getRootName(getSimpleName(fastafile)) + "dist"; remove(outputFile.c_str()); + }else { //assume square + outputFile = outputDir + getRootName(getSimpleName(fastafile)) + "square.dist"; + remove(outputFile.c_str()); } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) @@ -205,6 +209,8 @@ int DistanceCommand::execute(){ driver(0, numSeqs, outputFile, cutoff); #endif + if (output == "square") { convertMatrix(outputFile); } + delete distCalculator; return 0; @@ -260,10 +266,10 @@ int DistanceCommand::driver(int startLine, int endLine, string dFileName, float outFile.setf(ios::fixed, ios::showpoint); outFile << setprecision(4); - if(isTrue(phylip) && startLine == 0){ outFile << alignDB.getNumSeqs() << endl; } + if((output == "lt") && startLine == 0){ outFile << alignDB.getNumSeqs() << endl; } for(int i=startLine;igetDist(); if(dist <= cutoff){ - if (!isTrue(phylip)) { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; } + if (output == "column") { outFile << alignDB.get(i).getName() << ' ' << alignDB.get(j).getName() << ' ' << dist << endl; } } - if (isTrue(phylip)) { outFile << dist << '\t'; } + if (output == "lt") { outFile << dist << '\t'; } + if (output == "square") { //make a square column you can convert to square phylip + outFile << alignDB.get(i).getName() << '\t' << alignDB.get(j).getName() << '\t' << dist << endl; + outFile << alignDB.get(j).getName() << '\t' << alignDB.get(i).getName() << '\t' << dist << endl; + } + } - if (isTrue(phylip) == true) { outFile << endl; } + if (output == "lt") { outFile << endl; } if(i % 100 == 0){ mothurOut(toString(i) + "\t" + toString(time(NULL) - startTime)); mothurOutEndLine(); @@ -299,7 +310,90 @@ int DistanceCommand::driver(int startLine, int endLine, string dFileName, float exit(1); } } +/**************************************************************************************************/ +void DistanceCommand::convertMatrix(string outputFile) { + try{ + //sort file by first column so the distances for each row are together + string outfile = getRootName(outputFile) + "sorted.dist.temp"; + + //use the unix sort + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + string command = "sort -n " + outputFile + " -o " + outfile; + system(command.c_str()); + #else //sort using windows sort + string command = "sort " + outputFile + " /O " + outfile; + system(command.c_str()); + #endif + + + //output to new file distance for each row and save positions in file where new row begins + ifstream in; + openInputFile(outfile, in); + + ofstream out; + openOutputFile(outputFile, out); + + out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); + + out << alignDB.getNumSeqs() << endl; + + //get first currentRow + string first, currentRow, second; + float dist; + map rowDists; //take advantage of the fact that maps are already sorted by key + map::iterator it; + + in >> first; + currentRow = first; + + rowDists[first] = 0.00; //distance to yourself is 0.0 + + in.seekg(0); + //openInputFile(outfile, in); + + while(!in.eof()) { + in >> first >> second >> dist; gobble(in); + + if (first != currentRow) { + //print out last row + out << currentRow << '\t'; //print name + + //print dists + for (it = rowDists.begin(); it != rowDists.end(); it++) { + out << it->second << '\t'; + } + out << endl; + + //start new row + currentRow = first; + rowDists.clear(); + rowDists[first] = 0.00; + rowDists[second] = dist; + }else{ + rowDists[second] = dist; + } + } + //print out last row + out << currentRow << '\t'; //print name + + //print dists + for (it = rowDists.begin(); it != rowDists.end(); it++) { + out << it->second << '\t'; + } + out << endl; + + in.close(); + out.close(); + + remove(outfile.c_str()); + + } + catch(exception& e) { + errorOut(e, "DistanceCommand", "convertMatrix"); + exit(1); + } +} /************************************************************************************************** void DistanceCommand::appendFiles(string temp, string filename) { try{ @@ -324,3 +418,5 @@ void DistanceCommand::appendFiles(string temp, string filename) { } } /**************************************************************************************************/ + + diff --git a/distancecommand.h b/distancecommand.h index c7aaf46..7521ac7 100644 --- a/distancecommand.h +++ b/distancecommand.h @@ -34,7 +34,7 @@ private: Dist* distCalculator; SequenceDB alignDB; - string countends, phylip, fastafile, calc, outputDir; + string countends, output, fastafile, calc, outputDir; int processors; float cutoff; map processIDS; //end line, processid @@ -46,6 +46,7 @@ private: //void appendFiles(string, string); void createProcesses(string); int driver(/*Dist*, SequenceDB, */int, int, string, float); + void convertMatrix(string); }; diff --git a/matrixoutputcommand.cpp b/matrixoutputcommand.cpp index fb74875..87e6489 100644 --- a/matrixoutputcommand.cpp +++ b/matrixoutputcommand.cpp @@ -36,7 +36,7 @@ MatrixOutputCommand::MatrixOutputCommand(string option){ else { //valid paramters for this command - string Array[] = {"label","calc","groups","outputdir","inputdir"}; + string Array[] = {"label","calc","groups","outputdir","inputdir", "output"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -70,6 +70,9 @@ MatrixOutputCommand::MatrixOutputCommand(string option){ else { allLines = 1; } } + output = validParameter.validFile(parameters, "output", false); if(output == "not found"){ output = "lt"; } + if ((output != "lt") && (output != "square")) { mothurOut(output + " is not a valid output form. Options are lt and square. I will use lt."); mothurOutEndLine(); output = "lt"; } + //if the user has not specified any labels use the ones from read.otu if (label == "") { allLines = globaldata->allLines; @@ -136,10 +139,11 @@ MatrixOutputCommand::MatrixOutputCommand(string option){ void MatrixOutputCommand::help(){ try { mothurOut("The dist.shared command can only be executed after a successful read.otu command.\n"); - mothurOut("The dist.shared command parameters are groups, calc and label. No parameters are required.\n"); + mothurOut("The dist.shared command parameters are groups, calc, output and label. No parameters are required.\n"); mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like included used.\n"); mothurOut("The group names are separated by dashes. The label parameter allows you to select what distance levels you would like distance matrices created for, and is also separated by dashes.\n"); mothurOut("The dist.shared command should be in the following format: dist.shared(groups=yourGroups, calc=yourCalcs, label=yourLabels).\n"); + mothurOut("The output parameter allows you to specify format of your distance matrix. Options are lt, and square. The default is lt.\n"); mothurOut("Example dist.shared(groups=A-B-C, calc=jabund-sorabund).\n"); mothurOut("The default value for groups is all the groups in your groupfile.\n"); mothurOut("The default value for calc is jclass and thetayc.\n"); @@ -262,17 +266,28 @@ int MatrixOutputCommand::execute(){ void MatrixOutputCommand::printSims(ostream& out) { try { - //output column headers + out.setf(ios::fixed, ios::floatfield); out.setf(ios::showpoint); + + //output num seqs out << simMatrix.size() << endl; - for (int m = 0; m < simMatrix.size(); m++) { - out << lookup[m]->getGroup() << '\t'; - for (int n = 0; n < m; n++) { - out << simMatrix[m][n] << '\t'; + if (output == "lt") { + for (int m = 0; m < simMatrix.size(); m++) { + out << lookup[m]->getGroup() << '\t'; + for (int n = 0; n < m; n++) { + out << simMatrix[m][n] << '\t'; + } + out << endl; + } + }else{ + for (int m = 0; m < simMatrix.size(); m++) { + out << lookup[m]->getGroup() << '\t'; + for (int n = 0; n < simMatrix[m].size(); n++) { + out << simMatrix[m][n] << '\t'; + } + out << endl; } - out << endl; } - } catch(exception& e) { errorOut(e, "MatrixOutputCommand", "printSims"); @@ -315,7 +330,7 @@ void MatrixOutputCommand::process(vector thisLookup){ } } - exportFileName = outputDir + getRootName(getSimpleName(globaldata->inputFileName)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".dist"; + exportFileName = outputDir + getRootName(getSimpleName(globaldata->inputFileName)) + matrixCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + "." + output + ".dist"; openOutputFile(exportFileName, out); printSims(out); out.close(); diff --git a/matrixoutputcommand.h b/matrixoutputcommand.h index 6c1ce89..ff7519e 100644 --- a/matrixoutputcommand.h +++ b/matrixoutputcommand.h @@ -42,7 +42,7 @@ private: InputData* input; ValidCalculators* validCalculator; vector lookup; - string exportFileName; + string exportFileName, output; int numGroups; ofstream out; diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp index 07c5147..fba2959 100644 --- a/mgclustercommand.cpp +++ b/mgclustercommand.cpp @@ -191,7 +191,7 @@ int MGClusterCommand::execute(){ //cluster using cluster classes while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){ - cluster->update(); + cluster->update(cutoff); float dist = distMatrix->getSmallDist(); float rndDist = roundDist(dist, precision); diff --git a/mgclustercommand.h b/mgclustercommand.h index 89b72fc..a1dac22 100644 --- a/mgclustercommand.h +++ b/mgclustercommand.h @@ -40,7 +40,8 @@ private: string blastfile, method, namefile, overlapFile, distFile, outputDir; ofstream sabundFile, rabundFile, listFile; - float cutoff, penalty; + double cutoff; + float penalty; int precision, length, precisionLength; bool abort, minWanted, hclusterWanted, merge; diff --git a/pintail.cpp b/pintail.cpp index 46da913..63a9014 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -80,15 +80,9 @@ void Pintail::doPrep() { bool reRead = false; //create filter if needed for later if (filter) { - reRead = true; - - if (seqMask != "") { - //mask templates - for (int i = 0; i < templateSeqs.size(); i++) { - decalc->runMask(templateSeqs[i]); - } - } +cout << "filter" << endl; + //read in all query seqs ifstream in; openInputFile(fastafile, in); @@ -106,11 +100,20 @@ void Pintail::doPrep() { //merge query seqs and template seqs temp = templateSeqs; for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } - +cout << temp.size() << endl; + if (seqMask != "") { + reRead = true; + cout << "masked " << seqMask << endl; + //mask templates + for (int i = 0; i < temp.size(); i++) { + decalc->runMask(temp[i]); + } + } + mergedFilterString = createFilter(temp, 0.5); - + cout << "here" << endl; //reread template seqs - for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } + //for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } } @@ -128,6 +131,13 @@ void Pintail::doPrep() { } } + if (filter) { + reRead = true; + for (int i = 0; i < templateSeqs.size(); i++) { + runFilter(templateSeqs[i]); + } + } + mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); if (processors == 1) { quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); @@ -139,19 +149,20 @@ void Pintail::doPrep() { if ((!filter) && (seqMask == "")) { noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan"; - }else if ((filter) && (seqMask == "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.quan"; }else if ((!filter) && (seqMask != "")) { noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan"; }else if ((filter) && (seqMask != "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.masked.quan"; + noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan"; + }else if ((filter) && (seqMask == "")) { + noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan"; } + + decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); - openOutputFile(noOutliers, out5); - + openOutputFile(noOutliers, out5); //adjust quantiles for (int i = 0; i < quantilesMembers.size(); i++) { vector temp; @@ -209,7 +220,7 @@ void Pintail::doPrep() { } } //*************************************************************************************************************** -void Pintail::print(ostream& out) { +void Pintail::print(ostream& out, ostream& outAcc) { try { int index = ceil(deviation); @@ -227,6 +238,7 @@ void Pintail::print(ostream& out) { out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl; if (chimera == "Yes") { mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine(); + outAcc << querySeq->getName() << endl; } out << "Observed\t"; diff --git a/pintail.h b/pintail.h index efb8ba6..2364c86 100644 --- a/pintail.h +++ b/pintail.h @@ -28,7 +28,7 @@ class Pintail : public Chimera { ~Pintail(); int getChimeras(Sequence*); - void print(ostream&); + void print(ostream&, ostream&); void setCons(string c) { consfile = c; } void setQuantiles(string q) { quanfile = q; } @@ -42,7 +42,7 @@ class Pintail : public Chimera { string fastafile, consfile; vector templateLines; - Sequence*querySeq; + Sequence* querySeq; Sequence* bestfit; //closest match to query in template