From: westcott Date: Tue, 12 Jan 2010 19:00:34 +0000 (+0000) Subject: worked on hcluster. made .single command run using a sharedfile. and various other... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=92f998cc7debc4bf3e8594848586b8153d96db16 worked on hcluster. made .single command run using a sharedfile. and various other small changes --- diff --git a/aligncommand.cpp b/aligncommand.cpp index f0f7243..3b3afc1 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -61,13 +61,29 @@ AlignCommand::AlignCommand(string option){ } else if (templateFileName == "not open") { abort = true; } - candidateFileName = validParameter.validFile(parameters, "candidate", true); - if (candidateFileName == "not found") { - mothurOut("candidate is a required parameter for the align.seqs command."); - mothurOutEndLine(); - abort = true; + candidateFileName = validParameter.validFile(parameters, "candidate", false); + if (candidateFileName == "not found") { mothurOut("candidate is a required parameter for the align.seqs command."); mothurOutEndLine(); abort = true; } + else { + splitAtDash(candidateFileName, candidateFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < candidateFileNames.size(); i++) { + int ableToOpen; + ifstream in; + ableToOpen = openInputFile(candidateFileNames[i], in); + if (ableToOpen == 1) { + mothurOut(candidateFileNames[i] + " will be disregarded."); mothurOutEndLine(); + //erase from file list + candidateFileNames.erase(candidateFileNames.begin()+i); + i--; + } + in.close(); + } + + //make sure there is at least one valid file left + if (candidateFileNames.size() == 0) { mothurOut("no valid files."); mothurOutEndLine(); abort = true; } } - else if (candidateFileName == "not open") { abort = true; } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -125,7 +141,7 @@ void AlignCommand::help(){ try { mothurOut("The align.seqs command reads a file containing sequences and creates an alignment file and a report file.\n"); mothurOut("The align.seqs command parameters are template, candidate, search, ksize, align, match, mismatch, gapopen and gapextend.\n"); - mothurOut("The template and candidate parameters are required.\n"); + mothurOut("The template and candidate parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"); mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n"); mothurOut("The align parameter allows you to specify the alignment method to use. Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n"); mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); @@ -167,25 +183,112 @@ int AlignCommand::execute(){ mothurOutEndLine(); alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } - mothurOut("Aligning sequences..."); - mothurOutEndLine(); - - string alignFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align"; - string reportFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align.report"; - string accnosFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "flip.accnos"; - - int numFastaSeqs = 0; - int start = time(NULL); + for (int s = 0; s < candidateFileNames.size(); s++) { + mothurOut("Aligning sequences from " + candidateFileNames[s] + " ..." ); mothurOutEndLine(); + + string alignFileName = candidateFileNames[s].substr(0,candidateFileNames[s].find_last_of(".")+1) + "align"; + string reportFileName = candidateFileNames[s].substr(0,candidateFileNames[s].find_last_of(".")+1) + "align.report"; + string accnosFileName = candidateFileNames[s].substr(0,candidateFileNames[s].find_last_of(".")+1) + "flip.accnos"; + + int numFastaSeqs = 0; + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + int start = time(NULL); + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - if(processors == 1){ + if(processors == 1){ + ifstream inFASTA; + openInputFile(candidateFileNames[s], inFASTA); + numFastaSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numFastaSeqs)); + + driver(lines[0], alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); + + //delete accnos file if its blank else report to user + if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); } + else { + mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + "."); + if (!flip) { + mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well."); + }else{ mothurOut(" If the reverse compliment proved to be better it was reported."); } + mothurOutEndLine(); + } + } + else{ + vector positions; + processIDS.resize(0); + + ifstream inFASTA; + openInputFile(candidateFileNames[s], inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); + if (input.length() != 0) { + if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + numFastaSeqs = positions.size(); + + int numSeqsPerProcessor = numFastaSeqs / processors; + + for (int i = 0; i < processors; i++) { + long int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } + + createProcesses(alignFileName, reportFileName, accnosFileName, candidateFileNames[s]); + + rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str()); + rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str()); + + //append alignment and report files + for(int i=1;i nonBlankAccnosFiles; + //delete blank accnos files generated with multiple processes + for(int i=0;i(inFASTA),istreambuf_iterator(), '>'); inFASTA.close(); lines.push_back(new linePair(0, numFastaSeqs)); - + driver(lines[0], alignFileName, reportFileName, accnosFileName); //delete accnos file if its blank else report to user @@ -197,100 +300,16 @@ int AlignCommand::execute(){ }else{ mothurOut(" If the reverse compliment proved to be better it was reported."); } mothurOutEndLine(); } - } - else{ - vector positions; - processIDS.resize(0); - - ifstream inFASTA; - openInputFile(candidateFileName, inFASTA); - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - numFastaSeqs = positions.size(); - - int numSeqsPerProcessor = numFastaSeqs / processors; - - for (int i = 0; i < processors; i++) { - long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - - createProcesses(alignFileName, reportFileName, accnosFileName); - - rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str()); - rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str()); +#endif - //append alignment and report files - for(int i=1;i nonBlankAccnosFiles; - //delete blank accnos files generated with multiple processes - for(int i=0;i(inFASTA),istreambuf_iterator(), '>'); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], alignFileName, reportFileName, accnosFileName); - - //delete accnos file if its blank else report to user - if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); } - else { - mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + "."); - if (!flip) { - mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well."); - }else{ mothurOut(" If the reverse compliment proved to be better it was reported."); } + mothurOut("It took " + toString(time(NULL) - start) + " secs to align " + toString(numFastaSeqs) + " sequences."); + mothurOutEndLine(); mothurOutEndLine(); } -#endif - - - - mothurOut("It took " + toString(time(NULL) - start) + " secs to align " + toString(numFastaSeqs) + " sequences."); - mothurOutEndLine(); - mothurOutEndLine(); - return 0; } catch(exception& e) { @@ -301,7 +320,7 @@ int AlignCommand::execute(){ //********************************************************************************************************************** -int AlignCommand::driver(linePair* line, string alignFName, string reportFName, string accnosFName){ +int AlignCommand::driver(linePair* line, string alignFName, string reportFName, string accnosFName, string filename){ try { ofstream alignmentFile; openOutputFile(alignFName, alignmentFile); @@ -312,12 +331,12 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName, NastReport report(reportFName); ifstream inFASTA; - openInputFile(candidateFileName, inFASTA); + openInputFile(filename, inFASTA); inFASTA.seekg(line->start); for(int i=0;inumSeqs;i++){ - + Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); int origNumBases = candidateSeq->getNumBases(); string originalUnaligned = candidateSeq->getUnaligned(); @@ -394,7 +413,6 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName, //report progress if((i+1) % 100 == 0){ mothurOut(toString(i+1)); mothurOutEndLine(); } - } //report progress if((line->numSeqs) % 100 != 0){ mothurOut(toString(line->numSeqs)); mothurOutEndLine(); } @@ -413,7 +431,7 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName, /**************************************************************************************************/ -void AlignCommand::createProcesses(string alignFileName, string reportFileName, string accnosFName) { +void AlignCommand::createProcesses(string alignFileName, string reportFileName, string accnosFName, string filename) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; @@ -427,7 +445,7 @@ void AlignCommand::createProcesses(string alignFileName, string reportFileName, processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp", accnosFName + toString(getpid()) + ".temp"); + driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp", accnosFName + toString(getpid()) + ".temp", filename); exit(0); }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } } diff --git a/aligncommand.h b/aligncommand.h index ec0c80f..0a2e7ba 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -36,14 +36,15 @@ private: AlignmentDB* templateDB; Alignment* alignment; - int driver(linePair*, string, string, string); - void createProcesses(string, string, string); + int driver(linePair*, string, string, string, string); + void createProcesses(string, string, string, string); void appendAlignFiles(string, string); void appendReportFiles(string, string); string candidateFileName, templateFileName, distanceFileName, search, align; float match, misMatch, gapOpen, gapExtend, threshold; int processors, kmerSize; + vector candidateFileNames; bool abort, flip; }; diff --git a/bayesian.cpp b/bayesian.cpp index e59545a..a83f1ed 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -92,7 +92,7 @@ Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), c mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); mothurOutEndLine(); } catch(exception& e) { - errorOut(e, "Bayesian", "getTaxonomy"); + errorOut(e, "Bayesian", "Bayesian"); exit(1); } } diff --git a/bayesian.h b/bayesian.h index 7100bf3..6e35e9f 100644 --- a/bayesian.h +++ b/bayesian.h @@ -22,13 +22,8 @@ public: ~Bayesian() {}; string getTaxonomy(Sequence*); - //map getConfidenceScores() { return taxConfidenceScore; } private: - //map > taxonomyProbability; //probability of a word being in a given taxonomy. - //taxonomyProbability[bacteria;][0] = probabtility that a sequence within bacteria; would contain kmer 0; - //taxonomyProbability[bacteria;][1] = probabtility that a sequence within bacteria; would contain kmer 1... - vector< vector > wordGenusProb; //vector of maps from genus to probability //wordGenusProb[0][392] = probability that a sequence within genus that's index in the tree is 392 would contain kmer 0; @@ -40,7 +35,6 @@ private: string bootstrapResults(vector, int, int); int getMostProbableTaxonomy(vector); void readProbFile(ifstream&, ifstream&); - //map parseTaxMap(string); }; diff --git a/classify.cpp b/classify.cpp index 9431c6c..c0f8a9c 100644 --- a/classify.cpp +++ b/classify.cpp @@ -14,9 +14,9 @@ #include "blastdb.hpp" /**************************************************************************************************/ - Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) { - try { + try { + readTaxonomy(taxFile); int start = time(NULL); @@ -111,10 +111,6 @@ void Classify::readTaxonomy(string file) { taxonomy[name] = taxInfo; - //itTax = taxList.find(taxInfo); - //if (itTax == taxList.end()) { //this is new taxonomy - //taxList[taxInfo] = 1; - //}else { taxList[taxInfo]++; } phyloTree->addSeqToTree(name, taxInfo); gobble(inTax); diff --git a/classify.h b/classify.h index a897873..5c5a7ac 100644 --- a/classify.h +++ b/classify.h @@ -45,7 +45,6 @@ protected: string taxFile, templateFile, simpleTax; vector names; - //map taxConfidenceScore; void readTaxonomy(string); vector parseTax(string); diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 768cb0d..5224897 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -25,7 +25,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){ else { //valid paramters for this command - string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters"}; + string AlignArray[] = {"template","fasta","name","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); OptionParser parser(option); @@ -47,13 +47,29 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){ } else if (templateFileName == "not open") { abort = true; } - fastaFileName = validParameter.validFile(parameters, "fasta", true); - if (fastaFileName == "not found") { - mothurOut("fasta is a required parameter for the classify.seqs command."); - mothurOutEndLine(); - abort = true; + fastaFileName = validParameter.validFile(parameters, "fasta", false); + if (fastaFileName == "not found") { mothurOut("fasta is a required parameter for the classify.seqs command."); mothurOutEndLine(); abort = true; } + else { + splitAtDash(fastaFileName, fastaFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < fastaFileNames.size(); i++) { + int ableToOpen; + ifstream in; + ableToOpen = openInputFile(fastaFileNames[i], in); + if (ableToOpen == 1) { + mothurOut(fastaFileNames[i] + " will be disregarded."); mothurOutEndLine(); + //erase from file list + fastaFileNames.erase(fastaFileNames.begin()+i); + i--; + } + in.close(); + } + + //make sure there is at least one valid file left + if (fastaFileNames.size() == 0) { mothurOut("no valid files."); mothurOutEndLine(); abort = true; } } - else if (fastaFileName == "not open") { abort = true; } + taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true); if (taxonomyFileName == "not found") { @@ -62,7 +78,26 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){ abort = true; } else if (taxonomyFileName == "not open") { abort = true; } + + + namefile = validParameter.validFile(parameters, "name", false); + if (fastaFileName == "not found") { namefile = ""; } + else { + splitAtDash(namefile, namefileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < namefileNames.size(); i++) { + int ableToOpen; + ifstream in; + ableToOpen = openInputFile(namefileNames[i], in); + if (ableToOpen == 1) { mothurOut("Unable to match name file with fasta file."); mothurOutEndLine(); abort = true; } + in.close(); + } + } + if (namefile != "") { + if (namefileNames.size() != fastaFileNames.size()) { abort = true; mothurOut("If you provide a name file, you must have one for each fasta file."); mothurOutEndLine(); } + } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -131,7 +166,7 @@ void ClassifySeqsCommand::help(){ try { mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n"); mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"); - mothurOut("The template, fasta and taxonomy parameters are required.\n"); + mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"); mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n"); mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n"); mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); @@ -172,117 +207,201 @@ int ClassifySeqsCommand::execute(){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters); } - int numFastaSeqs = 0; + + for (int s = 0; s < fastaFileNames.size(); s++) { - string newTaxonomyFile = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "taxonomy"; - string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp"; - string taxSummary = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "tax.summary"; + //read namefile + if(namefile != "") { + nameMap.clear(); //remove old names + + ifstream inNames; + openInputFile(namefileNames[s], inNames); + + string firstCol, secondCol; + while(!inNames.eof()) { + inNames >> firstCol >> secondCol; gobble(inNames); + nameMap[firstCol] = getNumNames(secondCol); //ex. seq1 seq1,seq3,seq5 -> seq1 = 3. + } + inNames.close(); + } - int start = time(NULL); + mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); mothurOutEndLine(); + string newTaxonomyFile = getRootName(fastaFileNames[s]) + getRootName(taxonomyFileName) + "taxonomy"; + string tempTaxonomyFile = getRootName(fastaFileNames[s]) + "taxonomy.temp"; + string taxSummary = getRootName(fastaFileNames[s]) + getRootName(taxonomyFileName) + "tax.summary"; + + int start = time(NULL); + int numFastaSeqs = 0; + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - if(processors == 1){ + if(processors == 1){ + ifstream inFASTA; + openInputFile(fastaFileNames[s], inFASTA); + numFastaSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numFastaSeqs)); + + driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + } + else{ + vector positions; + processIDS.resize(0); + + ifstream inFASTA; + openInputFile(fastaFileNames[s], inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); + if (input.length() != 0) { + if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + numFastaSeqs = positions.size(); + + int numSeqsPerProcessor = numFastaSeqs / processors; + + for (int i = 0; i < processors; i++) { + int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } + createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + + rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str()); + rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str()); + + for(int i=1;i(inFASTA),istreambuf_iterator(), '>'); inFASTA.close(); lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], newTaxonomyFile, tempTaxonomyFile); - } - else{ - vector positions; - processIDS.resize(0); - ifstream inFASTA; - openInputFile(fastaFileName, inFASTA); + driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); +#endif + //make taxonomy tree from new taxonomy file + PhyloTree taxaBrowser; - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } + ifstream in; + openInputFile(tempTaxonomyFile, in); + + //read in users taxonomy file and add sequences to tree + string name, taxon; + while(!in.eof()){ + in >> name >> taxon; gobble(in); + + if (namefile != "") { + itNames = nameMap.find(name); + + if (itNames == nameMap.end()) { + mothurOut(name + " is not in your name file please correct."); mothurOutEndLine(); exit(1); + }else{ + for (int i = 0; i < itNames->second; i++) { + taxaBrowser.addSeqToTree(name+toString(i), taxon); //add it as many times as there are identical seqs + } + } + }else { taxaBrowser.addSeqToTree(name, taxon); } //add it once } - inFASTA.close(); + in.close(); + + taxaBrowser.assignHeirarchyIDs(0); + + taxaBrowser.binUnclassified(); - numFastaSeqs = positions.size(); + remove(tempTaxonomyFile.c_str()); - int numSeqsPerProcessor = numFastaSeqs / processors; + //print summary file + ofstream outTaxTree; + openOutputFile(taxSummary, outTaxTree); + taxaBrowser.print(outTaxTree); + outTaxTree.close(); - for (int i = 0; i < processors; i++) { - int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - createProcesses(newTaxonomyFile, tempTaxonomyFile); + //output taxonomy with the unclassified bins added + ifstream inTax; + openInputFile(newTaxonomyFile, inTax); - rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str()); - rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str()); + ofstream outTax; + string unclass = newTaxonomyFile + ".unclass.temp"; + openOutputFile(unclass, outTax); - for(int i=1;i> name >> taxon; gobble(inTax); + + string newTax = addUnclassifieds(taxon, maxLevel); + + outTax << name << '\t' << newTax << endl; } + inTax.close(); + outTax.close(); + remove(newTaxonomyFile.c_str()); + rename(unclass.c_str(), newTaxonomyFile.c_str()); + + mothurOutEndLine(); + mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); mothurOutEndLine(); mothurOutEndLine(); } -#else - ifstream inFASTA; - openInputFile(fastaFileName, inFASTA); - numFastaSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); - inFASTA.close(); - lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], newTaxonomyFile, tempTaxonomyFile); -#endif delete classify; + return 0; + } + catch(exception& e) { + errorOut(e, "ClassifySeqsCommand", "execute"); + exit(1); + } +} + +/**************************************************************************************************/ +string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) { + try{ + string newTax, taxon; + int level = 0; - //make taxonomy tree from new taxonomy file - ifstream inTaxonomy; - openInputFile(tempTaxonomyFile, inTaxonomy); - - string accession, taxaList; - PhyloTree taxaBrowser; - - //read in users taxonomy file and add sequences to tree - while(!inTaxonomy.eof()){ - inTaxonomy >> accession >> taxaList; - - taxaBrowser.addSeqToTree(accession, taxaList); - - gobble(inTaxonomy); + //keep what you have counting the levels + while (tax.find_first_of(';') != -1) { + //get taxon + taxon = tax.substr(0,tax.find_first_of(';')); + tax = tax.substr(tax.find_first_of(';')+1, tax.length()); + newTax += taxon; + level++; } - inTaxonomy.close(); - remove(tempTaxonomyFile.c_str()); - - taxaBrowser.assignHeirarchyIDs(0); - taxaBrowser.binUnclassified(); - - ofstream outTaxTree; - openOutputFile(taxSummary, outTaxTree); - taxaBrowser.print(outTaxTree); - - mothurOutEndLine(); - mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); - mothurOutEndLine(); - mothurOutEndLine(); + //add "unclassified" until you reach maxLevel + while (level < maxlevel) { + newTax += "unclassified;"; + level++; + } - return 0; + return newTax; } catch(exception& e) { - errorOut(e, "ClassifySeqsCommand", "execute"); + errorOut(e, "ClassifySeqsCommand", "addUnclassifieds"); exit(1); } } + /**************************************************************************************************/ -void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile) { +void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; @@ -296,7 +415,7 @@ void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp"); + driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename); exit(0); }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } } @@ -339,7 +458,7 @@ void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) { //********************************************************************************************************************** -int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName){ +int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName, string filename){ try { ofstream outTax; openOutputFile(taxFName, outTax); @@ -348,7 +467,7 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa openOutputFile(tempTFName, outTaxSimple); ifstream inFASTA; - openInputFile(fastaFileName, inFASTA); + openInputFile(filename, inFASTA); inFASTA.seekg(line->start); diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 2c6390c..0255bd2 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -41,17 +41,22 @@ private: }; vector processIDS; //processid vector lines; + vector fastaFileNames; + vector namefileNames; + map nameMap; + map::iterator itNames; Classify* classify; - string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName; + string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName; int processors, kmerSize, numWanted, cutoff, iters; float match, misMatch, gapOpen, gapExtend; bool abort, probs; - int driver(linePair*, string, string); + int driver(linePair*, string, string, string); void appendTaxFiles(string, string); - void createProcesses(string, string); + void createProcesses(string, string, string); + string addUnclassifieds(string, int); }; #endif diff --git a/collectcommand.cpp b/collectcommand.cpp index ea2d558..beb70eb 100644 --- a/collectcommand.cpp +++ b/collectcommand.cpp @@ -40,7 +40,7 @@ CollectCommand::CollectCommand(string option){ Estimators.clear(); //allow user to run help - if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; } + if(option == "help") { validCalculator = new ValidCalculators(); help(); delete validCalculator; abort = true; } else { //valid paramters for this command @@ -58,7 +58,7 @@ CollectCommand::CollectCommand(string option){ } //make sure the user has already run the read.otu command - if ((globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund or rabund before you can use the collect.single command."); mothurOutEndLine(); abort = true; } + if ((globaldata->getSharedFile() == "") && (globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund, rabund or shared file before you can use the collect.single command."); mothurOutEndLine(); abort = true; } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -91,58 +91,6 @@ CollectCommand::CollectCommand(string option){ temp = validParameter.validFile(parameters, "size", false); if (temp == "not found") { temp = "0"; } convert(temp, size); - - if (abort == false) { - string fileNameRoot = getRootName(globaldata->inputFileName); - int i; - validCalculator = new ValidCalculators(); - - for (i=0; iisValidCalculator("single", Estimators[i]) == true) { - if (Estimators[i] == "sobs") { - cDisplays.push_back(new CollectDisplay(new Sobs(), new OneColumnFile(fileNameRoot+"sobs"))); - }else if (Estimators[i] == "chao") { - cDisplays.push_back(new CollectDisplay(new Chao1(), new ThreeColumnFile(fileNameRoot+"chao"))); - }else if (Estimators[i] == "nseqs") { - cDisplays.push_back(new CollectDisplay(new NSeqs(), new OneColumnFile(fileNameRoot+"nseqs"))); - }else if (Estimators[i] == "coverage") { - cDisplays.push_back(new CollectDisplay(new Coverage(), new OneColumnFile(fileNameRoot+"coverage"))); - }else if (Estimators[i] == "ace") { - cDisplays.push_back(new CollectDisplay(new Ace(abund), new ThreeColumnFile(fileNameRoot+"ace"))); - }else if (Estimators[i] == "jack") { - cDisplays.push_back(new CollectDisplay(new Jackknife(), new ThreeColumnFile(fileNameRoot+"jack"))); - }else if (Estimators[i] == "shannon") { - cDisplays.push_back(new CollectDisplay(new Shannon(), new ThreeColumnFile(fileNameRoot+"shannon"))); - }else if (Estimators[i] == "npshannon") { - cDisplays.push_back(new CollectDisplay(new NPShannon(), new OneColumnFile(fileNameRoot+"np_shannon"))); - }else if (Estimators[i] == "simpson") { - cDisplays.push_back(new CollectDisplay(new Simpson(), new ThreeColumnFile(fileNameRoot+"simpson"))); - }else if (Estimators[i] == "bootstrap") { - cDisplays.push_back(new CollectDisplay(new Bootstrap(), new OneColumnFile(fileNameRoot+"bootstrap"))); - }else if (Estimators[i] == "geometric") { - cDisplays.push_back(new CollectDisplay(new Geom(), new OneColumnFile(fileNameRoot+"geometric"))); - }else if (Estimators[i] == "qstat") { - cDisplays.push_back(new CollectDisplay(new QStat(), new OneColumnFile(fileNameRoot+"qstat"))); - }else if (Estimators[i] == "logseries") { - cDisplays.push_back(new CollectDisplay(new LogSD(), new OneColumnFile(fileNameRoot+"logseries"))); - }else if (Estimators[i] == "bergerparker") { - cDisplays.push_back(new CollectDisplay(new BergerParker(), new OneColumnFile(fileNameRoot+"bergerparker"))); - }else if (Estimators[i] == "bstick") { - cDisplays.push_back(new CollectDisplay(new BStick(), new ThreeColumnFile(fileNameRoot+"bstick"))); - }else if (Estimators[i] == "goodscoverage") { - cDisplays.push_back(new CollectDisplay(new GoodsCoverage(), new OneColumnFile(fileNameRoot+"goodscoverage"))); - }else if (Estimators[i] == "efron") { - cDisplays.push_back(new CollectDisplay(new Efron(size), new OneColumnFile(fileNameRoot+"efron"))); - }else if (Estimators[i] == "boneh") { - cDisplays.push_back(new CollectDisplay(new Boneh(size), new OneColumnFile(fileNameRoot+"boneh"))); - }else if (Estimators[i] == "solow") { - cDisplays.push_back(new CollectDisplay(new Solow(size), new OneColumnFile(fileNameRoot+"solow"))); - }else if (Estimators[i] == "shen") { - cDisplays.push_back(new CollectDisplay(new Shen(size, abund), new OneColumnFile(fileNameRoot+"shen"))); - } - } - } - } } } @@ -174,15 +122,7 @@ void CollectCommand::help(){ //********************************************************************************************************************** -CollectCommand::~CollectCommand(){ - if (abort == false) { - //delete order; - delete input; globaldata->ginput = NULL; - delete read; - delete validCalculator; - globaldata->gorder = NULL; - } -} +CollectCommand::~CollectCommand(){} //********************************************************************************************************************** @@ -191,87 +131,156 @@ int CollectCommand::execute(){ if (abort == true) { return 0; } - //if the users entered no valid calculators don't execute command - if (cDisplays.size() == 0) { return 0; } - - read = new ReadOTUFile(globaldata->inputFileName); - read->read(&*globaldata); - - order = globaldata->gorder; - string lastLabel = order->getLabel(); - input = globaldata->ginput; - - //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. - set processedLabels; - set userLabels = labels; + if ((globaldata->getFormat() != "sharedfile")) { inputFileNames.push_back(globaldata->inputFileName); } + else { inputFileNames = parseSharedFile(globaldata->getSharedFile()); globaldata->setFormat("rabund"); } - while((order != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { - - if(allLines == 1 || labels.count(order->getLabel()) == 1){ - - cCurve = new Collect(order, cDisplays); - cCurve->getCurve(freq); - delete cCurve; + for (int p = 0; p < inputFileNames.size(); p++) { - mothurOut(order->getLabel()); mothurOutEndLine(); - processedLabels.insert(order->getLabel()); - userLabels.erase(order->getLabel()); + string fileNameRoot = getRootName(inputFileNames[p]); + globaldata->inputFileName = inputFileNames[p]; + + if (inputFileNames.size() > 1) { + mothurOutEndLine(); mothurOut("Processing group " + groups[p]); mothurOutEndLine(); mothurOutEndLine(); + } + validCalculator = new ValidCalculators(); + for (int i=0; iisValidCalculator("single", Estimators[i]) == true) { + if (Estimators[i] == "sobs") { + cDisplays.push_back(new CollectDisplay(new Sobs(), new OneColumnFile(fileNameRoot+"sobs"))); + }else if (Estimators[i] == "chao") { + cDisplays.push_back(new CollectDisplay(new Chao1(), new ThreeColumnFile(fileNameRoot+"chao"))); + }else if (Estimators[i] == "nseqs") { + cDisplays.push_back(new CollectDisplay(new NSeqs(), new OneColumnFile(fileNameRoot+"nseqs"))); + }else if (Estimators[i] == "coverage") { + cDisplays.push_back(new CollectDisplay(new Coverage(), new OneColumnFile(fileNameRoot+"coverage"))); + }else if (Estimators[i] == "ace") { + cDisplays.push_back(new CollectDisplay(new Ace(abund), new ThreeColumnFile(fileNameRoot+"ace"))); + }else if (Estimators[i] == "jack") { + cDisplays.push_back(new CollectDisplay(new Jackknife(), new ThreeColumnFile(fileNameRoot+"jack"))); + }else if (Estimators[i] == "shannon") { + cDisplays.push_back(new CollectDisplay(new Shannon(), new ThreeColumnFile(fileNameRoot+"shannon"))); + }else if (Estimators[i] == "npshannon") { + cDisplays.push_back(new CollectDisplay(new NPShannon(), new OneColumnFile(fileNameRoot+"np_shannon"))); + }else if (Estimators[i] == "simpson") { + cDisplays.push_back(new CollectDisplay(new Simpson(), new ThreeColumnFile(fileNameRoot+"simpson"))); + }else if (Estimators[i] == "bootstrap") { + cDisplays.push_back(new CollectDisplay(new Bootstrap(), new OneColumnFile(fileNameRoot+"bootstrap"))); + }else if (Estimators[i] == "geometric") { + cDisplays.push_back(new CollectDisplay(new Geom(), new OneColumnFile(fileNameRoot+"geometric"))); + }else if (Estimators[i] == "qstat") { + cDisplays.push_back(new CollectDisplay(new QStat(), new OneColumnFile(fileNameRoot+"qstat"))); + }else if (Estimators[i] == "logseries") { + cDisplays.push_back(new CollectDisplay(new LogSD(), new OneColumnFile(fileNameRoot+"logseries"))); + }else if (Estimators[i] == "bergerparker") { + cDisplays.push_back(new CollectDisplay(new BergerParker(), new OneColumnFile(fileNameRoot+"bergerparker"))); + }else if (Estimators[i] == "bstick") { + cDisplays.push_back(new CollectDisplay(new BStick(), new ThreeColumnFile(fileNameRoot+"bstick"))); + }else if (Estimators[i] == "goodscoverage") { + cDisplays.push_back(new CollectDisplay(new GoodsCoverage(), new OneColumnFile(fileNameRoot+"goodscoverage"))); + }else if (Estimators[i] == "efron") { + cDisplays.push_back(new CollectDisplay(new Efron(size), new OneColumnFile(fileNameRoot+"efron"))); + }else if (Estimators[i] == "boneh") { + cDisplays.push_back(new CollectDisplay(new Boneh(size), new OneColumnFile(fileNameRoot+"boneh"))); + }else if (Estimators[i] == "solow") { + cDisplays.push_back(new CollectDisplay(new Solow(size), new OneColumnFile(fileNameRoot+"solow"))); + }else if (Estimators[i] == "shen") { + cDisplays.push_back(new CollectDisplay(new Shen(size, abund), new OneColumnFile(fileNameRoot+"shen"))); + } + } } - //you have a label the user want that is smaller than this label and the last label has not already been processed - if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { - string saveLabel = order->getLabel(); + + + //if the users entered no valid calculators don't execute command + if (cDisplays.size() == 0) { return 0; } + + read = new ReadOTUFile(inputFileNames[p]); + read->read(&*globaldata); - delete order; + order = globaldata->gorder; + string lastLabel = order->getLabel(); + input = globaldata->ginput; + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set processedLabels; + set userLabels = labels; + + while((order != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if(allLines == 1 || labels.count(order->getLabel()) == 1){ + + cCurve = new Collect(order, cDisplays); + cCurve->getCurve(freq); + delete cCurve; + + mothurOut(order->getLabel()); mothurOutEndLine(); + processedLabels.insert(order->getLabel()); + userLabels.erase(order->getLabel()); + + + } + //you have a label the user want that is smaller than this label and the last label has not already been processed + if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = order->getLabel(); + + delete order; + order = (input->getOrderVector(lastLabel)); + + cCurve = new Collect(order, cDisplays); + cCurve->getCurve(freq); + delete cCurve; + + mothurOut(order->getLabel()); mothurOutEndLine(); + processedLabels.insert(order->getLabel()); + userLabels.erase(order->getLabel()); + + //restore real lastlabel to save below + order->setLabel(saveLabel); + } + + lastLabel = order->getLabel(); + + delete order; + order = (input->getOrderVector()); + } + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine(); + needToRun = true; + }else { + mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + if (order != NULL) { delete order; } order = (input->getOrderVector(lastLabel)); + mothurOut(order->getLabel()); mothurOutEndLine(); + cCurve = new Collect(order, cDisplays); cCurve->getCurve(freq); delete cCurve; - - mothurOut(order->getLabel()); mothurOutEndLine(); - processedLabels.insert(order->getLabel()); - userLabels.erase(order->getLabel()); - - //restore real lastlabel to save below - order->setLabel(saveLabel); + delete order; } - lastLabel = order->getLabel(); - - delete order; - order = (input->getOrderVector()); + for(int i=0;iginput = NULL; + delete read; + globaldata->gorder = NULL; + delete validCalculator; } - //output error messages about any remaining user labels - set::iterator it; - bool needToRun = false; - for (it = userLabels.begin(); it != userLabels.end(); it++) { - mothurOut("Your file does not include the label " + *it); - if (processedLabels.count(lastLabel) != 1) { - mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine(); - needToRun = true; - }else { - mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine(); - } - } - - //run last label if you need to - if (needToRun == true) { - if (order != NULL) { delete order; } - order = (input->getOrderVector(lastLabel)); - - mothurOut(order->getLabel()); mothurOutEndLine(); - - cCurve = new Collect(order, cDisplays); - cCurve->getCurve(freq); - delete cCurve; - delete order; - } - for(int i=0;i CollectCommand::parseSharedFile(string filename) { + try { + vector filenames; + + map filehandles; + map::iterator it3; + + + //read first line + read = new ReadOTUFile(filename); + read->read(&*globaldata); + + input = globaldata->ginput; + vector lookup = input->getSharedRAbundVectors(); + + string sharedFileRoot = getRootName(filename); + + //clears file before we start to write to it below + for (int i=0; igetGroup() + ".rabund").c_str()); + filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund")); + } + + ofstream* temp; + for (int i=0; igetGroup()] = temp; + groups.push_back(lookup[i]->getGroup()); + } + + while(lookup[0] != NULL) { + + for (int i = 0; i < lookup.size(); i++) { + RAbundVector rav = lookup[i]->getRAbundVector(); + openOutputFileAppend(sharedFileRoot + lookup[i]->getGroup() + ".rabund", *(filehandles[lookup[i]->getGroup()])); + rav.print(*(filehandles[lookup[i]->getGroup()])); + (*(filehandles[lookup[i]->getGroup()])).close(); + } + + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input->getSharedRAbundVectors(); + } + + //free memory + for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { + delete it3->second; + } + delete read; + delete input; + globaldata->ginput = NULL; + + return filenames; + } + catch(exception& e) { + errorOut(e, "CollectCommand", "parseSharedFile"); + exit(1); + } +} +//********************************************************************************************************************** + diff --git a/collectcommand.h b/collectcommand.h index 10894e7..46dd9dc 100644 --- a/collectcommand.h +++ b/collectcommand.h @@ -56,6 +56,10 @@ private: set labels; //holds labels to be used string label, calc; vector Estimators; + vector inputFileNames; + vector groups; + + vector parseSharedFile(string); }; diff --git a/distancecommand.cpp b/distancecommand.cpp index 3ac3969..0422532 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -246,8 +246,15 @@ int DistanceCommand::driver(int startLine, int endLine, string dFileName, float outFile << setprecision(4); if(isTrue(phylip) && startLine == 0){ outFile << alignDB.getNumSeqs() << endl; } + for(int i=startLine;icalcDist(alignDB.get(i), alignDB.get(j)); double dist = distCalculator->getDist(); diff --git a/hcluster.cpp b/hcluster.cpp index 72af7c8..3b972c0 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -12,10 +12,8 @@ #include "listvector.hpp" #include "sparsematrix.hpp" - /***********************************************************************/ - -HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m) : rabund(rav), list(lv), method(m){ +HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) { try { mapWanted = false; exitedBreak = false; @@ -27,6 +25,9 @@ HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m) : rabund(rav), clusterArray.push_back(temp); } + if (method != "average") { + openInputFile(distfile, filehandle); + }else{ firstRead = true; } } catch(exception& e) { errorOut(e, "HCluster", "HCluster"); @@ -76,7 +77,6 @@ void HCluster::clusterNames(){ /***********************************************************************/ int HCluster::getUpmostParent(int node){ try { - while (clusterArray[node].parent != -1) { node = clusterArray[node].parent; } @@ -93,14 +93,14 @@ void HCluster::printInfo(){ try { cout << "link table" << endl; - for (it = activeLinks.begin(); it!= activeLinks.end(); it++) { - cout << it->first << " = " << it->second << endl; + for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) { + cout << itActive->first << " = " << itActive->second << endl; } cout << endl; for (int i = 0; i < linkTable.size(); i++) { cout << i << '\t'; for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) { - cout << it->first << '-' << it->second << '\t'; + cout << it->first << '-' << it->second << '\t' ; } cout << endl; } @@ -121,64 +121,62 @@ void HCluster::printInfo(){ /***********************************************************************/ int HCluster::makeActive() { try { - int linkValue = 1; -//cout << "active - here" << endl; - it = activeLinks.find(smallRow); - it2 = activeLinks.find(smallCol); + + itActive = activeLinks.find(smallRow); + it2Active = activeLinks.find(smallCol); - if ((it == activeLinks.end()) && (it2 == activeLinks.end())) { //both are not active so add them + if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them int size = linkTable.size(); map temp; map temp2; //add link to eachother - temp[smallRow] = 1; // 1 2 - temp2[smallCol] = 1; // 1 0 1 - // 2 1 0 + temp[smallRow] = 1; // 1 2 + temp2[smallCol] = 1; // 1 0 1 + // 2 1 0 linkTable.push_back(temp); linkTable.push_back(temp2); //add to activeLinks activeLinks[smallRow] = size; activeLinks[smallCol] = size+1; -//cout << "active - here1" << endl; - }else if ((it != activeLinks.end()) && (it2 == activeLinks.end())) { //smallRow is active, smallCol is not + + }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) { //smallRow is active, smallCol is not int size = linkTable.size(); - int alreadyActiveRow = it->second; + int alreadyActiveRow = itActive->second; map temp; //add link to eachother - temp[smallRow] = 1; // 6 2 3 5 - linkTable.push_back(temp); // 6 0 1 2 0 - linkTable[alreadyActiveRow][smallCol] = 1; // 2 1 0 1 1 - // 3 2 1 0 0 - // 5 0 1 0 0 + temp[smallRow] = 1; // 6 2 3 5 + linkTable.push_back(temp); // 6 0 1 2 0 + linkTable[alreadyActiveRow][smallCol] = 1; // 2 1 0 1 1 + // 3 2 1 0 0 + // 5 0 1 0 0 //add to activeLinks activeLinks[smallCol] = size; -//cout << "active - here2" << endl; - }else if ((it == activeLinks.end()) && (it2 != activeLinks.end())) { //smallCol is active, smallRow is not + + }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) { //smallCol is active, smallRow is not int size = linkTable.size(); - int alreadyActiveCol = it2->second; + int alreadyActiveCol = it2Active->second; map temp; //add link to eachother - temp[smallCol] = 1; // 6 2 3 5 - linkTable.push_back(temp); // 6 0 1 2 0 - linkTable[alreadyActiveCol][smallRow] = 1; // 2 1 0 1 1 - // 3 2 1 0 0 - // 5 0 1 0 0 + temp[smallCol] = 1; // 6 2 3 5 + linkTable.push_back(temp); // 6 0 1 2 0 + linkTable[alreadyActiveCol][smallRow] = 1; // 2 1 0 1 1 + // 3 2 1 0 0 + // 5 0 1 0 0 //add to activeLinks activeLinks[smallRow] = size; -//cout << "active - here3" << endl; + }else { //both are active so add one - int row = it->second; - int col = it2->second; -//cout << row << '\t' << col << endl; + int row = itActive->second; + int col = it2Active->second; + linkTable[row][smallCol]++; linkTable[col][smallRow]++; linkValue = linkTable[row][smallCol]; -//cout << "active - here4" << endl; } return linkValue; @@ -191,21 +189,23 @@ int HCluster::makeActive() { /***********************************************************************/ void HCluster::updateArrayandLinkTable() { try { - //if cluster was made update clusterArray and linkTable - int size = clusterArray.size(); - - //add new node - clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild); - clusterArray.push_back(temp); - - //update child nodes - clusterArray[smallRow].parent = size; - clusterArray[smallCol].parent = size; + //if cluster was made update clusterArray and linkTable + int size = clusterArray.size(); + + //add new node + clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild); + clusterArray.push_back(temp); + + //update child nodes + clusterArray[smallRow].parent = size; + clusterArray[smallCol].parent = size; + + if (method == "furthest") { //update linkTable by merging clustered rows and columns int rowSpot = activeLinks[smallRow]; int colSpot = activeLinks[smallCol]; - //cout << "here" << endl; + //fix old rows for (int i = 0; i < linkTable.size(); i++) { //check if they are in map @@ -224,8 +224,7 @@ void HCluster::updateArrayandLinkTable() { linkTable[i].erase(smallRow); //delete col } } - //printInfo(); -//cout << "here2" << endl; + //merge their values for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) { it2 = linkTable[colSpot].find(it->first); //does the col also have this @@ -233,25 +232,23 @@ void HCluster::updateArrayandLinkTable() { if (it2 == linkTable[colSpot].end()) { //not there so add it linkTable[colSpot][it->first] = it->second; }else { //merge them - linkTable[colSpot][it->first] = it->second+it2->second; + linkTable[colSpot][it->first] = it->second + it2->second; } } -//cout << "here3" << endl; + linkTable[colSpot].erase(size); linkTable.erase(linkTable.begin()+rowSpot); //delete row - //printInfo(); + //update activerows activeLinks.erase(smallRow); activeLinks.erase(smallCol); activeLinks[size] = colSpot; //adjust everybody elses spot since you deleted - time vs. space - for (it = activeLinks.begin(); it != activeLinks.end(); it++) { - if (it->second > rowSpot) { activeLinks[it->first]--; } + for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) { + if (itActive->second > rowSpot) { activeLinks[itActive->first]--; } } - -//cout << "here4" << endl; - + } } catch(exception& e) { errorOut(e, "HCluster", "updateArrayandLinkTable"); @@ -259,9 +256,9 @@ void HCluster::updateArrayandLinkTable() { } } /***********************************************************************/ -void HCluster::update(int row, int col, float distance){ +bool HCluster::update(int row, int col, float distance){ try { - + bool cluster = false; smallRow = row; smallCol = col; smallDist = distance; @@ -269,29 +266,34 @@ void HCluster::update(int row, int col, float distance){ //find upmost parent of row and col smallRow = getUpmostParent(smallRow); smallCol = getUpmostParent(smallCol); - //cout << "row = " << row << " smallRow = " << smallRow << " col = " << col << " smallCol = " << smallCol << " dist = " << distance << endl; + //you don't want to cluster with yourself if (smallRow != smallCol) { - //are they active in the link table - int linkValue = makeActive(); //after this point this nodes info is active in linkTable - //printInfo(); - //cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl; - //can we cluster??? - bool cluster = false; - - if (method == "nearest") { cluster = true; } - else if (method == "average") { - if (linkValue == (ceil(((clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) / (float) 2.0))) { cluster = true; } - }else{ //assume furthest - if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; } - } - if (cluster) { + if (method != "average") { + //can we cluster??? + if (method == "nearest") { cluster = true; } + else{ //assume furthest + //are they active in the link table + int linkValue = makeActive(); //after this point this nodes info is active in linkTable + if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; } + } + + if (cluster) { + updateArrayandLinkTable(); + clusterBins(); + clusterNames(); + } + }else { + cluster = true; updateArrayandLinkTable(); clusterBins(); clusterNames(); + combineFile(); } } + + return cluster; //printInfo(); } catch(exception& e) { @@ -349,7 +351,26 @@ try { } } //********************************************************************************************************************** -vector HCluster::getSeqs(ifstream& filehandle, NameAssignment* nameMap, float cutoff){ +vector HCluster::getSeqs(){ + try { + vector sameSeqs; + + if(method != "average") { + sameSeqs = getSeqsFNNN(); + }else{ + if (firstRead) { processFile(); } + sameSeqs = getSeqsAN(); + } + + return sameSeqs; + } + catch(exception& e) { + errorOut(e, "HCluster", "getSeqs"); + exit(1); + } +} +//********************************************************************************************************************** +vector HCluster::getSeqsFNNN(){ try { string firstName, secondName; float distance, prevDistance; @@ -402,11 +423,352 @@ vector HCluster::getSeqs(ifstream& filehandle, NameAssignment* nameMap, return sameSeqs; } catch(exception& e) { - errorOut(e, "HCluster", "getSeqs"); + errorOut(e, "HCluster", "getSeqsFNNN"); exit(1); } } +//********************************************************************************************************************** +//don't need cutoff since processFile removes all distance above cutoff and changes names to indexes +vector HCluster::getSeqsAN(){ + try { + int firstName, secondName; + float prevDistance; + vector sameSeqs; + prevDistance = -1; + + openInputFile(distfile, filehandle, "no error"); + + //is the smallest value in mergedMin or the distfile? + float mergedMinDist = 10000; + float distance = 10000; + if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist; } + + if (!filehandle.eof()) { + filehandle >> firstName >> secondName >> distance; gobble(filehandle); + //save first one + if (prevDistance == -1) { prevDistance = distance; } + if (distance != -1) { //-1 means skip me + seqDist temp(firstName, secondName, distance); + sameSeqs.push_back(temp); + } + } + + if (mergedMinDist < distance) { //get minimum distance from mergedMin + //remove distance we saved from file + sameSeqs.clear(); + prevDistance = mergedMinDist; + + for (int i = 0; i < mergedMin.size(); i++) { + if (mergedMin[i].dist == prevDistance) { + sameSeqs.push_back(mergedMin[i]); + }else { break; } + } + }else{ //get minimum from file + //get entry + while (!filehandle.eof()) { + + filehandle >> firstName >> secondName >> distance; gobble(filehandle); + + if (prevDistance == -1) { prevDistance = distance; } + + if (distance != -1) { //-1 means skip me + //are the distances the same + if (distance == prevDistance) { //save in vector + seqDist temp(firstName, secondName, distance); + sameSeqs.push_back(temp); + }else{ + break; + } + } + } + } + filehandle.close(); + + //randomize matching dists + random_shuffle(sameSeqs.begin(), sameSeqs.end()); + + //can only return one value since once these are merged the other distances in sameSeqs may have changed + vector temp; + if (sameSeqs.size() > 0) { temp.push_back(sameSeqs[0]); } + + return temp; + } + catch(exception& e) { + errorOut(e, "HCluster", "getSeqsAN"); + exit(1); + } +} + /***********************************************************************/ +void HCluster::combineFile() { + try { + int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware + char* inputBuffer; + inputBuffer = new char[bufferSize]; + size_t numRead; + + string tempDistFile = distfile + ".temp"; + ofstream out; + openOutputFile(tempDistFile, out); + + FILE* in; + in = fopen(distfile.c_str(), "rb"); + + int first, second; + float dist; + + vector< map > smallRowColValues; + smallRowColValues.resize(2); //0 = row, 1 = col + int count = 0; + + //go through file pulling out distances related to rows merging + //if mergedMin contains distances add those back into file + bool done = false; + partialDist = ""; + while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) { +//cout << "number of char read = " << numRead << endl; +//cout << inputBuffer << endl; + if (numRead < bufferSize) { done = true; } + + //parse input into individual distances + int spot = 0; + string outputString = ""; + while(spot < numRead) { + //cout << "spot = " << spot << endl; + seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize); + + //you read a partial distance + if (nextDist.seq1 == -1) { break; } + + first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist; + //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl; + //since file is sorted and mergedMin is sorted + //you can put the smallest distance from each through the code below and keep the file sorted + + //while there are still values in mergedMin that are smaller than the distance read from file + while (count < mergedMin.size()) { + + //is the distance in mergedMin smaller than from the file + if (mergedMin[count].dist < dist) { + //is this a distance related to the columns merging? + //if yes, save in memory + if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time + }else if (mergedMin[count].seq1 == smallCol) { + smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist; + }else if (mergedMin[count].seq2 == smallCol) { + smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist; + }else if (mergedMin[count].seq1 == smallRow) { + smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist; + }else if (mergedMin[count].seq2 == smallRow) { + smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist; + }else { //if no, write to temp file + outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n'; + } + count++; + }else{ break; } + } + + //is this a distance related to the columns merging? + //if yes, save in memory + if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time + }else if (first == smallCol) { + smallRowColValues[1][second] = dist; + }else if (second == smallCol) { + smallRowColValues[1][first] = dist; + }else if (first == smallRow) { + smallRowColValues[0][second] = dist; + }else if (second == smallRow) { + smallRowColValues[0][first] = dist; + + }else { //if no, write to temp file + outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n'; + } + } + + out << outputString; + if(done) { break; } + } + fclose(in); + + //if values in mergedMin are larger than the the largest in file then + while (count < mergedMin.size()) { + //is this a distance related to the columns merging? + //if yes, save in memory + if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time + }else if (mergedMin[count].seq1 == smallCol) { + smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist; + }else if (mergedMin[count].seq2 == smallCol) { + smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist; + }else if (mergedMin[count].seq1 == smallRow) { + smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist; + }else if (mergedMin[count].seq2 == smallRow) { + smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist; + + }else { //if no, write to temp file + out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl; + } + count++; + } + out.close(); + mergedMin.clear(); + + //rename tempfile to distfile + remove(distfile.c_str()); + rename(tempDistFile.c_str(), distfile.c_str()); + + //merge clustered rows averaging the distances + map::iterator itMerge; + map::iterator it2Merge; + for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) { + //does smallRowColValues[1] have a distance to this seq too? + it2Merge = smallRowColValues[1].find(itMerge->first); + + float average; + if (it2Merge != smallRowColValues[1].end()) { //if yes, then average + //weighted average + int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq; + average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total; + smallRowColValues[1].erase(it2Merge); + + seqDist temp(clusterArray[smallRow].parent, itMerge->first, average); + mergedMin.push_back(temp); + } + } + + //sort merged values + sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance); + } + catch(exception& e) { + errorOut(e, "HCluster", "combineFile"); + exit(1); + } +} +/***********************************************************************/ +seqDist HCluster::getNextDist(char* buffer, int& index, int size){ + try { + seqDist next; + int indexBefore = index; + string first, second, distance; + first = ""; second = ""; distance = ""; + int tabCount = 0; +//cout << "partial = " << partialDist << endl; + if (partialDist != "") { //read what you can, you know it is less than a whole distance. + for (int i = 0; i < partialDist.size(); i++) { + if (tabCount == 0) { + if (partialDist[i] == '\t') { tabCount++; } + else { first += partialDist[i]; } + }else if (tabCount == 1) { + if (partialDist[i] == '\t') { tabCount++; } + else { second += partialDist[i]; } + }else if (tabCount == 2) { + distance += partialDist[i]; + } + } + partialDist = ""; + } + + //try to get another distance + bool gotDist = false; + while (index < size) { + if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows + gotDist = true; + + //gobble space + while (index < size) { + if (isspace(buffer[index])) { index++; } + else { break; } + } + break; + }else{ + if (tabCount == 0) { + if (buffer[index] == '\t') { tabCount++; } + else { first += buffer[index]; } + }else if (tabCount == 1) { + if (buffer[index] == '\t') { tabCount++; } + else { second += buffer[index]; } + }else if (tabCount == 2) { + distance += buffer[index]; + } + index++; + } + } + + //there was not a whole distance in the buffer, ie. buffer = "1 2 0.01 2 3 0." + //then you want to save the partial distance. + if (!gotDist) { + for (int i = indexBefore; i < size; i++) { + partialDist += buffer[i]; + } + index = size + 1; + next.seq1 = -1; next.seq2 = -1; next.dist = 0.0; + }else{ + int firstname, secondname; + float dist; + + convert(first, firstname); + convert(second, secondname); + convert(distance, dist); + + next.seq1 = firstname; next.seq2 = secondname; next.dist = dist; + } + + return next; + } + catch(exception& e) { + errorOut(e, "HCluster", "getNextDist"); + exit(1); + } +} +/***********************************************************************/ +void HCluster::processFile() { + try { + string firstName, secondName; + float distance; + + ifstream in; + openInputFile(distfile, in); + + ofstream out; + string outTemp = distfile + ".temp"; + openOutputFile(outTemp, out); + + //get entry + while (!in.eof()) { + + in >> firstName >> secondName >> distance; gobble(in); + + map::iterator itA = nameMap->find(firstName); + map::iterator itB = nameMap->find(secondName); + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); } + + //using cutoff + if (distance > cutoff) { break; } + + if (distance != -1) { //-1 means skip me + out << itA->second << '\t' << itB->second << '\t' << distance << endl; + } + } + + in.close(); + out.close(); + + remove(distfile.c_str()); + rename(outTemp.c_str(), distfile.c_str()); + + firstRead = false; + } + catch(exception& e) { + errorOut(e, "HCluster", "processFile"); + exit(1); + } +} +/***********************************************************************/ + + + + + diff --git a/hcluster.h b/hcluster.h index 065d559..679abbc 100644 --- a/hcluster.h +++ b/hcluster.h @@ -17,16 +17,26 @@ class RAbundVector; class ListVector; +/***********************************************************************/ +struct linkNode { + int links; + float dist; + + linkNode() {}; + linkNode(int l, float a) : links(l), dist(a) {}; + ~linkNode() {}; +}; + /***********************************************************************/ class HCluster { public: - HCluster(RAbundVector*, ListVector*, string); + HCluster(RAbundVector*, ListVector*, string, string, NameAssignment*, float); ~HCluster(){}; - void update(int, int, float); + bool update(int, int, float); void setMapWanted(bool m); map getSeqtoBin() { return seq2Bin; } - vector getSeqs(ifstream&, NameAssignment*, float); + vector getSeqs(); protected: void clusterBins(); @@ -36,24 +46,39 @@ protected: void printInfo(); void updateArrayandLinkTable(); void updateMap(); + vector getSeqsFNNN(); + vector getSeqsAN(); + void combineFile(); + void processFile(); + seqDist getNextDist(char*, int&, int); RAbundVector* rabund; ListVector* list; + NameAssignment* nameMap; vector clusterArray; + + //note: the nearest and average neighbor method do not use the link table or active links vector< map > linkTable; // vector of maps - linkTable[1][6] = 2 would mean sequence in spot 1 has 2 links with sequence in 6 map activeLinks; //maps sequence to index in linkTable map::iterator it; + map::iterator itActive; + map::iterator it2Active; map::iterator it2; int numSeqs; int smallRow; int smallCol; - float smallDist; + float smallDist, cutoff; map seq2Bin; - bool mapWanted, exitedBreak; + bool mapWanted, exitedBreak, firstRead; seqDist next; - string method; + string method, distfile; + ifstream filehandle; + + vector mergedMin; + string partialDist; + }; diff --git a/hclustercommand.cpp b/hclustercommand.cpp index 506ba54..bdec48e 100644 --- a/hclustercommand.cpp +++ b/hclustercommand.cpp @@ -179,39 +179,30 @@ int HClusterCommand::execute(){ print_start = true; start = time(NULL); - ifstream in; - openInputFile(distfile, in); - string firstName, secondName; - float distance; - - cluster = new HCluster(rabund, list, method); + //ifstream in; + //openInputFile(distfile, in); + + cluster = new HCluster(rabund, list, method, distfile, globaldata->nameMap, cutoff); vector seqs; seqs.resize(1); // to start loop while (seqs.size() != 0){ - seqs = cluster->getSeqs(in, globaldata->nameMap, cutoff); + seqs = cluster->getSeqs(); for (int i = 0; i < seqs.size(); i++) { //-1 means skip me - if (print_start && isTrue(timing)) { - mothurOut("Clustering (" + tag + ") dist " + toString(distance) + "/" - + toString(roundDist(distance, precision)) - + "\t(precision: " + toString(precision) + ")"); - cout.flush(); - print_start = false; - } - - if (seqs[i].seq1 != seqs[i].seq2) { - cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist); + bool clustered = cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist); float rndDist = roundDist(seqs[i].dist, precision); - - if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){ - printData("unique"); - } - else if((rndDist != rndPreviousDist)){ - printData(toString(rndPreviousDist, length-1)); + + if (clustered) { + if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){ + printData("unique"); + } + else if((rndDist != rndPreviousDist)){ + printData(toString(rndPreviousDist, length-1)); + } } previousDist = seqs[i].dist; @@ -222,15 +213,8 @@ int HClusterCommand::execute(){ } } - in.close(); + //in.close(); - if (print_start && isTrue(timing)) { - //mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist) - //+ "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")"); - cout.flush(); - print_start = false; - } - if(previousDist <= 0.0000){ printData("unique"); } diff --git a/mergefilecommand.cpp b/mergefilecommand.cpp index 44cb03d..fcf6389 100644 --- a/mergefilecommand.cpp +++ b/mergefilecommand.cpp @@ -75,9 +75,10 @@ int MergeFileCommand::execute(){ ofstream outputFile; openOutputFile(outputFileName, outputFile); - ifstream inputFile; char c; for(int i=0;isetMapWanted(true); vector seqs; seqs.resize(1); // to start loop - ifstream inHcluster; - openInputFile(distFile, inHcluster); + //ifstream inHcluster; + //openInputFile(distFile, inHcluster); while (seqs.size() != 0){ - seqs = hcluster->getSeqs(inHcluster, nameMap, cutoff); + seqs = hcluster->getSeqs(); for (int i = 0; i < seqs.size(); i++) { //-1 means skip me @@ -262,7 +262,7 @@ int MGClusterCommand::execute(){ } } } - inHcluster.close(); + //inHcluster.close(); if(previousDist <= 0.0000){ oldList.setLabel("unique"); diff --git a/mothur.h b/mothur.h index cf6ae69..8f9f94f 100644 --- a/mothur.h +++ b/mothur.h @@ -109,6 +109,11 @@ struct seqDist { seqDist(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {} ~seqDist() {} }; +//******************************************************************************************************************** +//sorts lowest to highest +inline bool compareSequenceDistance(seqDist left, seqDist right){ + return (left.dist < right.dist); +} /***********************************************************************/ // snagged from http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.2 @@ -458,6 +463,22 @@ inline bool isBlank(string fileName){ } /***********************************************************************/ +inline int openInputFile(string fileName, ifstream& fileHandle, string m){ + + fileHandle.open(fileName.c_str()); + if(!fileHandle) { + mothurOut("Error: Could not open " + fileName); mothurOutEndLine(); + return 1; + } + else { + //check for blank file + gobble(fileHandle); + return 0; + } + +} +/***********************************************************************/ + inline int openInputFile(string fileName, ifstream& fileHandle){ fileHandle.open(fileName.c_str()); diff --git a/phylotree.cpp b/phylotree.cpp index 8d03607..a07cfde 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -17,6 +17,7 @@ PhyloTree::PhyloTree(){ numSeqs = 0; tree.push_back(TaxNode("Root")); tree[0].heirarchyID = "0"; + maxLevel = 0; } catch(exception& e) { errorOut(e, "PhyloTree", "PhyloTree"); @@ -219,6 +220,25 @@ void PhyloTree::binUnclassified(){ } } /**************************************************************************************************/ +string PhyloTree::getFullTaxonomy(string seqName) { + try { + string tax = ""; + + int currentNode = name2Taxonomy[seqName]; + + while (tree[currentNode].parent != -1) { + tax = tree[currentNode].name + ";" + tax; + currentNode = tree[currentNode].parent; + } + + return tax; + } + catch(exception& e) { + errorOut(e, "PhyloTree", "getFullTaxonomy"); + exit(1); + } +} +/**************************************************************************************************/ void PhyloTree::print(ofstream& out){ try { diff --git a/phylotree.h b/phylotree.h index 6e5b58d..cae0a08 100644 --- a/phylotree.h +++ b/phylotree.h @@ -38,10 +38,13 @@ public: vector getGenusNodes(); void binUnclassified(); - TaxNode get(int i) { return tree[i]; } + TaxNode get(int i) { return tree[i]; } TaxNode get(string seqName) { return tree[name2Taxonomy[seqName]]; } - int getIndex(string seqName) { return name2Taxonomy[seqName]; } - string getName(int i) { return tree[i].name; } + int getIndex(string seqName) { return name2Taxonomy[seqName]; } + string getName(int i) { return tree[i].name; } + string getFullTaxonomy(string); //pass a sequence name return taxonomy + int getMaxLevel() { return maxLevel; } + private: string getNextTaxon(string&); vector tree; diff --git a/rarefactcommand.cpp b/rarefactcommand.cpp index 4417d5c..f2de77b 100644 --- a/rarefactcommand.cpp +++ b/rarefactcommand.cpp @@ -31,7 +31,7 @@ RareFactCommand::RareFactCommand(string option){ Estimators.clear(); //allow user to run help - if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; } + if(option == "help") { validCalculator = new ValidCalculators(); help(); delete validCalculator; abort = true; } else { //valid paramters for this command @@ -49,7 +49,7 @@ RareFactCommand::RareFactCommand(string option){ } //make sure the user has already run the read.otu command - if ((globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund or rabund before you can use the rarefaction.single command."); mothurOutEndLine(); abort = true; } + if ((globaldata->getSharedFile() == "") && (globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund, rabund or shared file before you can use the rarefact.single command."); mothurOutEndLine(); abort = true; } //check for optional parameter and set defaults // ...at some point should added some additional type checking... @@ -82,43 +82,6 @@ RareFactCommand::RareFactCommand(string option){ temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } convert(temp, nIters); - - if (abort == false) { - - string fileNameRoot = getRootName(globaldata->inputFileName); - int i; - validCalculator = new ValidCalculators(); - - - for (i=0; iisValidCalculator("rarefaction", Estimators[i]) == true) { - if (Estimators[i] == "sobs") { - rDisplays.push_back(new RareDisplay(new Sobs(), new ThreeColumnFile(fileNameRoot+"rarefaction"))); - }else if (Estimators[i] == "chao") { - rDisplays.push_back(new RareDisplay(new Chao1(), new ThreeColumnFile(fileNameRoot+"r_chao"))); - }else if (Estimators[i] == "ace") { - if(abund < 5) - abund = 10; - rDisplays.push_back(new RareDisplay(new Ace(abund), new ThreeColumnFile(fileNameRoot+"r_ace"))); - }else if (Estimators[i] == "jack") { - rDisplays.push_back(new RareDisplay(new Jackknife(), new ThreeColumnFile(fileNameRoot+"r_jack"))); - }else if (Estimators[i] == "shannon") { - rDisplays.push_back(new RareDisplay(new Shannon(), new ThreeColumnFile(fileNameRoot+"r_shannon"))); - }else if (Estimators[i] == "npshannon") { - rDisplays.push_back(new RareDisplay(new NPShannon(), new ThreeColumnFile(fileNameRoot+"r_npshannon"))); - }else if (Estimators[i] == "simpson") { - rDisplays.push_back(new RareDisplay(new Simpson(), new ThreeColumnFile(fileNameRoot+"r_simpson"))); - }else if (Estimators[i] == "bootstrap") { - rDisplays.push_back(new RareDisplay(new Bootstrap(), new ThreeColumnFile(fileNameRoot+"r_bootstrap"))); - }else if (Estimators[i] == "coverage") { - rDisplays.push_back(new RareDisplay(new Coverage(), new ThreeColumnFile(fileNameRoot+"r_coverage"))); - }else if (Estimators[i] == "nseqs") { - rDisplays.push_back(new RareDisplay(new NSeqs(), new ThreeColumnFile(fileNameRoot+"r_nseqs"))); - } - } - } - } - } } @@ -150,14 +113,7 @@ void RareFactCommand::help(){ //********************************************************************************************************************** -RareFactCommand::~RareFactCommand(){ - if (abort == false) { - globaldata->gorder = NULL; - delete input; globaldata->ginput = NULL; - delete read; - delete validCalculator; - } -} +RareFactCommand::~RareFactCommand(){} //********************************************************************************************************************** @@ -166,92 +122,205 @@ int RareFactCommand::execute(){ if (abort == true) { return 0; } - //if the users entered no valid calculators don't execute command - if (rDisplays.size() == 0) { return 0; } - - read = new ReadOTUFile(globaldata->inputFileName); - read->read(&*globaldata); - - order = globaldata->gorder; - string lastLabel = order->getLabel(); - input = globaldata->ginput; + if ((globaldata->getFormat() != "sharedfile")) { inputFileNames.push_back(globaldata->inputFileName); } + else { inputFileNames = parseSharedFile(globaldata->getSharedFile()); globaldata->setFormat("rabund"); } - //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. - set processedLabels; - set userLabels = labels; - - //as long as you are not at the end of the file or done wih the lines you want - while((order != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + for (int p = 0; p < inputFileNames.size(); p++) { - if(allLines == 1 || labels.count(order->getLabel()) == 1){ + string fileNameRoot = getRootName(inputFileNames[p]); + globaldata->inputFileName = inputFileNames[p]; - rCurve = new Rarefact(order, rDisplays); - rCurve->getCurve(freq, nIters); - delete rCurve; + if (inputFileNames.size() > 1) { + mothurOutEndLine(); mothurOut("Processing group " + groups[p]); mothurOutEndLine(); mothurOutEndLine(); + } + int i; + validCalculator = new ValidCalculators(); - mothurOut(order->getLabel()); mothurOutEndLine(); - processedLabels.insert(order->getLabel()); - userLabels.erase(order->getLabel()); + + for (i=0; iisValidCalculator("rarefaction", Estimators[i]) == true) { + if (Estimators[i] == "sobs") { + rDisplays.push_back(new RareDisplay(new Sobs(), new ThreeColumnFile(fileNameRoot+"rarefaction"))); + }else if (Estimators[i] == "chao") { + rDisplays.push_back(new RareDisplay(new Chao1(), new ThreeColumnFile(fileNameRoot+"r_chao"))); + }else if (Estimators[i] == "ace") { + if(abund < 5) + abund = 10; + rDisplays.push_back(new RareDisplay(new Ace(abund), new ThreeColumnFile(fileNameRoot+"r_ace"))); + }else if (Estimators[i] == "jack") { + rDisplays.push_back(new RareDisplay(new Jackknife(), new ThreeColumnFile(fileNameRoot+"r_jack"))); + }else if (Estimators[i] == "shannon") { + rDisplays.push_back(new RareDisplay(new Shannon(), new ThreeColumnFile(fileNameRoot+"r_shannon"))); + }else if (Estimators[i] == "npshannon") { + rDisplays.push_back(new RareDisplay(new NPShannon(), new ThreeColumnFile(fileNameRoot+"r_npshannon"))); + }else if (Estimators[i] == "simpson") { + rDisplays.push_back(new RareDisplay(new Simpson(), new ThreeColumnFile(fileNameRoot+"r_simpson"))); + }else if (Estimators[i] == "bootstrap") { + rDisplays.push_back(new RareDisplay(new Bootstrap(), new ThreeColumnFile(fileNameRoot+"r_bootstrap"))); + }else if (Estimators[i] == "coverage") { + rDisplays.push_back(new RareDisplay(new Coverage(), new ThreeColumnFile(fileNameRoot+"r_coverage"))); + }else if (Estimators[i] == "nseqs") { + rDisplays.push_back(new RareDisplay(new NSeqs(), new ThreeColumnFile(fileNameRoot+"r_nseqs"))); + } + } } - if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { - string saveLabel = order->getLabel(); + + //if the users entered no valid calculators don't execute command + if (rDisplays.size() == 0) { return 0; } + + read = new ReadOTUFile(globaldata->inputFileName); + read->read(&*globaldata); + + order = globaldata->gorder; + string lastLabel = order->getLabel(); + input = globaldata->ginput; + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set processedLabels; + set userLabels = labels; + + //as long as you are not at the end of the file or done wih the lines you want + while((order != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if(allLines == 1 || labels.count(order->getLabel()) == 1){ + + rCurve = new Rarefact(order, rDisplays); + rCurve->getCurve(freq, nIters); + delete rCurve; + + mothurOut(order->getLabel()); mothurOutEndLine(); + processedLabels.insert(order->getLabel()); + userLabels.erase(order->getLabel()); + } + + if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = order->getLabel(); + + delete order; + order = (input->getOrderVector(lastLabel)); + + rCurve = new Rarefact(order, rDisplays); + rCurve->getCurve(freq, nIters); + delete rCurve; + + mothurOut(order->getLabel()); mothurOutEndLine(); + processedLabels.insert(order->getLabel()); + userLabels.erase(order->getLabel()); + + //restore real lastlabel to save below + order->setLabel(saveLabel); + } + + lastLabel = order->getLabel(); delete order; + order = (input->getOrderVector()); + } + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine(); + needToRun = true; + }else { + mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + if (order != NULL) { delete order; } order = (input->getOrderVector(lastLabel)); rCurve = new Rarefact(order, rDisplays); rCurve->getCurve(freq, nIters); delete rCurve; - - mothurOut(order->getLabel()); mothurOutEndLine(); - processedLabels.insert(order->getLabel()); - userLabels.erase(order->getLabel()); - //restore real lastlabel to save below - order->setLabel(saveLabel); + mothurOut(order->getLabel()); mothurOutEndLine(); + delete order; } - lastLabel = order->getLabel(); - delete order; - order = (input->getOrderVector()); + for(int i=0;igorder = NULL; + delete input; globaldata->ginput = NULL; + delete read; + delete validCalculator; + } - //output error messages about any remaining user labels - set::iterator it; - bool needToRun = false; - for (it = userLabels.begin(); it != userLabels.end(); it++) { - mothurOut("Your file does not include the label " + *it); - if (processedLabels.count(lastLabel) != 1) { - mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine(); - needToRun = true; - }else { - mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine(); - } - } + return 0; + } + catch(exception& e) { + errorOut(e, "RareFactCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +vector RareFactCommand::parseSharedFile(string filename) { + try { + vector filenames; + + map filehandles; + map::iterator it3; - //run last label if you need to - if (needToRun == true) { - if (order != NULL) { delete order; } - order = (input->getOrderVector(lastLabel)); - rCurve = new Rarefact(order, rDisplays); - rCurve->getCurve(freq, nIters); - delete rCurve; + //read first line + read = new ReadOTUFile(filename); + read->read(&*globaldata); - mothurOut(order->getLabel()); mothurOutEndLine(); - delete order; + input = globaldata->ginput; + vector lookup = input->getSharedRAbundVectors(); + + string sharedFileRoot = getRootName(filename); + + //clears file before we start to write to it below + for (int i=0; igetGroup() + ".rabund").c_str()); + filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund")); } + ofstream* temp; + for (int i=0; igetGroup()] = temp; + groups.push_back(lookup[i]->getGroup()); + } - for(int i=0;igetRAbundVector(); + openOutputFileAppend(sharedFileRoot + lookup[i]->getGroup() + ".rabund", *(filehandles[lookup[i]->getGroup()])); + rav.print(*(filehandles[lookup[i]->getGroup()])); + (*(filehandles[lookup[i]->getGroup()])).close(); + } + + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input->getSharedRAbundVectors(); + } + + //free memory + for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { + delete it3->second; + } + delete read; + delete input; + globaldata->ginput = NULL; + + return filenames; } catch(exception& e) { - errorOut(e, "RareFactCommand", "execute"); + errorOut(e, "RareFactCommand", "parseSharedFile"); exit(1); } } - //********************************************************************************************************************** + + + diff --git a/rarefactcommand.h b/rarefactcommand.h index 88b0ade..3213af4 100644 --- a/rarefactcommand.h +++ b/rarefactcommand.h @@ -42,6 +42,11 @@ private: set labels; //holds labels to be used string label, calc; vector Estimators; + vector inputFileNames; + vector groups; + + vector parseSharedFile(string); + }; diff --git a/summarycommand.cpp b/summarycommand.cpp index 51fabf8..afb4758 100644 --- a/summarycommand.cpp +++ b/summarycommand.cpp @@ -40,7 +40,7 @@ SummaryCommand::SummaryCommand(string option){ Estimators.clear(); //allow user to run help - if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; } + if(option == "help") { validCalculator = new ValidCalculators(); help(); delete validCalculator; abort = true; } else { //valid paramters for this command @@ -58,8 +58,8 @@ SummaryCommand::SummaryCommand(string option){ } //make sure the user has already run the read.otu command - if ((globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund or rabund before you can use the summary.single command."); mothurOutEndLine(); abort = true; } - + if ((globaldata->getSharedFile() == "") && (globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund, rabund or shared file before you can use the summary.single command."); mothurOutEndLine(); abort = true; } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... label = validParameter.validFile(parameters, "label", false); @@ -89,62 +89,7 @@ SummaryCommand::SummaryCommand(string option){ temp = validParameter.validFile(parameters, "size", false); if (temp == "not found") { temp = "0"; } convert(temp, size); - if (abort == false) { - - validCalculator = new ValidCalculators(); - int i; - - for (i=0; iisValidCalculator("summary", Estimators[i]) == true) { - if(Estimators[i] == "sobs"){ - sumCalculators.push_back(new Sobs()); - }else if(Estimators[i] == "chao"){ - sumCalculators.push_back(new Chao1()); - }else if(Estimators[i] == "coverage"){ - sumCalculators.push_back(new Coverage()); - }else if(Estimators[i] == "geometric"){ - sumCalculators.push_back(new Geom()); - }else if(Estimators[i] == "logseries"){ - sumCalculators.push_back(new LogSD()); - }else if(Estimators[i] == "qstat"){ - sumCalculators.push_back(new QStat()); - }else if(Estimators[i] == "bergerparker"){ - sumCalculators.push_back(new BergerParker()); - }else if(Estimators[i] == "bstick"){ - sumCalculators.push_back(new BStick()); - }else if(Estimators[i] == "ace"){ - if(abund < 5) - abund = 10; - sumCalculators.push_back(new Ace(abund)); - }else if(Estimators[i] == "jack"){ - sumCalculators.push_back(new Jackknife()); - }else if(Estimators[i] == "shannon"){ - sumCalculators.push_back(new Shannon()); - }else if(Estimators[i] == "npshannon"){ - sumCalculators.push_back(new NPShannon()); - }else if(Estimators[i] == "simpson"){ - sumCalculators.push_back(new Simpson()); - }else if(Estimators[i] == "bootstrap"){ - sumCalculators.push_back(new Bootstrap()); - }else if (Estimators[i] == "nseqs") { - sumCalculators.push_back(new NSeqs()); - }else if (Estimators[i] == "goodscoverage") { - sumCalculators.push_back(new GoodsCoverage()); - }else if (Estimators[i] == "efron") { - sumCalculators.push_back(new Efron(size)); - }else if (Estimators[i] == "boneh") { - sumCalculators.push_back(new Boneh(size)); - }else if (Estimators[i] == "solow") { - sumCalculators.push_back(new Solow(size)); - }else if (Estimators[i] == "shen") { - sumCalculators.push_back(new Shen(size, abund)); - } - } - } - } } - - } catch(exception& e) { errorOut(e, "SummaryCommand", "SummaryCommand"); @@ -174,14 +119,7 @@ void SummaryCommand::help(){ //********************************************************************************************************************** -SummaryCommand::~SummaryCommand(){ - if (abort == false) { - delete input; globaldata->ginput = NULL; - delete read; - delete validCalculator; - globaldata->sabund = NULL; - } -} +SummaryCommand::~SummaryCommand(){} //********************************************************************************************************************** @@ -190,61 +128,161 @@ int SummaryCommand::execute(){ if (abort == true) { return 0; } - //if the users entered no valid calculators don't execute command - if (sumCalculators.size() == 0) { return 0; } - - outputFileName = ((getRootName(globaldata->inputFileName)) + "summary"); - openOutputFile(outputFileName, outputFileHandle); - outputFileHandle << "label"; - - read = new ReadOTUFile(globaldata->inputFileName); - read->read(&*globaldata); - - sabund = globaldata->sabund; - string lastLabel = sabund->getLabel(); - input = globaldata->ginput; + if ((globaldata->getFormat() != "sharedfile")) { inputFileNames.push_back(globaldata->inputFileName); } + else { inputFileNames = parseSharedFile(globaldata->getSharedFile()); globaldata->setFormat("rabund"); } - for(int i=0;igetCols() == 1){ - outputFileHandle << '\t' << sumCalculators[i]->getName(); + for (int p = 0; p < inputFileNames.size(); p++) { + + string fileNameRoot = getRootName(inputFileNames[p]) + "summary"; + globaldata->inputFileName = inputFileNames[p]; + + if (inputFileNames.size() > 1) { + mothurOutEndLine(); mothurOut("Processing group " + groups[p]); mothurOutEndLine(); mothurOutEndLine(); } - else{ - outputFileHandle << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; + + + validCalculator = new ValidCalculators(); + + for (int i=0; iisValidCalculator("summary", Estimators[i]) == true) { + if(Estimators[i] == "sobs"){ + sumCalculators.push_back(new Sobs()); + }else if(Estimators[i] == "chao"){ + sumCalculators.push_back(new Chao1()); + }else if(Estimators[i] == "coverage"){ + sumCalculators.push_back(new Coverage()); + }else if(Estimators[i] == "geometric"){ + sumCalculators.push_back(new Geom()); + }else if(Estimators[i] == "logseries"){ + sumCalculators.push_back(new LogSD()); + }else if(Estimators[i] == "qstat"){ + sumCalculators.push_back(new QStat()); + }else if(Estimators[i] == "bergerparker"){ + sumCalculators.push_back(new BergerParker()); + }else if(Estimators[i] == "bstick"){ + sumCalculators.push_back(new BStick()); + }else if(Estimators[i] == "ace"){ + if(abund < 5) + abund = 10; + sumCalculators.push_back(new Ace(abund)); + }else if(Estimators[i] == "jack"){ + sumCalculators.push_back(new Jackknife()); + }else if(Estimators[i] == "shannon"){ + sumCalculators.push_back(new Shannon()); + }else if(Estimators[i] == "npshannon"){ + sumCalculators.push_back(new NPShannon()); + }else if(Estimators[i] == "simpson"){ + sumCalculators.push_back(new Simpson()); + }else if(Estimators[i] == "bootstrap"){ + sumCalculators.push_back(new Bootstrap()); + }else if (Estimators[i] == "nseqs") { + sumCalculators.push_back(new NSeqs()); + }else if (Estimators[i] == "goodscoverage") { + sumCalculators.push_back(new GoodsCoverage()); + }else if (Estimators[i] == "efron") { + sumCalculators.push_back(new Efron(size)); + }else if (Estimators[i] == "boneh") { + sumCalculators.push_back(new Boneh(size)); + }else if (Estimators[i] == "solow") { + sumCalculators.push_back(new Solow(size)); + }else if (Estimators[i] == "shen") { + sumCalculators.push_back(new Shen(size, abund)); + } + } } - } - outputFileHandle << endl; - - //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. - set processedLabels; - set userLabels = labels; - while((sabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + //if the users entered no valid calculators don't execute command + if (sumCalculators.size() == 0) { return 0; } - if(allLines == 1 || labels.count(sabund->getLabel()) == 1){ - - mothurOut(sabund->getLabel()); mothurOutEndLine(); - processedLabels.insert(sabund->getLabel()); - userLabels.erase(sabund->getLabel()); - - outputFileHandle << sabund->getLabel(); - for(int i=0;i data = sumCalculators[i]->getValues(sabund); - outputFileHandle << '\t'; - sumCalculators[i]->print(outputFileHandle); + ofstream outputFileHandle; + openOutputFile(fileNameRoot, outputFileHandle); + outputFileHandle << "label"; + + read = new ReadOTUFile(globaldata->inputFileName); + read->read(&*globaldata); + + sabund = globaldata->sabund; + string lastLabel = sabund->getLabel(); + input = globaldata->ginput; + + for(int i=0;igetCols() == 1){ + outputFileHandle << '\t' << sumCalculators[i]->getName(); + } + else{ + outputFileHandle << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci"; } - outputFileHandle << endl; } + outputFileHandle << endl; + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set processedLabels; + set userLabels = labels; - if ((anyLabelsToProcess(sabund->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { - string saveLabel = sabund->getLabel(); + while((sabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { + + if(allLines == 1 || labels.count(sabund->getLabel()) == 1){ + + mothurOut(sabund->getLabel()); mothurOutEndLine(); + processedLabels.insert(sabund->getLabel()); + userLabels.erase(sabund->getLabel()); + + outputFileHandle << sabund->getLabel(); + for(int i=0;i data = sumCalculators[i]->getValues(sabund); + outputFileHandle << '\t'; + sumCalculators[i]->print(outputFileHandle); + } + outputFileHandle << endl; + } + + if ((anyLabelsToProcess(sabund->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = sabund->getLabel(); + + delete sabund; + sabund = input->getSAbundVector(lastLabel); + + mothurOut(sabund->getLabel()); mothurOutEndLine(); + processedLabels.insert(sabund->getLabel()); + userLabels.erase(sabund->getLabel()); + + outputFileHandle << sabund->getLabel(); + for(int i=0;i data = sumCalculators[i]->getValues(sabund); + outputFileHandle << '\t'; + sumCalculators[i]->print(outputFileHandle); + } + outputFileHandle << endl; + + //restore real lastlabel to save below + sabund->setLabel(saveLabel); + } + + lastLabel = sabund->getLabel(); delete sabund; + sabund = input->getSAbundVector(); + } + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine(); + needToRun = true; + }else { + mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + if (sabund != NULL) { delete sabund; } sabund = input->getSAbundVector(lastLabel); mothurOut(sabund->getLabel()); mothurOutEndLine(); - processedLabels.insert(sabund->getLabel()); - userLabels.erase(sabund->getLabel()); - outputFileHandle << sabund->getLabel(); for(int i=0;i data = sumCalculators[i]->getValues(sabund); @@ -252,54 +290,81 @@ int SummaryCommand::execute(){ sumCalculators[i]->print(outputFileHandle); } outputFileHandle << endl; + delete sabund; + } + + outputFileHandle.close(); + + delete input; globaldata->ginput = NULL; + delete read; + delete validCalculator; + globaldata->sabund = NULL; + } + + return 0; + } + catch(exception& e) { + errorOut(e, "SummaryCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +vector SummaryCommand::parseSharedFile(string filename) { + try { + vector filenames; + + map filehandles; + map::iterator it3; + - //restore real lastlabel to save below - sabund->setLabel(saveLabel); - } - - lastLabel = sabund->getLabel(); + //read first line + read = new ReadOTUFile(filename); + read->read(&*globaldata); - delete sabund; - sabund = input->getSAbundVector(); + input = globaldata->ginput; + vector lookup = input->getSharedRAbundVectors(); + + string sharedFileRoot = getRootName(filename); + + //clears file before we start to write to it below + for (int i=0; igetGroup() + ".rabund").c_str()); + filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund")); } - //output error messages about any remaining user labels - set::iterator it; - bool needToRun = false; - for (it = userLabels.begin(); it != userLabels.end(); it++) { - mothurOut("Your file does not include the label " + *it); - if (processedLabels.count(lastLabel) != 1) { - mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine(); - needToRun = true; - }else { - mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine(); - } + ofstream* temp; + for (int i=0; igetGroup()] = temp; + groups.push_back(lookup[i]->getGroup()); } + + while(lookup[0] != NULL) { - //run last label if you need to - if (needToRun == true) { - if (sabund != NULL) { delete sabund; } - sabund = input->getSAbundVector(lastLabel); - - mothurOut(sabund->getLabel()); mothurOutEndLine(); - outputFileHandle << sabund->getLabel(); - for(int i=0;i data = sumCalculators[i]->getValues(sabund); - outputFileHandle << '\t'; - sumCalculators[i]->print(outputFileHandle); + for (int i = 0; i < lookup.size(); i++) { + RAbundVector rav = lookup[i]->getRAbundVector(); + openOutputFileAppend(sharedFileRoot + lookup[i]->getGroup() + ".rabund", *(filehandles[lookup[i]->getGroup()])); + rav.print(*(filehandles[lookup[i]->getGroup()])); + (*(filehandles[lookup[i]->getGroup()])).close(); } - outputFileHandle << endl; - delete sabund; - } - outputFileHandle.close(); + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input->getSharedRAbundVectors(); + } - return 0; + //free memory + for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { + delete it3->second; + } + delete read; + delete input; + globaldata->ginput = NULL; + + return filenames; } catch(exception& e) { - errorOut(e, "SummaryCommand", "execute"); + errorOut(e, "SummaryCommand", "parseSharedFile"); exit(1); } } - //********************************************************************************************************************** diff --git a/summarycommand.h b/summarycommand.h index 6ee8c4a..1258c65 100644 --- a/summarycommand.h +++ b/summarycommand.h @@ -33,14 +33,17 @@ private: InputData* input; ValidCalculators* validCalculator; SAbundVector* sabund; - string outputFileName; - ofstream outputFileHandle; int abund, size; bool abort, allLines; set labels; //holds labels to be used string label, calc; vector Estimators; + vector inputFileNames; + vector groups; + + vector parseSharedFile(string); + }; #endif diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index b551e5f..1512284 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -298,7 +298,11 @@ void UnifracUnweightedCommand::createPhylipFile(int i) { //output to file for (int r=0; rGroups.size(); r++) { //output name - out << globaldata->Groups[r] << '\t'; + string name = globaldata->Groups[r]; + if (name.length() < 10) { //pad with spaces to make compatible + while (name.length() < 10) { name += " "; } + } + out << name << '\t'; //output distances for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; } diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index f9cdd5a..ea79475 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -308,7 +308,11 @@ void UnifracWeightedCommand::createPhylipFile() { //output to file for (int r=0; rGroups.size(); r++) { //output name - out << globaldata->Groups[r] << '\t'; + string name = globaldata->Groups[r]; + if (name.length() < 10) { //pad with spaces to make compatible + while (name.length() < 10) { name += " "; } + } + out << name << '\t'; //output distances for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; }