}
else if (templateFileName == "not open") { abort = true; }
- candidateFileName = validParameter.validFile(parameters, "candidate", true);
- if (candidateFileName == "not found") {
- mothurOut("candidate is a required parameter for the align.seqs command.");
- mothurOutEndLine();
- abort = true;
+ candidateFileName = validParameter.validFile(parameters, "candidate", false);
+ if (candidateFileName == "not found") { mothurOut("candidate is a required parameter for the align.seqs command."); mothurOutEndLine(); abort = true; }
+ else {
+ splitAtDash(candidateFileName, candidateFileNames);
+
+ //go through files and make sure they are good, if not, then disregard them
+ for (int i = 0; i < candidateFileNames.size(); i++) {
+ int ableToOpen;
+ ifstream in;
+ ableToOpen = openInputFile(candidateFileNames[i], in);
+ if (ableToOpen == 1) {
+ mothurOut(candidateFileNames[i] + " will be disregarded."); mothurOutEndLine();
+ //erase from file list
+ candidateFileNames.erase(candidateFileNames.begin()+i);
+ i--;
+ }
+ in.close();
+ }
+
+ //make sure there is at least one valid file left
+ if (candidateFileNames.size() == 0) { mothurOut("no valid files."); mothurOutEndLine(); abort = true; }
}
- else if (candidateFileName == "not open") { abort = true; }
+
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
try {
mothurOut("The align.seqs command reads a file containing sequences and creates an alignment file and a report file.\n");
mothurOut("The align.seqs command parameters are template, candidate, search, ksize, align, match, mismatch, gapopen and gapextend.\n");
- mothurOut("The template and candidate parameters are required.\n");
+ mothurOut("The template and candidate parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n");
mothurOut("The align parameter allows you to specify the alignment method to use. Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n");
mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
mothurOutEndLine();
alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase);
}
- mothurOut("Aligning sequences...");
- mothurOutEndLine();
-
- string alignFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align";
- string reportFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "align.report";
- string accnosFileName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + "flip.accnos";
-
- int numFastaSeqs = 0;
- int start = time(NULL);
+ for (int s = 0; s < candidateFileNames.size(); s++) {
+ mothurOut("Aligning sequences from " + candidateFileNames[s] + " ..." ); mothurOutEndLine();
+
+ string alignFileName = candidateFileNames[s].substr(0,candidateFileNames[s].find_last_of(".")+1) + "align";
+ string reportFileName = candidateFileNames[s].substr(0,candidateFileNames[s].find_last_of(".")+1) + "align.report";
+ string accnosFileName = candidateFileNames[s].substr(0,candidateFileNames[s].find_last_of(".")+1) + "flip.accnos";
+
+ int numFastaSeqs = 0;
+ for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
+ int start = time(NULL);
+
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- if(processors == 1){
+ if(processors == 1){
+ ifstream inFASTA;
+ openInputFile(candidateFileNames[s], inFASTA);
+ numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+ inFASTA.close();
+
+ lines.push_back(new linePair(0, numFastaSeqs));
+
+ driver(lines[0], alignFileName, reportFileName, accnosFileName, candidateFileNames[s]);
+
+ //delete accnos file if its blank else report to user
+ if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); }
+ else {
+ mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + ".");
+ if (!flip) {
+ mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well.");
+ }else{ mothurOut(" If the reverse compliment proved to be better it was reported."); }
+ mothurOutEndLine();
+ }
+ }
+ else{
+ vector<int> positions;
+ processIDS.resize(0);
+
+ ifstream inFASTA;
+ openInputFile(candidateFileNames[s], inFASTA);
+
+ string input;
+ while(!inFASTA.eof()){
+ input = getline(inFASTA);
+ if (input.length() != 0) {
+ if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
+ }
+ }
+ inFASTA.close();
+
+ numFastaSeqs = positions.size();
+
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+
+ for (int i = 0; i < processors; i++) {
+ long int startPos = positions[ i * numSeqsPerProcessor ];
+ if(i == processors - 1){
+ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
+ }
+ lines.push_back(new linePair(startPos, numSeqsPerProcessor));
+ }
+
+ createProcesses(alignFileName, reportFileName, accnosFileName, candidateFileNames[s]);
+
+ rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str());
+ rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str());
+
+ //append alignment and report files
+ for(int i=1;i<processors;i++){
+ appendAlignFiles((alignFileName + toString(processIDS[i]) + ".temp"), alignFileName);
+ remove((alignFileName + toString(processIDS[i]) + ".temp").c_str());
+
+ appendReportFiles((reportFileName + toString(processIDS[i]) + ".temp"), reportFileName);
+ remove((reportFileName + toString(processIDS[i]) + ".temp").c_str());
+ }
+
+ vector<string> nonBlankAccnosFiles;
+ //delete blank accnos files generated with multiple processes
+ for(int i=0;i<processors;i++){
+ if (!(isBlank(accnosFileName + toString(processIDS[i]) + ".temp"))) {
+ nonBlankAccnosFiles.push_back(accnosFileName + toString(processIDS[i]) + ".temp");
+ }else { remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str()); }
+ }
+
+ //append accnos files
+ if (nonBlankAccnosFiles.size() != 0) {
+ rename(nonBlankAccnosFiles[0].c_str(), accnosFileName.c_str());
+
+ for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
+ appendAlignFiles(nonBlankAccnosFiles[h], accnosFileName);
+ remove(nonBlankAccnosFiles[h].c_str());
+ }
+ mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + ".");
+ if (!flip) {
+ mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well.");
+ }else{ mothurOut(" If the reverse compliment proved to be better it was reported."); }
+ mothurOutEndLine();
+ }
+ }
+#else
ifstream inFASTA;
- openInputFile(candidateFileName, inFASTA);
+ openInputFile(candidateFileName[s], inFASTA);
numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
inFASTA.close();
lines.push_back(new linePair(0, numFastaSeqs));
-
+
driver(lines[0], alignFileName, reportFileName, accnosFileName);
//delete accnos file if its blank else report to user
}else{ mothurOut(" If the reverse compliment proved to be better it was reported."); }
mothurOutEndLine();
}
- }
- else{
- vector<int> positions;
- processIDS.resize(0);
-
- ifstream inFASTA;
- openInputFile(candidateFileName, inFASTA);
- string input;
- while(!inFASTA.eof()){
- input = getline(inFASTA);
- if (input.length() != 0) {
- if(input[0] == '>'){ long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
- }
- }
- inFASTA.close();
-
- numFastaSeqs = positions.size();
-
- int numSeqsPerProcessor = numFastaSeqs / processors;
-
- for (int i = 0; i < processors; i++) {
- long int startPos = positions[ i * numSeqsPerProcessor ];
- if(i == processors - 1){
- numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
- }
- lines.push_back(new linePair(startPos, numSeqsPerProcessor));
- }
-
- createProcesses(alignFileName, reportFileName, accnosFileName);
-
- rename((alignFileName + toString(processIDS[0]) + ".temp").c_str(), alignFileName.c_str());
- rename((reportFileName + toString(processIDS[0]) + ".temp").c_str(), reportFileName.c_str());
+#endif
- //append alignment and report files
- for(int i=1;i<processors;i++){
- appendAlignFiles((alignFileName + toString(processIDS[i]) + ".temp"), alignFileName);
- remove((alignFileName + toString(processIDS[i]) + ".temp").c_str());
-
- appendReportFiles((reportFileName + toString(processIDS[i]) + ".temp"), reportFileName);
- remove((reportFileName + toString(processIDS[i]) + ".temp").c_str());
- }
- vector<string> nonBlankAccnosFiles;
- //delete blank accnos files generated with multiple processes
- for(int i=0;i<processors;i++){
- if (!(isBlank(accnosFileName + toString(processIDS[i]) + ".temp"))) {
- nonBlankAccnosFiles.push_back(accnosFileName + toString(processIDS[i]) + ".temp");
- }else { remove((accnosFileName + toString(processIDS[i]) + ".temp").c_str()); }
- }
- //append accnos files
- if (nonBlankAccnosFiles.size() != 0) {
- rename(nonBlankAccnosFiles[0].c_str(), accnosFileName.c_str());
-
- for (int h=1; h < nonBlankAccnosFiles.size(); h++) {
- appendAlignFiles(nonBlankAccnosFiles[h], accnosFileName);
- remove(nonBlankAccnosFiles[h].c_str());
- }
- mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + ".");
- if (!flip) {
- mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well.");
- }else{ mothurOut(" If the reverse compliment proved to be better it was reported."); }
- mothurOutEndLine();
- }
- }
-#else
- ifstream inFASTA;
- openInputFile(candidateFileName, inFASTA);
- numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
- inFASTA.close();
-
- lines.push_back(new linePair(0, numFastaSeqs));
-
- driver(lines[0], alignFileName, reportFileName, accnosFileName);
-
- //delete accnos file if its blank else report to user
- if (isBlank(accnosFileName)) { remove(accnosFileName.c_str()); }
- else {
- mothurOut("Some of you sequences generated alignments that eliminated too many bases, a list is provided in " + accnosFileName + ".");
- if (!flip) {
- mothurOut(" If you set the flip parameter to true mothur will try aligning the reverse compliment as well.");
- }else{ mothurOut(" If the reverse compliment proved to be better it was reported."); }
+ mothurOut("It took " + toString(time(NULL) - start) + " secs to align " + toString(numFastaSeqs) + " sequences.");
+ mothurOutEndLine();
mothurOutEndLine();
}
-#endif
-
-
-
- mothurOut("It took " + toString(time(NULL) - start) + " secs to align " + toString(numFastaSeqs) + " sequences.");
- mothurOutEndLine();
- mothurOutEndLine();
-
return 0;
}
catch(exception& e) {
//**********************************************************************************************************************
-int AlignCommand::driver(linePair* line, string alignFName, string reportFName, string accnosFName){
+int AlignCommand::driver(linePair* line, string alignFName, string reportFName, string accnosFName, string filename){
try {
ofstream alignmentFile;
openOutputFile(alignFName, alignmentFile);
NastReport report(reportFName);
ifstream inFASTA;
- openInputFile(candidateFileName, inFASTA);
+ openInputFile(filename, inFASTA);
inFASTA.seekg(line->start);
for(int i=0;i<line->numSeqs;i++){
-
+
Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA);
int origNumBases = candidateSeq->getNumBases();
string originalUnaligned = candidateSeq->getUnaligned();
//report progress
if((i+1) % 100 == 0){ mothurOut(toString(i+1)); mothurOutEndLine(); }
-
}
//report progress
if((line->numSeqs) % 100 != 0){ mothurOut(toString(line->numSeqs)); mothurOutEndLine(); }
/**************************************************************************************************/
-void AlignCommand::createProcesses(string alignFileName, string reportFileName, string accnosFName) {
+void AlignCommand::createProcesses(string alignFileName, string reportFileName, string accnosFName, string filename) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
int process = 0;
processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
process++;
}else if (pid == 0){
- driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp", accnosFName + toString(getpid()) + ".temp");
+ driver(lines[process], alignFileName + toString(getpid()) + ".temp", reportFileName + toString(getpid()) + ".temp", accnosFName + toString(getpid()) + ".temp", filename);
exit(0);
}else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
}
AlignmentDB* templateDB;
Alignment* alignment;
- int driver(linePair*, string, string, string);
- void createProcesses(string, string, string);
+ int driver(linePair*, string, string, string, string);
+ void createProcesses(string, string, string, string);
void appendAlignFiles(string, string);
void appendReportFiles(string, string);
string candidateFileName, templateFileName, distanceFileName, search, align;
float match, misMatch, gapOpen, gapExtend, threshold;
int processors, kmerSize;
+ vector<string> candidateFileNames;
bool abort, flip;
};
mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); mothurOutEndLine();
}
catch(exception& e) {
- errorOut(e, "Bayesian", "getTaxonomy");
+ errorOut(e, "Bayesian", "Bayesian");
exit(1);
}
}
~Bayesian() {};
string getTaxonomy(Sequence*);
- //map<string, int> getConfidenceScores() { return taxConfidenceScore; }
private:
- //map<string, vector<double> > taxonomyProbability; //probability of a word being in a given taxonomy.
- //taxonomyProbability[bacteria;][0] = probabtility that a sequence within bacteria; would contain kmer 0;
- //taxonomyProbability[bacteria;][1] = probabtility that a sequence within bacteria; would contain kmer 1...
-
vector< vector<float> > wordGenusProb; //vector of maps from genus to probability
//wordGenusProb[0][392] = probability that a sequence within genus that's index in the tree is 392 would contain kmer 0;
string bootstrapResults(vector<int>, int, int);
int getMostProbableTaxonomy(vector<int>);
void readProbFile(ifstream&, ifstream&);
- //map<string, int> parseTaxMap(string);
};
#include "blastdb.hpp"
/**************************************************************************************************/
-
Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {
- try {
+ try {
+
readTaxonomy(taxFile);
int start = time(NULL);
taxonomy[name] = taxInfo;
- //itTax = taxList.find(taxInfo);
- //if (itTax == taxList.end()) { //this is new taxonomy
- //taxList[taxInfo] = 1;
- //}else { taxList[taxInfo]++; }
phyloTree->addSeqToTree(name, taxInfo);
gobble(inTax);
string taxFile, templateFile, simpleTax;
vector<string> names;
- //map<string, int> taxConfidenceScore;
void readTaxonomy(string);
vector<string> parseTax(string);
else {
//valid paramters for this command
- string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters"};
+ string AlignArray[] = {"template","fasta","name","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters"};
vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
OptionParser parser(option);
}
else if (templateFileName == "not open") { abort = true; }
- fastaFileName = validParameter.validFile(parameters, "fasta", true);
- if (fastaFileName == "not found") {
- mothurOut("fasta is a required parameter for the classify.seqs command.");
- mothurOutEndLine();
- abort = true;
+ fastaFileName = validParameter.validFile(parameters, "fasta", false);
+ if (fastaFileName == "not found") { mothurOut("fasta is a required parameter for the classify.seqs command."); mothurOutEndLine(); abort = true; }
+ else {
+ splitAtDash(fastaFileName, fastaFileNames);
+
+ //go through files and make sure they are good, if not, then disregard them
+ for (int i = 0; i < fastaFileNames.size(); i++) {
+ int ableToOpen;
+ ifstream in;
+ ableToOpen = openInputFile(fastaFileNames[i], in);
+ if (ableToOpen == 1) {
+ mothurOut(fastaFileNames[i] + " will be disregarded."); mothurOutEndLine();
+ //erase from file list
+ fastaFileNames.erase(fastaFileNames.begin()+i);
+ i--;
+ }
+ in.close();
+ }
+
+ //make sure there is at least one valid file left
+ if (fastaFileNames.size() == 0) { mothurOut("no valid files."); mothurOutEndLine(); abort = true; }
}
- else if (fastaFileName == "not open") { abort = true; }
+
taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
if (taxonomyFileName == "not found") {
abort = true;
}
else if (taxonomyFileName == "not open") { abort = true; }
+
+
+ namefile = validParameter.validFile(parameters, "name", false);
+ if (fastaFileName == "not found") { namefile = ""; }
+ else {
+ splitAtDash(namefile, namefileNames);
+
+ //go through files and make sure they are good, if not, then disregard them
+ for (int i = 0; i < namefileNames.size(); i++) {
+ int ableToOpen;
+ ifstream in;
+ ableToOpen = openInputFile(namefileNames[i], in);
+ if (ableToOpen == 1) { mothurOut("Unable to match name file with fasta file."); mothurOutEndLine(); abort = true; }
+ in.close();
+ }
+ }
+ if (namefile != "") {
+ if (namefileNames.size() != fastaFileNames.size()) { abort = true; mothurOut("If you provide a name file, you must have one for each fasta file."); mothurOutEndLine(); }
+ }
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
try {
mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n");
- mothurOut("The template, fasta and taxonomy parameters are required.\n");
+ mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n");
mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n");
mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters);
}
- int numFastaSeqs = 0;
+
+ for (int s = 0; s < fastaFileNames.size(); s++) {
- string newTaxonomyFile = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "taxonomy";
- string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp";
- string taxSummary = getRootName(fastaFileName) + getRootName(taxonomyFileName) + "tax.summary";
+ //read namefile
+ if(namefile != "") {
+ nameMap.clear(); //remove old names
+
+ ifstream inNames;
+ openInputFile(namefileNames[s], inNames);
+
+ string firstCol, secondCol;
+ while(!inNames.eof()) {
+ inNames >> firstCol >> secondCol; gobble(inNames);
+ nameMap[firstCol] = getNumNames(secondCol); //ex. seq1 seq1,seq3,seq5 -> seq1 = 3.
+ }
+ inNames.close();
+ }
- int start = time(NULL);
+ mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); mothurOutEndLine();
+ string newTaxonomyFile = getRootName(fastaFileNames[s]) + getRootName(taxonomyFileName) + "taxonomy";
+ string tempTaxonomyFile = getRootName(fastaFileNames[s]) + "taxonomy.temp";
+ string taxSummary = getRootName(fastaFileNames[s]) + getRootName(taxonomyFileName) + "tax.summary";
+
+ int start = time(NULL);
+ int numFastaSeqs = 0;
+ for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
+
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- if(processors == 1){
+ if(processors == 1){
+ ifstream inFASTA;
+ openInputFile(fastaFileNames[s], inFASTA);
+ numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+ inFASTA.close();
+
+ lines.push_back(new linePair(0, numFastaSeqs));
+
+ driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
+ }
+ else{
+ vector<int> positions;
+ processIDS.resize(0);
+
+ ifstream inFASTA;
+ openInputFile(fastaFileNames[s], inFASTA);
+
+ string input;
+ while(!inFASTA.eof()){
+ input = getline(inFASTA);
+ if (input.length() != 0) {
+ if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
+ }
+ }
+ inFASTA.close();
+
+ numFastaSeqs = positions.size();
+
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+
+ for (int i = 0; i < processors; i++) {
+ int startPos = positions[ i * numSeqsPerProcessor ];
+ if(i == processors - 1){
+ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
+ }
+ lines.push_back(new linePair(startPos, numSeqsPerProcessor));
+ }
+ createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
+
+ rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
+ rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str());
+
+ for(int i=1;i<processors;i++){
+ appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
+ appendTaxFiles((tempTaxonomyFile + toString(processIDS[i]) + ".temp"), tempTaxonomyFile);
+ remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
+ remove((tempTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
+ }
+
+ }
+#else
ifstream inFASTA;
- openInputFile(fastaFileName, inFASTA);
+ openInputFile(fastaFileNames[s], inFASTA);
numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
inFASTA.close();
lines.push_back(new linePair(0, numFastaSeqs));
-
- driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
- }
- else{
- vector<int> positions;
- processIDS.resize(0);
- ifstream inFASTA;
- openInputFile(fastaFileName, inFASTA);
+ driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]);
+#endif
+ //make taxonomy tree from new taxonomy file
+ PhyloTree taxaBrowser;
- string input;
- while(!inFASTA.eof()){
- input = getline(inFASTA);
- if (input.length() != 0) {
- if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
- }
+ ifstream in;
+ openInputFile(tempTaxonomyFile, in);
+
+ //read in users taxonomy file and add sequences to tree
+ string name, taxon;
+ while(!in.eof()){
+ in >> name >> taxon; gobble(in);
+
+ if (namefile != "") {
+ itNames = nameMap.find(name);
+
+ if (itNames == nameMap.end()) {
+ mothurOut(name + " is not in your name file please correct."); mothurOutEndLine(); exit(1);
+ }else{
+ for (int i = 0; i < itNames->second; i++) {
+ taxaBrowser.addSeqToTree(name+toString(i), taxon); //add it as many times as there are identical seqs
+ }
+ }
+ }else { taxaBrowser.addSeqToTree(name, taxon); } //add it once
}
- inFASTA.close();
+ in.close();
+
+ taxaBrowser.assignHeirarchyIDs(0);
+
+ taxaBrowser.binUnclassified();
- numFastaSeqs = positions.size();
+ remove(tempTaxonomyFile.c_str());
- int numSeqsPerProcessor = numFastaSeqs / processors;
+ //print summary file
+ ofstream outTaxTree;
+ openOutputFile(taxSummary, outTaxTree);
+ taxaBrowser.print(outTaxTree);
+ outTaxTree.close();
- for (int i = 0; i < processors; i++) {
- int startPos = positions[ i * numSeqsPerProcessor ];
- if(i == processors - 1){
- numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
- }
- lines.push_back(new linePair(startPos, numSeqsPerProcessor));
- }
- createProcesses(newTaxonomyFile, tempTaxonomyFile);
+ //output taxonomy with the unclassified bins added
+ ifstream inTax;
+ openInputFile(newTaxonomyFile, inTax);
- rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
- rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str());
+ ofstream outTax;
+ string unclass = newTaxonomyFile + ".unclass.temp";
+ openOutputFile(unclass, outTax);
- for(int i=1;i<processors;i++){
- appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
- appendTaxFiles((tempTaxonomyFile + toString(processIDS[i]) + ".temp"), tempTaxonomyFile);
- remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
- remove((tempTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
+ //get maxLevel from phylotree so you know how many 'unclassified's to add
+ int maxLevel = taxaBrowser.getMaxLevel();
+
+ //read taxfile - this reading and rewriting is done to preserve the confidence sscores.
+ while (!inTax.eof()) {
+ inTax >> name >> taxon; gobble(inTax);
+
+ string newTax = addUnclassifieds(taxon, maxLevel);
+
+ outTax << name << '\t' << newTax << endl;
}
+ inTax.close();
+ outTax.close();
+ remove(newTaxonomyFile.c_str());
+ rename(unclass.c_str(), newTaxonomyFile.c_str());
+
+ mothurOutEndLine();
+ mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); mothurOutEndLine(); mothurOutEndLine();
}
-#else
- ifstream inFASTA;
- openInputFile(fastaFileName, inFASTA);
- numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
- inFASTA.close();
- lines.push_back(new linePair(0, numFastaSeqs));
-
- driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
-#endif
delete classify;
+ return 0;
+ }
+ catch(exception& e) {
+ errorOut(e, "ClassifySeqsCommand", "execute");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) {
+ try{
+ string newTax, taxon;
+ int level = 0;
- //make taxonomy tree from new taxonomy file
- ifstream inTaxonomy;
- openInputFile(tempTaxonomyFile, inTaxonomy);
-
- string accession, taxaList;
- PhyloTree taxaBrowser;
-
- //read in users taxonomy file and add sequences to tree
- while(!inTaxonomy.eof()){
- inTaxonomy >> accession >> taxaList;
-
- taxaBrowser.addSeqToTree(accession, taxaList);
-
- gobble(inTaxonomy);
+ //keep what you have counting the levels
+ while (tax.find_first_of(';') != -1) {
+ //get taxon
+ taxon = tax.substr(0,tax.find_first_of(';'));
+ tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+ newTax += taxon;
+ level++;
}
- inTaxonomy.close();
- remove(tempTaxonomyFile.c_str());
-
- taxaBrowser.assignHeirarchyIDs(0);
- taxaBrowser.binUnclassified();
-
- ofstream outTaxTree;
- openOutputFile(taxSummary, outTaxTree);
- taxaBrowser.print(outTaxTree);
-
- mothurOutEndLine();
- mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences.");
- mothurOutEndLine();
- mothurOutEndLine();
+ //add "unclassified" until you reach maxLevel
+ while (level < maxlevel) {
+ newTax += "unclassified;";
+ level++;
+ }
- return 0;
+ return newTax;
}
catch(exception& e) {
- errorOut(e, "ClassifySeqsCommand", "execute");
+ errorOut(e, "ClassifySeqsCommand", "addUnclassifieds");
exit(1);
}
}
+
/**************************************************************************************************/
-void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile) {
+void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
int process = 0;
processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
process++;
}else if (pid == 0){
- driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp");
+ driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename);
exit(0);
}else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
}
//**********************************************************************************************************************
-int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName){
+int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName, string filename){
try {
ofstream outTax;
openOutputFile(taxFName, outTax);
openOutputFile(tempTFName, outTaxSimple);
ifstream inFASTA;
- openInputFile(fastaFileName, inFASTA);
+ openInputFile(filename, inFASTA);
inFASTA.seekg(line->start);
};
vector<int> processIDS; //processid
vector<linePair*> lines;
+ vector<string> fastaFileNames;
+ vector<string> namefileNames;
+ map<string, int> nameMap;
+ map<string, int>::iterator itNames;
Classify* classify;
- string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName;
+ string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName;
int processors, kmerSize, numWanted, cutoff, iters;
float match, misMatch, gapOpen, gapExtend;
bool abort, probs;
- int driver(linePair*, string, string);
+ int driver(linePair*, string, string, string);
void appendTaxFiles(string, string);
- void createProcesses(string, string);
+ void createProcesses(string, string, string);
+ string addUnclassifieds(string, int);
};
#endif
Estimators.clear();
//allow user to run help
- if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; }
+ if(option == "help") { validCalculator = new ValidCalculators(); help(); delete validCalculator; abort = true; }
else {
//valid paramters for this command
}
//make sure the user has already run the read.otu command
- if ((globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund or rabund before you can use the collect.single command."); mothurOutEndLine(); abort = true; }
+ if ((globaldata->getSharedFile() == "") && (globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund, rabund or shared file before you can use the collect.single command."); mothurOutEndLine(); abort = true; }
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
temp = validParameter.validFile(parameters, "size", false); if (temp == "not found") { temp = "0"; }
convert(temp, size);
-
- if (abort == false) {
- string fileNameRoot = getRootName(globaldata->inputFileName);
- int i;
- validCalculator = new ValidCalculators();
-
- for (i=0; i<Estimators.size(); i++) {
- if (validCalculator->isValidCalculator("single", Estimators[i]) == true) {
- if (Estimators[i] == "sobs") {
- cDisplays.push_back(new CollectDisplay(new Sobs(), new OneColumnFile(fileNameRoot+"sobs")));
- }else if (Estimators[i] == "chao") {
- cDisplays.push_back(new CollectDisplay(new Chao1(), new ThreeColumnFile(fileNameRoot+"chao")));
- }else if (Estimators[i] == "nseqs") {
- cDisplays.push_back(new CollectDisplay(new NSeqs(), new OneColumnFile(fileNameRoot+"nseqs")));
- }else if (Estimators[i] == "coverage") {
- cDisplays.push_back(new CollectDisplay(new Coverage(), new OneColumnFile(fileNameRoot+"coverage")));
- }else if (Estimators[i] == "ace") {
- cDisplays.push_back(new CollectDisplay(new Ace(abund), new ThreeColumnFile(fileNameRoot+"ace")));
- }else if (Estimators[i] == "jack") {
- cDisplays.push_back(new CollectDisplay(new Jackknife(), new ThreeColumnFile(fileNameRoot+"jack")));
- }else if (Estimators[i] == "shannon") {
- cDisplays.push_back(new CollectDisplay(new Shannon(), new ThreeColumnFile(fileNameRoot+"shannon")));
- }else if (Estimators[i] == "npshannon") {
- cDisplays.push_back(new CollectDisplay(new NPShannon(), new OneColumnFile(fileNameRoot+"np_shannon")));
- }else if (Estimators[i] == "simpson") {
- cDisplays.push_back(new CollectDisplay(new Simpson(), new ThreeColumnFile(fileNameRoot+"simpson")));
- }else if (Estimators[i] == "bootstrap") {
- cDisplays.push_back(new CollectDisplay(new Bootstrap(), new OneColumnFile(fileNameRoot+"bootstrap")));
- }else if (Estimators[i] == "geometric") {
- cDisplays.push_back(new CollectDisplay(new Geom(), new OneColumnFile(fileNameRoot+"geometric")));
- }else if (Estimators[i] == "qstat") {
- cDisplays.push_back(new CollectDisplay(new QStat(), new OneColumnFile(fileNameRoot+"qstat")));
- }else if (Estimators[i] == "logseries") {
- cDisplays.push_back(new CollectDisplay(new LogSD(), new OneColumnFile(fileNameRoot+"logseries")));
- }else if (Estimators[i] == "bergerparker") {
- cDisplays.push_back(new CollectDisplay(new BergerParker(), new OneColumnFile(fileNameRoot+"bergerparker")));
- }else if (Estimators[i] == "bstick") {
- cDisplays.push_back(new CollectDisplay(new BStick(), new ThreeColumnFile(fileNameRoot+"bstick")));
- }else if (Estimators[i] == "goodscoverage") {
- cDisplays.push_back(new CollectDisplay(new GoodsCoverage(), new OneColumnFile(fileNameRoot+"goodscoverage")));
- }else if (Estimators[i] == "efron") {
- cDisplays.push_back(new CollectDisplay(new Efron(size), new OneColumnFile(fileNameRoot+"efron")));
- }else if (Estimators[i] == "boneh") {
- cDisplays.push_back(new CollectDisplay(new Boneh(size), new OneColumnFile(fileNameRoot+"boneh")));
- }else if (Estimators[i] == "solow") {
- cDisplays.push_back(new CollectDisplay(new Solow(size), new OneColumnFile(fileNameRoot+"solow")));
- }else if (Estimators[i] == "shen") {
- cDisplays.push_back(new CollectDisplay(new Shen(size, abund), new OneColumnFile(fileNameRoot+"shen")));
- }
- }
- }
- }
}
}
//**********************************************************************************************************************
-CollectCommand::~CollectCommand(){
- if (abort == false) {
- //delete order;
- delete input; globaldata->ginput = NULL;
- delete read;
- delete validCalculator;
- globaldata->gorder = NULL;
- }
-}
+CollectCommand::~CollectCommand(){}
//**********************************************************************************************************************
if (abort == true) { return 0; }
- //if the users entered no valid calculators don't execute command
- if (cDisplays.size() == 0) { return 0; }
-
- read = new ReadOTUFile(globaldata->inputFileName);
- read->read(&*globaldata);
-
- order = globaldata->gorder;
- string lastLabel = order->getLabel();
- input = globaldata->ginput;
-
- //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
- set<string> processedLabels;
- set<string> userLabels = labels;
+ if ((globaldata->getFormat() != "sharedfile")) { inputFileNames.push_back(globaldata->inputFileName); }
+ else { inputFileNames = parseSharedFile(globaldata->getSharedFile()); globaldata->setFormat("rabund"); }
- while((order != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
-
- if(allLines == 1 || labels.count(order->getLabel()) == 1){
-
- cCurve = new Collect(order, cDisplays);
- cCurve->getCurve(freq);
- delete cCurve;
+ for (int p = 0; p < inputFileNames.size(); p++) {
- mothurOut(order->getLabel()); mothurOutEndLine();
- processedLabels.insert(order->getLabel());
- userLabels.erase(order->getLabel());
+ string fileNameRoot = getRootName(inputFileNames[p]);
+ globaldata->inputFileName = inputFileNames[p];
+
+ if (inputFileNames.size() > 1) {
+ mothurOutEndLine(); mothurOut("Processing group " + groups[p]); mothurOutEndLine(); mothurOutEndLine();
+ }
+ validCalculator = new ValidCalculators();
+ for (int i=0; i<Estimators.size(); i++) {
+ if (validCalculator->isValidCalculator("single", Estimators[i]) == true) {
+ if (Estimators[i] == "sobs") {
+ cDisplays.push_back(new CollectDisplay(new Sobs(), new OneColumnFile(fileNameRoot+"sobs")));
+ }else if (Estimators[i] == "chao") {
+ cDisplays.push_back(new CollectDisplay(new Chao1(), new ThreeColumnFile(fileNameRoot+"chao")));
+ }else if (Estimators[i] == "nseqs") {
+ cDisplays.push_back(new CollectDisplay(new NSeqs(), new OneColumnFile(fileNameRoot+"nseqs")));
+ }else if (Estimators[i] == "coverage") {
+ cDisplays.push_back(new CollectDisplay(new Coverage(), new OneColumnFile(fileNameRoot+"coverage")));
+ }else if (Estimators[i] == "ace") {
+ cDisplays.push_back(new CollectDisplay(new Ace(abund), new ThreeColumnFile(fileNameRoot+"ace")));
+ }else if (Estimators[i] == "jack") {
+ cDisplays.push_back(new CollectDisplay(new Jackknife(), new ThreeColumnFile(fileNameRoot+"jack")));
+ }else if (Estimators[i] == "shannon") {
+ cDisplays.push_back(new CollectDisplay(new Shannon(), new ThreeColumnFile(fileNameRoot+"shannon")));
+ }else if (Estimators[i] == "npshannon") {
+ cDisplays.push_back(new CollectDisplay(new NPShannon(), new OneColumnFile(fileNameRoot+"np_shannon")));
+ }else if (Estimators[i] == "simpson") {
+ cDisplays.push_back(new CollectDisplay(new Simpson(), new ThreeColumnFile(fileNameRoot+"simpson")));
+ }else if (Estimators[i] == "bootstrap") {
+ cDisplays.push_back(new CollectDisplay(new Bootstrap(), new OneColumnFile(fileNameRoot+"bootstrap")));
+ }else if (Estimators[i] == "geometric") {
+ cDisplays.push_back(new CollectDisplay(new Geom(), new OneColumnFile(fileNameRoot+"geometric")));
+ }else if (Estimators[i] == "qstat") {
+ cDisplays.push_back(new CollectDisplay(new QStat(), new OneColumnFile(fileNameRoot+"qstat")));
+ }else if (Estimators[i] == "logseries") {
+ cDisplays.push_back(new CollectDisplay(new LogSD(), new OneColumnFile(fileNameRoot+"logseries")));
+ }else if (Estimators[i] == "bergerparker") {
+ cDisplays.push_back(new CollectDisplay(new BergerParker(), new OneColumnFile(fileNameRoot+"bergerparker")));
+ }else if (Estimators[i] == "bstick") {
+ cDisplays.push_back(new CollectDisplay(new BStick(), new ThreeColumnFile(fileNameRoot+"bstick")));
+ }else if (Estimators[i] == "goodscoverage") {
+ cDisplays.push_back(new CollectDisplay(new GoodsCoverage(), new OneColumnFile(fileNameRoot+"goodscoverage")));
+ }else if (Estimators[i] == "efron") {
+ cDisplays.push_back(new CollectDisplay(new Efron(size), new OneColumnFile(fileNameRoot+"efron")));
+ }else if (Estimators[i] == "boneh") {
+ cDisplays.push_back(new CollectDisplay(new Boneh(size), new OneColumnFile(fileNameRoot+"boneh")));
+ }else if (Estimators[i] == "solow") {
+ cDisplays.push_back(new CollectDisplay(new Solow(size), new OneColumnFile(fileNameRoot+"solow")));
+ }else if (Estimators[i] == "shen") {
+ cDisplays.push_back(new CollectDisplay(new Shen(size, abund), new OneColumnFile(fileNameRoot+"shen")));
+ }
+ }
}
- //you have a label the user want that is smaller than this label and the last label has not already been processed
- if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
- string saveLabel = order->getLabel();
+
+
+ //if the users entered no valid calculators don't execute command
+ if (cDisplays.size() == 0) { return 0; }
+
+ read = new ReadOTUFile(inputFileNames[p]);
+ read->read(&*globaldata);
- delete order;
+ order = globaldata->gorder;
+ string lastLabel = order->getLabel();
+ input = globaldata->ginput;
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ while((order != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+
+ if(allLines == 1 || labels.count(order->getLabel()) == 1){
+
+ cCurve = new Collect(order, cDisplays);
+ cCurve->getCurve(freq);
+ delete cCurve;
+
+ mothurOut(order->getLabel()); mothurOutEndLine();
+ processedLabels.insert(order->getLabel());
+ userLabels.erase(order->getLabel());
+
+
+ }
+ //you have a label the user want that is smaller than this label and the last label has not already been processed
+ if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = order->getLabel();
+
+ delete order;
+ order = (input->getOrderVector(lastLabel));
+
+ cCurve = new Collect(order, cDisplays);
+ cCurve->getCurve(freq);
+ delete cCurve;
+
+ mothurOut(order->getLabel()); mothurOutEndLine();
+ processedLabels.insert(order->getLabel());
+ userLabels.erase(order->getLabel());
+
+ //restore real lastlabel to save below
+ order->setLabel(saveLabel);
+ }
+
+ lastLabel = order->getLabel();
+
+ delete order;
+ order = (input->getOrderVector());
+ }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
+ needToRun = true;
+ }else {
+ mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ if (order != NULL) { delete order; }
order = (input->getOrderVector(lastLabel));
+ mothurOut(order->getLabel()); mothurOutEndLine();
+
cCurve = new Collect(order, cDisplays);
cCurve->getCurve(freq);
delete cCurve;
-
- mothurOut(order->getLabel()); mothurOutEndLine();
- processedLabels.insert(order->getLabel());
- userLabels.erase(order->getLabel());
-
- //restore real lastlabel to save below
- order->setLabel(saveLabel);
+ delete order;
}
- lastLabel = order->getLabel();
-
- delete order;
- order = (input->getOrderVector());
+ for(int i=0;i<cDisplays.size();i++){ delete cDisplays[i]; }
+ cDisplays.clear();
+ delete input; globaldata->ginput = NULL;
+ delete read;
+ globaldata->gorder = NULL;
+ delete validCalculator;
}
- //output error messages about any remaining user labels
- set<string>::iterator it;
- bool needToRun = false;
- for (it = userLabels.begin(); it != userLabels.end(); it++) {
- mothurOut("Your file does not include the label " + *it);
- if (processedLabels.count(lastLabel) != 1) {
- mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
- needToRun = true;
- }else {
- mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
- }
- }
-
- //run last label if you need to
- if (needToRun == true) {
- if (order != NULL) { delete order; }
- order = (input->getOrderVector(lastLabel));
-
- mothurOut(order->getLabel()); mothurOutEndLine();
-
- cCurve = new Collect(order, cDisplays);
- cCurve->getCurve(freq);
- delete cCurve;
- delete order;
- }
- for(int i=0;i<cDisplays.size();i++){ delete cDisplays[i]; }
return 0;
}
catch(exception& e) {
}
//**********************************************************************************************************************
+vector<string> CollectCommand::parseSharedFile(string filename) {
+ try {
+ vector<string> filenames;
+
+ map<string, ofstream*> filehandles;
+ map<string, ofstream*>::iterator it3;
+
+
+ //read first line
+ read = new ReadOTUFile(filename);
+ read->read(&*globaldata);
+
+ input = globaldata->ginput;
+ vector<SharedRAbundVector*> lookup = input->getSharedRAbundVectors();
+
+ string sharedFileRoot = getRootName(filename);
+
+ //clears file before we start to write to it below
+ for (int i=0; i<lookup.size(); i++) {
+ remove((sharedFileRoot + lookup[i]->getGroup() + ".rabund").c_str());
+ filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund"));
+ }
+
+ ofstream* temp;
+ for (int i=0; i<lookup.size(); i++) {
+ temp = new ofstream;
+ filehandles[lookup[i]->getGroup()] = temp;
+ groups.push_back(lookup[i]->getGroup());
+ }
+
+ while(lookup[0] != NULL) {
+
+ for (int i = 0; i < lookup.size(); i++) {
+ RAbundVector rav = lookup[i]->getRAbundVector();
+ openOutputFileAppend(sharedFileRoot + lookup[i]->getGroup() + ".rabund", *(filehandles[lookup[i]->getGroup()]));
+ rav.print(*(filehandles[lookup[i]->getGroup()]));
+ (*(filehandles[lookup[i]->getGroup()])).close();
+ }
+
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ lookup = input->getSharedRAbundVectors();
+ }
+
+ //free memory
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ delete it3->second;
+ }
+ delete read;
+ delete input;
+ globaldata->ginput = NULL;
+
+ return filenames;
+ }
+ catch(exception& e) {
+ errorOut(e, "CollectCommand", "parseSharedFile");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+
set<string> labels; //holds labels to be used
string label, calc;
vector<string> Estimators;
+ vector<string> inputFileNames;
+ vector<string> groups;
+
+ vector<string> parseSharedFile(string);
};
outFile << setprecision(4);
if(isTrue(phylip) && startLine == 0){ outFile << alignDB.getNumSeqs() << endl; }
+
for(int i=startLine;i<endLine;i++){
- if(isTrue(phylip)) { outFile << alignDB.get(i).getName() << '\t'; }
+ if(isTrue(phylip)) {
+ string name = alignDB.get(i).getName();
+ if (name.length() < 10) { //pad with spaces to make compatible
+ while (name.length() < 10) { name += " "; }
+ }
+ outFile << name << '\t';
+ }
for(int j=0;j<i;j++){
distCalculator->calcDist(alignDB.get(i), alignDB.get(j));
double dist = distCalculator->getDist();
#include "listvector.hpp"
#include "sparsematrix.hpp"
-
/***********************************************************************/
-
-HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m) : rabund(rav), list(lv), method(m){
+HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m, string d, NameAssignment* n, float c) : rabund(rav), list(lv), method(m), distfile(d), nameMap(n), cutoff(c) {
try {
mapWanted = false;
exitedBreak = false;
clusterArray.push_back(temp);
}
+ if (method != "average") {
+ openInputFile(distfile, filehandle);
+ }else{ firstRead = true; }
}
catch(exception& e) {
errorOut(e, "HCluster", "HCluster");
/***********************************************************************/
int HCluster::getUpmostParent(int node){
try {
-
while (clusterArray[node].parent != -1) {
node = clusterArray[node].parent;
}
try {
cout << "link table" << endl;
- for (it = activeLinks.begin(); it!= activeLinks.end(); it++) {
- cout << it->first << " = " << it->second << endl;
+ for (itActive = activeLinks.begin(); itActive!= activeLinks.end(); itActive++) {
+ cout << itActive->first << " = " << itActive->second << endl;
}
cout << endl;
for (int i = 0; i < linkTable.size(); i++) {
cout << i << '\t';
for (it = linkTable[i].begin(); it != linkTable[i].end(); it++) {
- cout << it->first << '-' << it->second << '\t';
+ cout << it->first << '-' << it->second << '\t' ;
}
cout << endl;
}
/***********************************************************************/
int HCluster::makeActive() {
try {
-
int linkValue = 1;
-//cout << "active - here" << endl;
- it = activeLinks.find(smallRow);
- it2 = activeLinks.find(smallCol);
+
+ itActive = activeLinks.find(smallRow);
+ it2Active = activeLinks.find(smallCol);
- if ((it == activeLinks.end()) && (it2 == activeLinks.end())) { //both are not active so add them
+ if ((itActive == activeLinks.end()) && (it2Active == activeLinks.end())) { //both are not active so add them
int size = linkTable.size();
map<int, int> temp; map<int, int> temp2;
//add link to eachother
- temp[smallRow] = 1; // 1 2
- temp2[smallCol] = 1; // 1 0 1
- // 2 1 0
+ temp[smallRow] = 1; // 1 2
+ temp2[smallCol] = 1; // 1 0 1
+ // 2 1 0
linkTable.push_back(temp);
linkTable.push_back(temp2);
//add to activeLinks
activeLinks[smallRow] = size;
activeLinks[smallCol] = size+1;
-//cout << "active - here1" << endl;
- }else if ((it != activeLinks.end()) && (it2 == activeLinks.end())) { //smallRow is active, smallCol is not
+
+ }else if ((itActive != activeLinks.end()) && (it2Active == activeLinks.end())) { //smallRow is active, smallCol is not
int size = linkTable.size();
- int alreadyActiveRow = it->second;
+ int alreadyActiveRow = itActive->second;
map<int, int> temp;
//add link to eachother
- temp[smallRow] = 1; // 6 2 3 5
- linkTable.push_back(temp); // 6 0 1 2 0
- linkTable[alreadyActiveRow][smallCol] = 1; // 2 1 0 1 1
- // 3 2 1 0 0
- // 5 0 1 0 0
+ temp[smallRow] = 1; // 6 2 3 5
+ linkTable.push_back(temp); // 6 0 1 2 0
+ linkTable[alreadyActiveRow][smallCol] = 1; // 2 1 0 1 1
+ // 3 2 1 0 0
+ // 5 0 1 0 0
//add to activeLinks
activeLinks[smallCol] = size;
-//cout << "active - here2" << endl;
- }else if ((it == activeLinks.end()) && (it2 != activeLinks.end())) { //smallCol is active, smallRow is not
+
+ }else if ((itActive == activeLinks.end()) && (it2Active != activeLinks.end())) { //smallCol is active, smallRow is not
int size = linkTable.size();
- int alreadyActiveCol = it2->second;
+ int alreadyActiveCol = it2Active->second;
map<int, int> temp;
//add link to eachother
- temp[smallCol] = 1; // 6 2 3 5
- linkTable.push_back(temp); // 6 0 1 2 0
- linkTable[alreadyActiveCol][smallRow] = 1; // 2 1 0 1 1
- // 3 2 1 0 0
- // 5 0 1 0 0
+ temp[smallCol] = 1; // 6 2 3 5
+ linkTable.push_back(temp); // 6 0 1 2 0
+ linkTable[alreadyActiveCol][smallRow] = 1; // 2 1 0 1 1
+ // 3 2 1 0 0
+ // 5 0 1 0 0
//add to activeLinks
activeLinks[smallRow] = size;
-//cout << "active - here3" << endl;
+
}else { //both are active so add one
- int row = it->second;
- int col = it2->second;
-//cout << row << '\t' << col << endl;
+ int row = itActive->second;
+ int col = it2Active->second;
+
linkTable[row][smallCol]++;
linkTable[col][smallRow]++;
linkValue = linkTable[row][smallCol];
-//cout << "active - here4" << endl;
}
return linkValue;
/***********************************************************************/
void HCluster::updateArrayandLinkTable() {
try {
- //if cluster was made update clusterArray and linkTable
- int size = clusterArray.size();
-
- //add new node
- clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
- clusterArray.push_back(temp);
-
- //update child nodes
- clusterArray[smallRow].parent = size;
- clusterArray[smallCol].parent = size;
+ //if cluster was made update clusterArray and linkTable
+ int size = clusterArray.size();
+
+ //add new node
+ clusterNode temp(clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq, -1, clusterArray[smallCol].smallChild);
+ clusterArray.push_back(temp);
+
+ //update child nodes
+ clusterArray[smallRow].parent = size;
+ clusterArray[smallCol].parent = size;
+
+ if (method == "furthest") {
//update linkTable by merging clustered rows and columns
int rowSpot = activeLinks[smallRow];
int colSpot = activeLinks[smallCol];
- //cout << "here" << endl;
+
//fix old rows
for (int i = 0; i < linkTable.size(); i++) {
//check if they are in map
linkTable[i].erase(smallRow); //delete col
}
}
- //printInfo();
-//cout << "here2" << endl;
+
//merge their values
for (it = linkTable[rowSpot].begin(); it != linkTable[rowSpot].end(); it++) {
it2 = linkTable[colSpot].find(it->first); //does the col also have this
if (it2 == linkTable[colSpot].end()) { //not there so add it
linkTable[colSpot][it->first] = it->second;
}else { //merge them
- linkTable[colSpot][it->first] = it->second+it2->second;
+ linkTable[colSpot][it->first] = it->second + it2->second;
}
}
-//cout << "here3" << endl;
+
linkTable[colSpot].erase(size);
linkTable.erase(linkTable.begin()+rowSpot); //delete row
- //printInfo();
+
//update activerows
activeLinks.erase(smallRow);
activeLinks.erase(smallCol);
activeLinks[size] = colSpot;
//adjust everybody elses spot since you deleted - time vs. space
- for (it = activeLinks.begin(); it != activeLinks.end(); it++) {
- if (it->second > rowSpot) { activeLinks[it->first]--; }
+ for (itActive = activeLinks.begin(); itActive != activeLinks.end(); itActive++) {
+ if (itActive->second > rowSpot) { activeLinks[itActive->first]--; }
}
-
-//cout << "here4" << endl;
-
+ }
}
catch(exception& e) {
errorOut(e, "HCluster", "updateArrayandLinkTable");
}
}
/***********************************************************************/
-void HCluster::update(int row, int col, float distance){
+bool HCluster::update(int row, int col, float distance){
try {
-
+ bool cluster = false;
smallRow = row;
smallCol = col;
smallDist = distance;
//find upmost parent of row and col
smallRow = getUpmostParent(smallRow);
smallCol = getUpmostParent(smallCol);
- //cout << "row = " << row << " smallRow = " << smallRow << " col = " << col << " smallCol = " << smallCol << " dist = " << distance << endl;
+
//you don't want to cluster with yourself
if (smallRow != smallCol) {
- //are they active in the link table
- int linkValue = makeActive(); //after this point this nodes info is active in linkTable
- //printInfo();
- //cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl;
- //can we cluster???
- bool cluster = false;
-
- if (method == "nearest") { cluster = true; }
- else if (method == "average") {
- if (linkValue == (ceil(((clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) / (float) 2.0))) { cluster = true; }
- }else{ //assume furthest
- if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; }
- }
- if (cluster) {
+ if (method != "average") {
+ //can we cluster???
+ if (method == "nearest") { cluster = true; }
+ else{ //assume furthest
+ //are they active in the link table
+ int linkValue = makeActive(); //after this point this nodes info is active in linkTable
+ if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; }
+ }
+
+ if (cluster) {
+ updateArrayandLinkTable();
+ clusterBins();
+ clusterNames();
+ }
+ }else {
+ cluster = true;
updateArrayandLinkTable();
clusterBins();
clusterNames();
+ combineFile();
}
}
+
+ return cluster;
//printInfo();
}
catch(exception& e) {
}
}
//**********************************************************************************************************************
-vector<seqDist> HCluster::getSeqs(ifstream& filehandle, NameAssignment* nameMap, float cutoff){
+vector<seqDist> HCluster::getSeqs(){
+ try {
+ vector<seqDist> sameSeqs;
+
+ if(method != "average") {
+ sameSeqs = getSeqsFNNN();
+ }else{
+ if (firstRead) { processFile(); }
+ sameSeqs = getSeqsAN();
+ }
+
+ return sameSeqs;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "getSeqs");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<seqDist> HCluster::getSeqsFNNN(){
try {
string firstName, secondName;
float distance, prevDistance;
return sameSeqs;
}
catch(exception& e) {
- errorOut(e, "HCluster", "getSeqs");
+ errorOut(e, "HCluster", "getSeqsFNNN");
exit(1);
}
}
+//**********************************************************************************************************************
+//don't need cutoff since processFile removes all distance above cutoff and changes names to indexes
+vector<seqDist> HCluster::getSeqsAN(){
+ try {
+ int firstName, secondName;
+ float prevDistance;
+ vector<seqDist> sameSeqs;
+ prevDistance = -1;
+
+ openInputFile(distfile, filehandle, "no error");
+
+ //is the smallest value in mergedMin or the distfile?
+ float mergedMinDist = 10000;
+ float distance = 10000;
+ if (mergedMin.size() > 0) { mergedMinDist = mergedMin[0].dist; }
+
+ if (!filehandle.eof()) {
+ filehandle >> firstName >> secondName >> distance; gobble(filehandle);
+ //save first one
+ if (prevDistance == -1) { prevDistance = distance; }
+ if (distance != -1) { //-1 means skip me
+ seqDist temp(firstName, secondName, distance);
+ sameSeqs.push_back(temp);
+ }
+ }
+
+ if (mergedMinDist < distance) { //get minimum distance from mergedMin
+ //remove distance we saved from file
+ sameSeqs.clear();
+ prevDistance = mergedMinDist;
+
+ for (int i = 0; i < mergedMin.size(); i++) {
+ if (mergedMin[i].dist == prevDistance) {
+ sameSeqs.push_back(mergedMin[i]);
+ }else { break; }
+ }
+ }else{ //get minimum from file
+ //get entry
+ while (!filehandle.eof()) {
+
+ filehandle >> firstName >> secondName >> distance; gobble(filehandle);
+
+ if (prevDistance == -1) { prevDistance = distance; }
+
+ if (distance != -1) { //-1 means skip me
+ //are the distances the same
+ if (distance == prevDistance) { //save in vector
+ seqDist temp(firstName, secondName, distance);
+ sameSeqs.push_back(temp);
+ }else{
+ break;
+ }
+ }
+ }
+ }
+ filehandle.close();
+
+ //randomize matching dists
+ random_shuffle(sameSeqs.begin(), sameSeqs.end());
+
+ //can only return one value since once these are merged the other distances in sameSeqs may have changed
+ vector<seqDist> temp;
+ if (sameSeqs.size() > 0) { temp.push_back(sameSeqs[0]); }
+
+ return temp;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "getSeqsAN");
+ exit(1);
+ }
+}
+
/***********************************************************************/
+void HCluster::combineFile() {
+ try {
+ int bufferSize = 64000; //512k - this should be a variable that the user can set to optimize code to their hardware
+ char* inputBuffer;
+ inputBuffer = new char[bufferSize];
+ size_t numRead;
+
+ string tempDistFile = distfile + ".temp";
+ ofstream out;
+ openOutputFile(tempDistFile, out);
+
+ FILE* in;
+ in = fopen(distfile.c_str(), "rb");
+
+ int first, second;
+ float dist;
+
+ vector< map<int, float> > smallRowColValues;
+ smallRowColValues.resize(2); //0 = row, 1 = col
+ int count = 0;
+
+ //go through file pulling out distances related to rows merging
+ //if mergedMin contains distances add those back into file
+ bool done = false;
+ partialDist = "";
+ while ((numRead = fread(inputBuffer, 1, bufferSize, in)) != 0) {
+//cout << "number of char read = " << numRead << endl;
+//cout << inputBuffer << endl;
+ if (numRead < bufferSize) { done = true; }
+
+ //parse input into individual distances
+ int spot = 0;
+ string outputString = "";
+ while(spot < numRead) {
+ //cout << "spot = " << spot << endl;
+ seqDist nextDist = getNextDist(inputBuffer, spot, bufferSize);
+
+ //you read a partial distance
+ if (nextDist.seq1 == -1) { break; }
+
+ first = nextDist.seq1; second = nextDist.seq2; dist = nextDist.dist;
+ //cout << "next distance = " << first << '\t' << second << '\t' << dist << endl;
+ //since file is sorted and mergedMin is sorted
+ //you can put the smallest distance from each through the code below and keep the file sorted
+
+ //while there are still values in mergedMin that are smaller than the distance read from file
+ while (count < mergedMin.size()) {
+
+ //is the distance in mergedMin smaller than from the file
+ if (mergedMin[count].dist < dist) {
+ //is this a distance related to the columns merging?
+ //if yes, save in memory
+ if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
+ }else if (mergedMin[count].seq1 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq1 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
+ }else { //if no, write to temp file
+ outputString += toString(mergedMin[count].seq1) + '\t' + toString(mergedMin[count].seq2) + '\t' + toString(mergedMin[count].dist) + '\n';
+ }
+ count++;
+ }else{ break; }
+ }
+
+ //is this a distance related to the columns merging?
+ //if yes, save in memory
+ if ((first == smallRow) && (second == smallCol)) { //do nothing this is the smallest distance from last time
+ }else if (first == smallCol) {
+ smallRowColValues[1][second] = dist;
+ }else if (second == smallCol) {
+ smallRowColValues[1][first] = dist;
+ }else if (first == smallRow) {
+ smallRowColValues[0][second] = dist;
+ }else if (second == smallRow) {
+ smallRowColValues[0][first] = dist;
+
+ }else { //if no, write to temp file
+ outputString += toString(first) + '\t' + toString(second) + '\t' + toString(dist) + '\n';
+ }
+ }
+
+ out << outputString;
+ if(done) { break; }
+ }
+ fclose(in);
+
+ //if values in mergedMin are larger than the the largest in file then
+ while (count < mergedMin.size()) {
+ //is this a distance related to the columns merging?
+ //if yes, save in memory
+ if ((mergedMin[count].seq1 == smallRow) && (mergedMin[count].seq2 == smallCol)) { //do nothing this is the smallest distance from last time
+ }else if (mergedMin[count].seq1 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallCol) {
+ smallRowColValues[1][mergedMin[count].seq1] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq1 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq2] = mergedMin[count].dist;
+ }else if (mergedMin[count].seq2 == smallRow) {
+ smallRowColValues[0][mergedMin[count].seq1] = mergedMin[count].dist;
+
+ }else { //if no, write to temp file
+ out << mergedMin[count].seq1 << '\t' << mergedMin[count].seq2 << '\t' << mergedMin[count].dist << endl;
+ }
+ count++;
+ }
+ out.close();
+ mergedMin.clear();
+
+ //rename tempfile to distfile
+ remove(distfile.c_str());
+ rename(tempDistFile.c_str(), distfile.c_str());
+
+ //merge clustered rows averaging the distances
+ map<int, float>::iterator itMerge;
+ map<int, float>::iterator it2Merge;
+ for(itMerge = smallRowColValues[0].begin(); itMerge != smallRowColValues[0].end(); itMerge++) {
+ //does smallRowColValues[1] have a distance to this seq too?
+ it2Merge = smallRowColValues[1].find(itMerge->first);
+
+ float average;
+ if (it2Merge != smallRowColValues[1].end()) { //if yes, then average
+ //weighted average
+ int total = clusterArray[smallRow].numSeq + clusterArray[smallCol].numSeq;
+ average = ((clusterArray[smallRow].numSeq * itMerge->second) + (clusterArray[smallCol].numSeq * it2Merge->second)) / (float) total;
+ smallRowColValues[1].erase(it2Merge);
+
+ seqDist temp(clusterArray[smallRow].parent, itMerge->first, average);
+ mergedMin.push_back(temp);
+ }
+ }
+
+ //sort merged values
+ sort(mergedMin.begin(), mergedMin.end(), compareSequenceDistance);
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "combineFile");
+ exit(1);
+ }
+}
+/***********************************************************************/
+seqDist HCluster::getNextDist(char* buffer, int& index, int size){
+ try {
+ seqDist next;
+ int indexBefore = index;
+ string first, second, distance;
+ first = ""; second = ""; distance = "";
+ int tabCount = 0;
+//cout << "partial = " << partialDist << endl;
+ if (partialDist != "") { //read what you can, you know it is less than a whole distance.
+ for (int i = 0; i < partialDist.size(); i++) {
+ if (tabCount == 0) {
+ if (partialDist[i] == '\t') { tabCount++; }
+ else { first += partialDist[i]; }
+ }else if (tabCount == 1) {
+ if (partialDist[i] == '\t') { tabCount++; }
+ else { second += partialDist[i]; }
+ }else if (tabCount == 2) {
+ distance += partialDist[i];
+ }
+ }
+ partialDist = "";
+ }
+
+ //try to get another distance
+ bool gotDist = false;
+ while (index < size) {
+ if ((buffer[index] == 10) || (buffer[index] == 13)) { //newline in unix or windows
+ gotDist = true;
+
+ //gobble space
+ while (index < size) {
+ if (isspace(buffer[index])) { index++; }
+ else { break; }
+ }
+ break;
+ }else{
+ if (tabCount == 0) {
+ if (buffer[index] == '\t') { tabCount++; }
+ else { first += buffer[index]; }
+ }else if (tabCount == 1) {
+ if (buffer[index] == '\t') { tabCount++; }
+ else { second += buffer[index]; }
+ }else if (tabCount == 2) {
+ distance += buffer[index];
+ }
+ index++;
+ }
+ }
+
+ //there was not a whole distance in the buffer, ie. buffer = "1 2 0.01 2 3 0."
+ //then you want to save the partial distance.
+ if (!gotDist) {
+ for (int i = indexBefore; i < size; i++) {
+ partialDist += buffer[i];
+ }
+ index = size + 1;
+ next.seq1 = -1; next.seq2 = -1; next.dist = 0.0;
+ }else{
+ int firstname, secondname;
+ float dist;
+
+ convert(first, firstname);
+ convert(second, secondname);
+ convert(distance, dist);
+
+ next.seq1 = firstname; next.seq2 = secondname; next.dist = dist;
+ }
+
+ return next;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "getNextDist");
+ exit(1);
+ }
+}
+/***********************************************************************/
+void HCluster::processFile() {
+ try {
+ string firstName, secondName;
+ float distance;
+
+ ifstream in;
+ openInputFile(distfile, in);
+
+ ofstream out;
+ string outTemp = distfile + ".temp";
+ openOutputFile(outTemp, out);
+
+ //get entry
+ while (!in.eof()) {
+
+ in >> firstName >> secondName >> distance; gobble(in);
+
+ map<string,int>::iterator itA = nameMap->find(firstName);
+ map<string,int>::iterator itB = nameMap->find(secondName);
+ if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
+ if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
+
+ //using cutoff
+ if (distance > cutoff) { break; }
+
+ if (distance != -1) { //-1 means skip me
+ out << itA->second << '\t' << itB->second << '\t' << distance << endl;
+ }
+ }
+
+ in.close();
+ out.close();
+
+ remove(distfile.c_str());
+ rename(outTemp.c_str(), distfile.c_str());
+
+ firstRead = false;
+ }
+ catch(exception& e) {
+ errorOut(e, "HCluster", "processFile");
+ exit(1);
+ }
+}
+/***********************************************************************/
+
+
+
+
+
class RAbundVector;
class ListVector;
+/***********************************************************************/
+struct linkNode {
+ int links;
+ float dist;
+
+ linkNode() {};
+ linkNode(int l, float a) : links(l), dist(a) {};
+ ~linkNode() {};
+};
+
/***********************************************************************/
class HCluster {
public:
- HCluster(RAbundVector*, ListVector*, string);
+ HCluster(RAbundVector*, ListVector*, string, string, NameAssignment*, float);
~HCluster(){};
- void update(int, int, float);
+ bool update(int, int, float);
void setMapWanted(bool m);
map<string, int> getSeqtoBin() { return seq2Bin; }
- vector<seqDist> getSeqs(ifstream&, NameAssignment*, float);
+ vector<seqDist> getSeqs();
protected:
void clusterBins();
void printInfo();
void updateArrayandLinkTable();
void updateMap();
+ vector<seqDist> getSeqsFNNN();
+ vector<seqDist> getSeqsAN();
+ void combineFile();
+ void processFile();
+ seqDist getNextDist(char*, int&, int);
RAbundVector* rabund;
ListVector* list;
+ NameAssignment* nameMap;
vector<clusterNode> clusterArray;
+
+ //note: the nearest and average neighbor method do not use the link table or active links
vector< map<int, int> > linkTable; // vector of maps - linkTable[1][6] = 2 would mean sequence in spot 1 has 2 links with sequence in 6
map<int, int> activeLinks; //maps sequence to index in linkTable
map<int, int>::iterator it;
+ map<int, int>::iterator itActive;
+ map<int, int>::iterator it2Active;
map<int, int>::iterator it2;
int numSeqs;
int smallRow;
int smallCol;
- float smallDist;
+ float smallDist, cutoff;
map<string, int> seq2Bin;
- bool mapWanted, exitedBreak;
+ bool mapWanted, exitedBreak, firstRead;
seqDist next;
- string method;
+ string method, distfile;
+ ifstream filehandle;
+
+ vector<seqDist> mergedMin;
+ string partialDist;
+
};
print_start = true;
start = time(NULL);
- ifstream in;
- openInputFile(distfile, in);
- string firstName, secondName;
- float distance;
-
- cluster = new HCluster(rabund, list, method);
+ //ifstream in;
+ //openInputFile(distfile, in);
+
+ cluster = new HCluster(rabund, list, method, distfile, globaldata->nameMap, cutoff);
vector<seqDist> seqs; seqs.resize(1); // to start loop
while (seqs.size() != 0){
- seqs = cluster->getSeqs(in, globaldata->nameMap, cutoff);
+ seqs = cluster->getSeqs();
for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
- if (print_start && isTrue(timing)) {
- mothurOut("Clustering (" + tag + ") dist " + toString(distance) + "/"
- + toString(roundDist(distance, precision))
- + "\t(precision: " + toString(precision) + ")");
- cout.flush();
- print_start = false;
- }
-
-
if (seqs[i].seq1 != seqs[i].seq2) {
- cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
+ bool clustered = cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
float rndDist = roundDist(seqs[i].dist, precision);
-
- if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
- printData("unique");
- }
- else if((rndDist != rndPreviousDist)){
- printData(toString(rndPreviousDist, length-1));
+
+ if (clustered) {
+ if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
+ printData("unique");
+ }
+ else if((rndDist != rndPreviousDist)){
+ printData(toString(rndPreviousDist, length-1));
+ }
}
previousDist = seqs[i].dist;
}
}
- in.close();
+ //in.close();
- if (print_start && isTrue(timing)) {
- //mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist)
- //+ "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")");
- cout.flush();
- print_start = false;
- }
-
if(previousDist <= 0.0000){
printData("unique");
}
ofstream outputFile;
openOutputFile(outputFileName, outputFile);
- ifstream inputFile;
char c;
for(int i=0;i<numInputFiles;i++){
+ ifstream inputFile; //declaration must be inside for loop of windows throws an error
+
openInputFile(fileNames[i], inputFile);
while(!inputFile.eof()){ c = inputFile.get(); outputFile << c; }
sortHclusterFiles(distFile, overlapFile);
//create cluster
- hcluster = new HCluster(rabund, list, method);
+ hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
hcluster->setMapWanted(true);
vector<seqDist> seqs; seqs.resize(1); // to start loop
- ifstream inHcluster;
- openInputFile(distFile, inHcluster);
+ //ifstream inHcluster;
+ //openInputFile(distFile, inHcluster);
while (seqs.size() != 0){
- seqs = hcluster->getSeqs(inHcluster, nameMap, cutoff);
+ seqs = hcluster->getSeqs();
for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
}
}
}
- inHcluster.close();
+ //inHcluster.close();
if(previousDist <= 0.0000){
oldList.setLabel("unique");
seqDist(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {}
~seqDist() {}
};
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareSequenceDistance(seqDist left, seqDist right){
+ return (left.dist < right.dist);
+}
/***********************************************************************/
// snagged from http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.2
}
/***********************************************************************/
+inline int openInputFile(string fileName, ifstream& fileHandle, string m){
+
+ fileHandle.open(fileName.c_str());
+ if(!fileHandle) {
+ mothurOut("Error: Could not open " + fileName); mothurOutEndLine();
+ return 1;
+ }
+ else {
+ //check for blank file
+ gobble(fileHandle);
+ return 0;
+ }
+
+}
+/***********************************************************************/
+
inline int openInputFile(string fileName, ifstream& fileHandle){
fileHandle.open(fileName.c_str());
numSeqs = 0;
tree.push_back(TaxNode("Root"));
tree[0].heirarchyID = "0";
+ maxLevel = 0;
}
catch(exception& e) {
errorOut(e, "PhyloTree", "PhyloTree");
}
}
/**************************************************************************************************/
+string PhyloTree::getFullTaxonomy(string seqName) {
+ try {
+ string tax = "";
+
+ int currentNode = name2Taxonomy[seqName];
+
+ while (tree[currentNode].parent != -1) {
+ tax = tree[currentNode].name + ";" + tax;
+ currentNode = tree[currentNode].parent;
+ }
+
+ return tax;
+ }
+ catch(exception& e) {
+ errorOut(e, "PhyloTree", "getFullTaxonomy");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
void PhyloTree::print(ofstream& out){
try {
vector<int> getGenusNodes();
void binUnclassified();
- TaxNode get(int i) { return tree[i]; }
+ TaxNode get(int i) { return tree[i]; }
TaxNode get(string seqName) { return tree[name2Taxonomy[seqName]]; }
- int getIndex(string seqName) { return name2Taxonomy[seqName]; }
- string getName(int i) { return tree[i].name; }
+ int getIndex(string seqName) { return name2Taxonomy[seqName]; }
+ string getName(int i) { return tree[i].name; }
+ string getFullTaxonomy(string); //pass a sequence name return taxonomy
+ int getMaxLevel() { return maxLevel; }
+
private:
string getNextTaxon(string&);
vector<TaxNode> tree;
Estimators.clear();
//allow user to run help
- if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; }
+ if(option == "help") { validCalculator = new ValidCalculators(); help(); delete validCalculator; abort = true; }
else {
//valid paramters for this command
}
//make sure the user has already run the read.otu command
- if ((globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund or rabund before you can use the rarefaction.single command."); mothurOutEndLine(); abort = true; }
+ if ((globaldata->getSharedFile() == "") && (globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund, rabund or shared file before you can use the rarefact.single command."); mothurOutEndLine(); abort = true; }
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; }
convert(temp, nIters);
-
- if (abort == false) {
-
- string fileNameRoot = getRootName(globaldata->inputFileName);
- int i;
- validCalculator = new ValidCalculators();
-
-
- for (i=0; i<Estimators.size(); i++) {
- if (validCalculator->isValidCalculator("rarefaction", Estimators[i]) == true) {
- if (Estimators[i] == "sobs") {
- rDisplays.push_back(new RareDisplay(new Sobs(), new ThreeColumnFile(fileNameRoot+"rarefaction")));
- }else if (Estimators[i] == "chao") {
- rDisplays.push_back(new RareDisplay(new Chao1(), new ThreeColumnFile(fileNameRoot+"r_chao")));
- }else if (Estimators[i] == "ace") {
- if(abund < 5)
- abund = 10;
- rDisplays.push_back(new RareDisplay(new Ace(abund), new ThreeColumnFile(fileNameRoot+"r_ace")));
- }else if (Estimators[i] == "jack") {
- rDisplays.push_back(new RareDisplay(new Jackknife(), new ThreeColumnFile(fileNameRoot+"r_jack")));
- }else if (Estimators[i] == "shannon") {
- rDisplays.push_back(new RareDisplay(new Shannon(), new ThreeColumnFile(fileNameRoot+"r_shannon")));
- }else if (Estimators[i] == "npshannon") {
- rDisplays.push_back(new RareDisplay(new NPShannon(), new ThreeColumnFile(fileNameRoot+"r_npshannon")));
- }else if (Estimators[i] == "simpson") {
- rDisplays.push_back(new RareDisplay(new Simpson(), new ThreeColumnFile(fileNameRoot+"r_simpson")));
- }else if (Estimators[i] == "bootstrap") {
- rDisplays.push_back(new RareDisplay(new Bootstrap(), new ThreeColumnFile(fileNameRoot+"r_bootstrap")));
- }else if (Estimators[i] == "coverage") {
- rDisplays.push_back(new RareDisplay(new Coverage(), new ThreeColumnFile(fileNameRoot+"r_coverage")));
- }else if (Estimators[i] == "nseqs") {
- rDisplays.push_back(new RareDisplay(new NSeqs(), new ThreeColumnFile(fileNameRoot+"r_nseqs")));
- }
- }
- }
- }
-
}
}
//**********************************************************************************************************************
-RareFactCommand::~RareFactCommand(){
- if (abort == false) {
- globaldata->gorder = NULL;
- delete input; globaldata->ginput = NULL;
- delete read;
- delete validCalculator;
- }
-}
+RareFactCommand::~RareFactCommand(){}
//**********************************************************************************************************************
if (abort == true) { return 0; }
- //if the users entered no valid calculators don't execute command
- if (rDisplays.size() == 0) { return 0; }
-
- read = new ReadOTUFile(globaldata->inputFileName);
- read->read(&*globaldata);
-
- order = globaldata->gorder;
- string lastLabel = order->getLabel();
- input = globaldata->ginput;
+ if ((globaldata->getFormat() != "sharedfile")) { inputFileNames.push_back(globaldata->inputFileName); }
+ else { inputFileNames = parseSharedFile(globaldata->getSharedFile()); globaldata->setFormat("rabund"); }
- //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
- set<string> processedLabels;
- set<string> userLabels = labels;
-
- //as long as you are not at the end of the file or done wih the lines you want
- while((order != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+ for (int p = 0; p < inputFileNames.size(); p++) {
- if(allLines == 1 || labels.count(order->getLabel()) == 1){
+ string fileNameRoot = getRootName(inputFileNames[p]);
+ globaldata->inputFileName = inputFileNames[p];
- rCurve = new Rarefact(order, rDisplays);
- rCurve->getCurve(freq, nIters);
- delete rCurve;
+ if (inputFileNames.size() > 1) {
+ mothurOutEndLine(); mothurOut("Processing group " + groups[p]); mothurOutEndLine(); mothurOutEndLine();
+ }
+ int i;
+ validCalculator = new ValidCalculators();
- mothurOut(order->getLabel()); mothurOutEndLine();
- processedLabels.insert(order->getLabel());
- userLabels.erase(order->getLabel());
+
+ for (i=0; i<Estimators.size(); i++) {
+ if (validCalculator->isValidCalculator("rarefaction", Estimators[i]) == true) {
+ if (Estimators[i] == "sobs") {
+ rDisplays.push_back(new RareDisplay(new Sobs(), new ThreeColumnFile(fileNameRoot+"rarefaction")));
+ }else if (Estimators[i] == "chao") {
+ rDisplays.push_back(new RareDisplay(new Chao1(), new ThreeColumnFile(fileNameRoot+"r_chao")));
+ }else if (Estimators[i] == "ace") {
+ if(abund < 5)
+ abund = 10;
+ rDisplays.push_back(new RareDisplay(new Ace(abund), new ThreeColumnFile(fileNameRoot+"r_ace")));
+ }else if (Estimators[i] == "jack") {
+ rDisplays.push_back(new RareDisplay(new Jackknife(), new ThreeColumnFile(fileNameRoot+"r_jack")));
+ }else if (Estimators[i] == "shannon") {
+ rDisplays.push_back(new RareDisplay(new Shannon(), new ThreeColumnFile(fileNameRoot+"r_shannon")));
+ }else if (Estimators[i] == "npshannon") {
+ rDisplays.push_back(new RareDisplay(new NPShannon(), new ThreeColumnFile(fileNameRoot+"r_npshannon")));
+ }else if (Estimators[i] == "simpson") {
+ rDisplays.push_back(new RareDisplay(new Simpson(), new ThreeColumnFile(fileNameRoot+"r_simpson")));
+ }else if (Estimators[i] == "bootstrap") {
+ rDisplays.push_back(new RareDisplay(new Bootstrap(), new ThreeColumnFile(fileNameRoot+"r_bootstrap")));
+ }else if (Estimators[i] == "coverage") {
+ rDisplays.push_back(new RareDisplay(new Coverage(), new ThreeColumnFile(fileNameRoot+"r_coverage")));
+ }else if (Estimators[i] == "nseqs") {
+ rDisplays.push_back(new RareDisplay(new NSeqs(), new ThreeColumnFile(fileNameRoot+"r_nseqs")));
+ }
+ }
}
- if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
- string saveLabel = order->getLabel();
+
+ //if the users entered no valid calculators don't execute command
+ if (rDisplays.size() == 0) { return 0; }
+
+ read = new ReadOTUFile(globaldata->inputFileName);
+ read->read(&*globaldata);
+
+ order = globaldata->gorder;
+ string lastLabel = order->getLabel();
+ input = globaldata->ginput;
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> processedLabels;
+ set<string> userLabels = labels;
+
+ //as long as you are not at the end of the file or done wih the lines you want
+ while((order != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+
+ if(allLines == 1 || labels.count(order->getLabel()) == 1){
+
+ rCurve = new Rarefact(order, rDisplays);
+ rCurve->getCurve(freq, nIters);
+ delete rCurve;
+
+ mothurOut(order->getLabel()); mothurOutEndLine();
+ processedLabels.insert(order->getLabel());
+ userLabels.erase(order->getLabel());
+ }
+
+ if ((anyLabelsToProcess(order->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = order->getLabel();
+
+ delete order;
+ order = (input->getOrderVector(lastLabel));
+
+ rCurve = new Rarefact(order, rDisplays);
+ rCurve->getCurve(freq, nIters);
+ delete rCurve;
+
+ mothurOut(order->getLabel()); mothurOutEndLine();
+ processedLabels.insert(order->getLabel());
+ userLabels.erase(order->getLabel());
+
+ //restore real lastlabel to save below
+ order->setLabel(saveLabel);
+ }
+
+ lastLabel = order->getLabel();
delete order;
+ order = (input->getOrderVector());
+ }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
+ needToRun = true;
+ }else {
+ mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ if (order != NULL) { delete order; }
order = (input->getOrderVector(lastLabel));
rCurve = new Rarefact(order, rDisplays);
rCurve->getCurve(freq, nIters);
delete rCurve;
-
- mothurOut(order->getLabel()); mothurOutEndLine();
- processedLabels.insert(order->getLabel());
- userLabels.erase(order->getLabel());
- //restore real lastlabel to save below
- order->setLabel(saveLabel);
+ mothurOut(order->getLabel()); mothurOutEndLine();
+ delete order;
}
- lastLabel = order->getLabel();
- delete order;
- order = (input->getOrderVector());
+ for(int i=0;i<rDisplays.size();i++){ delete rDisplays[i]; }
+ rDisplays.clear();
+ globaldata->gorder = NULL;
+ delete input; globaldata->ginput = NULL;
+ delete read;
+ delete validCalculator;
+
}
- //output error messages about any remaining user labels
- set<string>::iterator it;
- bool needToRun = false;
- for (it = userLabels.begin(); it != userLabels.end(); it++) {
- mothurOut("Your file does not include the label " + *it);
- if (processedLabels.count(lastLabel) != 1) {
- mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
- needToRun = true;
- }else {
- mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
- }
- }
+ return 0;
+ }
+ catch(exception& e) {
+ errorOut(e, "RareFactCommand", "execute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<string> RareFactCommand::parseSharedFile(string filename) {
+ try {
+ vector<string> filenames;
+
+ map<string, ofstream*> filehandles;
+ map<string, ofstream*>::iterator it3;
- //run last label if you need to
- if (needToRun == true) {
- if (order != NULL) { delete order; }
- order = (input->getOrderVector(lastLabel));
- rCurve = new Rarefact(order, rDisplays);
- rCurve->getCurve(freq, nIters);
- delete rCurve;
+ //read first line
+ read = new ReadOTUFile(filename);
+ read->read(&*globaldata);
- mothurOut(order->getLabel()); mothurOutEndLine();
- delete order;
+ input = globaldata->ginput;
+ vector<SharedRAbundVector*> lookup = input->getSharedRAbundVectors();
+
+ string sharedFileRoot = getRootName(filename);
+
+ //clears file before we start to write to it below
+ for (int i=0; i<lookup.size(); i++) {
+ remove((sharedFileRoot + lookup[i]->getGroup() + ".rabund").c_str());
+ filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund"));
}
+ ofstream* temp;
+ for (int i=0; i<lookup.size(); i++) {
+ temp = new ofstream;
+ filehandles[lookup[i]->getGroup()] = temp;
+ groups.push_back(lookup[i]->getGroup());
+ }
- for(int i=0;i<rDisplays.size();i++){ delete rDisplays[i]; }
- return 0;
+ while(lookup[0] != NULL) {
+
+ for (int i = 0; i < lookup.size(); i++) {
+ RAbundVector rav = lookup[i]->getRAbundVector();
+ openOutputFileAppend(sharedFileRoot + lookup[i]->getGroup() + ".rabund", *(filehandles[lookup[i]->getGroup()]));
+ rav.print(*(filehandles[lookup[i]->getGroup()]));
+ (*(filehandles[lookup[i]->getGroup()])).close();
+ }
+
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ lookup = input->getSharedRAbundVectors();
+ }
+
+ //free memory
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ delete it3->second;
+ }
+ delete read;
+ delete input;
+ globaldata->ginput = NULL;
+
+ return filenames;
}
catch(exception& e) {
- errorOut(e, "RareFactCommand", "execute");
+ errorOut(e, "RareFactCommand", "parseSharedFile");
exit(1);
}
}
-
//**********************************************************************************************************************
+
+
+
set<string> labels; //holds labels to be used
string label, calc;
vector<string> Estimators;
+ vector<string> inputFileNames;
+ vector<string> groups;
+
+ vector<string> parseSharedFile(string);
+
};
Estimators.clear();
//allow user to run help
- if(option == "help") { validCalculator = new ValidCalculators(); help(); abort = true; }
+ if(option == "help") { validCalculator = new ValidCalculators(); help(); delete validCalculator; abort = true; }
else {
//valid paramters for this command
}
//make sure the user has already run the read.otu command
- if ((globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund or rabund before you can use the summary.single command."); mothurOutEndLine(); abort = true; }
-
+ if ((globaldata->getSharedFile() == "") && (globaldata->getListFile() == "") && (globaldata->getRabundFile() == "") && (globaldata->getSabundFile() == "")) { mothurOut("You must read a list, sabund, rabund or shared file before you can use the summary.single command."); mothurOutEndLine(); abort = true; }
+
//check for optional parameter and set defaults
// ...at some point should added some additional type checking...
label = validParameter.validFile(parameters, "label", false);
temp = validParameter.validFile(parameters, "size", false); if (temp == "not found") { temp = "0"; }
convert(temp, size);
- if (abort == false) {
-
- validCalculator = new ValidCalculators();
- int i;
-
- for (i=0; i<Estimators.size(); i++) {
- if (validCalculator->isValidCalculator("summary", Estimators[i]) == true) {
- if(Estimators[i] == "sobs"){
- sumCalculators.push_back(new Sobs());
- }else if(Estimators[i] == "chao"){
- sumCalculators.push_back(new Chao1());
- }else if(Estimators[i] == "coverage"){
- sumCalculators.push_back(new Coverage());
- }else if(Estimators[i] == "geometric"){
- sumCalculators.push_back(new Geom());
- }else if(Estimators[i] == "logseries"){
- sumCalculators.push_back(new LogSD());
- }else if(Estimators[i] == "qstat"){
- sumCalculators.push_back(new QStat());
- }else if(Estimators[i] == "bergerparker"){
- sumCalculators.push_back(new BergerParker());
- }else if(Estimators[i] == "bstick"){
- sumCalculators.push_back(new BStick());
- }else if(Estimators[i] == "ace"){
- if(abund < 5)
- abund = 10;
- sumCalculators.push_back(new Ace(abund));
- }else if(Estimators[i] == "jack"){
- sumCalculators.push_back(new Jackknife());
- }else if(Estimators[i] == "shannon"){
- sumCalculators.push_back(new Shannon());
- }else if(Estimators[i] == "npshannon"){
- sumCalculators.push_back(new NPShannon());
- }else if(Estimators[i] == "simpson"){
- sumCalculators.push_back(new Simpson());
- }else if(Estimators[i] == "bootstrap"){
- sumCalculators.push_back(new Bootstrap());
- }else if (Estimators[i] == "nseqs") {
- sumCalculators.push_back(new NSeqs());
- }else if (Estimators[i] == "goodscoverage") {
- sumCalculators.push_back(new GoodsCoverage());
- }else if (Estimators[i] == "efron") {
- sumCalculators.push_back(new Efron(size));
- }else if (Estimators[i] == "boneh") {
- sumCalculators.push_back(new Boneh(size));
- }else if (Estimators[i] == "solow") {
- sumCalculators.push_back(new Solow(size));
- }else if (Estimators[i] == "shen") {
- sumCalculators.push_back(new Shen(size, abund));
- }
- }
- }
- }
}
-
-
}
catch(exception& e) {
errorOut(e, "SummaryCommand", "SummaryCommand");
//**********************************************************************************************************************
-SummaryCommand::~SummaryCommand(){
- if (abort == false) {
- delete input; globaldata->ginput = NULL;
- delete read;
- delete validCalculator;
- globaldata->sabund = NULL;
- }
-}
+SummaryCommand::~SummaryCommand(){}
//**********************************************************************************************************************
if (abort == true) { return 0; }
- //if the users entered no valid calculators don't execute command
- if (sumCalculators.size() == 0) { return 0; }
-
- outputFileName = ((getRootName(globaldata->inputFileName)) + "summary");
- openOutputFile(outputFileName, outputFileHandle);
- outputFileHandle << "label";
-
- read = new ReadOTUFile(globaldata->inputFileName);
- read->read(&*globaldata);
-
- sabund = globaldata->sabund;
- string lastLabel = sabund->getLabel();
- input = globaldata->ginput;
+ if ((globaldata->getFormat() != "sharedfile")) { inputFileNames.push_back(globaldata->inputFileName); }
+ else { inputFileNames = parseSharedFile(globaldata->getSharedFile()); globaldata->setFormat("rabund"); }
- for(int i=0;i<sumCalculators.size();i++){
- if(sumCalculators[i]->getCols() == 1){
- outputFileHandle << '\t' << sumCalculators[i]->getName();
+ for (int p = 0; p < inputFileNames.size(); p++) {
+
+ string fileNameRoot = getRootName(inputFileNames[p]) + "summary";
+ globaldata->inputFileName = inputFileNames[p];
+
+ if (inputFileNames.size() > 1) {
+ mothurOutEndLine(); mothurOut("Processing group " + groups[p]); mothurOutEndLine(); mothurOutEndLine();
}
- else{
- outputFileHandle << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci";
+
+
+ validCalculator = new ValidCalculators();
+
+ for (int i=0; i<Estimators.size(); i++) {
+ if (validCalculator->isValidCalculator("summary", Estimators[i]) == true) {
+ if(Estimators[i] == "sobs"){
+ sumCalculators.push_back(new Sobs());
+ }else if(Estimators[i] == "chao"){
+ sumCalculators.push_back(new Chao1());
+ }else if(Estimators[i] == "coverage"){
+ sumCalculators.push_back(new Coverage());
+ }else if(Estimators[i] == "geometric"){
+ sumCalculators.push_back(new Geom());
+ }else if(Estimators[i] == "logseries"){
+ sumCalculators.push_back(new LogSD());
+ }else if(Estimators[i] == "qstat"){
+ sumCalculators.push_back(new QStat());
+ }else if(Estimators[i] == "bergerparker"){
+ sumCalculators.push_back(new BergerParker());
+ }else if(Estimators[i] == "bstick"){
+ sumCalculators.push_back(new BStick());
+ }else if(Estimators[i] == "ace"){
+ if(abund < 5)
+ abund = 10;
+ sumCalculators.push_back(new Ace(abund));
+ }else if(Estimators[i] == "jack"){
+ sumCalculators.push_back(new Jackknife());
+ }else if(Estimators[i] == "shannon"){
+ sumCalculators.push_back(new Shannon());
+ }else if(Estimators[i] == "npshannon"){
+ sumCalculators.push_back(new NPShannon());
+ }else if(Estimators[i] == "simpson"){
+ sumCalculators.push_back(new Simpson());
+ }else if(Estimators[i] == "bootstrap"){
+ sumCalculators.push_back(new Bootstrap());
+ }else if (Estimators[i] == "nseqs") {
+ sumCalculators.push_back(new NSeqs());
+ }else if (Estimators[i] == "goodscoverage") {
+ sumCalculators.push_back(new GoodsCoverage());
+ }else if (Estimators[i] == "efron") {
+ sumCalculators.push_back(new Efron(size));
+ }else if (Estimators[i] == "boneh") {
+ sumCalculators.push_back(new Boneh(size));
+ }else if (Estimators[i] == "solow") {
+ sumCalculators.push_back(new Solow(size));
+ }else if (Estimators[i] == "shen") {
+ sumCalculators.push_back(new Shen(size, abund));
+ }
+ }
}
- }
- outputFileHandle << endl;
-
- //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
- set<string> processedLabels;
- set<string> userLabels = labels;
- while((sabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+ //if the users entered no valid calculators don't execute command
+ if (sumCalculators.size() == 0) { return 0; }
- if(allLines == 1 || labels.count(sabund->getLabel()) == 1){
-
- mothurOut(sabund->getLabel()); mothurOutEndLine();
- processedLabels.insert(sabund->getLabel());
- userLabels.erase(sabund->getLabel());
-
- outputFileHandle << sabund->getLabel();
- for(int i=0;i<sumCalculators.size();i++){
- vector<double> data = sumCalculators[i]->getValues(sabund);
- outputFileHandle << '\t';
- sumCalculators[i]->print(outputFileHandle);
+ ofstream outputFileHandle;
+ openOutputFile(fileNameRoot, outputFileHandle);
+ outputFileHandle << "label";
+
+ read = new ReadOTUFile(globaldata->inputFileName);
+ read->read(&*globaldata);
+
+ sabund = globaldata->sabund;
+ string lastLabel = sabund->getLabel();
+ input = globaldata->ginput;
+
+ for(int i=0;i<sumCalculators.size();i++){
+ if(sumCalculators[i]->getCols() == 1){
+ outputFileHandle << '\t' << sumCalculators[i]->getName();
+ }
+ else{
+ outputFileHandle << '\t' << sumCalculators[i]->getName() << "\t" << sumCalculators[i]->getName() << "_lci\t" << sumCalculators[i]->getName() << "_hci";
}
- outputFileHandle << endl;
}
+ outputFileHandle << endl;
+
+ //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+ set<string> processedLabels;
+ set<string> userLabels = labels;
- if ((anyLabelsToProcess(sabund->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
- string saveLabel = sabund->getLabel();
+ while((sabund != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+
+ if(allLines == 1 || labels.count(sabund->getLabel()) == 1){
+
+ mothurOut(sabund->getLabel()); mothurOutEndLine();
+ processedLabels.insert(sabund->getLabel());
+ userLabels.erase(sabund->getLabel());
+
+ outputFileHandle << sabund->getLabel();
+ for(int i=0;i<sumCalculators.size();i++){
+ vector<double> data = sumCalculators[i]->getValues(sabund);
+ outputFileHandle << '\t';
+ sumCalculators[i]->print(outputFileHandle);
+ }
+ outputFileHandle << endl;
+ }
+
+ if ((anyLabelsToProcess(sabund->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ string saveLabel = sabund->getLabel();
+
+ delete sabund;
+ sabund = input->getSAbundVector(lastLabel);
+
+ mothurOut(sabund->getLabel()); mothurOutEndLine();
+ processedLabels.insert(sabund->getLabel());
+ userLabels.erase(sabund->getLabel());
+
+ outputFileHandle << sabund->getLabel();
+ for(int i=0;i<sumCalculators.size();i++){
+ vector<double> data = sumCalculators[i]->getValues(sabund);
+ outputFileHandle << '\t';
+ sumCalculators[i]->print(outputFileHandle);
+ }
+ outputFileHandle << endl;
+
+ //restore real lastlabel to save below
+ sabund->setLabel(saveLabel);
+ }
+
+ lastLabel = sabund->getLabel();
delete sabund;
+ sabund = input->getSAbundVector();
+ }
+
+ //output error messages about any remaining user labels
+ set<string>::iterator it;
+ bool needToRun = false;
+ for (it = userLabels.begin(); it != userLabels.end(); it++) {
+ mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(lastLabel) != 1) {
+ mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
+ needToRun = true;
+ }else {
+ mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
+ }
+ }
+
+ //run last label if you need to
+ if (needToRun == true) {
+ if (sabund != NULL) { delete sabund; }
sabund = input->getSAbundVector(lastLabel);
mothurOut(sabund->getLabel()); mothurOutEndLine();
- processedLabels.insert(sabund->getLabel());
- userLabels.erase(sabund->getLabel());
-
outputFileHandle << sabund->getLabel();
for(int i=0;i<sumCalculators.size();i++){
vector<double> data = sumCalculators[i]->getValues(sabund);
sumCalculators[i]->print(outputFileHandle);
}
outputFileHandle << endl;
+ delete sabund;
+ }
+
+ outputFileHandle.close();
+
+ delete input; globaldata->ginput = NULL;
+ delete read;
+ delete validCalculator;
+ globaldata->sabund = NULL;
+ }
+
+ return 0;
+ }
+ catch(exception& e) {
+ errorOut(e, "SummaryCommand", "execute");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+vector<string> SummaryCommand::parseSharedFile(string filename) {
+ try {
+ vector<string> filenames;
+
+ map<string, ofstream*> filehandles;
+ map<string, ofstream*>::iterator it3;
+
- //restore real lastlabel to save below
- sabund->setLabel(saveLabel);
- }
-
- lastLabel = sabund->getLabel();
+ //read first line
+ read = new ReadOTUFile(filename);
+ read->read(&*globaldata);
- delete sabund;
- sabund = input->getSAbundVector();
+ input = globaldata->ginput;
+ vector<SharedRAbundVector*> lookup = input->getSharedRAbundVectors();
+
+ string sharedFileRoot = getRootName(filename);
+
+ //clears file before we start to write to it below
+ for (int i=0; i<lookup.size(); i++) {
+ remove((sharedFileRoot + lookup[i]->getGroup() + ".rabund").c_str());
+ filenames.push_back((sharedFileRoot + lookup[i]->getGroup() + ".rabund"));
}
- //output error messages about any remaining user labels
- set<string>::iterator it;
- bool needToRun = false;
- for (it = userLabels.begin(); it != userLabels.end(); it++) {
- mothurOut("Your file does not include the label " + *it);
- if (processedLabels.count(lastLabel) != 1) {
- mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
- needToRun = true;
- }else {
- mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
- }
+ ofstream* temp;
+ for (int i=0; i<lookup.size(); i++) {
+ temp = new ofstream;
+ filehandles[lookup[i]->getGroup()] = temp;
+ groups.push_back(lookup[i]->getGroup());
}
+
+ while(lookup[0] != NULL) {
- //run last label if you need to
- if (needToRun == true) {
- if (sabund != NULL) { delete sabund; }
- sabund = input->getSAbundVector(lastLabel);
-
- mothurOut(sabund->getLabel()); mothurOutEndLine();
- outputFileHandle << sabund->getLabel();
- for(int i=0;i<sumCalculators.size();i++){
- vector<double> data = sumCalculators[i]->getValues(sabund);
- outputFileHandle << '\t';
- sumCalculators[i]->print(outputFileHandle);
+ for (int i = 0; i < lookup.size(); i++) {
+ RAbundVector rav = lookup[i]->getRAbundVector();
+ openOutputFileAppend(sharedFileRoot + lookup[i]->getGroup() + ".rabund", *(filehandles[lookup[i]->getGroup()]));
+ rav.print(*(filehandles[lookup[i]->getGroup()]));
+ (*(filehandles[lookup[i]->getGroup()])).close();
}
- outputFileHandle << endl;
- delete sabund;
- }
- outputFileHandle.close();
+ for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
+ lookup = input->getSharedRAbundVectors();
+ }
- return 0;
+ //free memory
+ for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+ delete it3->second;
+ }
+ delete read;
+ delete input;
+ globaldata->ginput = NULL;
+
+ return filenames;
}
catch(exception& e) {
- errorOut(e, "SummaryCommand", "execute");
+ errorOut(e, "SummaryCommand", "parseSharedFile");
exit(1);
}
}
-
//**********************************************************************************************************************
InputData* input;
ValidCalculators* validCalculator;
SAbundVector* sabund;
- string outputFileName;
- ofstream outputFileHandle;
int abund, size;
bool abort, allLines;
set<string> labels; //holds labels to be used
string label, calc;
vector<string> Estimators;
+ vector<string> inputFileNames;
+ vector<string> groups;
+
+ vector<string> parseSharedFile(string);
+
};
#endif
//output to file
for (int r=0; r<globaldata->Groups.size(); r++) {
//output name
- out << globaldata->Groups[r] << '\t';
+ string name = globaldata->Groups[r];
+ if (name.length() < 10) { //pad with spaces to make compatible
+ while (name.length() < 10) { name += " "; }
+ }
+ out << name << '\t';
//output distances
for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; }
//output to file
for (int r=0; r<globaldata->Groups.size(); r++) {
//output name
- out << globaldata->Groups[r] << '\t';
+ string name = globaldata->Groups[r];
+ if (name.length() < 10) { //pad with spaces to make compatible
+ while (name.length() < 10) { name += " "; }
+ }
+ out << name << '\t';
//output distances
for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; }