#include "getoturepcommand.h"
//**********************************************************************************************************************
-GetOTURepCommand::GetOTURepCommand(){
+GetOTURepCommand::GetOTURepCommand(string option){
try{
globaldata = GlobalData::getInstance();
-
- if(globaldata->gSparseMatrix != NULL) { matrix = new SparseMatrix(*globaldata->gSparseMatrix); }
+ abort = false;
+ allLines = 1;
+ labels.clear();
- //listOfNames bin 0 = first name read in distance matrix, listOfNames bin 1 = second name read in distance matrix
- if(globaldata->gListVector != NULL) {
- listOfNames = new ListVector(*globaldata->gListVector);
+ //allow user to run help
+ if (option == "help") {
+ help(); abort = true;
+ } else {
+ //valid paramters for this command
+ string Array[] = {"fasta","list","label","name", "group"};
+ vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
- vector<string> names;
- string binnames;
- //map names to rows in sparsematrix
- for (int i = 0; i < listOfNames->size(); i++) {
- names.clear();
- binnames = listOfNames->get(i);
- splitAtComma(binnames, names);
-
- for (int j = 0; j < names.size(); j++) {
- nameToIndex[names[j]] = i;
- }
- }
- }else { cout << "error, no listvector." << endl; }
-
+ OptionParser parser(option);
+ map<string, string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
- fastafile = globaldata->getFastaFile();
- namesfile = globaldata->getNameFile();
- groupfile = globaldata->getGroupFile();
+ //check to make sure all parameters are valid for command
+ for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //make sure the user has already run the read.otu command
+ if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
+ mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine();
+ abort = true;
+ }
+
+ //check for required parameters
+ fastafile = validParameter.validFile(parameters, "fasta", true);
+ if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
+ else if (fastafile == "not open") { abort = true; }
- if (groupfile != "") {
- //read in group map info.
- groupMap = new GroupMap(groupfile);
- groupMap->readMap();
- }
+ listfile = validParameter.validFile(parameters, "list", true);
+ if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
+ else if (listfile == "not open") { abort = true; }
+
+ //check for optional parameter and set defaults
+ // ...at some point should added some additional type checking...
+ label = validParameter.validFile(parameters, "label", false);
+ if (label == "not found") { label = ""; }
+ else {
+ if(label != "all") { splitAtDash(label, labels); allLines = 0; }
+ else { allLines = 1; }
+ }
+
+ //if the user has not specified any labels use the ones from read.otu
+ if (label == "") {
+ allLines = globaldata->allLines;
+ labels = globaldata->labels;
+ }
+
+ namesfile = validParameter.validFile(parameters, "name", true);
+ if (namesfile == "not open") { abort = true; }
+ else if (namesfile == "not found") { namesfile = ""; }
- openInputFile(fastafile, in);
+ groupfile = validParameter.validFile(parameters, "group", true);
+ if (groupfile == "not open") { abort = true; }
+ else if (groupfile == "not found") { groupfile = ""; }
+ else {
+ //read in group map info.
+ groupMap = new GroupMap(groupfile);
+ groupMap->readMap();
+ }
- fasta = new FastaMap();
+ if (abort == false) {
+
+ if(globaldata->gSparseMatrix != NULL) {
+ matrix = globaldata->gSparseMatrix;
+ }
+
+ //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
+ if (globaldata->gListVector != NULL) {
+ vector<string> names;
+ string binnames;
+ //map names to rows in sparsematrix
+ for (int i = 0; i < globaldata->gListVector->size(); i++) {
+ names.clear();
+ binnames = globaldata->gListVector->get(i);
+
+ splitAtComma(binnames, names);
+
+ for (int j = 0; j < names.size(); j++) {
+ nameToIndex[names[j]] = i;
+ }
+ }
+ } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
+ // Create a data structure to quickly access the distance information.
+ // It consists of a vector of distance maps, where each map contains
+ // all distances of a certain sequence. Vector and maps are accessed
+ // via the index of a sequence in the distance matrix
+ seqVec = vector<SeqMap>(globaldata->gListVector->size());
+ for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
+ seqVec[currentCell->row][currentCell->column] = currentCell->dist;
+ }
+
+ fasta = new FastaMap();
+ }
+ }
}
catch(exception& e) {
- cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
exit(1);
}
- catch(...) {
- cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+}
+
+//**********************************************************************************************************************
+
+void GetOTURepCommand::help(){
+ try {
+ mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
+ mothurOut("The get.oturep command parameters are list, fasta, name, group and label. The fasta and list parameters are required.\n");
+ mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
+ mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
+ mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
+ mothurOut("The default value for label is all labels in your inputfile.\n");
+ mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
+ mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
+ mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
+ }
+ catch(exception& e) {
+ errorOut(e, "GetOTURepCommand", "help");
exit(1);
}
}
+
//**********************************************************************************************************************
GetOTURepCommand::~GetOTURepCommand(){
- delete input;
- delete read;
- delete fasta;
- if (groupfile != "") {
- delete groupMap;
+ if (abort == false) {
+ delete input; globaldata->ginput = NULL;
+ delete read;
+ delete fasta;
+ if (groupfile != "") {
+ delete groupMap; globaldata->gGroupmap = NULL;
+ }
}
}
int GetOTURepCommand::execute(){
try {
- int count = 1;
+
+ if (abort == true) { return 0; }
+
int error;
//read fastafile
- fasta->readFastaFile(in);
+ fasta->readFastaFile(fastafile);
+
+ in.close();
//set format to list so input can get listvector
globaldata->setFormat("list");
}
//read list file
- read = new ReadOTUFile(globaldata->getListFile());
+ read = new ReadOTUFile(listfile);
read->read(&*globaldata);
input = globaldata->ginput;
list = globaldata->gListVector;
- ListVector* lastList = list;
+ string lastLabel = list->getLabel();
//if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
set<string> processedLabels;
- set<string> userLabels = globaldata->labels;
- set<int> userLines = globaldata->lines;
-
+ set<string> userLabels = labels;
- while((list != NULL) && ((globaldata->allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
+ while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
- if(globaldata->allLines == 1 || globaldata->lines.count(count) == 1 || globaldata->labels.count(list->getLabel()) == 1){
- cout << list->getLabel() << '\t' << count << endl;
+ if (allLines == 1 || labels.count(list->getLabel()) == 1){
+ mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
error = process(list);
if (error == 1) { return 0; } //there is an error in hte input files, abort command
processedLabels.insert(list->getLabel());
userLabels.erase(list->getLabel());
- userLines.erase(count);
}
- if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
- cout << lastList->getLabel() << '\t' << count << endl;
- error = process(lastList);
+ if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+ delete list;
+ list = input->getListVector(lastLabel);
+ mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
+ error = process(list);
if (error == 1) { return 0; } //there is an error in hte input files, abort command
- processedLabels.insert(lastList->getLabel());
- userLabels.erase(lastList->getLabel());
+ processedLabels.insert(list->getLabel());
+ userLabels.erase(list->getLabel());
}
- if (count != 1) { delete lastList; }
- lastList = list;
+ lastLabel = list->getLabel();
+ delete list;
list = input->getListVector();
- count++;
}
//output error messages about any remaining user labels
- set<string>::iterator it;
bool needToRun = false;
- for (it = userLabels.begin(); it != userLabels.end(); it++) {
- cout << "Your file does not include the label "<< *it;
- if (processedLabels.count(lastList->getLabel()) != 1) {
- cout << ". I will use " << lastList->getLabel() << "." << endl;
+ for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
+ mothurOut("Your file does not include the label " + *it);
+ if (processedLabels.count(list->getLabel()) != 1) {
+ mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
needToRun = true;
}else {
- cout << ". Please refer to " << lastList->getLabel() << "." << endl;
+ mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
}
}
- //run last line if you need to
+ //run last label if you need to
if (needToRun == true) {
- cout << lastList->getLabel() << '\t' << count << endl;
- error = process(lastList);
+ if (list != NULL) { delete list; }
+ list = input->getListVector(lastLabel);
+ mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
+ error = process(list);
+ delete list;
if (error == 1) { return 0; } //there is an error in hte input files, abort command
}
- delete lastList;
delete matrix;
globaldata->gSparseMatrix = NULL;
return 0;
}
catch(exception& e) {
- cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
- exit(1);
- }
- catch(...) {
- cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ errorOut(e, "GetOTURepCommand", "execute");
exit(1);
}
-
}
//**********************************************************************************************************************
}
catch(exception& e) {
- cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ errorOut(e, "GetOTURepCommand", "readNamesFile");
exit(1);
}
- catch(...) {
- cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
- exit(1);
- }
}
//**********************************************************************************************************************
-string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
+string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
try{
vector<string> names;
- map<string, float> sums;
- map<string, float>::iterator it4;
- map<int, string> binMap; //subset of namesToIndex - just member of this bin
- string binnames;
- float min = 10000;
- string minName;
map<string, string> groups;
map<string, string>::iterator groupIt;
-
- binnames = thisList->get(bin);
-
+
//parse names into vector
+ string binnames = thisList->get(bin);
splitAtComma(binnames, names);
-
+ binsize = names.size();
+
//if you have a groupfile
- if(groupfile != "") {
+ if (groupfile != "") {
//find the groups that are in this bin
- for (int i = 0; i < names.size(); i++) {
+ for (size_t i = 0; i < names.size(); i++) {
string groupName = groupMap->getGroup(names[i]);
if (groupName == "not found") {
- cout << names[i] << " is missing from your group file. Please correct. " << endl;
+ mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
groupError = true;
- }else{
+ } else {
groups[groupName] = groupName;
}
}
//turn the groups into a string
- for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
-
+ for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
+ group += groupIt->first + "-";
+ }
//rip off last dash
group = group.substr(0, group.length()-1);
}
-
- //if only 1 sequence in bin then that's the rep
- if (names.size() == 1) { return names[0]; }
- else {
- //fill binMap
- for (int i = 0; i < names.size(); i++) {
- for (it3 = nameToIndex.begin(); it3 != nameToIndex.end(); it3++) {
- if (it3->first == names[i]) {
- binMap[it3->second] = it3->first;
+ // if only 1 sequence in bin or processing the "unique" label, then
+ // the first sequence of the OTU is the representative one
+ if ((names.size() == 1) || (list->getLabel() == "unique")) {
+ return names[0];
+ } else {
+ vector<int> seqIndex(names.size());
+ vector<float> max_dist(names.size());
- //initialize sums map
- sums[it3->first] = 0.0;
- break;
- }
- }
+ //fill seqIndex and initialize sums
+ for (size_t i = 0; i < names.size(); i++) {
+ seqIndex[i] = nameToIndex[names[i]];
+ max_dist[i] = 0.0;
}
- //go through each cell in the sparsematrix
- for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
- //is this a distance between 2 members of this bin
- it = binMap.find(currentCell->row);
- it2 = binMap.find(currentCell->column);
-
- //sum the distance of the sequences in the bin to eachother
- if ((it != binMap.end()) && (it2 != binMap.end())) {
- //this is a cell that repesents the distance between to of this bins members
- sums[it->second] += currentCell->dist;
- sums[it2->second] += currentCell->dist;
+ // loop through all entries in seqIndex
+ SeqMap::iterator it;
+ SeqMap currMap;
+ for (size_t i=0; i < seqIndex.size(); i++) {
+ currMap = seqVec[seqIndex[i]];
+ for (size_t j=0; j < seqIndex.size(); j++) {
+ it = currMap.find(seqIndex[j]);
+ if (it != currMap.end()) {
+ max_dist[i] = max(max_dist[i], it->second);
+ max_dist[j] = max(max_dist[j], it->second);
+ }
}
}
- //smallest sum is the representative
- for (it4 = sums.begin(); it4 != sums.end(); it4++) {
- if (it4->second < min) {
- min = it4->second;
- minName = it4->first;
+ // sequence with the smallest maximum distance is the representative
+ float min = 10000;
+ int minIndex;
+ for (size_t i=0; i < max_dist.size(); i++) {
+ if (max_dist[i] < min) {
+ min = max_dist[i];
+ minIndex = i;
}
-
}
-
- return minName;
+
+ return(names[minIndex]);
}
-
}
catch(exception& e) {
- cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ errorOut(e, "GetOTURepCommand", "FindRep");
exit(1);
}
- catch(...) {
- cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
- exit(1);
- }
}
//**********************************************************************************************************************
int GetOTURepCommand::process(ListVector* processList) {
try{
- string nameRep, name, sequence;
+ string nameRep, name, sequence;
- //create output file
- string outputFileName = getRootName(globaldata->getListFile()) + processList->getLabel() + ".rep.fasta";
- openOutputFile(outputFileName, out);
-
- //for each bin in the list vector
- for (int i = 0; i < processList->size(); i++) {
- string groups;
- nameRep = FindRep(i, groups, processList);
-
- //print out name and sequence for that bin
- sequence = fasta->getSequence(nameRep);
+ //create output file
+ string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
+ openOutputFile(outputFileName, out);
- if (sequence != "not found") {
- if (groupfile == "") {
- nameRep = nameRep + "|" + toString(i+1);
- out << ">" << nameRep << endl;
- out << sequence << endl;
- }else {
- nameRep = nameRep + "|" + groups + "|" + toString(i+1);
- out << ">" << nameRep << endl;
- out << sequence << endl;
- }
- }else {
- cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl;
- remove(outputFileName.c_str());
- return 1;
- }
+ //for each bin in the list vector
+ for (int i = 0; i < processList->size(); i++) {
+ string groups;
+ int binsize;
+ nameRep = findRep(i, groups, processList, binsize);
+
+ //print out name and sequence for that bin
+ sequence = fasta->getSequence(nameRep);
+
+ if (sequence != "not found") {
+ nameRep = nameRep + "|" + toString(i+1);
+ nameRep = nameRep + "|" + toString(binsize);
+ if (groupfile != "") {
+ nameRep = nameRep + "|" + groups;
}
-
- out.close();
- return 0;
-
+ out << ">" << nameRep << endl;
+ out << sequence << endl;
+ } else {
+ mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine();
+ remove(outputFileName.c_str());
+ return 1;
+ }
+ }
+
+ out.close();
+ return 0;
+
}
catch(exception& e) {
- cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ errorOut(e, "GetOTURepCommand", "process");
exit(1);
}
- catch(...) {
- cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
- exit(1);
- }
}
//**********************************************************************************************************************
-
-
-
-
-