]> git.donarmstrong.com Git - mothur.git/blobdiff - getoturepcommand.cpp
removed line option
[mothur.git] / getoturepcommand.cpp
index acb9372ff173b211685ea555531643966f8addc1..07486191e886c72d5d2aafd1db781bad96411874 100644 (file)
 #include "getoturepcommand.h"
 
 //**********************************************************************************************************************
-GetOTURepCommand::GetOTURepCommand(){
+GetOTURepCommand::GetOTURepCommand(string option){
        try{
                globaldata = GlobalData::getInstance();
-       
-               if(globaldata->gSparseMatrix != NULL)   {       matrix = new SparseMatrix(*globaldata->gSparseMatrix);          }
+               abort = false;
+               allLines = 1;
+               labels.clear();
                
-               //listOfNames bin 0 = first name read in distance matrix, listOfNames bin 1 = second name read in distance matrix
-               if(globaldata->gListVector != NULL)             {       
-                       listOfNames = new ListVector(*globaldata->gListVector); 
+               //allow user to run help
+               if (option == "help") { 
+                       help(); abort = true;
+               } else {
+                       //valid paramters for this command
+                       string Array[] =  {"fasta","list","label","name", "group"};
+                       vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
-                       vector<string> names;
-                       string binnames;
-                       //map names to rows in sparsematrix
-                       for (int i = 0; i < listOfNames->size(); i++) {
-                               names.clear();
-                               binnames = listOfNames->get(i);
-                               splitAtComma(binnames, names);
-                               
-                               for (int j = 0; j < names.size(); j++) {
-                                       nameToIndex[names[j]] = i;
-                               }
-                       }
-               }else { cout << "error, no listvector." << endl; }
-
+                       OptionParser parser(option);
+                       map<string, string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
                
-               fastafile = globaldata->getFastaFile();
-               namesfile = globaldata->getNameFile();
-               groupfile = globaldata->getGroupFile();
+                       //check to make sure all parameters are valid for command
+                       for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //make sure the user has already run the read.otu command
+                       if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
+                               mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine(); 
+                               abort = true;
+                       }
+                       
+                       //check for required parameters
+                       fastafile = validParameter.validFile(parameters, "fasta", true);
+                       if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
+                       else if (fastafile == "not open") { abort = true; }     
                
-               if (groupfile != "") {
-                       //read in group map info.
-                       groupMap = new GroupMap(groupfile);
-                       groupMap->readMap();
-               }
+                       listfile = validParameter.validFile(parameters, "list", true);
+                       if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
+                       else if (listfile == "not open") { abort = true; }      
+
+                       //check for optional parameter and set defaults
+                       // ...at some point should added some additional type checking...
+                       label = validParameter.validFile(parameters, "label", false);                   
+                       if (label == "not found") { label = ""; }
+                       else { 
+                               if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
+                               else { allLines = 1;  }
+                       }
+                       
+                       //if the user has not specified any labels use the ones from read.otu
+                       if (label == "") {  
+                               allLines = globaldata->allLines; 
+                               labels = globaldata->labels; 
+                       }
+                       
+                       namesfile = validParameter.validFile(parameters, "name", true);
+                       if (namesfile == "not open") { abort = true; }  
+                       else if (namesfile == "not found") { namesfile = ""; }
 
-               openInputFile(fastafile, in);
+                       groupfile = validParameter.validFile(parameters, "group", true);
+                       if (groupfile == "not open") { abort = true; }
+                       else if (groupfile == "not found") { groupfile = ""; }
+                       else {
+                               //read in group map info.
+                               groupMap = new GroupMap(groupfile);
+                               groupMap->readMap();
+                       }
                
-               fasta = new FastaMap();
+                       if (abort == false) {
+                       
+                               if(globaldata->gSparseMatrix != NULL)   {
+                                       matrix = globaldata->gSparseMatrix;
+                               }
+
+                               //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
+                               if (globaldata->gListVector != NULL) {
+                                       vector<string> names;
+                                       string binnames;
+                                       //map names to rows in sparsematrix
+                                       for (int i = 0; i < globaldata->gListVector->size(); i++) {
+                                               names.clear();
+                                               binnames = globaldata->gListVector->get(i);
+       
+                                               splitAtComma(binnames, names);
+                               
+                                               for (int j = 0; j < names.size(); j++) {
+                                                       nameToIndex[names[j]] = i;
+                                               }
+                                       }
+                               } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
 
+                               // Create a data structure to quickly access the distance information.
+                               // It consists of a vector of distance maps, where each map contains
+                               // all distances of a certain sequence. Vector and maps are accessed
+                               // via the index of a sequence in the distance matrix
+                               seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
+                               for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
+                                       seqVec[currentCell->row][currentCell->column] = currentCell->dist;
+                               }
+
+                               fasta = new FastaMap();
+                       }
+               }
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
                exit(1);
        }
-       catch(...) {
-               cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+}
+
+//**********************************************************************************************************************
+
+void GetOTURepCommand::help(){
+       try {
+               mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
+               mothurOut("The get.oturep command parameters are list, fasta, name, group and label.  The fasta and list parameters are required.\n");
+               mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
+               mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
+               mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
+               mothurOut("The default value for label is all labels in your inputfile.\n");
+               mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
+               mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
+               mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
+       }
+       catch(exception& e) {
+               errorOut(e, "GetOTURepCommand", "help");
                exit(1);
        }
 }
+
 //**********************************************************************************************************************
 
 GetOTURepCommand::~GetOTURepCommand(){
-       delete input;
-       delete read;
-       delete fasta;
-       if (groupfile != "") {
-               delete groupMap;
+       if (abort == false) {
+               delete input;  globaldata->ginput = NULL;
+               delete read;
+               delete fasta;
+               if (groupfile != "") {
+                       delete groupMap;  globaldata->gGroupmap = NULL;
+               }
        }
 }
 
@@ -74,11 +157,15 @@ GetOTURepCommand::~GetOTURepCommand(){
 
 int GetOTURepCommand::execute(){
        try {
-               int count = 1;
+       
+               if (abort == true) { return 0; }
+
                int error;
                
                //read fastafile
-               fasta->readFastaFile(in);
+               fasta->readFastaFile(fastafile);
+               
+               in.close();
                
                //set format to list so input can get listvector
                globaldata->setFormat("list");
@@ -89,22 +176,21 @@ int GetOTURepCommand::execute(){
                }
                
                //read list file
-               read = new ReadOTUFile(globaldata->getListFile());      
+               read = new ReadOTUFile(listfile);
                read->read(&*globaldata); 
                
                input = globaldata->ginput;
                list = globaldata->gListVector;
-               ListVector* lastList = list;
+               string lastLabel = list->getLabel();
                
                //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
                set<string> processedLabels;
-               set<string> userLabels = globaldata->labels;
-
+               set<string> userLabels = labels;
                
-               while((list != NULL) && ((globaldata->allLines == 1) || (userLabels.size() != 0))) {
+               while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                        
-                       if(globaldata->allLines == 1 || globaldata->lines.count(count) == 1 || globaldata->labels.count(list->getLabel()) == 1){
-                                       cout << list->getLabel() << '\t' << count << endl;
+                       if (allLines == 1 || labels.count(list->getLabel()) == 1){
+                                       mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
                                        error = process(list);
                                        if (error == 1) { return 0; } //there is an error in hte input files, abort command
                                        
@@ -112,42 +198,44 @@ int GetOTURepCommand::execute(){
                                        userLabels.erase(list->getLabel());
                        }
                        
-                       if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
-                                       cout << lastList->getLabel() << '\t' << count << endl;
-                                       error = process(lastList);
+                       if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+                                       delete list;
+                                       list = input->getListVector(lastLabel);
+                                       mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
+                                       error = process(list);
                                        if (error == 1) { return 0; } //there is an error in hte input files, abort command
                                        
-                                       processedLabels.insert(lastList->getLabel());
-                                       userLabels.erase(lastList->getLabel());
+                                       processedLabels.insert(list->getLabel());
+                                       userLabels.erase(list->getLabel());
                        }
                        
-                       if (count != 1) { delete lastList; }
-                       lastList = list;                        
+                       lastLabel = list->getLabel();
                        
+                       delete list;
                        list = input->getListVector();
-                       count++;
                }
                
                //output error messages about any remaining user labels
-               set<string>::iterator it;
                bool needToRun = false;
-               for (it = userLabels.begin(); it != userLabels.end(); it++) {  
-                       cout << "Your file does not include the label "<< *it
-                       if (processedLabels.count(lastList->getLabel()) != 1) {
-                               cout << ". I will use " << lastList->getLabel() << "." << endl;
+               for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
+                       mothurOut("Your file does not include the label " + *it)
+                       if (processedLabels.count(list->getLabel()) != 1) {
+                               mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
                                needToRun = true;
                        }else {
-                               cout << ". Please refer to " << lastList->getLabel() << "." << endl;
+                               mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
                        }
                }
                
-               //run last line if you need to
+               //run last label if you need to
                if (needToRun == true)  {
-                       cout << lastList->getLabel() << '\t' << count << endl;
-                       error = process(lastList);
+                       if (list != NULL) {     delete list;    }
+                       list = input->getListVector(lastLabel);
+                       mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
+                       error = process(list);
+                       delete list;
                        if (error == 1) { return 0; } //there is an error in hte input files, abort command
                }
-               delete lastList;
                
                delete matrix;
                globaldata->gSparseMatrix = NULL;
@@ -157,14 +245,9 @@ int GetOTURepCommand::execute(){
                return 0;
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
-               exit(1);
-       }
-       catch(...) {
-               cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               errorOut(e, "GetOTURepCommand", "execute");
                exit(1);
        }
-
 }
 
 //**********************************************************************************************************************
@@ -196,157 +279,131 @@ void GetOTURepCommand::readNamesFile() {
 
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               errorOut(e, "GetOTURepCommand", "readNamesFile");
                exit(1);
        }
-       catch(...) {
-               cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
-               exit(1);
-       }       
 }
 //**********************************************************************************************************************
-string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
+string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
        try{
                vector<string> names;
-               map<string, float> sums;
-               map<string, float>::iterator it4;
-               map<int, string> binMap; //subset of namesToIndex - just member of this bin
-               string binnames;
-               float min = 10000;
-               string minName;
                map<string, string> groups;
                map<string, string>::iterator groupIt;
-               
-               binnames = thisList->get(bin);
-       
+
                //parse names into vector
+               string binnames = thisList->get(bin);
                splitAtComma(binnames, names);
-               
+               binsize = names.size();
+
                //if you have a groupfile
-               if(groupfile != "") {
+               if (groupfile != "") {
                        //find the groups that are in this bin
-                       for (int i = 0; i < names.size(); i++) {
+                       for (size_t i = 0; i < names.size(); i++) {
                                string groupName = groupMap->getGroup(names[i]);
                                if (groupName == "not found") {  
-                                       cout << names[i] << " is missing from your group file. Please correct. " << endl;
+                                       mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
                                        groupError = true;
-                               }else{
+                               } else {
                                        groups[groupName] = groupName;
                                }
                        }
                        
                        //turn the groups into a string
-                       for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
-                       
+                       for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
+                               group += groupIt->first + "-";
+                       }
                        //rip off last dash
                        group = group.substr(0, group.length()-1);
                }
-               
-               //if only 1 sequence in bin then that's the rep
-               if (names.size() == 1) { return names[0]; }
-               else {
-                       //fill binMap
-                       for (int i = 0; i < names.size(); i++) {
-                               for (it3 = nameToIndex.begin(); it3 != nameToIndex.end(); it3++) {
 
-                                       if (it3->first == names[i]) {  
-                                               binMap[it3->second] = it3->first;
+               // if only 1 sequence in bin or processing the "unique" label, then 
+               // the first sequence of the OTU is the representative one
+               if ((names.size() == 1) || (list->getLabel() == "unique")) {
+                       return names[0];
+               } else {
+                       vector<int> seqIndex(names.size());
+                       vector<float> max_dist(names.size());
 
-                                               //initialize sums map
-                                               sums[it3->first] = 0.0;
-                                               break;
-                                       }
-                               }
+                       //fill seqIndex and initialize sums
+                       for (size_t i = 0; i < names.size(); i++) {
+                               seqIndex[i] = nameToIndex[names[i]];
+                               max_dist[i] = 0.0;
                        }
                        
-                       //go through each cell in the sparsematrix
-                       for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
-                               //is this a distance between 2 members of this bin
-                               it = binMap.find(currentCell->row);
-                               it2 = binMap.find(currentCell->column);
-                               
-                               //sum the distance of the sequences in the bin to eachother
-                               if ((it != binMap.end()) && (it2 != binMap.end())) {
-                                       //this is a cell that repesents the distance between to of this bins members
-                                       sums[it->second] += currentCell->dist;
-                                       sums[it2->second] += currentCell->dist;
+                       // loop through all entries in seqIndex
+                       SeqMap::iterator it;
+                       SeqMap currMap;
+                       for (size_t i=0; i < seqIndex.size(); i++) {
+                               currMap = seqVec[seqIndex[i]];
+                               for (size_t j=0; j < seqIndex.size(); j++) {
+                                       it = currMap.find(seqIndex[j]);         
+                                       if (it != currMap.end()) {
+                                               max_dist[i] = max(max_dist[i], it->second);
+                                               max_dist[j] = max(max_dist[j], it->second);
+                                       }
                                }
                        }
                        
-                       //smallest sum is the representative
-                       for (it4 = sums.begin(); it4 != sums.end(); it4++) {
-                               if (it4->second < min) {
-                                       min = it4->second;
-                                       minName = it4->first;
+                       // sequence with the smallest maximum distance is the representative
+                       float min = 10000;
+                       int minIndex;
+                       for (size_t i=0; i < max_dist.size(); i++) {
+                               if (max_dist[i] < min) {
+                                       min = max_dist[i];
+                                       minIndex = i;
                                }
-
                        }
-                       
-                       return minName;
+
+                       return(names[minIndex]);
                }
-       
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               errorOut(e, "GetOTURepCommand", "FindRep");
                exit(1);
        }
-       catch(...) {
-               cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
-               exit(1);
-       }       
 }
 
 //**********************************************************************************************************************
 int GetOTURepCommand::process(ListVector* processList) {
        try{
-                               string nameRep, name, sequence;
+               string nameRep, name, sequence;
 
-                               //create output file
-                               string outputFileName = getRootName(globaldata->getListFile()) + processList->getLabel() + ".rep.fasta";
-                               openOutputFile(outputFileName, out);
-                               
-                               //for each bin in the list vector
-                               for (int i = 0; i < processList->size(); i++) {
-                                       string groups;
-                                       nameRep = FindRep(i, groups, processList);
-                                       
-                                       //print out name and sequence for that bin
-                                       sequence = fasta->getSequence(nameRep);
+               //create output file
+               string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
+               openOutputFile(outputFileName, out);
 
-                                       if (sequence != "not found") {
-                                               if (groupfile == "") {
-                                                       nameRep = nameRep + "|" + toString(i+1);
-                                                       out << ">" << nameRep << endl;
-                                                       out << sequence << endl;
-                                               }else {
-                                                       nameRep = nameRep + "|" + groups + "|" + toString(i+1);
-                                                       out << ">" << nameRep << endl;
-                                                       out << sequence << endl;
-                                               }
-                                       }else { 
-                                               cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl; 
-                                               remove(outputFileName.c_str());
-                                               return 1;
-                                       }
+               //for each bin in the list vector
+               for (int i = 0; i < processList->size(); i++) {
+                       string groups;
+                       int binsize;
+                       nameRep = findRep(i, groups, processList, binsize);
+
+                       //print out name and sequence for that bin
+                       sequence = fasta->getSequence(nameRep);
+
+                       if (sequence != "not found") {
+                               nameRep = nameRep + "|" + toString(i+1);
+                               nameRep = nameRep + "|" + toString(binsize);
+                               if (groupfile != "") {
+                                       nameRep = nameRep + "|" + groups;
                                }
-                               
-                               out.close();
-                               return 0;
-       
+                               out << ">" << nameRep << endl;
+                               out << sequence << endl;
+                       } else { 
+                               mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); 
+                               remove(outputFileName.c_str());
+                               return 1;
+                       }
+               }
+
+               out.close();
+               return 0;
+
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               errorOut(e, "GetOTURepCommand", "process");
                exit(1);
        }
-       catch(...) {
-               cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
-               exit(1);
-       }       
 }
 
 //**********************************************************************************************************************
-
-
-
-
-