]> git.donarmstrong.com Git - mothur.git/blobdiff - getoturepcommand.cpp
added sorted parameter to get.oturep, added error checking to chimera classes in...
[mothur.git] / getoturepcommand.cpp
index acb9372ff173b211685ea555531643966f8addc1..ed26cbecb3f0b4aa67da540ba26ddfaf500602ce 100644 (file)
 
 #include "getoturepcommand.h"
 
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareName(repStruct left, repStruct right){
+       return (left.name < right.name);        
+}
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareBin(repStruct left, repStruct right){
+       return (left.bin < right.bin);  
+}
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareSize(repStruct left, repStruct right){
+       return (left.size < right.size);        
+}
+//********************************************************************************************************************
+//sorts lowest to highest
+inline bool compareGroup(repStruct left, repStruct right){
+       return (left.group < right.group);      
+}
 //**********************************************************************************************************************
-GetOTURepCommand::GetOTURepCommand(){
+GetOTURepCommand::GetOTURepCommand(string option){
        try{
                globaldata = GlobalData::getInstance();
-       
-               if(globaldata->gSparseMatrix != NULL)   {       matrix = new SparseMatrix(*globaldata->gSparseMatrix);          }
+               abort = false;
+               allLines = 1;
+               labels.clear();
                
-               //listOfNames bin 0 = first name read in distance matrix, listOfNames bin 1 = second name read in distance matrix
-               if(globaldata->gListVector != NULL)             {       
-                       listOfNames = new ListVector(*globaldata->gListVector); 
+               //allow user to run help
+               if (option == "help") { 
+                       help(); abort = true;
+               } else {
+                       //valid paramters for this command
+                       string Array[] =  {"fasta","list","label","name", "group", "sorted"};
+                       vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
-                       vector<string> names;
-                       string binnames;
-                       //map names to rows in sparsematrix
-                       for (int i = 0; i < listOfNames->size(); i++) {
-                               names.clear();
-                               binnames = listOfNames->get(i);
-                               splitAtComma(binnames, names);
-                               
-                               for (int j = 0; j < names.size(); j++) {
-                                       nameToIndex[names[j]] = i;
-                               }
-                       }
-               }else { cout << "error, no listvector." << endl; }
-
+                       OptionParser parser(option);
+                       map<string, string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
                
-               fastafile = globaldata->getFastaFile();
-               namesfile = globaldata->getNameFile();
-               groupfile = globaldata->getGroupFile();
+                       //check to make sure all parameters are valid for command
+                       for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //make sure the user has already run the read.otu command
+                       if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
+                               mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine(); 
+                               abort = true;
+                       }
+                       
+                       //check for required parameters
+                       fastafile = validParameter.validFile(parameters, "fasta", true);
+                       if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
+                       else if (fastafile == "not open") { abort = true; }     
                
-               if (groupfile != "") {
-                       //read in group map info.
-                       groupMap = new GroupMap(groupfile);
-                       groupMap->readMap();
-               }
+                       listfile = validParameter.validFile(parameters, "list", true);
+                       if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
+                       else if (listfile == "not open") { abort = true; }      
 
-               openInputFile(fastafile, in);
-               
-               fasta = new FastaMap();
+                       //check for optional parameter and set defaults
+                       // ...at some point should added some additional type checking...
+                       label = validParameter.validFile(parameters, "label", false);                   
+                       if (label == "not found") { label = ""; }
+                       else { 
+                               if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
+                               else { allLines = 1;  }
+                       }
+                       
+                       //if the user has not specified any labels use the ones from read.otu
+                       if (label == "") {  
+                               allLines = globaldata->allLines; 
+                               labels = globaldata->labels; 
+                       }
+                       
+                       namesfile = validParameter.validFile(parameters, "name", true);
+                       if (namesfile == "not open") { abort = true; }  
+                       else if (namesfile == "not found") { namesfile = ""; }
+
+                       groupfile = validParameter.validFile(parameters, "group", true);
+                       if (groupfile == "not open") { groupfile = ""; abort = true; }
+                       else if (groupfile == "not found") { groupfile = ""; }
+                       else {
+                               //read in group map info.
+                               groupMap = new GroupMap(groupfile);
+                               groupMap->readMap();
+                       }
+                       
+                       sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
+                       if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
+                               mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); mothurOutEndLine();
+                               sorted = "";
+                       }
+                       
+                       if ((sorted == "group") && (groupfile == "")) {
+                               mothurOut("You must provide a groupfile to sort by group. I will not sort."); mothurOutEndLine();
+                               sorted = "";
+                       }
+                       
+                       if (abort == false) {
+                       
+                               if(globaldata->gSparseMatrix != NULL)   {
+                                       matrix = globaldata->gSparseMatrix;
+                               }
 
+                               //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
+                               if (globaldata->gListVector != NULL) {
+                                       vector<string> names;
+                                       string binnames;
+                                       //map names to rows in sparsematrix
+                                       for (int i = 0; i < globaldata->gListVector->size(); i++) {
+                                               names.clear();
+                                               binnames = globaldata->gListVector->get(i);
+       
+                                               splitAtComma(binnames, names);
+                               
+                                               for (int j = 0; j < names.size(); j++) {
+                                                       nameToIndex[names[j]] = i;
+                                               }
+                                       }
+                               } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
+
+                               // Create a data structure to quickly access the distance information.
+                               // It consists of a vector of distance maps, where each map contains
+                               // all distances of a certain sequence. Vector and maps are accessed
+                               // via the index of a sequence in the distance matrix
+                               seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
+                               for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
+                                       seqVec[currentCell->row][currentCell->column] = currentCell->dist;
+                               }
+
+                               fasta = new FastaMap();
+                       }
+               }
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
                exit(1);
        }
-       catch(...) {
-               cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+}
+
+//**********************************************************************************************************************
+
+void GetOTURepCommand::help(){
+       try {
+               mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
+               mothurOut("The get.oturep command parameters are list, fasta, name, group, sorted and label.  The fasta and list parameters are required.\n");
+               mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
+               mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
+               mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
+               mothurOut("The default value for label is all labels in your inputfile.\n");
+               mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
+               mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
+               mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
+               mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
+       }
+       catch(exception& e) {
+               errorOut(e, "GetOTURepCommand", "help");
                exit(1);
        }
 }
+
 //**********************************************************************************************************************
 
 GetOTURepCommand::~GetOTURepCommand(){
-       delete input;
-       delete read;
-       delete fasta;
-       if (groupfile != "") {
-               delete groupMap;
+       if (abort == false) {
+               delete input;  globaldata->ginput = NULL;
+               delete read;
+               delete fasta;
+               if (groupfile != "") {
+                       delete groupMap;  globaldata->gGroupmap = NULL;
+               }
        }
 }
 
@@ -74,11 +189,15 @@ GetOTURepCommand::~GetOTURepCommand(){
 
 int GetOTURepCommand::execute(){
        try {
-               int count = 1;
+       
+               if (abort == true) { return 0; }
+
                int error;
                
                //read fastafile
-               fasta->readFastaFile(in);
+               fasta->readFastaFile(fastafile);
+               
+               in.close();
                
                //set format to list so input can get listvector
                globaldata->setFormat("list");
@@ -89,22 +208,21 @@ int GetOTURepCommand::execute(){
                }
                
                //read list file
-               read = new ReadOTUFile(globaldata->getListFile());      
+               read = new ReadOTUFile(listfile);
                read->read(&*globaldata); 
                
                input = globaldata->ginput;
                list = globaldata->gListVector;
-               ListVector* lastList = list;
+               string lastLabel = list->getLabel();
                
                //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
                set<string> processedLabels;
-               set<string> userLabels = globaldata->labels;
-
+               set<string> userLabels = labels;
                
-               while((list != NULL) && ((globaldata->allLines == 1) || (userLabels.size() != 0))) {
+               while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
                        
-                       if(globaldata->allLines == 1 || globaldata->lines.count(count) == 1 || globaldata->labels.count(list->getLabel()) == 1){
-                                       cout << list->getLabel() << '\t' << count << endl;
+                       if (allLines == 1 || labels.count(list->getLabel()) == 1){
+                                       mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
                                        error = process(list);
                                        if (error == 1) { return 0; } //there is an error in hte input files, abort command
                                        
@@ -112,42 +230,49 @@ int GetOTURepCommand::execute(){
                                        userLabels.erase(list->getLabel());
                        }
                        
-                       if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
-                                       cout << lastList->getLabel() << '\t' << count << endl;
-                                       error = process(lastList);
+                       if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+                                       string saveLabel = list->getLabel();
+                                       
+                                       delete list;
+                                       list = input->getListVector(lastLabel);
+                                       mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
+                                       error = process(list);
                                        if (error == 1) { return 0; } //there is an error in hte input files, abort command
                                        
-                                       processedLabels.insert(lastList->getLabel());
-                                       userLabels.erase(lastList->getLabel());
+                                       processedLabels.insert(list->getLabel());
+                                       userLabels.erase(list->getLabel());
+                                       
+                                       //restore real lastlabel to save below
+                                       list->setLabel(saveLabel);
                        }
                        
-                       if (count != 1) { delete lastList; }
-                       lastList = list;                        
+                       lastLabel = list->getLabel();
                        
+                       delete list;
                        list = input->getListVector();
-                       count++;
                }
                
                //output error messages about any remaining user labels
-               set<string>::iterator it;
                bool needToRun = false;
-               for (it = userLabels.begin(); it != userLabels.end(); it++) {  
-                       cout << "Your file does not include the label "<< *it
-                       if (processedLabels.count(lastList->getLabel()) != 1) {
-                               cout << ". I will use " << lastList->getLabel() << "." << endl;
+               for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
+                       mothurOut("Your file does not include the label " + *it)
+                       if (processedLabels.count(list->getLabel()) != 1) {
+                               mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
                                needToRun = true;
                        }else {
-                               cout << ". Please refer to " << lastList->getLabel() << "." << endl;
+                               mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
                        }
                }
                
-               //run last line if you need to
+               //run last label if you need to
                if (needToRun == true)  {
-                       cout << lastList->getLabel() << '\t' << count << endl;
-                       error = process(lastList);
+                       if (list != NULL) {     delete list;    }
+                       list = input->getListVector(lastLabel);
+                       mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
+                       error = process(list);
+                       delete list;
                        if (error == 1) { return 0; } //there is an error in hte input files, abort command
                }
-               delete lastList;
                
                delete matrix;
                globaldata->gSparseMatrix = NULL;
@@ -157,14 +282,9 @@ int GetOTURepCommand::execute(){
                return 0;
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               errorOut(e, "GetOTURepCommand", "execute");
                exit(1);
        }
-       catch(...) {
-               cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
-               exit(1);
-       }
-
 }
 
 //**********************************************************************************************************************
@@ -196,157 +316,156 @@ void GetOTURepCommand::readNamesFile() {
 
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               errorOut(e, "GetOTURepCommand", "readNamesFile");
                exit(1);
        }
-       catch(...) {
-               cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
-               exit(1);
-       }       
 }
 //**********************************************************************************************************************
-string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
+string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
        try{
                vector<string> names;
-               map<string, float> sums;
-               map<string, float>::iterator it4;
-               map<int, string> binMap; //subset of namesToIndex - just member of this bin
-               string binnames;
-               float min = 10000;
-               string minName;
                map<string, string> groups;
                map<string, string>::iterator groupIt;
-               
-               binnames = thisList->get(bin);
-       
+
                //parse names into vector
+               string binnames = thisList->get(bin);
                splitAtComma(binnames, names);
-               
+               binsize = names.size();
+
                //if you have a groupfile
-               if(groupfile != "") {
+               if (groupfile != "") {
                        //find the groups that are in this bin
-                       for (int i = 0; i < names.size(); i++) {
+                       for (size_t i = 0; i < names.size(); i++) {
                                string groupName = groupMap->getGroup(names[i]);
                                if (groupName == "not found") {  
-                                       cout << names[i] << " is missing from your group file. Please correct. " << endl;
+                                       mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
                                        groupError = true;
-                               }else{
+                               } else {
                                        groups[groupName] = groupName;
                                }
                        }
                        
                        //turn the groups into a string
-                       for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
-                       
+                       for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
+                               group += groupIt->first + "-";
+                       }
                        //rip off last dash
                        group = group.substr(0, group.length()-1);
-               }
-               
-               //if only 1 sequence in bin then that's the rep
-               if (names.size() == 1) { return names[0]; }
-               else {
-                       //fill binMap
-                       for (int i = 0; i < names.size(); i++) {
-                               for (it3 = nameToIndex.begin(); it3 != nameToIndex.end(); it3++) {
+               }else{ group = ""; }
 
-                                       if (it3->first == names[i]) {  
-                                               binMap[it3->second] = it3->first;
+               // if only 1 sequence in bin or processing the "unique" label, then 
+               // the first sequence of the OTU is the representative one
+               if ((names.size() == 1) || (list->getLabel() == "unique")) {
+                       return names[0];
+               } else {
+                       vector<int> seqIndex(names.size());
+                       vector<float> max_dist(names.size());
 
-                                               //initialize sums map
-                                               sums[it3->first] = 0.0;
-                                               break;
-                                       }
-                               }
+                       //fill seqIndex and initialize sums
+                       for (size_t i = 0; i < names.size(); i++) {
+                               seqIndex[i] = nameToIndex[names[i]];
+                               max_dist[i] = 0.0;
                        }
                        
-                       //go through each cell in the sparsematrix
-                       for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
-                               //is this a distance between 2 members of this bin
-                               it = binMap.find(currentCell->row);
-                               it2 = binMap.find(currentCell->column);
-                               
-                               //sum the distance of the sequences in the bin to eachother
-                               if ((it != binMap.end()) && (it2 != binMap.end())) {
-                                       //this is a cell that repesents the distance between to of this bins members
-                                       sums[it->second] += currentCell->dist;
-                                       sums[it2->second] += currentCell->dist;
+                       // loop through all entries in seqIndex
+                       SeqMap::iterator it;
+                       SeqMap currMap;
+                       for (size_t i=0; i < seqIndex.size(); i++) {
+                               currMap = seqVec[seqIndex[i]];
+                               for (size_t j=0; j < seqIndex.size(); j++) {
+                                       it = currMap.find(seqIndex[j]);         
+                                       if (it != currMap.end()) {
+                                               max_dist[i] = max(max_dist[i], it->second);
+                                               max_dist[j] = max(max_dist[j], it->second);
+                                       }
                                }
                        }
                        
-                       //smallest sum is the representative
-                       for (it4 = sums.begin(); it4 != sums.end(); it4++) {
-                               if (it4->second < min) {
-                                       min = it4->second;
-                                       minName = it4->first;
+                       // sequence with the smallest maximum distance is the representative
+                       float min = 10000;
+                       int minIndex;
+                       for (size_t i=0; i < max_dist.size(); i++) {
+                               if (max_dist[i] < min) {
+                                       min = max_dist[i];
+                                       minIndex = i;
                                }
-
                        }
-                       
-                       return minName;
+
+                       return(names[minIndex]);
                }
-       
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               errorOut(e, "GetOTURepCommand", "FindRep");
                exit(1);
        }
-       catch(...) {
-               cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
-               exit(1);
-       }       
 }
 
 //**********************************************************************************************************************
 int GetOTURepCommand::process(ListVector* processList) {
        try{
-                               string nameRep, name, sequence;
+               string nameRep, name, sequence;
 
-                               //create output file
-                               string outputFileName = getRootName(globaldata->getListFile()) + processList->getLabel() + ".rep.fasta";
-                               openOutputFile(outputFileName, out);
-                               
-                               //for each bin in the list vector
-                               for (int i = 0; i < processList->size(); i++) {
-                                       string groups;
-                                       nameRep = FindRep(i, groups, processList);
-                                       
-                                       //print out name and sequence for that bin
-                                       sequence = fasta->getSequence(nameRep);
+               //create output file
+               string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
+               openOutputFile(outputFileName, out);
+               vector<repStruct> reps;
 
-                                       if (sequence != "not found") {
-                                               if (groupfile == "") {
-                                                       nameRep = nameRep + "|" + toString(i+1);
-                                                       out << ">" << nameRep << endl;
-                                                       out << sequence << endl;
-                                               }else {
-                                                       nameRep = nameRep + "|" + groups + "|" + toString(i+1);
-                                                       out << ">" << nameRep << endl;
-                                                       out << sequence << endl;
-                                               }
-                                       }else { 
-                                               cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl; 
-                                               remove(outputFileName.c_str());
-                                               return 1;
+               //for each bin in the list vector
+               for (int i = 0; i < processList->size(); i++) {
+                       string groups;
+                       int binsize;
+                       nameRep = findRep(i, groups, processList, binsize);
+
+                       //print out name and sequence for that bin
+                       sequence = fasta->getSequence(nameRep);
+
+                       if (sequence != "not found") {
+                               if (sorted == "") { //print them out
+                                       nameRep = nameRep + "|" + toString(i+1);
+                                       nameRep = nameRep + "|" + toString(binsize);
+                                       if (groupfile != "") {
+                                               nameRep = nameRep + "|" + groups;
                                        }
+                                       out << ">" << nameRep << endl;
+                                       out << sequence << endl;
+                               }else { //save them
+                                       repStruct newRep(nameRep, i+1, binsize, groups);
+                                       reps.push_back(newRep);
                                }
-                               
-                               out.close();
-                               return 0;
-       
+                       }else { 
+                               mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); 
+                               remove(outputFileName.c_str());
+                               return 1;
+                       }
+               }
+               
+               if (sorted != "") { //then sort them and print them
+                       if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
+                       else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
+                       else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
+                       else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
+                       
+                       //print them
+                       for (int i = 0; i < reps.size(); i++) {
+                               string sequence = fasta->getSequence(reps[i].name);
+                               string outputName = reps[i].name + "|" + toString(reps[i].bin);
+                               outputName = outputName + "|" + toString(reps[i].size);
+                               if (groupfile != "") {
+                                       outputName = outputName + "|" + reps[i].group;
+                               }
+                               out << ">" << outputName << endl;
+                               out << sequence << endl;
+                       }
+               }
+
+               out.close();
+               return 0;
+
        }
        catch(exception& e) {
-               cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               errorOut(e, "GetOTURepCommand", "process");
                exit(1);
        }
-       catch(...) {
-               cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
-               exit(1);
-       }       
 }
 
 //**********************************************************************************************************************
-
-
-
-
-