]> git.donarmstrong.com Git - mothur.git/commitdiff
added groups option to get.oturep command
authorwestcott <westcott>
Mon, 28 Jun 2010 18:32:11 +0000 (18:32 +0000)
committerwestcott <westcott>
Mon, 28 Jun 2010 18:32:11 +0000 (18:32 +0000)
getoturepcommand.cpp
getoturepcommand.h
groupmap.cpp
mgclustercommand.cpp
mothur.h

index a73fdf67bd24baf00d8cda5d3bb1f54ff708bc08..9dfc8bdeadc83c9784a42c10d66b26efcd52e8d4 100644 (file)
@@ -12,7 +12,7 @@
 #include "readcolumn.h"
 #include "formatphylip.h"
 #include "formatcolumn.h"
-
+#include "sharedutilities.h"
 
 
 //********************************************************************************************************************
@@ -48,7 +48,7 @@ GetOTURepCommand::GetOTURepCommand(string option)  {
                        help(); abort = true;
                } else {
                        //valid paramters for this command
-                       string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","outputdir","inputdir"};
+                       string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -172,6 +172,18 @@ GetOTURepCommand::GetOTURepCommand(string option)  {
                                sorted = "";
                        }
                        
+                       groups = validParameter.validFile(parameters, "groups", false);                 
+                       if (groups == "not found") { groups = ""; }
+                       else { 
+                               if (groupfile == "") {
+                                       m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
+                                       abort = true;
+                               }else { 
+                                       splitAtDash(groups, Groups);
+                               }
+                       }
+                       globaldata->Groups = Groups;
+                       
                        string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
                        large = isTrue(temp);
                        
@@ -193,7 +205,7 @@ GetOTURepCommand::GetOTURepCommand(string option)  {
 
 void GetOTURepCommand::help(){
        try {
-               m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
+               m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
                m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
                m->mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
                m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
@@ -202,6 +214,7 @@ void GetOTURepCommand::help(){
                m->mothurOut("The default value for label is all labels in your inputfile.\n");
                m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
                m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
+               m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
                m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
                m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
                m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
@@ -321,7 +334,20 @@ int GetOTURepCommand::execute(){
                        if (large) {  inRow.close(); remove(distFile.c_str());  }
                        return 0; 
                }
-                               
+               
+               if (groupfile != "") {
+                       //read in group map info.
+                       groupMap = new GroupMap(groupfile);
+                       int error = groupMap->readMap();
+                       if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
+                       
+                       if (Groups.size() != 0) {
+                               SharedUtil* util = new SharedUtil();
+                               util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
+                               delete util;
+                       }
+               }
+                                                               
                //set format to list so input can get listvector
                globaldata->setFormat("list");
        
@@ -425,13 +451,6 @@ int GetOTURepCommand::execute(){
                delete input;  globaldata->ginput = NULL;
                delete read;
                
-               if (groupfile != "") {
-                       //read in group map info.
-                       groupMap = new GroupMap(groupfile);
-                       int error = groupMap->readMap();
-                       if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
-               }
-
                //read fastafile
                fasta = new FastaMap();
                fasta->readFastaFile(fastafile);
@@ -499,16 +518,8 @@ void GetOTURepCommand::readNamesFile() {
        }
 }
 //**********************************************************************************************************************
-string GetOTURepCommand::findRep(int bin, ListVector* thisList) {
+string GetOTURepCommand::findRep(vector<string> names) {
        try{
-               vector<string> names;
-               map<string, string> groups;
-               map<string, string>::iterator groupIt;
-
-               //parse names into vector
-               string binnames = thisList->get(bin);
-               splitAtComma(binnames, names);
-
                // if only 1 sequence in bin or processing the "unique" label, then 
                // the first sequence of the OTU is the representative one
                if ((names.size() == 1) || (list->getLabel() == "unique")) {
@@ -569,7 +580,7 @@ string GetOTURepCommand::findRep(int bin, ListVector* thisList) {
                                        }
                                }
                        }
-
+                       
                        return(names[minIndex]);
                }
        }
@@ -582,28 +593,100 @@ string GetOTURepCommand::findRep(int bin, ListVector* thisList) {
 //**********************************************************************************************************************
 int GetOTURepCommand::process(ListVector* processList) {
        try{
-               string nameRep, name, sequence;
+               string name, sequence;
+               string nameRep;
 
                //create output file
                if (outputDir == "") { outputDir += hasPath(listfile); }
                                
                ofstream newNamesOutput;
-               string outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
-               openOutputFile(outputNamesFile, newNamesOutput);
-               outputNames.push_back(outputNamesFile);
-               outputNameFiles[outputNamesFile] = processList->getLabel();
+               string outputNamesFile;
+               map<string, ofstream*> filehandles;
+               
+               if (Groups.size() == 0) { //you don't want to use groups
+                       outputNamesFile  = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
+                       openOutputFile(outputNamesFile, newNamesOutput);
+                       outputNames.push_back(outputNamesFile);
+                       outputNameFiles[outputNamesFile] = processList->getLabel();
+               }else{ //you want to use groups
+                       ofstream* temp;
+                       for (int i=0; i<Groups.size(); i++) {
+                               temp = new ofstream;
+                               filehandles[Groups[i]] = temp;
+                               outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
+                               
+                               openOutputFile(outputNamesFile, *(temp));
+                               outputNames.push_back(outputNamesFile);
+                               outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
+                       }
+               }
                
                //for each bin in the list vector
                for (int i = 0; i < processList->size(); i++) {
-                       nameRep = findRep(i, processList);
-                       
-                       if (m->control_pressed) { out.close();  newNamesOutput.close(); return 0; }
+                       if (m->control_pressed) { 
+                               out.close();  
+                               if (Groups.size() == 0) { //you don't want to use groups
+                                       newNamesOutput.close();
+                               }else{
+                                       for (int j=0; j<Groups.size(); j++) {
+                                               (*(filehandles[Groups[j]])).close();
+                                               delete filehandles[Groups[j]];
+                                       }
+                               }
+                               return 0; 
+                       }
                        
-                       //output to new names file
-                       newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
+                       string temp = processList->get(i);
+                       vector<string> namesInBin;
+                       splitAtComma(temp, namesInBin);
+                       
+                       if (Groups.size() == 0) {
+                               nameRep = findRep(namesInBin);
+                               newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
+                       }else{
+                               map<string, vector<string> > NamesInGroup;
+                               for (int j=0; j<Groups.size(); j++) { //initialize groups
+                                       NamesInGroup[Groups[j]].resize(0);
+                               }
+                               
+                               for (int j=0; j<namesInBin.size(); j++) {
+                                       string thisgroup = groupMap->getGroup(namesInBin[j]);
+                                       
+                                       if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
+                                       
+                                       if (inUsersGroups(thisgroup, Groups)) { //add this name to correct group
+                                               NamesInGroup[thisgroup].push_back(namesInBin[j]);
+                                       }
+                               }
+                               
+                               //get rep for each group in otu
+                               for (int j=0; j<Groups.size(); j++) {
+                                       if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
+                                               //get rep for each group
+                                               nameRep = findRep(NamesInGroup[Groups[j]]);
+                                               
+                                               //output group rep and other members of this group
+                                               (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
+                                               
+                                               for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
+                                                       (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
+                                               }
+                                               //output last name
+                                               (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
+                                       }
+                               }
+                       }
                }
-
-               newNamesOutput.close();
+               
+               if (Groups.size() == 0) { //you don't want to use groups
+                       newNamesOutput.close();
+               }else{
+                       for (int i=0; i<Groups.size(); i++) {
+                               (*(filehandles[Groups[i]])).close();
+                               delete filehandles[Groups[i]];
+                       }
+               }
+               
                return 0;
 
        }
@@ -623,13 +706,18 @@ int GetOTURepCommand::processNames(string filename, string label) {
                vector<repStruct> reps;
                outputNames.push_back(outputFileName);
                
+               ofstream out2;
+               string tempNameFile = filename + ".temp";
+               openOutputFile(tempNameFile, out2);
+               
                ifstream in;
                openInputFile(filename, in);
                
                int i = 0;
                while (!in.eof()) {
                        string rep, binnames;
-                       in >> rep >> binnames; gobble(in);
+                       in >> i >> rep >> binnames; gobble(in);
+                       out2 << rep << '\t' << binnames << endl;
                        
                        vector<string> names;
                        splitAtComma(binnames, names);
@@ -680,7 +768,6 @@ int GetOTURepCommand::processNames(string filename, string label) {
                        }else { 
                                m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
                        }
-                       i++;
                }
                
                        
@@ -704,6 +791,9 @@ int GetOTURepCommand::processNames(string filename, string label) {
                }
                        
                out.close();
+               out2.close();
+               
+               rename(tempNameFile.c_str(), filename.c_str());
                
                return 0;
 
index 86c5434034aae9c7900428bb8ff323385f89b837..8099a3998c1a63de92d9c72b2c5419c08e2f96ff 100644 (file)
@@ -53,13 +53,13 @@ private:
        ReadMatrix* readMatrix;
        FormatMatrix* formatMatrix;
        NameAssignment* nameMap;
-       string filename, fastafile, listfile, namefile, groupfile, label, sorted, phylipfile, columnfile, distFile, format, outputDir;
+       string filename, fastafile, listfile, namefile, groupfile, label, sorted, phylipfile, columnfile, distFile, format, outputDir, groups;
        ofstream out;
        ifstream in, inNames, inRow;
        bool abort, allLines, groupError, large;
        set<string> labels; //holds labels to be used
        map<string, int> nameToIndex;  //maps sequence name to index in sparsematrix
-       vector<string> outputNames;
+       vector<string> outputNames, Groups;
        map<string, string> outputNameFiles;
        float cutoff;
        int precision;
@@ -70,7 +70,7 @@ private:
        void readNamesFile();
        int process(ListVector*);
        SeqMap getMap(int);
-       string findRep(int, ListVector*);       // returns the name of the "representative" sequence of given bin
+       string findRep(vector<string>);         // returns the name of the "representative" sequence of given bin or subset of a bin, for groups
        int processNames(string, string);
                                                                                                
 
index bc871c8e68e27ae1a90ccea96b3121aaea5611e3..d57646e3e3dad937d3ca0fa81b26f402cfa1ad4d 100644 (file)
@@ -27,7 +27,7 @@ int GroupMap::readMap() {
                int error = 0;
 
                while(fileHandle){
-                       fileHandle >> seqName;                  //read from first column
+                       fileHandle >> seqName;  gobble(fileHandle);             //read from first column
                        fileHandle >> seqGroup;                 //read from second column
                        
                        if (m->control_pressed) {  fileHandle.close();  return 1; }
index 182e78dc31502fb6c9a75e1703c8f17ffe4ba0fb..b92927b1d3d51c0505fbcadd14231eade34b78e5 100644 (file)
@@ -166,13 +166,6 @@ int MGClusterCommand::execute(){
                
                list = new ListVector(nameMap->getListVector());
                RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
-for (int i = 0; i < list->getNumBins(); i++) {
-string bin = list->get(i);
-if(bin == "") {
-cout << "bin " << i << " is blank."<< endl;
-}
-}
-cout << "after outputting blank bins." << endl;
                
                if (m->control_pressed) { delete nameMap; delete read; delete list; delete rabund; return 0; }
                
@@ -215,7 +208,7 @@ cout << "after outputting blank bins." << endl;
                                listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
                                return 0; 
                        }
-       int count = 0;          
+               
                        //cluster using cluster classes
                        while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
                                
@@ -234,16 +227,13 @@ cout << "after outputting blank bins." << endl;
                                }else{
                                        rndDist = roundDist(dist, precision); 
                                }
-cout << "here " << count << '\t' << dist << endl;
                                
                                if(previousDist <= 0.0000 && dist != previousDist){
                                        oldList.setLabel("unique");
                                        printData(&oldList);
-                                       Seq2Bin = cluster->getSeqtoBin();
                                }
                                else if(rndDist != rndPreviousDist){
                                        if (merge) {
-                                               Seq2Bin = cluster->getSeqtoBin();
                                                ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
                                                
                                                if (m->control_pressed) { 
@@ -260,22 +250,20 @@ cout << "here " << count << '\t' << dist << endl;
                                                printData(&oldList);
                                        }
                                }
-       //cout << "after merge " << count << '\t' << dist << endl;      
-       count++;                
+       
                                previousDist = dist;
                                rndPreviousDist = rndDist;
                                oldList = *list;
+                               Seq2Bin = cluster->getSeqtoBin();
                                oldSeq2Bin = Seq2Bin;
                        }
                        
                        if(previousDist <= 0.0000){
                                oldList.setLabel("unique");
                                printData(&oldList);
-                               Seq2Bin = cluster->getSeqtoBin();
                        }
                        else if(rndPreviousDist<cutoff){
                                if (merge) {
-                                       Seq2Bin = cluster->getSeqtoBin();
                                        ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
                                        
                                        if (m->control_pressed) { 
@@ -365,11 +353,9 @@ cout << "here " << count << '\t' << dist << endl;
                                                if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
                                                        oldList.setLabel("unique");
                                                        printData(&oldList);
-                                                       Seq2Bin = hcluster->getSeqtoBin();
                                                }
                                                else if((rndDist != rndPreviousDist)){
                                                        if (merge) {
-                                                               Seq2Bin = hcluster->getSeqtoBin();
                                                                ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
                                                                
                                                                if (m->control_pressed) { 
@@ -392,6 +378,7 @@ cout << "here " << count << '\t' << dist << endl;
                                                previousDist = seqs[i].dist;
                                                rndPreviousDist = rndDist;
                                                oldList = *list;
+                                               Seq2Bin = cluster->getSeqtoBin();
                                                oldSeq2Bin = Seq2Bin;
                                        }
                                }
@@ -404,7 +391,6 @@ cout << "here " << count << '\t' << dist << endl;
                        }
                        else if(rndPreviousDist<cutoff){
                                if (merge) {
-                                       Seq2Bin = hcluster->getSeqtoBin();
                                        ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
                                        
                                        if (m->control_pressed) { 
@@ -485,16 +471,6 @@ ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
        try {
                //create new listvector so you don't overwrite the clustering
                ListVector* newList = new ListVector(oldList);
-for (int i = 0; i < newList->getNumBins(); i++) {
-string bin = newList->get(i);
-if(bin == "") {
-cout << "bin " << i << " is blank."<< endl;
-for (map<string, int>::iterator itBin = binInfo.begin(); itBin != binInfo.end(); itBin++) {
-       if (itBin->second == i) { cout << itBin->first << " maps to an empty bin." << endl; }
-}
-}
-}
-cout << "after outputting blank bins." << endl;                
 
                bool done = false;
                ifstream inOverlap;
@@ -546,17 +522,16 @@ cout << "after outputting blank bins." << endl;
                                
                                if(itBin1 == binInfo.end()){  cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1);  }
                                if(itBin2 == binInfo.end()){  cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1);  }
-cout << overlapNode.dist << '\t' << dist << endl;
+
                                int binKeep = itBin1->second;
                                int binRemove = itBin2->second;
                        
                                //if not merge bins and update binInfo
                                if(binKeep != binRemove) {
-       cout << "bin keep = " << binKeep << " bin remove = " << binRemove << endl;              
+               
                                        //save names in old bin
                                        string names = newList->get(binRemove);
-                       cout << names << endl << endl << endl;  
-                       cout << newList->get(binKeep) << endl << endl << endl;  
+               
                                        //merge bins into name1s bin
                                        newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
                                        newList->set(binRemove, "");    
index c3f61b4268b613da53a567f0fa6239983a19311e..2404a84f08672c2178bd3e0a33946124348a72a7 100644 (file)
--- a/mothur.h
+++ b/mothur.h
@@ -828,7 +828,13 @@ inline bool anyLabelsToProcess(string label, set<string>& userLabels, string err
                
                //unique is the smallest line
                if (label == "unique") {  return false;  }
-               else { convert(label, labelFloat); }
+               else { 
+                       if (convertTestFloat(label, labelFloat)) {
+                               convert(label, labelFloat); 
+                       }else { //cant convert 
+                               return false;
+                       }
+               }
                
                //go through users set and make them floats
                for(it = userLabels.begin(); it != userLabels.end(); ++it) {