]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added hcluster command and fixed some bugs, namely one with smart distancing.
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
14         try{
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 allLines = 1;
18                 labels.clear();
19                 
20                 //allow user to run help
21                 if (option == "help") { 
22                         help(); abort = true;
23                 } else {
24                         //valid paramters for this command
25                         string Array[] =  {"fasta","list","label","name", "group"};
26                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27                         
28                         OptionParser parser(option);
29                         map<string, string> parameters = parser.getParameters();
30                         
31                         ValidParameters validParameter;
32                 
33                         //check to make sure all parameters are valid for command
34                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
35                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
36                         }
37                         
38                         //make sure the user has already run the read.otu command
39                         if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
40                                 mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine(); 
41                                 abort = true;
42                         }
43                         
44                         //check for required parameters
45                         fastafile = validParameter.validFile(parameters, "fasta", true);
46                         if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
47                         else if (fastafile == "not open") { abort = true; }     
48                 
49                         listfile = validParameter.validFile(parameters, "list", true);
50                         if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
51                         else if (listfile == "not open") { abort = true; }      
52
53                         //check for optional parameter and set defaults
54                         // ...at some point should added some additional type checking...
55                         label = validParameter.validFile(parameters, "label", false);                   
56                         if (label == "not found") { label = ""; }
57                         else { 
58                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
59                                 else { allLines = 1;  }
60                         }
61                         
62                         //if the user has not specified any labels use the ones from read.otu
63                         if (label == "") {  
64                                 allLines = globaldata->allLines; 
65                                 labels = globaldata->labels; 
66                         }
67                         
68                         namesfile = validParameter.validFile(parameters, "name", true);
69                         if (namesfile == "not open") { abort = true; }  
70                         else if (namesfile == "not found") { namesfile = ""; }
71
72                         groupfile = validParameter.validFile(parameters, "group", true);
73                         if (groupfile == "not open") { abort = true; }
74                         else if (groupfile == "not found") { groupfile = ""; }
75                         else {
76                                 //read in group map info.
77                                 groupMap = new GroupMap(groupfile);
78                                 groupMap->readMap();
79                         }
80                 
81                         if (abort == false) {
82                         
83                                 if(globaldata->gSparseMatrix != NULL)   {
84                                         matrix = globaldata->gSparseMatrix;
85                                 }
86
87                                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
88                                 if (globaldata->gListVector != NULL) {
89                                         vector<string> names;
90                                         string binnames;
91                                         //map names to rows in sparsematrix
92                                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
93                                                 names.clear();
94                                                 binnames = globaldata->gListVector->get(i);
95         
96                                                 splitAtComma(binnames, names);
97                                 
98                                                 for (int j = 0; j < names.size(); j++) {
99                                                         nameToIndex[names[j]] = i;
100                                                 }
101                                         }
102                                 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
103
104                                 // Create a data structure to quickly access the distance information.
105                                 // It consists of a vector of distance maps, where each map contains
106                                 // all distances of a certain sequence. Vector and maps are accessed
107                                 // via the index of a sequence in the distance matrix
108                                 seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
109                                 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
110                                         seqVec[currentCell->row][currentCell->column] = currentCell->dist;
111                                 }
112
113                                 fasta = new FastaMap();
114                         }
115                 }
116         }
117         catch(exception& e) {
118                 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
119                 exit(1);
120         }
121 }
122
123 //**********************************************************************************************************************
124
125 void GetOTURepCommand::help(){
126         try {
127                 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
128                 mothurOut("The get.oturep command parameters are list, fasta, name, group and label.  The fasta and list parameters are required.\n");
129                 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
130                 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
131                 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
132                 mothurOut("The default value for label is all labels in your inputfile.\n");
133                 mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
134                 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
135                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
136         }
137         catch(exception& e) {
138                 errorOut(e, "GetOTURepCommand", "help");
139                 exit(1);
140         }
141 }
142
143 //**********************************************************************************************************************
144
145 GetOTURepCommand::~GetOTURepCommand(){
146         if (abort == false) {
147                 delete input;  globaldata->ginput = NULL;
148                 delete read;
149                 delete fasta;
150                 if (groupfile != "") {
151                         delete groupMap;  globaldata->gGroupmap = NULL;
152                 }
153         }
154 }
155
156 //**********************************************************************************************************************
157
158 int GetOTURepCommand::execute(){
159         try {
160         
161                 if (abort == true) { return 0; }
162
163                 int error;
164                 
165                 //read fastafile
166                 fasta->readFastaFile(fastafile);
167                 
168                 in.close();
169                 
170                 //set format to list so input can get listvector
171                 globaldata->setFormat("list");
172                 
173                 //if user gave a namesfile then use it
174                 if (namesfile != "") {
175                         readNamesFile();
176                 }
177                 
178                 //read list file
179                 read = new ReadOTUFile(listfile);
180                 read->read(&*globaldata); 
181                 
182                 input = globaldata->ginput;
183                 list = globaldata->gListVector;
184                 string lastLabel = list->getLabel();
185                 
186                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
187                 set<string> processedLabels;
188                 set<string> userLabels = labels;
189                 
190                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
191                         
192                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
193                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
194                                         error = process(list);
195                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
196                                         
197                                         processedLabels.insert(list->getLabel());
198                                         userLabels.erase(list->getLabel());
199                         }
200                         
201                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
202                                         string saveLabel = list->getLabel();
203                                         
204                                         delete list;
205                                         list = input->getListVector(lastLabel);
206                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
207                                         error = process(list);
208                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
209                                         
210                                         processedLabels.insert(list->getLabel());
211                                         userLabels.erase(list->getLabel());
212                                         
213                                         //restore real lastlabel to save below
214                                         list->setLabel(saveLabel);
215                         }
216                         
217                         lastLabel = list->getLabel();
218                         
219                         delete list;
220                         list = input->getListVector();
221                 }
222                 
223                 //output error messages about any remaining user labels
224                 bool needToRun = false;
225                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
226                         mothurOut("Your file does not include the label " + *it); 
227                         if (processedLabels.count(list->getLabel()) != 1) {
228                                 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
229                                 needToRun = true;
230                         }else {
231                                 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
232                         }
233                 }
234                 
235                 //run last label if you need to
236                 if (needToRun == true)  {
237                         if (list != NULL) {     delete list;    }
238                         list = input->getListVector(lastLabel);
239                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
240                         error = process(list);
241                         delete list;
242                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
243                 }
244                 
245                 delete matrix;
246                 globaldata->gSparseMatrix = NULL;
247                 delete list;
248                 globaldata->gListVector = NULL;
249
250                 return 0;
251         }
252         catch(exception& e) {
253                 errorOut(e, "GetOTURepCommand", "execute");
254                 exit(1);
255         }
256 }
257
258 //**********************************************************************************************************************
259 void GetOTURepCommand::readNamesFile() {
260         try {
261                 vector<string> dupNames;
262                 openInputFile(namesfile, inNames);
263                 
264                 string name, names, sequence;
265         
266                 while(inNames){
267                         inNames >> name;                        //read from first column  A
268                         inNames >> names;               //read from second column  A,B,C,D
269                         
270                         dupNames.clear();
271                         
272                         //parse names into vector
273                         splitAtComma(names, dupNames);
274                         
275                         //store names in fasta map
276                         sequence = fasta->getSequence(name);
277                         for (int i = 0; i < dupNames.size(); i++) {
278                                 fasta->push_back(dupNames[i], sequence);
279                         }
280                 
281                         gobble(inNames);
282                 }
283                 inNames.close();
284
285         }
286         catch(exception& e) {
287                 errorOut(e, "GetOTURepCommand", "readNamesFile");
288                 exit(1);
289         }
290 }
291 //**********************************************************************************************************************
292 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
293         try{
294                 vector<string> names;
295                 map<string, string> groups;
296                 map<string, string>::iterator groupIt;
297
298                 //parse names into vector
299                 string binnames = thisList->get(bin);
300                 splitAtComma(binnames, names);
301                 binsize = names.size();
302
303                 //if you have a groupfile
304                 if (groupfile != "") {
305                         //find the groups that are in this bin
306                         for (size_t i = 0; i < names.size(); i++) {
307                                 string groupName = groupMap->getGroup(names[i]);
308                                 if (groupName == "not found") {  
309                                         mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
310                                         groupError = true;
311                                 } else {
312                                         groups[groupName] = groupName;
313                                 }
314                         }
315                         
316                         //turn the groups into a string
317                         for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
318                                 group += groupIt->first + "-";
319                         }
320                         //rip off last dash
321                         group = group.substr(0, group.length()-1);
322                 }
323
324                 // if only 1 sequence in bin or processing the "unique" label, then 
325                 // the first sequence of the OTU is the representative one
326                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
327                         return names[0];
328                 } else {
329                         vector<int> seqIndex(names.size());
330                         vector<float> max_dist(names.size());
331
332                         //fill seqIndex and initialize sums
333                         for (size_t i = 0; i < names.size(); i++) {
334                                 seqIndex[i] = nameToIndex[names[i]];
335                                 max_dist[i] = 0.0;
336                         }
337                         
338                         // loop through all entries in seqIndex
339                         SeqMap::iterator it;
340                         SeqMap currMap;
341                         for (size_t i=0; i < seqIndex.size(); i++) {
342                                 currMap = seqVec[seqIndex[i]];
343                                 for (size_t j=0; j < seqIndex.size(); j++) {
344                                         it = currMap.find(seqIndex[j]);         
345                                         if (it != currMap.end()) {
346                                                 max_dist[i] = max(max_dist[i], it->second);
347                                                 max_dist[j] = max(max_dist[j], it->second);
348                                         }
349                                 }
350                         }
351                         
352                         // sequence with the smallest maximum distance is the representative
353                         float min = 10000;
354                         int minIndex;
355                         for (size_t i=0; i < max_dist.size(); i++) {
356                                 if (max_dist[i] < min) {
357                                         min = max_dist[i];
358                                         minIndex = i;
359                                 }
360                         }
361
362                         return(names[minIndex]);
363                 }
364         }
365         catch(exception& e) {
366                 errorOut(e, "GetOTURepCommand", "FindRep");
367                 exit(1);
368         }
369 }
370
371 //**********************************************************************************************************************
372 int GetOTURepCommand::process(ListVector* processList) {
373         try{
374                 string nameRep, name, sequence;
375
376                 //create output file
377                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
378                 openOutputFile(outputFileName, out);
379
380                 //for each bin in the list vector
381                 for (int i = 0; i < processList->size(); i++) {
382                         string groups;
383                         int binsize;
384                         nameRep = findRep(i, groups, processList, binsize);
385
386                         //print out name and sequence for that bin
387                         sequence = fasta->getSequence(nameRep);
388
389                         if (sequence != "not found") {
390                                 nameRep = nameRep + "|" + toString(i+1);
391                                 nameRep = nameRep + "|" + toString(binsize);
392                                 if (groupfile != "") {
393                                         nameRep = nameRep + "|" + groups;
394                                 }
395                                 out << ">" << nameRep << endl;
396                                 out << sequence << endl;
397                         } else { 
398                                 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); 
399                                 remove(outputFileName.c_str());
400                                 return 1;
401                         }
402                 }
403
404                 out.close();
405                 return 0;
406
407         }
408         catch(exception& e) {
409                 errorOut(e, "GetOTURepCommand", "process");
410                 exit(1);
411         }
412 }
413
414 //**********************************************************************************************************************