]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added logfile feature
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
14         try{
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 allLines = 1;
18                 lines.clear();
19                 labels.clear();
20                 
21                 //allow user to run help
22                 if (option == "help") { 
23                         help(); abort = true;
24                 } else {
25                         //valid paramters for this command
26                         string Array[] =  {"fasta","list","line","label","name", "group"};
27                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28                         
29                         OptionParser parser(option);
30                         map<string, string> parameters = parser.getParameters();
31                         
32                         ValidParameters validParameter;
33                 
34                         //check to make sure all parameters are valid for command
35                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //make sure the user has already run the read.otu command
40                         if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
41                                 mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine(); 
42                                 abort = true;
43                         }
44                         
45                         //check for required parameters
46                         fastafile = validParameter.validFile(parameters, "fasta", true);
47                         if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
48                         else if (fastafile == "not open") { abort = true; }     
49                 
50                         listfile = validParameter.validFile(parameters, "list", true);
51                         if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
52                         else if (listfile == "not open") { abort = true; }      
53
54                         //check for optional parameter and set defaults
55                         // ...at some point should added some additional type checking...
56                         line = validParameter.validFile(parameters, "line", false);
57                         if (line == "not found") { line = "";  }
58                         else { 
59                                 if(line != "all") {  splitAtDash(line, lines);  allLines = 0;  }
60                                 else { allLines = 1;  }
61                         }
62                         
63                         label = validParameter.validFile(parameters, "label", false);                   
64                         if (label == "not found") { label = ""; }
65                         else { 
66                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
67                                 else { allLines = 1;  }
68                         }
69                         
70                         //make sure user did not use both the line and label parameters
71                         if ((line != "") && (label != "")) { mothurOut("You cannot use both the line and label parameters at the same time. "); mothurOutEndLine(); abort = true; }
72                         //if the user has not specified any line or labels use the ones from read.otu
73                         else if ((line == "") && (label == "")) {  
74                                 allLines = globaldata->allLines; 
75                                 labels = globaldata->labels; 
76                                 lines = globaldata->lines;
77                         }
78                         
79                         namesfile = validParameter.validFile(parameters, "name", true);
80                         if (namesfile == "not open") { abort = true; }  
81                         else if (namesfile == "not found") { namesfile = ""; }
82
83                         groupfile = validParameter.validFile(parameters, "group", true);
84                         if (groupfile == "not open") { abort = true; }
85                         else if (groupfile == "not found") { groupfile = ""; }
86                         else {
87                                 //read in group map info.
88                                 groupMap = new GroupMap(groupfile);
89                                 groupMap->readMap();
90                         }
91                 
92                         if (abort == false) {
93                         
94                                 if(globaldata->gSparseMatrix != NULL)   {
95                                         matrix = globaldata->gSparseMatrix;
96                                 }
97
98                                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
99                                 if (globaldata->gListVector != NULL) {
100                                         vector<string> names;
101                                         string binnames;
102                                         //map names to rows in sparsematrix
103                                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
104                                                 names.clear();
105                                                 binnames = globaldata->gListVector->get(i);
106         
107                                                 splitAtComma(binnames, names);
108                                 
109                                                 for (int j = 0; j < names.size(); j++) {
110                                                         nameToIndex[names[j]] = i;
111                                                 }
112                                         }
113                                 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
114
115                                 // Create a data structure to quickly access the distance information.
116                                 // It consists of a vector of distance maps, where each map contains
117                                 // all distances of a certain sequence. Vector and maps are accessed
118                                 // via the index of a sequence in the distance matrix
119                                 seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
120                                 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
121                                         seqVec[currentCell->row][currentCell->column] = currentCell->dist;
122                                 }
123
124                                 fasta = new FastaMap();
125                         }
126                 }
127         }
128         catch(exception& e) {
129                 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
130                 exit(1);
131         }
132 }
133
134 //**********************************************************************************************************************
135
136 void GetOTURepCommand::help(){
137         try {
138                 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
139                 mothurOut("The get.oturep command parameters are list, fasta, name, group, line and label.  The fasta and list parameters are required, and you may not use line and label at the same time.\n");
140                 mothurOut("The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes.\n");
141                 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, line=yourLines, label=yourLabels).\n");
142                 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, line=1-3-5, name=amazon.names).\n");
143                 mothurOut("The default value for line and label are all lines in your inputfile.\n");
144                 mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
145                 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
146                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
147         }
148         catch(exception& e) {
149                 errorOut(e, "GetOTURepCommand", "help");
150                 exit(1);
151         }
152 }
153
154 //**********************************************************************************************************************
155
156 GetOTURepCommand::~GetOTURepCommand(){
157         if (abort == false) {
158                 delete input;  globaldata->ginput = NULL;
159                 delete read;
160                 delete fasta;
161                 if (groupfile != "") {
162                         delete groupMap;  globaldata->gGroupmap = NULL;
163                 }
164         }
165 }
166
167 //**********************************************************************************************************************
168
169 int GetOTURepCommand::execute(){
170         try {
171         
172                 if (abort == true) { return 0; }
173
174                 int count = 1;
175                 int error;
176                 
177                 //read fastafile
178                 fasta->readFastaFile(fastafile);
179                 
180                 in.close();
181                 
182                 //set format to list so input can get listvector
183                 globaldata->setFormat("list");
184                 
185                 //if user gave a namesfile then use it
186                 if (namesfile != "") {
187                         readNamesFile();
188                 }
189                 
190                 //read list file
191                 read = new ReadOTUFile(listfile);
192                 read->read(&*globaldata); 
193                 
194                 input = globaldata->ginput;
195                 list = globaldata->gListVector;
196                 string lastLabel = list->getLabel();
197                 
198                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
199                 set<string> processedLabels;
200                 set<string> userLabels = labels;
201                 set<int> userLines = lines;
202
203                 
204                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
205                         
206                         if (allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
207                                         mothurOut(list->getLabel() + "\t" + toString(list->size()) + "\t" + toString(count)); mothurOutEndLine();
208                                         error = process(list);
209                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
210                                         
211                                         processedLabels.insert(list->getLabel());
212                                         userLabels.erase(list->getLabel());
213                                         userLines.erase(count);
214                         }
215                         
216                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
217                                         delete list;
218                                         list = input->getListVector(lastLabel);
219                                         mothurOut(list->getLabel() + "\t" + toString(list->size()) + "\t" + toString(count)); mothurOutEndLine();
220                                         error = process(list);
221                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
222                                         
223                                         processedLabels.insert(list->getLabel());
224                                         userLabels.erase(list->getLabel());
225                         }
226                         
227                         lastLabel = list->getLabel();
228                         
229                         delete list;
230                         list = input->getListVector();
231                         count++;
232                 }
233                 
234                 //output error messages about any remaining user labels
235                 bool needToRun = false;
236                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
237                         mothurOut("Your file does not include the label " + *it); 
238                         if (processedLabels.count(list->getLabel()) != 1) {
239                                 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
240                                 needToRun = true;
241                         }else {
242                                 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
243                         }
244                 }
245                 
246                 //run last line if you need to
247                 if (needToRun == true)  {
248                         delete list;
249                         list = input->getListVector(lastLabel);
250                         mothurOut(list->getLabel() + "\t" + toString(list->size()) + "\t" + toString(count)); mothurOutEndLine();
251                         error = process(list);
252                         delete list;
253                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
254                 }
255                 
256                 delete matrix;
257                 globaldata->gSparseMatrix = NULL;
258                 delete list;
259                 globaldata->gListVector = NULL;
260
261                 return 0;
262         }
263         catch(exception& e) {
264                 errorOut(e, "GetOTURepCommand", "execute");
265                 exit(1);
266         }
267 }
268
269 //**********************************************************************************************************************
270 void GetOTURepCommand::readNamesFile() {
271         try {
272                 vector<string> dupNames;
273                 openInputFile(namesfile, inNames);
274                 
275                 string name, names, sequence;
276         
277                 while(inNames){
278                         inNames >> name;                        //read from first column  A
279                         inNames >> names;               //read from second column  A,B,C,D
280                         
281                         dupNames.clear();
282                         
283                         //parse names into vector
284                         splitAtComma(names, dupNames);
285                         
286                         //store names in fasta map
287                         sequence = fasta->getSequence(name);
288                         for (int i = 0; i < dupNames.size(); i++) {
289                                 fasta->push_back(dupNames[i], sequence);
290                         }
291                 
292                         gobble(inNames);
293                 }
294                 inNames.close();
295
296         }
297         catch(exception& e) {
298                 errorOut(e, "GetOTURepCommand", "readNamesFile");
299                 exit(1);
300         }
301 }
302 //**********************************************************************************************************************
303 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
304         try{
305                 vector<string> names;
306                 map<string, string> groups;
307                 map<string, string>::iterator groupIt;
308
309                 //parse names into vector
310                 string binnames = thisList->get(bin);
311                 splitAtComma(binnames, names);
312                 binsize = names.size();
313
314                 //if you have a groupfile
315                 if (groupfile != "") {
316                         //find the groups that are in this bin
317                         for (size_t i = 0; i < names.size(); i++) {
318                                 string groupName = groupMap->getGroup(names[i]);
319                                 if (groupName == "not found") {  
320                                         mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
321                                         groupError = true;
322                                 } else {
323                                         groups[groupName] = groupName;
324                                 }
325                         }
326                         
327                         //turn the groups into a string
328                         for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
329                                 group += groupIt->first + "-";
330                         }
331                         //rip off last dash
332                         group = group.substr(0, group.length()-1);
333                 }
334
335                 // if only 1 sequence in bin or processing the "unique" line, then 
336                 // the first sequence of the OTU is the representative one
337                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
338                         return names[0];
339                 } else {
340                         vector<int> seqIndex(names.size());
341                         vector<float> max_dist(names.size());
342
343                         //fill seqIndex and initialize sums
344                         for (size_t i = 0; i < names.size(); i++) {
345                                 seqIndex[i] = nameToIndex[names[i]];
346                                 max_dist[i] = 0.0;
347                         }
348                         
349                         // loop through all entries in seqIndex
350                         SeqMap::iterator it;
351                         SeqMap currMap;
352                         for (size_t i=0; i < seqIndex.size(); i++) {
353                                 currMap = seqVec[seqIndex[i]];
354                                 for (size_t j=0; j < seqIndex.size(); j++) {
355                                         it = currMap.find(seqIndex[j]);         
356                                         if (it != currMap.end()) {
357                                                 max_dist[i] = max(max_dist[i], it->second);
358                                                 max_dist[j] = max(max_dist[j], it->second);
359                                         }
360                                 }
361                         }
362                         
363                         // sequence with the smallest maximum distance is the representative
364                         float min = 10000;
365                         int minIndex;
366                         for (size_t i=0; i < max_dist.size(); i++) {
367                                 if (max_dist[i] < min) {
368                                         min = max_dist[i];
369                                         minIndex = i;
370                                 }
371                         }
372
373                         return(names[minIndex]);
374                 }
375         }
376         catch(exception& e) {
377                 errorOut(e, "GetOTURepCommand", "FindRep");
378                 exit(1);
379         }
380 }
381
382 //**********************************************************************************************************************
383 int GetOTURepCommand::process(ListVector* processList) {
384         try{
385                 string nameRep, name, sequence;
386
387                 //create output file
388                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
389                 openOutputFile(outputFileName, out);
390
391                 //for each bin in the list vector
392                 for (int i = 0; i < processList->size(); i++) {
393                         string groups;
394                         int binsize;
395                         nameRep = findRep(i, groups, processList, binsize);
396
397                         //print out name and sequence for that bin
398                         sequence = fasta->getSequence(nameRep);
399
400                         if (sequence != "not found") {
401                                 nameRep = nameRep + "|" + toString(i+1);
402                                 nameRep = nameRep + "|" + toString(binsize);
403                                 if (groupfile != "") {
404                                         nameRep = nameRep + "|" + groups;
405                                 }
406                                 out << ">" << nameRep << endl;
407                                 out << sequence << endl;
408                         } else { 
409                                 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); 
410                                 remove(outputFileName.c_str());
411                                 return 1;
412                         }
413                 }
414
415                 out.close();
416                 return 0;
417
418         }
419         catch(exception& e) {
420                 errorOut(e, "GetOTURepCommand", "process");
421                 exit(1);
422         }
423 }
424
425 //**********************************************************************************************************************