]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
broke up globaldata and moved error checking and help into commands
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
14         try{
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 allLines = 1;
18                 lines.clear();
19                 labels.clear();
20                 
21                 //allow user to run help
22                 if(option == "help") { help(); abort = true; }
23                 
24                 else {
25                         //valid paramters for this command
26                         string Array[] =  {"fasta","list","line","label","name", "group"};
27                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28                         
29                         parser = new OptionParser();
30                         parser->parse(option, parameters);  delete parser;
31                         
32                         ValidParameters* validParameter = new ValidParameters();
33                 
34                         //check to make sure all parameters are valid for command
35                         for (it4 = parameters.begin(); it4 != parameters.end(); it4++) { 
36                                 if (validParameter->isValidParameter(it4->first, myArray, it4->second) != true) {  abort = true;  }
37                         }
38                         
39                         //make sure the user has already run the read.otu command
40                         if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
41                                 cout << "Before you use the get.oturep command, you first need to read in a distance matrix." << endl; 
42                                 abort = true;
43                         }
44                         
45                         //check for required parameters
46                         fastafile = validParameter->validFile(parameters, "fasta", true);
47                         if (fastafile == "not found") { cout << "fasta is a required parameter for the get.oturep command." << endl; abort = true; }
48                         else if (fastafile == "not open") { abort = true; }     
49                         else { 
50                                 globaldata->setFastaFile(fastafile);
51                         }
52                 
53                         listfile = validParameter->validFile(parameters, "list", true);
54                         if (listfile == "not found") { cout << "list is a required parameter for the get.oturep command." << endl; abort = true; }
55                         else if (listfile == "not open") { abort = true; }      
56                         else { 
57                                 globaldata->setListFile(listfile);
58                         }
59
60                         //check for optional parameter and set defaults
61                         // ...at some point should added some additional type checking...
62                         line = validParameter->validFile(parameters, "line", false);                            
63                         if (line == "not found") { line = "";  }
64                         else { 
65                                 if(line != "all") {  splitAtDash(line, lines);  allLines = 0;  }
66                                 else { allLines = 1;  }
67                         }
68                         
69                         label = validParameter->validFile(parameters, "label", false);                  
70                         if (label == "not found") { label = ""; }
71                         else { 
72                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
73                                 else { allLines = 1;  }
74                         }
75                         
76                         //make sure user did not use both the line and label parameters
77                         if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
78                         //if the user has not specified any line or labels use the ones from read.otu
79                         else if ((line == "") && (label == "")) {  
80                                 allLines = globaldata->allLines; 
81                                 labels = globaldata->labels; 
82                                 lines = globaldata->lines;
83                         }
84                         
85                         namesfile = validParameter->validFile(parameters, "name", true);
86                         if (namesfile == "not open") { abort = true; }  
87                         else if (namesfile == "not found") { namesfile = ""; }
88
89                         groupfile = validParameter->validFile(parameters, "group", true);
90                         if (groupfile == "not open") { abort = true; }
91                         else if (groupfile == "not found") { groupfile = ""; }
92                         else {
93                                 //read in group map info.
94                                 groupMap = new GroupMap(groupfile);
95                                 groupMap->readMap();
96                         }
97         
98                         delete validParameter;
99         
100                         if (abort == false) {
101                         
102                                 if(globaldata->gSparseMatrix != NULL)   {       matrix = new SparseMatrix(*globaldata->gSparseMatrix);          }       
103                                         
104                                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
105                                 if(globaldata->gListVector != NULL)             {
106                 
107                                         vector<string> names;
108                                         string binnames;
109                                         //map names to rows in sparsematrix
110                                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
111                                                 names.clear();
112                                                 binnames = globaldata->gListVector->get(i);
113         
114                                                 splitAtComma(binnames, names);
115                                 
116                                                 for (int j = 0; j < names.size(); j++) {
117                                                         nameToIndex[names[j]] = i;
118                                                 }
119         
120                                         }
121                                 }else { cout << "error, no listvector." << endl; }
122                                 
123                                 openInputFile(fastafile, in);
124                                 fasta = new FastaMap();
125                         }
126                 
127                 }
128         
129         }
130         catch(exception& e) {
131                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
132                 exit(1);
133         }
134         catch(...) {
135                 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
136                 exit(1);
137         }
138 }
139
140 //**********************************************************************************************************************
141
142 void GetOTURepCommand::help(){
143         try {
144                 cout << "The get.oturep command can only be executed after a successful read.dist command." << "\n";
145                 cout << "The get.oturep command parameters are list, fasta, name, group, line and label.  The fasta and list parameters are required, and you may not use line and label at the same time." << "\n";
146                 cout << "The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes." << "\n";
147                 cout << "The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, line=yourLines, label=yourLabels)." << "\n";
148                 cout << "Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, line=1-3-5, name=amazon.names)." << "\n";
149                 cout << "The default value for line and label are all lines in your inputfile." << "\n";
150                 cout << "The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin." << "\n";
151                 cout << "If you provide a groupfile, then it also appends the names of the groups present in that bin." << "\n";
152                 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile)." << "\n" << "\n";
153         }
154         catch(exception& e) {
155                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
156                 exit(1);
157         }
158         catch(...) {
159                 cout << "An unknown error has occurred in the GetOTURepCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
160                 exit(1);
161         }       
162 }
163
164 //**********************************************************************************************************************
165
166 GetOTURepCommand::~GetOTURepCommand(){
167         delete input;
168         delete read;
169         delete fasta;
170         if (groupfile != "") {
171                 delete groupMap;
172         }
173 }
174
175 //**********************************************************************************************************************
176
177 int GetOTURepCommand::execute(){
178         try {
179         
180                 if (abort == true) { return 0; }
181                 
182                 int count = 1;
183                 int error;
184                 
185                 //read fastafile
186                 fasta->readFastaFile(in);
187                 
188                 //set format to list so input can get listvector
189                 globaldata->setFormat("list");
190                 
191                 //if user gave a namesfile then use it
192                 if (namesfile != "") {
193                         readNamesFile();
194                 }
195                 
196                 //read list file
197                 read = new ReadOTUFile(listfile);       
198                 read->read(&*globaldata); 
199                 
200                 input = globaldata->ginput;
201                 list = globaldata->gListVector;
202                 ListVector* lastList = list;
203                 
204                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
205                 set<string> processedLabels;
206                 set<string> userLabels = labels;
207                 set<int> userLines = lines;
208
209                 
210                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
211                         
212                         if(allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
213                                         cout << list->getLabel() << '\t' << count << endl;
214                                         error = process(list);
215                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
216                                         
217                                         processedLabels.insert(list->getLabel());
218                                         userLabels.erase(list->getLabel());
219                                         userLines.erase(count);
220                         }
221                         
222                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
223                                         cout << lastList->getLabel() << '\t' << count << endl;
224                                         error = process(lastList);
225                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
226                                         
227                                         processedLabels.insert(lastList->getLabel());
228                                         userLabels.erase(lastList->getLabel());
229                         }
230                         
231                         if (count != 1) { delete lastList; }
232                         lastList = list;                        
233                         
234                         list = input->getListVector();
235                         count++;
236                 }
237                 
238                 //output error messages about any remaining user labels
239                 set<string>::iterator it;
240                 bool needToRun = false;
241                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
242                         cout << "Your file does not include the label "<< *it; 
243                         if (processedLabels.count(lastList->getLabel()) != 1) {
244                                 cout << ". I will use " << lastList->getLabel() << "." << endl;
245                                 needToRun = true;
246                         }else {
247                                 cout << ". Please refer to " << lastList->getLabel() << "." << endl;
248                         }
249                 }
250                 
251                 //run last line if you need to
252                 if (needToRun == true)  {
253                         cout << lastList->getLabel() << '\t' << count << endl;
254                         error = process(lastList);
255                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
256                 }
257                 delete lastList;
258                 
259                 delete matrix;
260                 globaldata->gSparseMatrix = NULL;
261                 delete list;
262                 globaldata->gListVector = NULL;
263
264                 return 0;
265         }
266         catch(exception& e) {
267                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
268                 exit(1);
269         }
270         catch(...) {
271                 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
272                 exit(1);
273         }
274
275 }
276
277 //**********************************************************************************************************************
278 void GetOTURepCommand::readNamesFile() {
279         try {
280                 vector<string> dupNames;
281                 openInputFile(namesfile, inNames);
282                 
283                 string name, names, sequence;
284         
285                 while(inNames){
286                         inNames >> name;                        //read from first column  A
287                         inNames >> names;               //read from second column  A,B,C,D
288                         
289                         dupNames.clear();
290                         
291                         //parse names into vector
292                         splitAtComma(names, dupNames);
293                         
294                         //store names in fasta map
295                         sequence = fasta->getSequence(name);
296                         for (int i = 0; i < dupNames.size(); i++) {
297                                 fasta->push_back(dupNames[i], sequence);
298                         }
299                 
300                         gobble(inNames);
301                 }
302                 inNames.close();
303
304         }
305         catch(exception& e) {
306                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
307                 exit(1);
308         }
309         catch(...) {
310                 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
311                 exit(1);
312         }       
313 }
314 //**********************************************************************************************************************
315 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
316         try{
317                 vector<string> names;
318                 map<string, float> sums;
319                 map<string, float>::iterator it4;
320                 map<int, string> binMap; //subset of namesToIndex - just member of this bin
321                 string binnames;
322                 float min = 10000;
323                 string minName;
324                 map<string, string> groups;
325                 map<string, string>::iterator groupIt;
326                 
327                 binnames = thisList->get(bin);
328         
329                 //parse names into vector
330                 splitAtComma(binnames, names);
331                 
332                 //if you have a groupfile
333                 if(groupfile != "") {
334                         //find the groups that are in this bin
335                         for (int i = 0; i < names.size(); i++) {
336                                 string groupName = groupMap->getGroup(names[i]);
337                                 if (groupName == "not found") {  
338                                         cout << names[i] << " is missing from your group file. Please correct. " << endl;
339                                         groupError = true;
340                                 }else{
341                                         groups[groupName] = groupName;
342                                 }
343                         }
344                         
345                         //turn the groups into a string
346                         for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
347                         
348                         //rip off last dash
349                         group = group.substr(0, group.length()-1);
350                 }
351                 
352                 //if only 1 sequence in bin then that's the rep
353                 if (names.size() == 1) { return names[0]; }
354                 else {
355                         //fill binMap
356                         for (int i = 0; i < names.size(); i++) {
357                                 for (it3 = nameToIndex.begin(); it3 != nameToIndex.end(); it3++) {
358
359                                         if (it3->first == names[i]) {  
360                                                 binMap[it3->second] = it3->first;
361
362                                                 //initialize sums map
363                                                 sums[it3->first] = 0.0;
364                                                 break;
365                                         }
366                                 }
367                         }
368                         
369                         //go through each cell in the sparsematrix
370                         for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
371                                 //is this a distance between 2 members of this bin
372                                 it = binMap.find(currentCell->row);
373                                 it2 = binMap.find(currentCell->column);
374                                 
375                                 //sum the distance of the sequences in the bin to eachother
376                                 if ((it != binMap.end()) && (it2 != binMap.end())) {
377                                         //this is a cell that repesents the distance between to of this bins members
378                                         sums[it->second] += currentCell->dist;
379                                         sums[it2->second] += currentCell->dist;
380                                 }
381                         }
382                         
383                         //smallest sum is the representative
384                         for (it4 = sums.begin(); it4 != sums.end(); it4++) {
385                                 if (it4->second < min) {
386                                         min = it4->second;
387                                         minName = it4->first;
388                                 }
389
390                         }
391                         
392                         return minName;
393                 }
394         
395         }
396         catch(exception& e) {
397                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
398                 exit(1);
399         }
400         catch(...) {
401                 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
402                 exit(1);
403         }       
404 }
405
406 //**********************************************************************************************************************
407 int GetOTURepCommand::process(ListVector* processList) {
408         try{
409                                 string nameRep, name, sequence;
410
411                                 //create output file
412                                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
413                                 openOutputFile(outputFileName, out);
414                                 
415                                 //for each bin in the list vector
416                                 for (int i = 0; i < processList->size(); i++) {
417                                         string groups;
418                                         nameRep = FindRep(i, groups, processList);
419                                         
420                                         //print out name and sequence for that bin
421                                         sequence = fasta->getSequence(nameRep);
422
423                                         if (sequence != "not found") {
424                                                 if (groupfile == "") {
425                                                         nameRep = nameRep + "|" + toString(i+1);
426                                                         out << ">" << nameRep << endl;
427                                                         out << sequence << endl;
428                                                 }else {
429                                                         nameRep = nameRep + "|" + groups + "|" + toString(i+1);
430                                                         out << ">" << nameRep << endl;
431                                                         out << sequence << endl;
432                                                 }
433                                         }else { 
434                                                 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl; 
435                                                 remove(outputFileName.c_str());
436                                                 return 1;
437                                         }
438                                 }
439                                 
440                                 out.close();
441                                 return 0;
442         
443         }
444         catch(exception& e) {
445                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
446                 exit(1);
447         }
448         catch(...) {
449                 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
450                 exit(1);
451         }       
452 }
453
454 //**********************************************************************************************************************
455
456
457
458
459