]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
fixed some bugs
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
14         try{
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 allLines = 1;
18                 lines.clear();
19                 labels.clear();
20                 
21                 //allow user to run help
22                 if(option == "help") { help(); abort = true; }
23                 
24                 else {
25                         //valid paramters for this command
26                         string Array[] =  {"fasta","list","line","label","name", "group"};
27                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28                         
29                         OptionParser parser(option);
30                         map<string, string> parameters = parser.getParameters();
31                         
32                         ValidParameters validParameter;
33                 
34                         //check to make sure all parameters are valid for command
35                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //make sure the user has already run the read.otu command
40                         if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
41                                 cout << "Before you use the get.oturep command, you first need to read in a distance matrix." << endl; 
42                                 abort = true;
43                         }
44                         
45                         //check for required parameters
46                         fastafile = validParameter.validFile(parameters, "fasta", true);
47                         if (fastafile == "not found") { cout << "fasta is a required parameter for the get.oturep command." << endl; abort = true; }
48                         else if (fastafile == "not open") { abort = true; }     
49                 
50                         listfile = validParameter.validFile(parameters, "list", true);
51                         if (listfile == "not found") { cout << "list is a required parameter for the get.oturep command." << endl; abort = true; }
52                         else if (listfile == "not open") { abort = true; }      
53
54                         //check for optional parameter and set defaults
55                         // ...at some point should added some additional type checking...
56                         line = validParameter.validFile(parameters, "line", false);                             
57                         if (line == "not found") { line = "";  }
58                         else { 
59                                 if(line != "all") {  splitAtDash(line, lines);  allLines = 0;  }
60                                 else { allLines = 1;  }
61                         }
62                         
63                         label = validParameter.validFile(parameters, "label", false);                   
64                         if (label == "not found") { label = ""; }
65                         else { 
66                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
67                                 else { allLines = 1;  }
68                         }
69                         
70                         //make sure user did not use both the line and label parameters
71                         if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
72                         //if the user has not specified any line or labels use the ones from read.otu
73                         else if ((line == "") && (label == "")) {  
74                                 allLines = globaldata->allLines; 
75                                 labels = globaldata->labels; 
76                                 lines = globaldata->lines;
77                         }
78                         
79                         namesfile = validParameter.validFile(parameters, "name", true);
80                         if (namesfile == "not open") { abort = true; }  
81                         else if (namesfile == "not found") { namesfile = ""; }
82
83                         groupfile = validParameter.validFile(parameters, "group", true);
84                         if (groupfile == "not open") { abort = true; }
85                         else if (groupfile == "not found") { groupfile = ""; }
86                         else {
87                                 //read in group map info.
88                                 groupMap = new GroupMap(groupfile);
89                                 groupMap->readMap();
90                         }
91                 
92                         if (abort == false) {
93                         
94                                 if(globaldata->gSparseMatrix != NULL)   {       matrix = globaldata->gSparseMatrix;             }       
95                                         
96                                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
97                                 if(globaldata->gListVector != NULL)             {
98                 
99                                         vector<string> names;
100                                         string binnames;
101                                         //map names to rows in sparsematrix
102                                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
103                                                 names.clear();
104                                                 binnames = globaldata->gListVector->get(i);
105         
106                                                 splitAtComma(binnames, names);
107                                 
108                                                 for (int j = 0; j < names.size(); j++) {
109                                                         nameToIndex[names[j]] = i;
110                                                 }
111         
112                                         }
113                                 }else { cout << "error, no listvector." << endl; }
114                                 
115                                 openInputFile(fastafile, in);
116                                 fasta = new FastaMap();
117                         }
118                 
119                 }
120         
121         }
122         catch(exception& e) {
123                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
124                 exit(1);
125         }
126         catch(...) {
127                 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
128                 exit(1);
129         }
130 }
131
132 //**********************************************************************************************************************
133
134 void GetOTURepCommand::help(){
135         try {
136                 cout << "The get.oturep command can only be executed after a successful read.dist command." << "\n";
137                 cout << "The get.oturep command parameters are list, fasta, name, group, line and label.  The fasta and list parameters are required, and you may not use line and label at the same time." << "\n";
138                 cout << "The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes." << "\n";
139                 cout << "The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, line=yourLines, label=yourLabels)." << "\n";
140                 cout << "Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, line=1-3-5, name=amazon.names)." << "\n";
141                 cout << "The default value for line and label are all lines in your inputfile." << "\n";
142                 cout << "The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin." << "\n";
143                 cout << "If you provide a groupfile, then it also appends the names of the groups present in that bin." << "\n";
144                 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile)." << "\n" << "\n";
145         }
146         catch(exception& e) {
147                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
148                 exit(1);
149         }
150         catch(...) {
151                 cout << "An unknown error has occurred in the GetOTURepCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
152                 exit(1);
153         }       
154 }
155
156 //**********************************************************************************************************************
157
158 GetOTURepCommand::~GetOTURepCommand(){
159         if (abort == false) {
160                 delete input;  globaldata->ginput = NULL;
161                 delete read;
162                 delete fasta;
163                 if (groupfile != "") {
164                         delete groupMap;  globaldata->gGroupmap = NULL;
165                 }
166         }
167 }
168
169 //**********************************************************************************************************************
170
171 int GetOTURepCommand::execute(){
172         try {
173         
174                 if (abort == true) { return 0; }
175                 
176                 int count = 1;
177                 int error;
178                 
179                 //read fastafile
180                 fasta->readFastaFile(in);
181                 
182                 in.close();
183                 
184                 //set format to list so input can get listvector
185                 globaldata->setFormat("list");
186                 
187                 //if user gave a namesfile then use it
188                 if (namesfile != "") {
189                         readNamesFile();
190                 }
191                 
192                 //read list file
193                 read = new ReadOTUFile(listfile);       
194                 read->read(&*globaldata); 
195                 
196                 input = globaldata->ginput;
197                 list = globaldata->gListVector;
198                 ListVector* lastList = list;
199                 
200                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
201                 set<string> processedLabels;
202                 set<string> userLabels = labels;
203                 set<int> userLines = lines;
204
205                 
206                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
207                         
208                         if(allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
209                                         cout << list->getLabel() << '\t' << count << endl;
210                                         error = process(list);
211                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
212                                         
213                                         processedLabels.insert(list->getLabel());
214                                         userLabels.erase(list->getLabel());
215                                         userLines.erase(count);
216                         }
217                         
218                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
219                                         cout << lastList->getLabel() << '\t' << count << endl;
220                                         error = process(lastList);
221                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
222                                         
223                                         processedLabels.insert(lastList->getLabel());
224                                         userLabels.erase(lastList->getLabel());
225                         }
226                         
227                         if (count != 1) { delete lastList; }
228                         lastList = list;                        
229                         
230                         list = input->getListVector();
231                         count++;
232                 }
233                 
234                 //output error messages about any remaining user labels
235                 bool needToRun = false;
236                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
237                         cout << "Your file does not include the label "<< *it; 
238                         if (processedLabels.count(lastList->getLabel()) != 1) {
239                                 cout << ". I will use " << lastList->getLabel() << "." << endl;
240                                 needToRun = true;
241                         }else {
242                                 cout << ". Please refer to " << lastList->getLabel() << "." << endl;
243                         }
244                 }
245                 
246                 //run last line if you need to
247                 if (needToRun == true)  {
248                         cout << lastList->getLabel() << '\t' << count << endl;
249                         error = process(lastList);
250                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
251                 }
252                 delete lastList;
253                 
254                 delete matrix;
255                 globaldata->gSparseMatrix = NULL;
256                 delete list;
257                 globaldata->gListVector = NULL;
258
259                 return 0;
260         }
261         catch(exception& e) {
262                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
263                 exit(1);
264         }
265         catch(...) {
266                 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
267                 exit(1);
268         }
269
270 }
271
272 //**********************************************************************************************************************
273 void GetOTURepCommand::readNamesFile() {
274         try {
275                 vector<string> dupNames;
276                 openInputFile(namesfile, inNames);
277                 
278                 string name, names, sequence;
279         
280                 while(inNames){
281                         inNames >> name;                        //read from first column  A
282                         inNames >> names;               //read from second column  A,B,C,D
283                         
284                         dupNames.clear();
285                         
286                         //parse names into vector
287                         splitAtComma(names, dupNames);
288                         
289                         //store names in fasta map
290                         sequence = fasta->getSequence(name);
291                         for (int i = 0; i < dupNames.size(); i++) {
292                                 fasta->push_back(dupNames[i], sequence);
293                         }
294                 
295                         gobble(inNames);
296                 }
297                 inNames.close();
298
299         }
300         catch(exception& e) {
301                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
302                 exit(1);
303         }
304         catch(...) {
305                 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
306                 exit(1);
307         }       
308 }
309 //**********************************************************************************************************************
310 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
311         try{
312                 vector<string> names;
313                 map<string, float> sums;
314                 map<int, string> binMap; //subset of namesToIndex - just member of this bin
315                 string binnames;
316                 float min = 10000;
317                 string minName;
318                 map<string, string> groups;
319                 map<string, string>::iterator groupIt;
320                 
321                 binnames = thisList->get(bin);
322         
323                 //parse names into vector
324                 splitAtComma(binnames, names);
325                 
326                 //if you have a groupfile
327                 if(groupfile != "") {
328                         //find the groups that are in this bin
329                         for (int i = 0; i < names.size(); i++) {
330                                 string groupName = groupMap->getGroup(names[i]);
331                                 if (groupName == "not found") {  
332                                         cout << names[i] << " is missing from your group file. Please correct. " << endl;
333                                         groupError = true;
334                                 }else{
335                                         groups[groupName] = groupName;
336                                 }
337                         }
338                         
339                         //turn the groups into a string
340                         for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
341                         
342                         //rip off last dash
343                         group = group.substr(0, group.length()-1);
344                 }
345                 
346                 //if only 1 sequence in bin then that's the rep
347                 if (names.size() == 1) { return names[0]; }
348                 else {
349                         //fill binMap
350                         for (int i = 0; i < names.size(); i++) {
351                                 for (map<string, int>::iterator it = nameToIndex.begin(); it != nameToIndex.end(); it++) {
352
353                                         if (it->first == names[i]) {  
354                                                 binMap[it->second] = it->first;
355
356                                                 //initialize sums map
357                                                 sums[it->first] = 0.0;
358                                                 break;
359                                         }
360                                 }
361                         }
362                         
363                         //go through each cell in the sparsematrix
364                         for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
365                                 //is this a distance between 2 members of this bin
366                                 map<int, string>::iterator it = binMap.find(currentCell->row);
367                                 map<int, string>::iterator it2 = binMap.find(currentCell->column);
368                                 
369                                 //sum the distance of the sequences in the bin to eachother
370                                 if ((it != binMap.end()) && (it2 != binMap.end())) {
371                                         //this is a cell that repesents the distance between to of this bins members
372                                         sums[it->second] += currentCell->dist;
373                                         sums[it2->second] += currentCell->dist;
374                                 }
375                         }
376                         
377                         //smallest sum is the representative
378                         for (map<string, float>::iterator it4 = sums.begin(); it4 != sums.end(); it4++) {
379                                 if (it4->second < min) {
380                                         min = it4->second;
381                                         minName = it4->first;
382                                 }
383
384                         }
385                         
386                         return minName;
387                 }
388         
389         }
390         catch(exception& e) {
391                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
392                 exit(1);
393         }
394         catch(...) {
395                 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
396                 exit(1);
397         }       
398 }
399
400 //**********************************************************************************************************************
401 int GetOTURepCommand::process(ListVector* processList) {
402         try{
403                                 string nameRep, name, sequence;
404
405                                 //create output file
406                                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
407                                 openOutputFile(outputFileName, out);
408                                 
409                                 //for each bin in the list vector
410                                 for (int i = 0; i < processList->size(); i++) {
411                                         string groups;
412                                         nameRep = FindRep(i, groups, processList);
413                                         
414                                         //print out name and sequence for that bin
415                                         sequence = fasta->getSequence(nameRep);
416
417                                         if (sequence != "not found") {
418                                                 if (groupfile == "") {
419                                                         nameRep = nameRep + "|" + toString(i+1);
420                                                         out << ">" << nameRep << endl;
421                                                         out << sequence << endl;
422                                                 }else {
423                                                         nameRep = nameRep + "|" + groups + "|" + toString(i+1);
424                                                         out << ">" << nameRep << endl;
425                                                         out << sequence << endl;
426                                                 }
427                                         }else { 
428                                                 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl; 
429                                                 remove(outputFileName.c_str());
430                                                 return 1;
431                                         }
432                                 }
433                                 
434                                 out.close();
435                                 return 0;
436         
437         }
438         catch(exception& e) {
439                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
440                 exit(1);
441         }
442         catch(...) {
443                 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
444                 exit(1);
445         }       
446 }
447
448 //**********************************************************************************************************************
449
450
451
452
453