]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
changed how we do "smart" distancing
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
14         try{
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 allLines = 1;
18                 lines.clear();
19                 labels.clear();
20                 
21                 //allow user to run help
22                 if(option == "help") { help(); abort = true; }
23                 
24                 else {
25                         //valid paramters for this command
26                         string Array[] =  {"fasta","list","line","label","name", "group"};
27                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28                         
29                         OptionParser parser(option);
30                         map<string, string> parameters = parser.getParameters();
31                         
32                         ValidParameters validParameter;
33                 
34                         //check to make sure all parameters are valid for command
35                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //make sure the user has already run the read.otu command
40                         if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
41                                 cout << "Before you use the get.oturep command, you first need to read in a distance matrix." << endl; 
42                                 abort = true;
43                         }
44                         
45                         //check for required parameters
46                         fastafile = validParameter.validFile(parameters, "fasta", true);
47                         if (fastafile == "not found") { cout << "fasta is a required parameter for the get.oturep command." << endl; abort = true; }
48                         else if (fastafile == "not open") { abort = true; }     
49                 
50                         listfile = validParameter.validFile(parameters, "list", true);
51                         if (listfile == "not found") { cout << "list is a required parameter for the get.oturep command." << endl; abort = true; }
52                         else if (listfile == "not open") { abort = true; }      
53
54                         //check for optional parameter and set defaults
55                         // ...at some point should added some additional type checking...
56                         line = validParameter.validFile(parameters, "line", false);                             
57                         if (line == "not found") { line = "";  }
58                         else { 
59                                 if(line != "all") {  splitAtDash(line, lines);  allLines = 0;  }
60                                 else { allLines = 1;  }
61                         }
62                         
63                         label = validParameter.validFile(parameters, "label", false);                   
64                         if (label == "not found") { label = ""; }
65                         else { 
66                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
67                                 else { allLines = 1;  }
68                         }
69                         
70                         //make sure user did not use both the line and label parameters
71                         if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
72                         //if the user has not specified any line or labels use the ones from read.otu
73                         else if ((line == "") && (label == "")) {  
74                                 allLines = globaldata->allLines; 
75                                 labels = globaldata->labels; 
76                                 lines = globaldata->lines;
77                         }
78                         
79                         namesfile = validParameter.validFile(parameters, "name", true);
80                         if (namesfile == "not open") { abort = true; }  
81                         else if (namesfile == "not found") { namesfile = ""; }
82
83                         groupfile = validParameter.validFile(parameters, "group", true);
84                         if (groupfile == "not open") { abort = true; }
85                         else if (groupfile == "not found") { groupfile = ""; }
86                         else {
87                                 //read in group map info.
88                                 groupMap = new GroupMap(groupfile);
89                                 groupMap->readMap();
90                         }
91                 
92                         if (abort == false) {
93                         
94                                 if(globaldata->gSparseMatrix != NULL)   {       matrix = globaldata->gSparseMatrix;             }       
95                                         
96                                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
97                                 if(globaldata->gListVector != NULL)             {
98                 
99                                         vector<string> names;
100                                         string binnames;
101                                         //map names to rows in sparsematrix
102                                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
103                                                 names.clear();
104                                                 binnames = globaldata->gListVector->get(i);
105         
106                                                 splitAtComma(binnames, names);
107                                 
108                                                 for (int j = 0; j < names.size(); j++) {
109                                                         nameToIndex[names[j]] = i;
110                                                 }
111         
112                                         }
113                                 }else { cout << "error, no listvector." << endl; }
114                                 
115 //                              openInputFile(fastafile, in);
116                                 fasta = new FastaMap();
117                         }
118                 
119                 }
120         
121         }
122         catch(exception& e) {
123                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
124                 exit(1);
125         }
126         catch(...) {
127                 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
128                 exit(1);
129         }
130 }
131
132 //**********************************************************************************************************************
133
134 void GetOTURepCommand::help(){
135         try {
136                 cout << "The get.oturep command can only be executed after a successful read.dist command." << "\n";
137                 cout << "The get.oturep command parameters are list, fasta, name, group, line and label.  The fasta and list parameters are required, and you may not use line and label at the same time." << "\n";
138                 cout << "The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes." << "\n";
139                 cout << "The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, line=yourLines, label=yourLabels)." << "\n";
140                 cout << "Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, line=1-3-5, name=amazon.names)." << "\n";
141                 cout << "The default value for line and label are all lines in your inputfile." << "\n";
142                 cout << "The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin." << "\n";
143                 cout << "If you provide a groupfile, then it also appends the names of the groups present in that bin." << "\n";
144                 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile)." << "\n" << "\n";
145         }
146         catch(exception& e) {
147                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
148                 exit(1);
149         }
150         catch(...) {
151                 cout << "An unknown error has occurred in the GetOTURepCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
152                 exit(1);
153         }       
154 }
155
156 //**********************************************************************************************************************
157
158 GetOTURepCommand::~GetOTURepCommand(){
159         if (abort == false) {
160                 delete input;  globaldata->ginput = NULL;
161                 delete read;
162                 delete fasta;
163                 if (groupfile != "") {
164                         delete groupMap;  globaldata->gGroupmap = NULL;
165                 }
166         }
167 }
168
169 //**********************************************************************************************************************
170
171 int GetOTURepCommand::execute(){
172         try {
173         
174                 if (abort == true) { return 0; }
175                 
176                 int count = 1;
177                 int error;
178                 
179                 //read fastafile
180                 fasta->readFastaFile(fastafile);
181                 
182                 in.close();
183                 
184                 //set format to list so input can get listvector
185                 globaldata->setFormat("list");
186                 
187                 //if user gave a namesfile then use it
188                 if (namesfile != "") {
189                         readNamesFile();
190                 }
191                 
192                 //read list file
193                 read = new ReadOTUFile(listfile);       
194                 read->read(&*globaldata); 
195                 
196                 input = globaldata->ginput;
197                 list = globaldata->gListVector;
198                 string lastLabel = list->getLabel();
199                 
200                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
201                 set<string> processedLabels;
202                 set<string> userLabels = labels;
203                 set<int> userLines = lines;
204
205                 
206                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
207                         
208                         if(allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
209                                         cout << list->getLabel() << '\t' << count << endl;
210                                         error = process(list);
211                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
212                                         
213                                         processedLabels.insert(list->getLabel());
214                                         userLabels.erase(list->getLabel());
215                                         userLines.erase(count);
216                         }
217                         
218                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
219                                         delete list;
220                                         list = input->getListVector(lastLabel);
221                                         
222                                         cout << list->getLabel() << '\t' << count << endl;
223                                         error = process(list);
224                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
225                                         
226                                         processedLabels.insert(list->getLabel());
227                                         userLabels.erase(list->getLabel());
228                         }
229                         
230                         
231                         lastLabel = list->getLabel();                   
232                         
233                         delete list;
234                         list = input->getListVector();
235                         count++;
236                 }
237                 
238                 //output error messages about any remaining user labels
239                 bool needToRun = false;
240                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
241                         cout << "Your file does not include the label "<< *it; 
242                         if (processedLabels.count(lastLabel) != 1) {
243                                 cout << ". I will use " << lastLabel << "." << endl;
244                                 needToRun = true;
245                         }else {
246                                 cout << ". Please refer to " << lastLabel << "." << endl;
247                         }
248                 }
249                 
250                 //run last line if you need to
251                 if (needToRun == true)  {
252                         delete list;
253                         list = input->getListVector(lastLabel);
254
255                         cout << list->getLabel() << '\t' << count << endl;
256                         error = process(list);
257                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
258                         delete list;
259                 }
260                 
261                 delete matrix;
262                 globaldata->gSparseMatrix = NULL;
263                 globaldata->gListVector = NULL;
264
265                 return 0;
266         }
267         catch(exception& e) {
268                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
269                 exit(1);
270         }
271         catch(...) {
272                 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
273                 exit(1);
274         }
275
276 }
277
278 //**********************************************************************************************************************
279 void GetOTURepCommand::readNamesFile() {
280         try {
281                 vector<string> dupNames;
282                 openInputFile(namesfile, inNames);
283                 
284                 string name, names, sequence;
285         
286                 while(inNames){
287                         inNames >> name;                        //read from first column  A
288                         inNames >> names;               //read from second column  A,B,C,D
289                         
290                         dupNames.clear();
291                         
292                         //parse names into vector
293                         splitAtComma(names, dupNames);
294                         
295                         //store names in fasta map
296                         sequence = fasta->getSequence(name);
297                         for (int i = 0; i < dupNames.size(); i++) {
298                                 fasta->push_back(dupNames[i], sequence);
299                         }
300                 
301                         gobble(inNames);
302                 }
303                 inNames.close();
304
305         }
306         catch(exception& e) {
307                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
308                 exit(1);
309         }
310         catch(...) {
311                 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
312                 exit(1);
313         }       
314 }
315 //**********************************************************************************************************************
316 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
317         try{
318                 vector<string> names;
319                 map<string, float> sums;
320                 map<int, string> binMap; //subset of namesToIndex - just member of this bin
321                 string binnames;
322                 float min = 10000;
323                 string minName;
324                 map<string, string> groups;
325                 map<string, string>::iterator groupIt;
326                 
327                 binnames = thisList->get(bin);
328         
329                 //parse names into vector
330                 splitAtComma(binnames, names);
331                 
332                 //if you have a groupfile
333                 if(groupfile != "") {
334                         //find the groups that are in this bin
335                         for (int i = 0; i < names.size(); i++) {
336                                 string groupName = groupMap->getGroup(names[i]);
337                                 if (groupName == "not found") {  
338                                         cout << names[i] << " is missing from your group file. Please correct. " << endl;
339                                         groupError = true;
340                                 }else{
341                                         groups[groupName] = groupName;
342                                 }
343                         }
344                         
345                         //turn the groups into a string
346                         for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
347                         
348                         //rip off last dash
349                         group = group.substr(0, group.length()-1);
350                 }
351                 
352                 //if only 1 sequence in bin then that's the rep
353                 if (names.size() == 1) { return names[0]; }
354                 else {
355                         //fill binMap
356                         for (int i = 0; i < names.size(); i++) {
357                                 for (map<string, int>::iterator it = nameToIndex.begin(); it != nameToIndex.end(); it++) {
358
359                                         if (it->first == names[i]) {  
360                                                 binMap[it->second] = it->first;
361
362                                                 //initialize sums map
363                                                 sums[it->first] = 0.0;
364                                                 break;
365                                         }
366                                 }
367                         }
368                         
369                         //go through each cell in the sparsematrix
370                         for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
371                                 //is this a distance between 2 members of this bin
372                                 map<int, string>::iterator it = binMap.find(currentCell->row);
373                                 map<int, string>::iterator it2 = binMap.find(currentCell->column);
374                                 
375                                 //sum the distance of the sequences in the bin to eachother
376                                 if ((it != binMap.end()) && (it2 != binMap.end())) {
377                                         //this is a cell that repesents the distance between to of this bins members
378                                         sums[it->second] += currentCell->dist;
379                                         sums[it2->second] += currentCell->dist;
380                                 }
381                         }
382                         
383                         //smallest sum is the representative
384                         for (map<string, float>::iterator it4 = sums.begin(); it4 != sums.end(); it4++) {
385                                 if (it4->second < min) {
386                                         min = it4->second;
387                                         minName = it4->first;
388                                 }
389
390                         }
391                         
392                         return minName;
393                 }
394         
395         }
396         catch(exception& e) {
397                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
398                 exit(1);
399         }
400         catch(...) {
401                 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
402                 exit(1);
403         }       
404 }
405
406 //**********************************************************************************************************************
407 int GetOTURepCommand::process(ListVector* processList) {
408         try{
409                                 string nameRep, name, sequence;
410
411                                 //create output file
412                                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
413                                 openOutputFile(outputFileName, out);
414                                 
415                                 //for each bin in the list vector
416                                 for (int i = 0; i < processList->size(); i++) {
417                                         string groups;
418                                         nameRep = FindRep(i, groups, processList);
419                                         
420                                         //print out name and sequence for that bin
421                                         sequence = fasta->getSequence(nameRep);
422
423                                         if (sequence != "not found") {
424                                                 if (groupfile == "") {
425                                                         nameRep = nameRep + "|" + toString(i+1);
426                                                         out << ">" << nameRep << endl;
427                                                         out << sequence << endl;
428                                                 }else {
429                                                         nameRep = nameRep + "|" + groups + "|" + toString(i+1);
430                                                         out << ">" << nameRep << endl;
431                                                         out << sequence << endl;
432                                                 }
433                                         }else { 
434                                                 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl; 
435                                                 remove(outputFileName.c_str());
436                                                 return 1;
437                                         }
438                                 }
439                                 
440                                 out.close();
441                                 return 0;
442         
443         }
444         catch(exception& e) {
445                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
446                 exit(1);
447         }
448         catch(...) {
449                 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
450                 exit(1);
451         }       
452 }
453
454 //**********************************************************************************************************************
455
456
457
458
459