]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
merged pat's trim seqs edits with sarah's major overhaul of global data; also added...
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
14         try{
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 allLines = 1;
18                 lines.clear();
19                 labels.clear();
20                 
21                 //allow user to run help
22                 if(option == "help") { help(); abort = true; }
23                 
24                 else {
25                         //valid paramters for this command
26                         string Array[] =  {"fasta","list","line","label","name", "group"};
27                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28                         
29                         OptionParser parser(option);
30                         map<string, string> parameters = parser.getParameters();
31                         
32                         ValidParameters validParameter;
33                 
34                         //check to make sure all parameters are valid for command
35                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
36                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
37                         }
38                         
39                         //make sure the user has already run the read.otu command
40                         if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
41                                 cout << "Before you use the get.oturep command, you first need to read in a distance matrix." << endl; 
42                                 abort = true;
43                         }
44                         
45                         //check for required parameters
46                         fastafile = validParameter.validFile(parameters, "fasta", true);
47                         if (fastafile == "not found") { cout << "fasta is a required parameter for the get.oturep command." << endl; abort = true; }
48                         else if (fastafile == "not open") { abort = true; }     
49                 
50                         listfile = validParameter.validFile(parameters, "list", true);
51                         if (listfile == "not found") { cout << "list is a required parameter for the get.oturep command." << endl; abort = true; }
52                         else if (listfile == "not open") { abort = true; }      
53
54                         //check for optional parameter and set defaults
55                         // ...at some point should added some additional type checking...
56                         line = validParameter.validFile(parameters, "line", false);                             
57                         if (line == "not found") { line = "";  }
58                         else { 
59                                 if(line != "all") {  splitAtDash(line, lines);  allLines = 0;  }
60                                 else { allLines = 1;  }
61                         }
62                         
63                         label = validParameter.validFile(parameters, "label", false);                   
64                         if (label == "not found") { label = ""; }
65                         else { 
66                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
67                                 else { allLines = 1;  }
68                         }
69                         
70                         //make sure user did not use both the line and label parameters
71                         if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
72                         //if the user has not specified any line or labels use the ones from read.otu
73                         else if ((line == "") && (label == "")) {  
74                                 allLines = globaldata->allLines; 
75                                 labels = globaldata->labels; 
76                                 lines = globaldata->lines;
77                         }
78                         
79                         namesfile = validParameter.validFile(parameters, "name", true);
80                         if (namesfile == "not open") { abort = true; }  
81                         else if (namesfile == "not found") { namesfile = ""; }
82
83                         groupfile = validParameter.validFile(parameters, "group", true);
84                         if (groupfile == "not open") { abort = true; }
85                         else if (groupfile == "not found") { groupfile = ""; }
86                         else {
87                                 //read in group map info.
88                                 groupMap = new GroupMap(groupfile);
89                                 groupMap->readMap();
90                         }
91                 
92                         if (abort == false) {
93                         
94                                 if(globaldata->gSparseMatrix != NULL)   {       matrix = new SparseMatrix(*globaldata->gSparseMatrix);          }       
95                                         
96                                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
97                                 if(globaldata->gListVector != NULL)             {
98                 
99                                         vector<string> names;
100                                         string binnames;
101                                         //map names to rows in sparsematrix
102                                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
103                                                 names.clear();
104                                                 binnames = globaldata->gListVector->get(i);
105         
106                                                 splitAtComma(binnames, names);
107                                 
108                                                 for (int j = 0; j < names.size(); j++) {
109                                                         nameToIndex[names[j]] = i;
110                                                 }
111         
112                                         }
113                                 }else { cout << "error, no listvector." << endl; }
114                                 
115                                 openInputFile(fastafile, in);
116                                 fasta = new FastaMap();
117                         }
118                 
119                 }
120         
121         }
122         catch(exception& e) {
123                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
124                 exit(1);
125         }
126         catch(...) {
127                 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
128                 exit(1);
129         }
130 }
131
132 //**********************************************************************************************************************
133
134 void GetOTURepCommand::help(){
135         try {
136                 cout << "The get.oturep command can only be executed after a successful read.dist command." << "\n";
137                 cout << "The get.oturep command parameters are list, fasta, name, group, line and label.  The fasta and list parameters are required, and you may not use line and label at the same time." << "\n";
138                 cout << "The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes." << "\n";
139                 cout << "The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, line=yourLines, label=yourLabels)." << "\n";
140                 cout << "Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, line=1-3-5, name=amazon.names)." << "\n";
141                 cout << "The default value for line and label are all lines in your inputfile." << "\n";
142                 cout << "The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin." << "\n";
143                 cout << "If you provide a groupfile, then it also appends the names of the groups present in that bin." << "\n";
144                 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile)." << "\n" << "\n";
145         }
146         catch(exception& e) {
147                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
148                 exit(1);
149         }
150         catch(...) {
151                 cout << "An unknown error has occurred in the GetOTURepCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
152                 exit(1);
153         }       
154 }
155
156 //**********************************************************************************************************************
157
158 GetOTURepCommand::~GetOTURepCommand(){
159         delete input;
160         delete read;
161         delete fasta;
162         if (groupfile != "") {
163                 delete groupMap;
164         }
165 }
166
167 //**********************************************************************************************************************
168
169 int GetOTURepCommand::execute(){
170         try {
171         
172                 if (abort == true) { return 0; }
173                 
174                 int count = 1;
175                 int error;
176                 
177                 //read fastafile
178                 fasta->readFastaFile(in);
179                 
180                 //set format to list so input can get listvector
181 //              globaldata->setFormat("list");
182                 
183                 //if user gave a namesfile then use it
184                 if (namesfile != "") {
185                         readNamesFile();
186                 }
187                 
188                 //read list file
189                 read = new ReadOTUFile(listfile);       
190                 read->read(&*globaldata); 
191                 
192                 input = globaldata->ginput;
193                 list = globaldata->gListVector;
194                 ListVector* lastList = list;
195                 
196                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
197                 set<string> processedLabels;
198                 set<string> userLabels = labels;
199                 set<int> userLines = lines;
200
201                 
202                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
203                         
204                         if(allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
205                                         cout << list->getLabel() << '\t' << count << endl;
206                                         error = process(list);
207                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
208                                         
209                                         processedLabels.insert(list->getLabel());
210                                         userLabels.erase(list->getLabel());
211                                         userLines.erase(count);
212                         }
213                         
214                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
215                                         cout << lastList->getLabel() << '\t' << count << endl;
216                                         error = process(lastList);
217                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
218                                         
219                                         processedLabels.insert(lastList->getLabel());
220                                         userLabels.erase(lastList->getLabel());
221                         }
222                         
223                         if (count != 1) { delete lastList; }
224                         lastList = list;                        
225                         
226                         list = input->getListVector();
227                         count++;
228                 }
229                 
230                 //output error messages about any remaining user labels
231                 bool needToRun = false;
232                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
233                         cout << "Your file does not include the label "<< *it; 
234                         if (processedLabels.count(lastList->getLabel()) != 1) {
235                                 cout << ". I will use " << lastList->getLabel() << "." << endl;
236                                 needToRun = true;
237                         }else {
238                                 cout << ". Please refer to " << lastList->getLabel() << "." << endl;
239                         }
240                 }
241                 
242                 //run last line if you need to
243                 if (needToRun == true)  {
244                         cout << lastList->getLabel() << '\t' << count << endl;
245                         error = process(lastList);
246                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
247                 }
248                 delete lastList;
249                 
250                 delete matrix;
251                 globaldata->gSparseMatrix = NULL;
252                 delete list;
253                 globaldata->gListVector = NULL;
254
255                 return 0;
256         }
257         catch(exception& e) {
258                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
259                 exit(1);
260         }
261         catch(...) {
262                 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
263                 exit(1);
264         }
265
266 }
267
268 //**********************************************************************************************************************
269 void GetOTURepCommand::readNamesFile() {
270         try {
271                 vector<string> dupNames;
272                 openInputFile(namesfile, inNames);
273                 
274                 string name, names, sequence;
275         
276                 while(inNames){
277                         inNames >> name;                        //read from first column  A
278                         inNames >> names;               //read from second column  A,B,C,D
279                         
280                         dupNames.clear();
281                         
282                         //parse names into vector
283                         splitAtComma(names, dupNames);
284                         
285                         //store names in fasta map
286                         sequence = fasta->getSequence(name);
287                         for (int i = 0; i < dupNames.size(); i++) {
288                                 fasta->push_back(dupNames[i], sequence);
289                         }
290                 
291                         gobble(inNames);
292                 }
293                 inNames.close();
294
295         }
296         catch(exception& e) {
297                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
298                 exit(1);
299         }
300         catch(...) {
301                 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
302                 exit(1);
303         }       
304 }
305 //**********************************************************************************************************************
306 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
307         try{
308                 vector<string> names;
309                 map<string, float> sums;
310                 map<int, string> binMap; //subset of namesToIndex - just member of this bin
311                 string binnames;
312                 float min = 10000;
313                 string minName;
314                 map<string, string> groups;
315                 map<string, string>::iterator groupIt;
316                 
317                 binnames = thisList->get(bin);
318         
319                 //parse names into vector
320                 splitAtComma(binnames, names);
321                 
322                 //if you have a groupfile
323                 if(groupfile != "") {
324                         //find the groups that are in this bin
325                         for (int i = 0; i < names.size(); i++) {
326                                 string groupName = groupMap->getGroup(names[i]);
327                                 if (groupName == "not found") {  
328                                         cout << names[i] << " is missing from your group file. Please correct. " << endl;
329                                         groupError = true;
330                                 }else{
331                                         groups[groupName] = groupName;
332                                 }
333                         }
334                         
335                         //turn the groups into a string
336                         for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
337                         
338                         //rip off last dash
339                         group = group.substr(0, group.length()-1);
340                 }
341                 
342                 //if only 1 sequence in bin then that's the rep
343                 if (names.size() == 1) { return names[0]; }
344                 else {
345                         //fill binMap
346                         for (int i = 0; i < names.size(); i++) {
347                                 for (map<string, int>::iterator it = nameToIndex.begin(); it != nameToIndex.end(); it++) {
348
349                                         if (it->first == names[i]) {  
350                                                 binMap[it->second] = it->first;
351
352                                                 //initialize sums map
353                                                 sums[it->first] = 0.0;
354                                                 break;
355                                         }
356                                 }
357                         }
358                         
359                         //go through each cell in the sparsematrix
360                         for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
361                                 //is this a distance between 2 members of this bin
362                                 map<int, string>::iterator it = binMap.find(currentCell->row);
363                                 map<int, string>::iterator it2 = binMap.find(currentCell->column);
364                                 
365                                 //sum the distance of the sequences in the bin to eachother
366                                 if ((it != binMap.end()) && (it2 != binMap.end())) {
367                                         //this is a cell that repesents the distance between to of this bins members
368                                         sums[it->second] += currentCell->dist;
369                                         sums[it2->second] += currentCell->dist;
370                                 }
371                         }
372                         
373                         //smallest sum is the representative
374                         for (map<string, float>::iterator it4 = sums.begin(); it4 != sums.end(); it4++) {
375                                 if (it4->second < min) {
376                                         min = it4->second;
377                                         minName = it4->first;
378                                 }
379
380                         }
381                         
382                         return minName;
383                 }
384         
385         }
386         catch(exception& e) {
387                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
388                 exit(1);
389         }
390         catch(...) {
391                 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
392                 exit(1);
393         }       
394 }
395
396 //**********************************************************************************************************************
397 int GetOTURepCommand::process(ListVector* processList) {
398         try{
399                                 string nameRep, name, sequence;
400
401                                 //create output file
402                                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
403                                 openOutputFile(outputFileName, out);
404                                 
405                                 //for each bin in the list vector
406                                 for (int i = 0; i < processList->size(); i++) {
407                                         string groups;
408                                         nameRep = FindRep(i, groups, processList);
409                                         
410                                         //print out name and sequence for that bin
411                                         sequence = fasta->getSequence(nameRep);
412
413                                         if (sequence != "not found") {
414                                                 if (groupfile == "") {
415                                                         nameRep = nameRep + "|" + toString(i+1);
416                                                         out << ">" << nameRep << endl;
417                                                         out << sequence << endl;
418                                                 }else {
419                                                         nameRep = nameRep + "|" + groups + "|" + toString(i+1);
420                                                         out << ">" << nameRep << endl;
421                                                         out << sequence << endl;
422                                                 }
423                                         }else { 
424                                                 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl; 
425                                                 remove(outputFileName.c_str());
426                                                 return 1;
427                                         }
428                                 }
429                                 
430                                 out.close();
431                                 return 0;
432         
433         }
434         catch(exception& e) {
435                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
436                 exit(1);
437         }
438         catch(...) {
439                 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
440                 exit(1);
441         }       
442 }
443
444 //**********************************************************************************************************************
445
446
447
448
449