]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added sorted parameter to get.oturep, added error checking to chimera classes in...
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //********************************************************************************************************************
13 //sorts lowest to highest
14 inline bool compareName(repStruct left, repStruct right){
15         return (left.name < right.name);        
16 }
17 //********************************************************************************************************************
18 //sorts lowest to highest
19 inline bool compareBin(repStruct left, repStruct right){
20         return (left.bin < right.bin);  
21 }
22 //********************************************************************************************************************
23 //sorts lowest to highest
24 inline bool compareSize(repStruct left, repStruct right){
25         return (left.size < right.size);        
26 }
27 //********************************************************************************************************************
28 //sorts lowest to highest
29 inline bool compareGroup(repStruct left, repStruct right){
30         return (left.group < right.group);      
31 }
32 //**********************************************************************************************************************
33 GetOTURepCommand::GetOTURepCommand(string option){
34         try{
35                 globaldata = GlobalData::getInstance();
36                 abort = false;
37                 allLines = 1;
38                 labels.clear();
39                 
40                 //allow user to run help
41                 if (option == "help") { 
42                         help(); abort = true;
43                 } else {
44                         //valid paramters for this command
45                         string Array[] =  {"fasta","list","label","name", "group", "sorted"};
46                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
47                         
48                         OptionParser parser(option);
49                         map<string, string> parameters = parser.getParameters();
50                         
51                         ValidParameters validParameter;
52                 
53                         //check to make sure all parameters are valid for command
54                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
55                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
56                         }
57                         
58                         //make sure the user has already run the read.otu command
59                         if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
60                                 mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine(); 
61                                 abort = true;
62                         }
63                         
64                         //check for required parameters
65                         fastafile = validParameter.validFile(parameters, "fasta", true);
66                         if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
67                         else if (fastafile == "not open") { abort = true; }     
68                 
69                         listfile = validParameter.validFile(parameters, "list", true);
70                         if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
71                         else if (listfile == "not open") { abort = true; }      
72
73                         //check for optional parameter and set defaults
74                         // ...at some point should added some additional type checking...
75                         label = validParameter.validFile(parameters, "label", false);                   
76                         if (label == "not found") { label = ""; }
77                         else { 
78                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
79                                 else { allLines = 1;  }
80                         }
81                         
82                         //if the user has not specified any labels use the ones from read.otu
83                         if (label == "") {  
84                                 allLines = globaldata->allLines; 
85                                 labels = globaldata->labels; 
86                         }
87                         
88                         namesfile = validParameter.validFile(parameters, "name", true);
89                         if (namesfile == "not open") { abort = true; }  
90                         else if (namesfile == "not found") { namesfile = ""; }
91
92                         groupfile = validParameter.validFile(parameters, "group", true);
93                         if (groupfile == "not open") { groupfile = ""; abort = true; }
94                         else if (groupfile == "not found") { groupfile = ""; }
95                         else {
96                                 //read in group map info.
97                                 groupMap = new GroupMap(groupfile);
98                                 groupMap->readMap();
99                         }
100                         
101                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
102                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
103                                 mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); mothurOutEndLine();
104                                 sorted = "";
105                         }
106                         
107                         if ((sorted == "group") && (groupfile == "")) {
108                                 mothurOut("You must provide a groupfile to sort by group. I will not sort."); mothurOutEndLine();
109                                 sorted = "";
110                         }
111                         
112                         if (abort == false) {
113                         
114                                 if(globaldata->gSparseMatrix != NULL)   {
115                                         matrix = globaldata->gSparseMatrix;
116                                 }
117
118                                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
119                                 if (globaldata->gListVector != NULL) {
120                                         vector<string> names;
121                                         string binnames;
122                                         //map names to rows in sparsematrix
123                                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
124                                                 names.clear();
125                                                 binnames = globaldata->gListVector->get(i);
126         
127                                                 splitAtComma(binnames, names);
128                                 
129                                                 for (int j = 0; j < names.size(); j++) {
130                                                         nameToIndex[names[j]] = i;
131                                                 }
132                                         }
133                                 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
134
135                                 // Create a data structure to quickly access the distance information.
136                                 // It consists of a vector of distance maps, where each map contains
137                                 // all distances of a certain sequence. Vector and maps are accessed
138                                 // via the index of a sequence in the distance matrix
139                                 seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
140                                 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
141                                         seqVec[currentCell->row][currentCell->column] = currentCell->dist;
142                                 }
143
144                                 fasta = new FastaMap();
145                         }
146                 }
147         }
148         catch(exception& e) {
149                 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
150                 exit(1);
151         }
152 }
153
154 //**********************************************************************************************************************
155
156 void GetOTURepCommand::help(){
157         try {
158                 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
159                 mothurOut("The get.oturep command parameters are list, fasta, name, group, sorted and label.  The fasta and list parameters are required.\n");
160                 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
161                 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
162                 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
163                 mothurOut("The default value for label is all labels in your inputfile.\n");
164                 mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
165                 mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
166                 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
167                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
168         }
169         catch(exception& e) {
170                 errorOut(e, "GetOTURepCommand", "help");
171                 exit(1);
172         }
173 }
174
175 //**********************************************************************************************************************
176
177 GetOTURepCommand::~GetOTURepCommand(){
178         if (abort == false) {
179                 delete input;  globaldata->ginput = NULL;
180                 delete read;
181                 delete fasta;
182                 if (groupfile != "") {
183                         delete groupMap;  globaldata->gGroupmap = NULL;
184                 }
185         }
186 }
187
188 //**********************************************************************************************************************
189
190 int GetOTURepCommand::execute(){
191         try {
192         
193                 if (abort == true) { return 0; }
194
195                 int error;
196                 
197                 //read fastafile
198                 fasta->readFastaFile(fastafile);
199                 
200                 in.close();
201                 
202                 //set format to list so input can get listvector
203                 globaldata->setFormat("list");
204                 
205                 //if user gave a namesfile then use it
206                 if (namesfile != "") {
207                         readNamesFile();
208                 }
209                 
210                 //read list file
211                 read = new ReadOTUFile(listfile);
212                 read->read(&*globaldata); 
213                 
214                 input = globaldata->ginput;
215                 list = globaldata->gListVector;
216                 string lastLabel = list->getLabel();
217                 
218                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
219                 set<string> processedLabels;
220                 set<string> userLabels = labels;
221                 
222                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
223                         
224                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
225                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
226                                         error = process(list);
227                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
228                                         
229                                         processedLabels.insert(list->getLabel());
230                                         userLabels.erase(list->getLabel());
231                         }
232                         
233                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
234                                         string saveLabel = list->getLabel();
235                                         
236                                         delete list;
237                                         list = input->getListVector(lastLabel);
238                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
239                                         error = process(list);
240                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
241                                         
242                                         processedLabels.insert(list->getLabel());
243                                         userLabels.erase(list->getLabel());
244                                         
245                                         //restore real lastlabel to save below
246                                         list->setLabel(saveLabel);
247                         }
248                         
249                         lastLabel = list->getLabel();
250                         
251                         delete list;
252                         list = input->getListVector();
253                 }
254                 
255                 //output error messages about any remaining user labels
256                 bool needToRun = false;
257                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
258                         mothurOut("Your file does not include the label " + *it); 
259                         if (processedLabels.count(list->getLabel()) != 1) {
260                                 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
261                                 needToRun = true;
262                         }else {
263                                 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
264                         }
265                 }
266                 
267                 //run last label if you need to
268                 if (needToRun == true)  {
269                         if (list != NULL) {     delete list;    }
270                         list = input->getListVector(lastLabel);
271                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
272                         error = process(list);
273                         delete list;
274                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
275                 }
276                 
277                 delete matrix;
278                 globaldata->gSparseMatrix = NULL;
279                 delete list;
280                 globaldata->gListVector = NULL;
281
282                 return 0;
283         }
284         catch(exception& e) {
285                 errorOut(e, "GetOTURepCommand", "execute");
286                 exit(1);
287         }
288 }
289
290 //**********************************************************************************************************************
291 void GetOTURepCommand::readNamesFile() {
292         try {
293                 vector<string> dupNames;
294                 openInputFile(namesfile, inNames);
295                 
296                 string name, names, sequence;
297         
298                 while(inNames){
299                         inNames >> name;                        //read from first column  A
300                         inNames >> names;               //read from second column  A,B,C,D
301                         
302                         dupNames.clear();
303                         
304                         //parse names into vector
305                         splitAtComma(names, dupNames);
306                         
307                         //store names in fasta map
308                         sequence = fasta->getSequence(name);
309                         for (int i = 0; i < dupNames.size(); i++) {
310                                 fasta->push_back(dupNames[i], sequence);
311                         }
312                 
313                         gobble(inNames);
314                 }
315                 inNames.close();
316
317         }
318         catch(exception& e) {
319                 errorOut(e, "GetOTURepCommand", "readNamesFile");
320                 exit(1);
321         }
322 }
323 //**********************************************************************************************************************
324 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
325         try{
326                 vector<string> names;
327                 map<string, string> groups;
328                 map<string, string>::iterator groupIt;
329
330                 //parse names into vector
331                 string binnames = thisList->get(bin);
332                 splitAtComma(binnames, names);
333                 binsize = names.size();
334
335                 //if you have a groupfile
336                 if (groupfile != "") {
337                         //find the groups that are in this bin
338                         for (size_t i = 0; i < names.size(); i++) {
339                                 string groupName = groupMap->getGroup(names[i]);
340                                 if (groupName == "not found") {  
341                                         mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
342                                         groupError = true;
343                                 } else {
344                                         groups[groupName] = groupName;
345                                 }
346                         }
347                         
348                         //turn the groups into a string
349                         for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
350                                 group += groupIt->first + "-";
351                         }
352                         //rip off last dash
353                         group = group.substr(0, group.length()-1);
354                 }else{ group = ""; }
355
356                 // if only 1 sequence in bin or processing the "unique" label, then 
357                 // the first sequence of the OTU is the representative one
358                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
359                         return names[0];
360                 } else {
361                         vector<int> seqIndex(names.size());
362                         vector<float> max_dist(names.size());
363
364                         //fill seqIndex and initialize sums
365                         for (size_t i = 0; i < names.size(); i++) {
366                                 seqIndex[i] = nameToIndex[names[i]];
367                                 max_dist[i] = 0.0;
368                         }
369                         
370                         // loop through all entries in seqIndex
371                         SeqMap::iterator it;
372                         SeqMap currMap;
373                         for (size_t i=0; i < seqIndex.size(); i++) {
374                                 currMap = seqVec[seqIndex[i]];
375                                 for (size_t j=0; j < seqIndex.size(); j++) {
376                                         it = currMap.find(seqIndex[j]);         
377                                         if (it != currMap.end()) {
378                                                 max_dist[i] = max(max_dist[i], it->second);
379                                                 max_dist[j] = max(max_dist[j], it->second);
380                                         }
381                                 }
382                         }
383                         
384                         // sequence with the smallest maximum distance is the representative
385                         float min = 10000;
386                         int minIndex;
387                         for (size_t i=0; i < max_dist.size(); i++) {
388                                 if (max_dist[i] < min) {
389                                         min = max_dist[i];
390                                         minIndex = i;
391                                 }
392                         }
393
394                         return(names[minIndex]);
395                 }
396         }
397         catch(exception& e) {
398                 errorOut(e, "GetOTURepCommand", "FindRep");
399                 exit(1);
400         }
401 }
402
403 //**********************************************************************************************************************
404 int GetOTURepCommand::process(ListVector* processList) {
405         try{
406                 string nameRep, name, sequence;
407
408                 //create output file
409                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
410                 openOutputFile(outputFileName, out);
411                 vector<repStruct> reps;
412
413                 //for each bin in the list vector
414                 for (int i = 0; i < processList->size(); i++) {
415                         string groups;
416                         int binsize;
417                         nameRep = findRep(i, groups, processList, binsize);
418
419                         //print out name and sequence for that bin
420                         sequence = fasta->getSequence(nameRep);
421
422                         if (sequence != "not found") {
423                                 if (sorted == "") { //print them out
424                                         nameRep = nameRep + "|" + toString(i+1);
425                                         nameRep = nameRep + "|" + toString(binsize);
426                                         if (groupfile != "") {
427                                                 nameRep = nameRep + "|" + groups;
428                                         }
429                                         out << ">" << nameRep << endl;
430                                         out << sequence << endl;
431                                 }else { //save them
432                                         repStruct newRep(nameRep, i+1, binsize, groups);
433                                         reps.push_back(newRep);
434                                 }
435                         }else { 
436                                 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); 
437                                 remove(outputFileName.c_str());
438                                 return 1;
439                         }
440                 }
441                 
442                 if (sorted != "") { //then sort them and print them
443                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
444                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
445                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
446                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
447                         
448                         //print them
449                         for (int i = 0; i < reps.size(); i++) {
450                                 string sequence = fasta->getSequence(reps[i].name);
451                                 string outputName = reps[i].name + "|" + toString(reps[i].bin);
452                                 outputName = outputName + "|" + toString(reps[i].size);
453                                 if (groupfile != "") {
454                                         outputName = outputName + "|" + reps[i].group;
455                                 }
456                                 out << ">" << outputName << endl;
457                                 out << sequence << endl;
458                         }
459                 }
460
461                 out.close();
462                 return 0;
463
464         }
465         catch(exception& e) {
466                 errorOut(e, "GetOTURepCommand", "process");
467                 exit(1);
468         }
469 }
470
471 //**********************************************************************************************************************