]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
removed line option
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
14         try{
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 allLines = 1;
18                 labels.clear();
19                 
20                 //allow user to run help
21                 if (option == "help") { 
22                         help(); abort = true;
23                 } else {
24                         //valid paramters for this command
25                         string Array[] =  {"fasta","list","label","name", "group"};
26                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27                         
28                         OptionParser parser(option);
29                         map<string, string> parameters = parser.getParameters();
30                         
31                         ValidParameters validParameter;
32                 
33                         //check to make sure all parameters are valid for command
34                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
35                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
36                         }
37                         
38                         //make sure the user has already run the read.otu command
39                         if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
40                                 mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine(); 
41                                 abort = true;
42                         }
43                         
44                         //check for required parameters
45                         fastafile = validParameter.validFile(parameters, "fasta", true);
46                         if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
47                         else if (fastafile == "not open") { abort = true; }     
48                 
49                         listfile = validParameter.validFile(parameters, "list", true);
50                         if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
51                         else if (listfile == "not open") { abort = true; }      
52
53                         //check for optional parameter and set defaults
54                         // ...at some point should added some additional type checking...
55                         label = validParameter.validFile(parameters, "label", false);                   
56                         if (label == "not found") { label = ""; }
57                         else { 
58                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
59                                 else { allLines = 1;  }
60                         }
61                         
62                         //if the user has not specified any labels use the ones from read.otu
63                         if (label == "") {  
64                                 allLines = globaldata->allLines; 
65                                 labels = globaldata->labels; 
66                         }
67                         
68                         namesfile = validParameter.validFile(parameters, "name", true);
69                         if (namesfile == "not open") { abort = true; }  
70                         else if (namesfile == "not found") { namesfile = ""; }
71
72                         groupfile = validParameter.validFile(parameters, "group", true);
73                         if (groupfile == "not open") { abort = true; }
74                         else if (groupfile == "not found") { groupfile = ""; }
75                         else {
76                                 //read in group map info.
77                                 groupMap = new GroupMap(groupfile);
78                                 groupMap->readMap();
79                         }
80                 
81                         if (abort == false) {
82                         
83                                 if(globaldata->gSparseMatrix != NULL)   {
84                                         matrix = globaldata->gSparseMatrix;
85                                 }
86
87                                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
88                                 if (globaldata->gListVector != NULL) {
89                                         vector<string> names;
90                                         string binnames;
91                                         //map names to rows in sparsematrix
92                                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
93                                                 names.clear();
94                                                 binnames = globaldata->gListVector->get(i);
95         
96                                                 splitAtComma(binnames, names);
97                                 
98                                                 for (int j = 0; j < names.size(); j++) {
99                                                         nameToIndex[names[j]] = i;
100                                                 }
101                                         }
102                                 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
103
104                                 // Create a data structure to quickly access the distance information.
105                                 // It consists of a vector of distance maps, where each map contains
106                                 // all distances of a certain sequence. Vector and maps are accessed
107                                 // via the index of a sequence in the distance matrix
108                                 seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
109                                 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
110                                         seqVec[currentCell->row][currentCell->column] = currentCell->dist;
111                                 }
112
113                                 fasta = new FastaMap();
114                         }
115                 }
116         }
117         catch(exception& e) {
118                 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
119                 exit(1);
120         }
121 }
122
123 //**********************************************************************************************************************
124
125 void GetOTURepCommand::help(){
126         try {
127                 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
128                 mothurOut("The get.oturep command parameters are list, fasta, name, group and label.  The fasta and list parameters are required.\n");
129                 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
130                 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
131                 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
132                 mothurOut("The default value for label is all labels in your inputfile.\n");
133                 mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
134                 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
135                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
136         }
137         catch(exception& e) {
138                 errorOut(e, "GetOTURepCommand", "help");
139                 exit(1);
140         }
141 }
142
143 //**********************************************************************************************************************
144
145 GetOTURepCommand::~GetOTURepCommand(){
146         if (abort == false) {
147                 delete input;  globaldata->ginput = NULL;
148                 delete read;
149                 delete fasta;
150                 if (groupfile != "") {
151                         delete groupMap;  globaldata->gGroupmap = NULL;
152                 }
153         }
154 }
155
156 //**********************************************************************************************************************
157
158 int GetOTURepCommand::execute(){
159         try {
160         
161                 if (abort == true) { return 0; }
162
163                 int error;
164                 
165                 //read fastafile
166                 fasta->readFastaFile(fastafile);
167                 
168                 in.close();
169                 
170                 //set format to list so input can get listvector
171                 globaldata->setFormat("list");
172                 
173                 //if user gave a namesfile then use it
174                 if (namesfile != "") {
175                         readNamesFile();
176                 }
177                 
178                 //read list file
179                 read = new ReadOTUFile(listfile);
180                 read->read(&*globaldata); 
181                 
182                 input = globaldata->ginput;
183                 list = globaldata->gListVector;
184                 string lastLabel = list->getLabel();
185                 
186                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
187                 set<string> processedLabels;
188                 set<string> userLabels = labels;
189                 
190                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
191                         
192                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
193                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
194                                         error = process(list);
195                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
196                                         
197                                         processedLabels.insert(list->getLabel());
198                                         userLabels.erase(list->getLabel());
199                         }
200                         
201                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
202                                         delete list;
203                                         list = input->getListVector(lastLabel);
204                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
205                                         error = process(list);
206                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
207                                         
208                                         processedLabels.insert(list->getLabel());
209                                         userLabels.erase(list->getLabel());
210                         }
211                         
212                         lastLabel = list->getLabel();
213                         
214                         delete list;
215                         list = input->getListVector();
216                 }
217                 
218                 //output error messages about any remaining user labels
219                 bool needToRun = false;
220                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
221                         mothurOut("Your file does not include the label " + *it); 
222                         if (processedLabels.count(list->getLabel()) != 1) {
223                                 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
224                                 needToRun = true;
225                         }else {
226                                 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
227                         }
228                 }
229                 
230                 //run last label if you need to
231                 if (needToRun == true)  {
232                         if (list != NULL) {     delete list;    }
233                         list = input->getListVector(lastLabel);
234                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
235                         error = process(list);
236                         delete list;
237                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
238                 }
239                 
240                 delete matrix;
241                 globaldata->gSparseMatrix = NULL;
242                 delete list;
243                 globaldata->gListVector = NULL;
244
245                 return 0;
246         }
247         catch(exception& e) {
248                 errorOut(e, "GetOTURepCommand", "execute");
249                 exit(1);
250         }
251 }
252
253 //**********************************************************************************************************************
254 void GetOTURepCommand::readNamesFile() {
255         try {
256                 vector<string> dupNames;
257                 openInputFile(namesfile, inNames);
258                 
259                 string name, names, sequence;
260         
261                 while(inNames){
262                         inNames >> name;                        //read from first column  A
263                         inNames >> names;               //read from second column  A,B,C,D
264                         
265                         dupNames.clear();
266                         
267                         //parse names into vector
268                         splitAtComma(names, dupNames);
269                         
270                         //store names in fasta map
271                         sequence = fasta->getSequence(name);
272                         for (int i = 0; i < dupNames.size(); i++) {
273                                 fasta->push_back(dupNames[i], sequence);
274                         }
275                 
276                         gobble(inNames);
277                 }
278                 inNames.close();
279
280         }
281         catch(exception& e) {
282                 errorOut(e, "GetOTURepCommand", "readNamesFile");
283                 exit(1);
284         }
285 }
286 //**********************************************************************************************************************
287 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
288         try{
289                 vector<string> names;
290                 map<string, string> groups;
291                 map<string, string>::iterator groupIt;
292
293                 //parse names into vector
294                 string binnames = thisList->get(bin);
295                 splitAtComma(binnames, names);
296                 binsize = names.size();
297
298                 //if you have a groupfile
299                 if (groupfile != "") {
300                         //find the groups that are in this bin
301                         for (size_t i = 0; i < names.size(); i++) {
302                                 string groupName = groupMap->getGroup(names[i]);
303                                 if (groupName == "not found") {  
304                                         mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
305                                         groupError = true;
306                                 } else {
307                                         groups[groupName] = groupName;
308                                 }
309                         }
310                         
311                         //turn the groups into a string
312                         for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
313                                 group += groupIt->first + "-";
314                         }
315                         //rip off last dash
316                         group = group.substr(0, group.length()-1);
317                 }
318
319                 // if only 1 sequence in bin or processing the "unique" label, then 
320                 // the first sequence of the OTU is the representative one
321                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
322                         return names[0];
323                 } else {
324                         vector<int> seqIndex(names.size());
325                         vector<float> max_dist(names.size());
326
327                         //fill seqIndex and initialize sums
328                         for (size_t i = 0; i < names.size(); i++) {
329                                 seqIndex[i] = nameToIndex[names[i]];
330                                 max_dist[i] = 0.0;
331                         }
332                         
333                         // loop through all entries in seqIndex
334                         SeqMap::iterator it;
335                         SeqMap currMap;
336                         for (size_t i=0; i < seqIndex.size(); i++) {
337                                 currMap = seqVec[seqIndex[i]];
338                                 for (size_t j=0; j < seqIndex.size(); j++) {
339                                         it = currMap.find(seqIndex[j]);         
340                                         if (it != currMap.end()) {
341                                                 max_dist[i] = max(max_dist[i], it->second);
342                                                 max_dist[j] = max(max_dist[j], it->second);
343                                         }
344                                 }
345                         }
346                         
347                         // sequence with the smallest maximum distance is the representative
348                         float min = 10000;
349                         int minIndex;
350                         for (size_t i=0; i < max_dist.size(); i++) {
351                                 if (max_dist[i] < min) {
352                                         min = max_dist[i];
353                                         minIndex = i;
354                                 }
355                         }
356
357                         return(names[minIndex]);
358                 }
359         }
360         catch(exception& e) {
361                 errorOut(e, "GetOTURepCommand", "FindRep");
362                 exit(1);
363         }
364 }
365
366 //**********************************************************************************************************************
367 int GetOTURepCommand::process(ListVector* processList) {
368         try{
369                 string nameRep, name, sequence;
370
371                 //create output file
372                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
373                 openOutputFile(outputFileName, out);
374
375                 //for each bin in the list vector
376                 for (int i = 0; i < processList->size(); i++) {
377                         string groups;
378                         int binsize;
379                         nameRep = findRep(i, groups, processList, binsize);
380
381                         //print out name and sequence for that bin
382                         sequence = fasta->getSequence(nameRep);
383
384                         if (sequence != "not found") {
385                                 nameRep = nameRep + "|" + toString(i+1);
386                                 nameRep = nameRep + "|" + toString(binsize);
387                                 if (groupfile != "") {
388                                         nameRep = nameRep + "|" + groups;
389                                 }
390                                 out << ">" << nameRep << endl;
391                                 out << sequence << endl;
392                         } else { 
393                                 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); 
394                                 remove(outputFileName.c_str());
395                                 return 1;
396                         }
397                 }
398
399                 out.close();
400                 return 0;
401
402         }
403         catch(exception& e) {
404                 errorOut(e, "GetOTURepCommand", "process");
405                 exit(1);
406         }
407 }
408
409 //**********************************************************************************************************************