]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added pca command
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //********************************************************************************************************************
13 //sorts lowest to highest
14 inline bool compareName(repStruct left, repStruct right){
15         return (left.name < right.name);        
16 }
17 //********************************************************************************************************************
18 //sorts lowest to highest
19 inline bool compareBin(repStruct left, repStruct right){
20         return (left.bin < right.bin);  
21 }
22 //********************************************************************************************************************
23 //sorts lowest to highest
24 inline bool compareSize(repStruct left, repStruct right){
25         return (left.size < right.size);        
26 }
27 //********************************************************************************************************************
28 //sorts lowest to highest
29 inline bool compareGroup(repStruct left, repStruct right){
30         return (left.group < right.group);      
31 }
32 //**********************************************************************************************************************
33 GetOTURepCommand::GetOTURepCommand(string option){
34         try{
35                 globaldata = GlobalData::getInstance();
36                 abort = false;
37                 allLines = 1;
38                 labels.clear();
39                 
40                 //allow user to run help
41                 if (option == "help") { 
42                         help(); abort = true;
43                 } else {
44                         //valid paramters for this command
45                         string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column"};
46                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
47                         
48                         OptionParser parser(option);
49                         map<string, string> parameters = parser.getParameters();
50                         
51                         ValidParameters validParameter;
52                 
53                         //check to make sure all parameters are valid for command
54                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
55                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
56                         }
57                         
58                         //make sure the user has already run the read.otu command
59                         if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
60                                 mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine(); 
61                                 abort = true;
62                         }
63                         
64                         //check for required parameters
65                         fastafile = validParameter.validFile(parameters, "fasta", true);
66                         if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
67                         else if (fastafile == "not open") { abort = true; }     
68                 
69                         listfile = validParameter.validFile(parameters, "list", true);
70                         if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
71                         else if (listfile == "not open") { abort = true; }      
72                         
73                         phylipfile = validParameter.validFile(parameters, "phylip", true);
74                         if (phylipfile == "not found") { phylipfile = "";  }
75                         else if (phylipfile == "not open") { abort = true; }    
76                         
77                         columnfile = validParameter.validFile(parameters, "column", true);
78                         if (columnfile == "not found") { columnfile = ""; }
79                         else if (columnfile == "not open") { abort = true; }    
80                         
81                         if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
82                         else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
83                 
84                         if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
85
86                         //check for optional parameter and set defaults
87                         // ...at some point should added some additional type checking...
88                         label = validParameter.validFile(parameters, "label", false);                   
89                         if (label == "not found") { label = ""; }
90                         else { 
91                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
92                                 else { allLines = 1;  }
93                         }
94                         
95                         //if the user has not specified any labels use the ones from read.otu
96                         if (label == "") {  
97                                 allLines = globaldata->allLines; 
98                                 labels = globaldata->labels; 
99                         }
100                         
101                         namesfile = validParameter.validFile(parameters, "name", true);
102                         if (namesfile == "not open") { abort = true; }  
103                         else if (namesfile == "not found") { namesfile = ""; }
104
105                         groupfile = validParameter.validFile(parameters, "group", true);
106                         if (groupfile == "not open") { groupfile = ""; abort = true; }
107                         else if (groupfile == "not found") { groupfile = ""; }
108                         else {
109                                 //read in group map info.
110                                 groupMap = new GroupMap(groupfile);
111                                 groupMap->readMap();
112                         }
113                         
114                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
115                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
116                                 mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); mothurOutEndLine();
117                                 sorted = "";
118                         }
119                         
120                         if ((sorted == "group") && (groupfile == "")) {
121                                 mothurOut("You must provide a groupfile to sort by group. I will not sort."); mothurOutEndLine();
122                                 sorted = "";
123                         }
124                         
125                         if (abort == false) {
126
127                                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
128                                 if (globaldata->gListVector != NULL) {
129                                         vector<string> names;
130                                         string binnames;
131                                         //map names to rows in sparsematrix
132                                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
133                                                 names.clear();
134                                                 binnames = globaldata->gListVector->get(i);
135         
136                                                 splitAtComma(binnames, names);
137                                 
138                                                 for (int j = 0; j < names.size(); j++) {
139                                                         nameToIndex[names[j]] = i;
140                                                 }
141                                         }
142                                 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
143
144                                 fasta = new FastaMap();
145                         }
146                 }
147         }
148         catch(exception& e) {
149                 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
150                 exit(1);
151         }
152 }
153
154 //**********************************************************************************************************************
155
156 void GetOTURepCommand::help(){
157         try {
158                 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
159                 mothurOut("The get.oturep command parameters are list, fasta, name, group, sorted and label.  The fasta and list parameters are required.\n");
160                 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
161                 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
162                 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
163                 mothurOut("The default value for label is all labels in your inputfile.\n");
164                 mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
165                 mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
166                 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
167                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
168         }
169         catch(exception& e) {
170                 errorOut(e, "GetOTURepCommand", "help");
171                 exit(1);
172         }
173 }
174
175 //**********************************************************************************************************************
176
177 GetOTURepCommand::~GetOTURepCommand(){
178         if (abort == false) {
179                 delete input;  globaldata->ginput = NULL;
180                 delete read;
181                 delete fasta;
182                 if (groupfile != "") {
183                         delete groupMap;  globaldata->gGroupmap = NULL;
184                 }
185         }
186 }
187
188 //**********************************************************************************************************************
189
190 int GetOTURepCommand::execute(){
191         try {
192         
193                 if (abort == true) { return 0; }
194                 int error;
195                 
196                 //read fastafile
197                 fasta->readFastaFile(fastafile);
198                 
199                 //in.close();
200                 //read distance file and generate indexed distance file that can be used by findrep function
201 //....new reading class....//
202                 
203                 //if user gave a namesfile then use it
204                 if (namesfile != "") {  readNamesFile();        }
205                 
206                 //set format to list so input can get listvector
207                 globaldata->setFormat("list");
208
209                 //read list file
210                 read = new ReadOTUFile(listfile);
211                 read->read(&*globaldata); 
212                 
213                 input = globaldata->ginput;
214                 list = globaldata->gListVector;
215                 string lastLabel = list->getLabel();
216                 
217                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
218                 set<string> processedLabels;
219                 set<string> userLabels = labels;
220                 
221                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
222                         
223                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
224                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
225                                         error = process(list);
226                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
227                                         
228                                         processedLabels.insert(list->getLabel());
229                                         userLabels.erase(list->getLabel());
230                         }
231                         
232                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
233                                         string saveLabel = list->getLabel();
234                                         
235                                         delete list;
236                                         list = input->getListVector(lastLabel);
237                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
238                                         error = process(list);
239                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
240                                         
241                                         processedLabels.insert(list->getLabel());
242                                         userLabels.erase(list->getLabel());
243                                         
244                                         //restore real lastlabel to save below
245                                         list->setLabel(saveLabel);
246                         }
247                         
248                         lastLabel = list->getLabel();
249                         
250                         delete list;
251                         list = input->getListVector();
252                 }
253                 
254                 //output error messages about any remaining user labels
255                 bool needToRun = false;
256                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
257                         mothurOut("Your file does not include the label " + *it); 
258                         if (processedLabels.count(list->getLabel()) != 1) {
259                                 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
260                                 needToRun = true;
261                         }else {
262                                 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
263                         }
264                 }
265                 
266                 //run last label if you need to
267                 if (needToRun == true)  {
268                         if (list != NULL) {     delete list;    }
269                         list = input->getListVector(lastLabel);
270                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
271                         error = process(list);
272                         delete list;
273                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
274                 }
275                 
276                 delete list;
277                 globaldata->gListVector = NULL;
278
279                 return 0;
280         }
281         catch(exception& e) {
282                 errorOut(e, "GetOTURepCommand", "execute");
283                 exit(1);
284         }
285 }
286
287 //**********************************************************************************************************************
288 void GetOTURepCommand::readNamesFile() {
289         try {
290                 vector<string> dupNames;
291                 openInputFile(namesfile, inNames);
292                 
293                 string name, names, sequence;
294         
295                 while(inNames){
296                         inNames >> name;                        //read from first column  A
297                         inNames >> names;               //read from second column  A,B,C,D
298                         
299                         dupNames.clear();
300                         
301                         //parse names into vector
302                         splitAtComma(names, dupNames);
303                         
304                         //store names in fasta map
305                         sequence = fasta->getSequence(name);
306                         for (int i = 0; i < dupNames.size(); i++) {
307                                 fasta->push_back(dupNames[i], sequence);
308                         }
309                 
310                         gobble(inNames);
311                 }
312                 inNames.close();
313
314         }
315         catch(exception& e) {
316                 errorOut(e, "GetOTURepCommand", "readNamesFile");
317                 exit(1);
318         }
319 }
320 //**********************************************************************************************************************
321 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
322         try{
323                 vector<string> names;
324                 map<string, string> groups;
325                 map<string, string>::iterator groupIt;
326
327                 //parse names into vector
328                 string binnames = thisList->get(bin);
329                 splitAtComma(binnames, names);
330                 binsize = names.size();
331
332                 //if you have a groupfile
333                 if (groupfile != "") {
334                         //find the groups that are in this bin
335                         for (size_t i = 0; i < names.size(); i++) {
336                                 string groupName = groupMap->getGroup(names[i]);
337                                 if (groupName == "not found") {  
338                                         mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
339                                         groupError = true;
340                                 } else {
341                                         groups[groupName] = groupName;
342                                 }
343                         }
344                         
345                         //turn the groups into a string
346                         for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
347                                 group += groupIt->first + "-";
348                         }
349                         //rip off last dash
350                         group = group.substr(0, group.length()-1);
351                 }else{ group = ""; }
352
353                 // if only 1 sequence in bin or processing the "unique" label, then 
354                 // the first sequence of the OTU is the representative one
355                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
356                         return names[0];
357                 } else {
358                         vector<int> seqIndex(names.size());
359                         vector<float> max_dist(names.size());
360
361                         //fill seqIndex and initialize sums
362                         for (size_t i = 0; i < names.size(); i++) {
363                                 seqIndex[i] = nameToIndex[names[i]];
364                                 max_dist[i] = 0.0;
365                         }
366                         
367                         // loop through all entries in seqIndex
368                         SeqMap::iterator it;
369                         SeqMap currMap;
370                         for (size_t i=0; i < seqIndex.size(); i++) {
371         //currMap = seqVec[seqIndex[i]];
372                                 for (size_t j=0; j < seqIndex.size(); j++) {
373                                         it = currMap.find(seqIndex[j]);         
374                                         if (it != currMap.end()) {
375                                                 max_dist[i] = max(max_dist[i], it->second);
376                                                 max_dist[j] = max(max_dist[j], it->second);
377                                         }
378                                 }
379                         }
380                         
381                         // sequence with the smallest maximum distance is the representative
382                         float min = 10000;
383                         int minIndex;
384                         for (size_t i=0; i < max_dist.size(); i++) {
385                                 if (max_dist[i] < min) {
386                                         min = max_dist[i];
387                                         minIndex = i;
388                                 }
389                         }
390
391                         return(names[minIndex]);
392                 }
393         }
394         catch(exception& e) {
395                 errorOut(e, "GetOTURepCommand", "FindRep");
396                 exit(1);
397         }
398 }
399
400 //**********************************************************************************************************************
401 int GetOTURepCommand::process(ListVector* processList) {
402         try{
403                 string nameRep, name, sequence;
404
405                 //create output file
406                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
407                 openOutputFile(outputFileName, out);
408                 vector<repStruct> reps;
409
410                 //for each bin in the list vector
411                 for (int i = 0; i < processList->size(); i++) {
412                         string groups;
413                         int binsize;
414                         nameRep = findRep(i, groups, processList, binsize);
415
416                         //print out name and sequence for that bin
417                         sequence = fasta->getSequence(nameRep);
418
419                         if (sequence != "not found") {
420                                 if (sorted == "") { //print them out
421                                         nameRep = nameRep + "|" + toString(i+1);
422                                         nameRep = nameRep + "|" + toString(binsize);
423                                         if (groupfile != "") {
424                                                 nameRep = nameRep + "|" + groups;
425                                         }
426                                         out << ">" << nameRep << endl;
427                                         out << sequence << endl;
428                                 }else { //save them
429                                         repStruct newRep(nameRep, i+1, binsize, groups);
430                                         reps.push_back(newRep);
431                                 }
432                         }else { 
433                                 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); 
434                                 remove(outputFileName.c_str());
435                                 return 1;
436                         }
437                 }
438                 
439                 if (sorted != "") { //then sort them and print them
440                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
441                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
442                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
443                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
444                         
445                         //print them
446                         for (int i = 0; i < reps.size(); i++) {
447                                 string sequence = fasta->getSequence(reps[i].name);
448                                 string outputName = reps[i].name + "|" + toString(reps[i].bin);
449                                 outputName = outputName + "|" + toString(reps[i].size);
450                                 if (groupfile != "") {
451                                         outputName = outputName + "|" + reps[i].group;
452                                 }
453                                 out << ">" << outputName << endl;
454                                 out << sequence << endl;
455                         }
456                 }
457
458                 out.close();
459                 return 0;
460
461         }
462         catch(exception& e) {
463                 errorOut(e, "GetOTURepCommand", "process");
464                 exit(1);
465         }
466 }
467
468 //**********************************************************************************************************************