]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added warning about average neighbor merges around cutoff. fixed little bugs.
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option){
40         try{
41                 globaldata = GlobalData::getInstance();
42                 abort = false;
43                 allLines = 1;
44                 labels.clear();
45                 
46                 //allow user to run help
47                 if (option == "help") { 
48                         help(); abort = true;
49                 } else {
50                         //valid paramters for this command
51                         string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision"};
52                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
53                         
54                         OptionParser parser(option);
55                         map<string, string> parameters = parser.getParameters();
56                         
57                         ValidParameters validParameter;
58                 
59                         //check to make sure all parameters are valid for command
60                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
61                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
62                         }
63                         
64                         //check for required parameters
65                         fastafile = validParameter.validFile(parameters, "fasta", true);
66                         if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
67                         else if (fastafile == "not open") { abort = true; }     
68                 
69                         listfile = validParameter.validFile(parameters, "list", true);
70                         if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
71                         else if (listfile == "not open") { abort = true; }      
72                         
73                         phylipfile = validParameter.validFile(parameters, "phylip", true);
74                         if (phylipfile == "not found") { phylipfile = "";  }
75                         else if (phylipfile == "not open") { abort = true; }    
76                         else { distFile = phylipfile; format = "phylip";   }
77                         
78                         columnfile = validParameter.validFile(parameters, "column", true);
79                         if (columnfile == "not found") { columnfile = ""; }
80                         else if (columnfile == "not open") { abort = true; }    
81                         else { distFile = columnfile; format = "column";   }
82                         
83                         namefile = validParameter.validFile(parameters, "name", true);
84                         if (namefile == "not open") { abort = true; }   
85                         else if (namefile == "not found") { namefile = ""; }
86                         
87                         if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
88                         else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
89                 
90                         if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
91
92                         //check for optional parameter and set defaults
93                         // ...at some point should added some additional type checking...
94                         label = validParameter.validFile(parameters, "label", false);                   
95                         if (label == "not found") { label = ""; allLines = 1;  }
96                         else { 
97                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
98                                 else { allLines = 1;  }
99                         }
100                         
101                         groupfile = validParameter.validFile(parameters, "group", true);
102                         if (groupfile == "not open") { groupfile = ""; abort = true; }
103                         else if (groupfile == "not found") { groupfile = ""; }
104                         else {
105                                 //read in group map info.
106                                 groupMap = new GroupMap(groupfile);
107                                 groupMap->readMap();
108                         }
109                         
110                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
111                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
112                                 mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); mothurOutEndLine();
113                                 sorted = "";
114                         }
115                         
116                         if ((sorted == "group") && (groupfile == "")) {
117                                 mothurOut("You must provide a groupfile to sort by group. I will not sort."); mothurOutEndLine();
118                                 sorted = "";
119                         }
120                         
121                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
122                         large = isTrue(temp);
123                         
124                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
125                         convert(temp, precision); 
126                         
127                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
128                         convert(temp, cutoff); 
129                         cutoff += (5 / (precision * 10.0));
130                 }
131         }
132         catch(exception& e) {
133                 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
134                 exit(1);
135         }
136 }
137
138 //**********************************************************************************************************************
139
140 void GetOTURepCommand::help(){
141         try {
142                 mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
143                 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
144                 mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
145                 mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
146                 mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
147                 mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
148                 mothurOut("The default value for label is all labels in your inputfile.\n");
149                 mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
150                 mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
151                 mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
152                 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
153                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
154         }
155         catch(exception& e) {
156                 errorOut(e, "GetOTURepCommand", "help");
157                 exit(1);
158         }
159 }
160
161 //**********************************************************************************************************************
162
163 GetOTURepCommand::~GetOTURepCommand(){}
164
165 //**********************************************************************************************************************
166
167 int GetOTURepCommand::execute(){
168         try {
169         
170                 if (abort == true) { return 0; }
171                 int error;
172                 
173                 if (!large) {
174                         //read distance files
175                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
176                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
177                         else { mothurOut("File format error."); mothurOutEndLine(); return 0;  }
178                         
179                         readMatrix->setCutoff(cutoff);
180         
181                         if(namefile != ""){     
182                                 nameMap = new NameAssignment(namefile);
183                                 nameMap->readMap();
184                         }else{  nameMap = NULL;         }
185                         
186                         readMatrix->read(nameMap);
187
188                         //get matrix
189                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
190                         globaldata->gListVector = readMatrix->getListVector();
191
192                         SparseMatrix* matrix = readMatrix->getMatrix();
193                         
194                         // Create a data structure to quickly access the distance information.
195                         // It consists of a vector of distance maps, where each map contains
196                         // all distances of a certain sequence. Vector and maps are accessed
197                         // via the index of a sequence in the distance matrix
198                         seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
199                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
200                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
201                         }
202                         
203                         delete matrix;
204                         delete readMatrix;
205                 }else {
206                         //process file and set up indexes
207                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
208                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
209                         else { mothurOut("File format error."); mothurOutEndLine(); return 0;  }
210                         
211                         formatMatrix->setCutoff(cutoff);
212         
213                         if(namefile != ""){     
214                                 nameMap = new NameAssignment(namefile);
215                                 nameMap->readMap();
216                         }else{  nameMap = NULL;         }
217                         
218                         formatMatrix->read(nameMap);
219
220                         //get matrix
221                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
222                         globaldata->gListVector = formatMatrix->getListVector();
223                         
224                         distFile = formatMatrix->getFormattedFileName();
225                         
226                         //positions in file where the distances for each sequence begin
227                         //rowPositions[1] = position in file where distance related to sequence 1 start.
228                         rowPositions = formatMatrix->getRowPositions();
229                         
230                         delete formatMatrix;
231                         
232                         //openfile for getMap to use
233                         openInputFile(distFile, inRow);
234                 }
235                 
236                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
237                 if (globaldata->gListVector != NULL) {
238                         vector<string> names;
239                         string binnames;
240                         //map names to rows in sparsematrix
241                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
242                                 names.clear();
243                                 binnames = globaldata->gListVector->get(i);
244                                 
245                                 splitAtComma(binnames, names);
246                                 
247                                 for (int j = 0; j < names.size(); j++) {
248                                         nameToIndex[names[j]] = i;
249                                 }
250                         }
251                 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
252                 
253                 fasta = new FastaMap();
254                 
255                 //read fastafile
256                 fasta->readFastaFile(fastafile);
257                                 
258                 //if user gave a namesfile then use it
259                 if (namefile != "") {   readNamesFile();        }
260                 
261                 //set format to list so input can get listvector
262                 globaldata->setFormat("list");
263
264                 //read list file
265                 read = new ReadOTUFile(listfile);
266                 read->read(&*globaldata); 
267                 
268                 input = globaldata->ginput;
269                 list = globaldata->gListVector;
270                 string lastLabel = list->getLabel();
271                 
272                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
273                 set<string> processedLabels;
274                 set<string> userLabels = labels;
275         
276                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
277                         
278                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
279                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
280                                         error = process(list);
281                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
282                                         
283                                         processedLabels.insert(list->getLabel());
284                                         userLabels.erase(list->getLabel());
285                         }
286                         
287                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
288                                         string saveLabel = list->getLabel();
289                                         
290                                         delete list;
291                                         list = input->getListVector(lastLabel);
292                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
293                                         error = process(list);
294                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
295                                         
296                                         processedLabels.insert(list->getLabel());
297                                         userLabels.erase(list->getLabel());
298                                         
299                                         //restore real lastlabel to save below
300                                         list->setLabel(saveLabel);
301                         }
302                         
303                         lastLabel = list->getLabel();
304                         
305                         delete list;
306                         list = input->getListVector();
307                 }
308                 
309                 //output error messages about any remaining user labels
310                 bool needToRun = false;
311                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
312                         mothurOut("Your file does not include the label " + *it); 
313                         if (processedLabels.count(list->getLabel()) != 1) {
314                                 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
315                                 needToRun = true;
316                         }else {
317                                 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
318                         }
319                 }
320                 
321                 //run last label if you need to
322                 if (needToRun == true)  {
323                         if (list != NULL) {     delete list;    }
324                         list = input->getListVector(lastLabel);
325                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
326                         error = process(list);
327                         delete list;
328                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
329                 }
330                 
331                 //close and remove formatted matrix file
332                 if (large) {
333                         inRow.close();
334                         //remove(distFile.c_str());
335                 }
336                 
337                 globaldata->gListVector = NULL;
338                 delete input;  globaldata->ginput = NULL;
339                 delete read;
340                 delete fasta;
341                 if (groupfile != "") {
342                         delete groupMap;  globaldata->gGroupmap = NULL;
343                 }
344                 
345                 return 0;
346         }
347         catch(exception& e) {
348                 errorOut(e, "GetOTURepCommand", "execute");
349                 exit(1);
350         }
351 }
352
353 //**********************************************************************************************************************
354 void GetOTURepCommand::readNamesFile() {
355         try {
356                 vector<string> dupNames;
357                 openInputFile(namefile, inNames);
358                 
359                 string name, names, sequence;
360         
361                 while(inNames){
362                         inNames >> name;                        //read from first column  A
363                         inNames >> names;               //read from second column  A,B,C,D
364                         
365                         dupNames.clear();
366                         
367                         //parse names into vector
368                         splitAtComma(names, dupNames);
369                         
370                         //store names in fasta map
371                         sequence = fasta->getSequence(name);
372                         for (int i = 0; i < dupNames.size(); i++) {
373                                 fasta->push_back(dupNames[i], sequence);
374                         }
375                 
376                         gobble(inNames);
377                 }
378                 inNames.close();
379
380         }
381         catch(exception& e) {
382                 errorOut(e, "GetOTURepCommand", "readNamesFile");
383                 exit(1);
384         }
385 }
386 //**********************************************************************************************************************
387 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
388         try{
389                 vector<string> names;
390                 map<string, string> groups;
391                 map<string, string>::iterator groupIt;
392
393                 //parse names into vector
394                 string binnames = thisList->get(bin);
395                 splitAtComma(binnames, names);
396                 binsize = names.size();
397
398                 //if you have a groupfile
399                 if (groupfile != "") {
400                         //find the groups that are in this bin
401                         for (size_t i = 0; i < names.size(); i++) {
402                                 string groupName = groupMap->getGroup(names[i]);
403                                 if (groupName == "not found") {  
404                                         mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
405                                         groupError = true;
406                                 } else {
407                                         groups[groupName] = groupName;
408                                 }
409                         }
410                         
411                         //turn the groups into a string
412                         for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
413                                 group += groupIt->first + "-";
414                         }
415                         //rip off last dash
416                         group = group.substr(0, group.length()-1);
417                 }else{ group = ""; }
418
419                 // if only 1 sequence in bin or processing the "unique" label, then 
420                 // the first sequence of the OTU is the representative one
421                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
422                         return names[0];
423                 }else{
424                         vector<int> seqIndex(names.size());
425                         vector<float> max_dist(names.size());
426                         vector<float> total_dist(names.size());
427
428                         //fill seqIndex and initialize sums
429                         for (size_t i = 0; i < names.size(); i++) {
430                                 seqIndex[i] = nameToIndex[names[i]];
431                                 max_dist[i] = 0.0;
432                                 total_dist[i] = 0.0;
433                         }
434                         
435                         // loop through all entries in seqIndex
436                         SeqMap::iterator it;
437                         SeqMap currMap;
438                         for (size_t i=0; i < seqIndex.size(); i++) {
439                         
440                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
441                                 else            {       currMap = getMap(seqIndex[i]);  }
442                                 
443                                 for (size_t j=0; j < seqIndex.size(); j++) {
444                                         it = currMap.find(seqIndex[j]);         
445                                         if (it != currMap.end()) {
446                                                 max_dist[i] = max(max_dist[i], it->second);
447                                                 max_dist[j] = max(max_dist[j], it->second);
448                                                 total_dist[i] += it->second;
449                                                 total_dist[j] += it->second;
450                                         }else{ //if you can't find the distance make it the cutoff
451                                                 max_dist[i] = max(max_dist[i], cutoff);
452                                                 max_dist[j] = max(max_dist[j], cutoff);
453                                                 total_dist[i] += cutoff;
454                                                 total_dist[j] += cutoff;
455                                         }
456                                 }
457                         }
458                         
459                         // sequence with the smallest maximum distance is the representative
460                         //if tie occurs pick sequence with smallest average distance
461                         float min = 10000;
462                         int minIndex;
463                         for (size_t i=0; i < max_dist.size(); i++) {
464                                 if (max_dist[i] < min) {
465                                         min = max_dist[i];
466                                         minIndex = i;
467                                 }else if (max_dist[i] == min) {
468                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
469                                         float newAverage = total_dist[i] / (float) total_dist.size();
470                                         
471                                         if (newAverage < currentAverage) {
472                                                 min = max_dist[i];
473                                                 minIndex = i;
474                                         }
475                                 }
476                         }
477
478                         return(names[minIndex]);
479                 }
480         }
481         catch(exception& e) {
482                 errorOut(e, "GetOTURepCommand", "FindRep");
483                 exit(1);
484         }
485 }
486
487 //**********************************************************************************************************************
488 int GetOTURepCommand::process(ListVector* processList) {
489         try{
490                 string nameRep, name, sequence;
491
492                 //create output file
493                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
494                 openOutputFile(outputFileName, out);
495                 vector<repStruct> reps;
496                 
497                 ofstream newNamesOutput;
498                 string outputNamesFile = getRootName(listfile) + processList->getLabel() + ".rep.names";
499                 openOutputFile(outputNamesFile, newNamesOutput);
500                 
501                 //for each bin in the list vector
502                 for (int i = 0; i < processList->size(); i++) {
503                         string groups;
504                         int binsize;
505                         nameRep = findRep(i, groups, processList, binsize);
506                         
507                         //output to new names file
508                         newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
509
510                         //print out name and sequence for that bin
511                         sequence = fasta->getSequence(nameRep);
512
513                         if (sequence != "not found") {
514                                 if (sorted == "") { //print them out
515                                         nameRep = nameRep + "|" + toString(i+1);
516                                         nameRep = nameRep + "|" + toString(binsize);
517                                         if (groupfile != "") {
518                                                 nameRep = nameRep + "|" + groups;
519                                         }
520                                         out << ">" << nameRep << endl;
521                                         out << sequence << endl;
522                                 }else { //save them
523                                         repStruct newRep(nameRep, i+1, binsize, groups);
524                                         reps.push_back(newRep);
525                                 }
526                         }else { 
527                                 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); 
528                                 remove(outputFileName.c_str());
529                                 remove(outputNamesFile.c_str());
530                                 return 1;
531                         }
532                 }
533                 
534                 if (sorted != "") { //then sort them and print them
535                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
536                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
537                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
538                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
539                         
540                         //print them
541                         for (int i = 0; i < reps.size(); i++) {
542                                 string sequence = fasta->getSequence(reps[i].name);
543                                 string outputName = reps[i].name + "|" + toString(reps[i].bin);
544                                 outputName = outputName + "|" + toString(reps[i].size);
545                                 if (groupfile != "") {
546                                         outputName = outputName + "|" + reps[i].group;
547                                 }
548                                 out << ">" << outputName << endl;
549                                 out << sequence << endl;
550                         }
551                 }
552
553                 out.close();
554                 newNamesOutput.close();
555                 return 0;
556
557         }
558         catch(exception& e) {
559                 errorOut(e, "GetOTURepCommand", "process");
560                 exit(1);
561         }
562 }
563
564 //**********************************************************************************************************************
565 SeqMap GetOTURepCommand::getMap(int row) {
566         try {
567                 SeqMap rowMap;
568                 
569                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
570                 if (rowPositions[row] != -1){
571                         //go to row in file
572                         inRow.seekg(rowPositions[row]);
573                         
574                         int rowNum, numDists, colNum;
575                         float dist;
576                         
577                         inRow >> rowNum >> numDists;
578                         
579                         for(int i = 0; i < numDists; i++) {
580                                 inRow >> colNum >> dist;
581                                 rowMap[colNum] = dist;
582                                 
583                         }
584                 }
585                 
586                 return rowMap;
587         }
588         catch(exception& e) {
589                 errorOut(e, "GetOTURepCommand", "getMap");
590                 exit(1);
591         }
592 }
593 //**********************************************************************************************************************
594