]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added formatmatrix, formatcolumn, and formatphylip classes. Used these classes in...
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option){
40         try{
41                 globaldata = GlobalData::getInstance();
42                 abort = false;
43                 allLines = 1;
44                 labels.clear();
45                 
46                 //allow user to run help
47                 if (option == "help") { 
48                         help(); abort = true;
49                 } else {
50                         //valid paramters for this command
51                         string Array[] =  {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff"};
52                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
53                         
54                         OptionParser parser(option);
55                         map<string, string> parameters = parser.getParameters();
56                         
57                         ValidParameters validParameter;
58                 
59                         //check to make sure all parameters are valid for command
60                         for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
61                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
62                         }
63                         
64                         //check for required parameters
65                         fastafile = validParameter.validFile(parameters, "fasta", true);
66                         if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
67                         else if (fastafile == "not open") { abort = true; }     
68                 
69                         listfile = validParameter.validFile(parameters, "list", true);
70                         if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
71                         else if (listfile == "not open") { abort = true; }      
72                         
73                         phylipfile = validParameter.validFile(parameters, "phylip", true);
74                         if (phylipfile == "not found") { phylipfile = "";  }
75                         else if (phylipfile == "not open") { abort = true; }    
76                         else { distFile = phylipfile; format = "phylip";   }
77                         
78                         columnfile = validParameter.validFile(parameters, "column", true);
79                         if (columnfile == "not found") { columnfile = ""; }
80                         else if (columnfile == "not open") { abort = true; }    
81                         else { distFile = columnfile; format = "column";   }
82                         
83                         namefile = validParameter.validFile(parameters, "name", true);
84                         if (namefile == "not open") { abort = true; }   
85                         else if (namefile == "not found") { namefile = ""; }
86                         
87                         if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
88                         else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
89                 
90                         if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
91
92                         //check for optional parameter and set defaults
93                         // ...at some point should added some additional type checking...
94                         label = validParameter.validFile(parameters, "label", false);                   
95                         if (label == "not found") { label = ""; allLines = 1;  }
96                         else { 
97                                 if(label != "all") {  splitAtDash(label, labels);  allLines = 0;  }
98                                 else { allLines = 1;  }
99                         }
100                         
101                         groupfile = validParameter.validFile(parameters, "group", true);
102                         if (groupfile == "not open") { groupfile = ""; abort = true; }
103                         else if (groupfile == "not found") { groupfile = ""; }
104                         else {
105                                 //read in group map info.
106                                 groupMap = new GroupMap(groupfile);
107                                 groupMap->readMap();
108                         }
109                         
110                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
111                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
112                                 mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); mothurOutEndLine();
113                                 sorted = "";
114                         }
115                         
116                         if ((sorted == "group") && (groupfile == "")) {
117                                 mothurOut("You must provide a groupfile to sort by group. I will not sort."); mothurOutEndLine();
118                                 sorted = "";
119                         }
120                         
121                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
122                         large = isTrue(temp);
123                         
124                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
125                         convert(temp, precision); 
126                         
127                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
128                         convert(temp, cutoff); 
129                         cutoff += (5 / (precision * 10.0));
130                 }
131         }
132         catch(exception& e) {
133                 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
134                 exit(1);
135         }
136 }
137
138 //**********************************************************************************************************************
139
140 void GetOTURepCommand::help(){
141         try {
142                 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
143                 mothurOut("The get.oturep command parameters are list, fasta, name, group, large, sorted and label.  The fasta and list parameters are required.\n");
144                 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
145                 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
146                 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
147                 mothurOut("The default value for label is all labels in your inputfile.\n");
148                 mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
149                 mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
150                 mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
151                 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
152                 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
153         }
154         catch(exception& e) {
155                 errorOut(e, "GetOTURepCommand", "help");
156                 exit(1);
157         }
158 }
159
160 //**********************************************************************************************************************
161
162 GetOTURepCommand::~GetOTURepCommand(){}
163
164 //**********************************************************************************************************************
165
166 int GetOTURepCommand::execute(){
167         try {
168         
169                 if (abort == true) { return 0; }
170                 int error;
171                 
172                 if (!large) {
173                         //read distance files
174                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
175                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
176                         else { mothurOut("File format error."); mothurOutEndLine(); return 0;  }
177                         
178                         readMatrix->setCutoff(cutoff);
179         
180                         if(namefile != ""){     
181                                 nameMap = new NameAssignment(namefile);
182                                 nameMap->readMap();
183                         }else{  nameMap = NULL;         }
184                         
185                         readMatrix->read(nameMap);
186
187                         //get matrix
188                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
189                         globaldata->gListVector = readMatrix->getListVector();
190
191                         SparseMatrix* matrix = readMatrix->getMatrix();
192                         
193                         // Create a data structure to quickly access the distance information.
194                         // It consists of a vector of distance maps, where each map contains
195                         // all distances of a certain sequence. Vector and maps are accessed
196                         // via the index of a sequence in the distance matrix
197                         seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
198                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
199                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
200                         }
201                         
202                         delete matrix;
203                         delete readMatrix;
204                 }else {
205                         //process file and set up indexes
206                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
207                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
208                         else { mothurOut("File format error."); mothurOutEndLine(); return 0;  }
209                         
210                         formatMatrix->setCutoff(cutoff);
211         
212                         if(namefile != ""){     
213                                 nameMap = new NameAssignment(namefile);
214                                 nameMap->readMap();
215                         }else{  nameMap = NULL;         }
216                         
217                         formatMatrix->read(nameMap);
218
219                         //get matrix
220                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
221                         globaldata->gListVector = formatMatrix->getListVector();
222                         
223                         distFile = formatMatrix->getFormattedFileName();
224                         
225                         //positions in file where the distances for each sequence begin
226                         //rowPositions[1] = position in file where distance related to sequence 1 start.
227                         rowPositions = formatMatrix->getRowPositions();
228                         
229                         delete formatMatrix;
230                         
231                         //openfile for getMap to use
232                         openInputFile(distFile, inRow);
233                 }
234                 
235                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
236                 if (globaldata->gListVector != NULL) {
237                         vector<string> names;
238                         string binnames;
239                         //map names to rows in sparsematrix
240                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
241                                 names.clear();
242                                 binnames = globaldata->gListVector->get(i);
243                                 
244                                 splitAtComma(binnames, names);
245                                 
246                                 for (int j = 0; j < names.size(); j++) {
247                                         nameToIndex[names[j]] = i;
248                                 }
249                         }
250                 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
251                 
252                 fasta = new FastaMap();
253                 
254                 //read fastafile
255                 fasta->readFastaFile(fastafile);
256                                 
257                 //if user gave a namesfile then use it
258                 if (namefile != "") {   readNamesFile();        }
259                 
260                 //set format to list so input can get listvector
261                 globaldata->setFormat("list");
262
263                 //read list file
264                 read = new ReadOTUFile(listfile);
265                 read->read(&*globaldata); 
266                 
267                 input = globaldata->ginput;
268                 list = globaldata->gListVector;
269                 string lastLabel = list->getLabel();
270                 
271                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
272                 set<string> processedLabels;
273                 set<string> userLabels = labels;
274         
275                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
276                         
277                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
278                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
279                                         error = process(list);
280                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
281                                         
282                                         processedLabels.insert(list->getLabel());
283                                         userLabels.erase(list->getLabel());
284                         }
285                         
286                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
287                                         string saveLabel = list->getLabel();
288                                         
289                                         delete list;
290                                         list = input->getListVector(lastLabel);
291                                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
292                                         error = process(list);
293                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
294                                         
295                                         processedLabels.insert(list->getLabel());
296                                         userLabels.erase(list->getLabel());
297                                         
298                                         //restore real lastlabel to save below
299                                         list->setLabel(saveLabel);
300                         }
301                         
302                         lastLabel = list->getLabel();
303                         
304                         delete list;
305                         list = input->getListVector();
306                 }
307                 
308                 //output error messages about any remaining user labels
309                 bool needToRun = false;
310                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
311                         mothurOut("Your file does not include the label " + *it); 
312                         if (processedLabels.count(list->getLabel()) != 1) {
313                                 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
314                                 needToRun = true;
315                         }else {
316                                 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
317                         }
318                 }
319                 
320                 //run last label if you need to
321                 if (needToRun == true)  {
322                         if (list != NULL) {     delete list;    }
323                         list = input->getListVector(lastLabel);
324                         mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
325                         error = process(list);
326                         delete list;
327                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
328                 }
329                 
330                 //close and remove formatted matrix file
331                 inRow.close();
332                 remove(distFile.c_str());
333                 
334                 globaldata->gListVector = NULL;
335                 delete input;  globaldata->ginput = NULL;
336                 delete read;
337                 delete fasta;
338                 if (groupfile != "") {
339                         delete groupMap;  globaldata->gGroupmap = NULL;
340                 }
341                 
342                 return 0;
343         }
344         catch(exception& e) {
345                 errorOut(e, "GetOTURepCommand", "execute");
346                 exit(1);
347         }
348 }
349
350 //**********************************************************************************************************************
351 void GetOTURepCommand::readNamesFile() {
352         try {
353                 vector<string> dupNames;
354                 openInputFile(namefile, inNames);
355                 
356                 string name, names, sequence;
357         
358                 while(inNames){
359                         inNames >> name;                        //read from first column  A
360                         inNames >> names;               //read from second column  A,B,C,D
361                         
362                         dupNames.clear();
363                         
364                         //parse names into vector
365                         splitAtComma(names, dupNames);
366                         
367                         //store names in fasta map
368                         sequence = fasta->getSequence(name);
369                         for (int i = 0; i < dupNames.size(); i++) {
370                                 fasta->push_back(dupNames[i], sequence);
371                         }
372                 
373                         gobble(inNames);
374                 }
375                 inNames.close();
376
377         }
378         catch(exception& e) {
379                 errorOut(e, "GetOTURepCommand", "readNamesFile");
380                 exit(1);
381         }
382 }
383 //**********************************************************************************************************************
384 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
385         try{
386                 vector<string> names;
387                 map<string, string> groups;
388                 map<string, string>::iterator groupIt;
389
390                 //parse names into vector
391                 string binnames = thisList->get(bin);
392                 splitAtComma(binnames, names);
393                 binsize = names.size();
394
395                 //if you have a groupfile
396                 if (groupfile != "") {
397                         //find the groups that are in this bin
398                         for (size_t i = 0; i < names.size(); i++) {
399                                 string groupName = groupMap->getGroup(names[i]);
400                                 if (groupName == "not found") {  
401                                         mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
402                                         groupError = true;
403                                 } else {
404                                         groups[groupName] = groupName;
405                                 }
406                         }
407                         
408                         //turn the groups into a string
409                         for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
410                                 group += groupIt->first + "-";
411                         }
412                         //rip off last dash
413                         group = group.substr(0, group.length()-1);
414                 }else{ group = ""; }
415
416                 // if only 1 sequence in bin or processing the "unique" label, then 
417                 // the first sequence of the OTU is the representative one
418                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
419                         return names[0];
420                 }else{
421                         vector<int> seqIndex(names.size());
422                         vector<float> max_dist(names.size());
423                         vector<float> total_dist(names.size());
424
425                         //fill seqIndex and initialize sums
426                         for (size_t i = 0; i < names.size(); i++) {
427                                 seqIndex[i] = nameToIndex[names[i]];
428                                 max_dist[i] = 0.0;
429                                 total_dist[i] = 0.0;
430                         }
431                         
432                         // loop through all entries in seqIndex
433                         SeqMap::iterator it;
434                         SeqMap currMap;
435                         for (size_t i=0; i < seqIndex.size(); i++) {
436                         
437                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
438                                 else            {       currMap = getMap(seqIndex[i]);  }
439                                 
440                                 for (size_t j=0; j < seqIndex.size(); j++) {
441                                         it = currMap.find(seqIndex[j]);         
442                                         if (it != currMap.end()) {
443                                                 max_dist[i] = max(max_dist[i], it->second);
444                                                 max_dist[j] = max(max_dist[j], it->second);
445                                                 total_dist[i] += it->second;
446                                                 total_dist[j] += it->second;
447                                         }else{ //if you can't find the distance make it the cutoff
448                                                 max_dist[i] = max(max_dist[i], cutoff);
449                                                 max_dist[j] = max(max_dist[j], cutoff);
450                                                 total_dist[i] += cutoff;
451                                                 total_dist[j] += cutoff;
452                                         }
453                                 }
454                         }
455                         
456                         // sequence with the smallest maximum distance is the representative
457                         //if tie occurs pick sequence with smallest average distance
458                         float min = 10000;
459                         int minIndex;
460                         for (size_t i=0; i < max_dist.size(); i++) {
461                                 if (max_dist[i] < min) {
462                                         min = max_dist[i];
463                                         minIndex = i;
464                                 }else if (max_dist[i] == min) {
465                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
466                                         float newAverage = total_dist[i] / (float) total_dist.size();
467                                         
468                                         if (newAverage < currentAverage) {
469                                                 min = max_dist[i];
470                                                 minIndex = i;
471                                         }
472                                 }
473                         }
474
475                         return(names[minIndex]);
476                 }
477         }
478         catch(exception& e) {
479                 errorOut(e, "GetOTURepCommand", "FindRep");
480                 exit(1);
481         }
482 }
483
484 //**********************************************************************************************************************
485 int GetOTURepCommand::process(ListVector* processList) {
486         try{
487                 string nameRep, name, sequence;
488
489                 //create output file
490                 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
491                 openOutputFile(outputFileName, out);
492                 vector<repStruct> reps;
493                 
494                 ofstream newNamesOutput;
495                 string outputNamesFile = getRootName(listfile) + processList->getLabel() + ".rep.names";
496                 openOutputFile(outputNamesFile, newNamesOutput);
497                 
498                 //for each bin in the list vector
499                 for (int i = 0; i < processList->size(); i++) {
500                         string groups;
501                         int binsize;
502                         nameRep = findRep(i, groups, processList, binsize);
503                         
504                         //output to new names file
505                         newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
506
507                         //print out name and sequence for that bin
508                         sequence = fasta->getSequence(nameRep);
509
510                         if (sequence != "not found") {
511                                 if (sorted == "") { //print them out
512                                         nameRep = nameRep + "|" + toString(i+1);
513                                         nameRep = nameRep + "|" + toString(binsize);
514                                         if (groupfile != "") {
515                                                 nameRep = nameRep + "|" + groups;
516                                         }
517                                         out << ">" << nameRep << endl;
518                                         out << sequence << endl;
519                                 }else { //save them
520                                         repStruct newRep(nameRep, i+1, binsize, groups);
521                                         reps.push_back(newRep);
522                                 }
523                         }else { 
524                                 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine(); 
525                                 remove(outputFileName.c_str());
526                                 remove(outputNamesFile.c_str());
527                                 return 1;
528                         }
529                 }
530                 
531                 if (sorted != "") { //then sort them and print them
532                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
533                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
534                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
535                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
536                         
537                         //print them
538                         for (int i = 0; i < reps.size(); i++) {
539                                 string sequence = fasta->getSequence(reps[i].name);
540                                 string outputName = reps[i].name + "|" + toString(reps[i].bin);
541                                 outputName = outputName + "|" + toString(reps[i].size);
542                                 if (groupfile != "") {
543                                         outputName = outputName + "|" + reps[i].group;
544                                 }
545                                 out << ">" << outputName << endl;
546                                 out << sequence << endl;
547                         }
548                 }
549
550                 out.close();
551                 newNamesOutput.close();
552                 return 0;
553
554         }
555         catch(exception& e) {
556                 errorOut(e, "GetOTURepCommand", "process");
557                 exit(1);
558         }
559 }
560
561 //**********************************************************************************************************************
562 SeqMap GetOTURepCommand::getMap(int row) {
563         try {
564                 SeqMap rowMap;
565                 
566                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
567                 if (rowPositions[row] != -1){
568                         //go to row in file
569                         inRow.seekg(rowPositions[row]);
570                         
571                         int rowNum, numDists, colNum;
572                         float dist;
573                         
574                         inRow >> rowNum >> numDists;
575                         
576                         for(int i = 0; i < numDists; i++) {
577                                 inRow >> colNum >> dist;
578                                 rowMap[colNum] = dist;
579                                 
580                         }
581                 }
582                 
583                 return rowMap;
584         }
585         catch(exception& e) {
586                 errorOut(e, "GetOTURepCommand", "getMap");
587                 exit(1);
588         }
589 }
590 //**********************************************************************************************************************
591