5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option){
41 globaldata = GlobalData::getInstance();
46 //allow user to run help
47 if (option == "help") {
50 //valid paramters for this command
51 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision"};
52 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 OptionParser parser(option);
55 map<string, string> parameters = parser.getParameters();
57 ValidParameters validParameter;
59 //check to make sure all parameters are valid for command
60 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
61 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
64 //check for required parameters
65 fastafile = validParameter.validFile(parameters, "fasta", true);
66 if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
67 else if (fastafile == "not open") { abort = true; }
69 listfile = validParameter.validFile(parameters, "list", true);
70 if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
71 else if (listfile == "not open") { abort = true; }
73 phylipfile = validParameter.validFile(parameters, "phylip", true);
74 if (phylipfile == "not found") { phylipfile = ""; }
75 else if (phylipfile == "not open") { abort = true; }
76 else { distFile = phylipfile; format = "phylip"; }
78 columnfile = validParameter.validFile(parameters, "column", true);
79 if (columnfile == "not found") { columnfile = ""; }
80 else if (columnfile == "not open") { abort = true; }
81 else { distFile = columnfile; format = "column"; }
83 namefile = validParameter.validFile(parameters, "name", true);
84 if (namefile == "not open") { abort = true; }
85 else if (namefile == "not found") { namefile = ""; }
87 if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
88 else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
90 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
92 //check for optional parameter and set defaults
93 // ...at some point should added some additional type checking...
94 label = validParameter.validFile(parameters, "label", false);
95 if (label == "not found") { label = ""; allLines = 1; }
97 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
98 else { allLines = 1; }
101 groupfile = validParameter.validFile(parameters, "group", true);
102 if (groupfile == "not open") { groupfile = ""; abort = true; }
103 else if (groupfile == "not found") { groupfile = ""; }
105 //read in group map info.
106 groupMap = new GroupMap(groupfile);
110 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
111 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
112 mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); mothurOutEndLine();
116 if ((sorted == "group") && (groupfile == "")) {
117 mothurOut("You must provide a groupfile to sort by group. I will not sort."); mothurOutEndLine();
121 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
122 large = isTrue(temp);
124 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
125 convert(temp, precision);
127 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
128 convert(temp, cutoff);
129 cutoff += (5 / (precision * 10.0));
132 catch(exception& e) {
133 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
138 //**********************************************************************************************************************
140 void GetOTURepCommand::help(){
142 mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n");
143 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
144 mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
145 mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
146 mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
147 mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
148 mothurOut("The default value for label is all labels in your inputfile.\n");
149 mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
150 mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
151 mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
152 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
153 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
155 catch(exception& e) {
156 errorOut(e, "GetOTURepCommand", "help");
161 //**********************************************************************************************************************
163 GetOTURepCommand::~GetOTURepCommand(){}
165 //**********************************************************************************************************************
167 int GetOTURepCommand::execute(){
170 if (abort == true) { return 0; }
174 //read distance files
175 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
176 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
177 else { mothurOut("File format error."); mothurOutEndLine(); return 0; }
179 readMatrix->setCutoff(cutoff);
182 nameMap = new NameAssignment(namefile);
184 }else{ nameMap = NULL; }
186 readMatrix->read(nameMap);
189 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
190 globaldata->gListVector = readMatrix->getListVector();
192 SparseMatrix* matrix = readMatrix->getMatrix();
194 // Create a data structure to quickly access the distance information.
195 // It consists of a vector of distance maps, where each map contains
196 // all distances of a certain sequence. Vector and maps are accessed
197 // via the index of a sequence in the distance matrix
198 seqVec = vector<SeqMap>(globaldata->gListVector->size());
199 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
200 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
206 //process file and set up indexes
207 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
208 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
209 else { mothurOut("File format error."); mothurOutEndLine(); return 0; }
211 formatMatrix->setCutoff(cutoff);
214 nameMap = new NameAssignment(namefile);
216 }else{ nameMap = NULL; }
218 formatMatrix->read(nameMap);
221 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
222 globaldata->gListVector = formatMatrix->getListVector();
224 distFile = formatMatrix->getFormattedFileName();
226 //positions in file where the distances for each sequence begin
227 //rowPositions[1] = position in file where distance related to sequence 1 start.
228 rowPositions = formatMatrix->getRowPositions();
232 //openfile for getMap to use
233 openInputFile(distFile, inRow);
236 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
237 if (globaldata->gListVector != NULL) {
238 vector<string> names;
240 //map names to rows in sparsematrix
241 for (int i = 0; i < globaldata->gListVector->size(); i++) {
243 binnames = globaldata->gListVector->get(i);
245 splitAtComma(binnames, names);
247 for (int j = 0; j < names.size(); j++) {
248 nameToIndex[names[j]] = i;
251 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
253 fasta = new FastaMap();
256 fasta->readFastaFile(fastafile);
258 //if user gave a namesfile then use it
259 if (namefile != "") { readNamesFile(); }
261 //set format to list so input can get listvector
262 globaldata->setFormat("list");
265 read = new ReadOTUFile(listfile);
266 read->read(&*globaldata);
268 input = globaldata->ginput;
269 list = globaldata->gListVector;
270 string lastLabel = list->getLabel();
272 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
273 set<string> processedLabels;
274 set<string> userLabels = labels;
276 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
278 if (allLines == 1 || labels.count(list->getLabel()) == 1){
279 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
280 error = process(list);
281 if (error == 1) { return 0; } //there is an error in hte input files, abort command
283 processedLabels.insert(list->getLabel());
284 userLabels.erase(list->getLabel());
287 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
288 string saveLabel = list->getLabel();
291 list = input->getListVector(lastLabel);
292 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
293 error = process(list);
294 if (error == 1) { return 0; } //there is an error in hte input files, abort command
296 processedLabels.insert(list->getLabel());
297 userLabels.erase(list->getLabel());
299 //restore real lastlabel to save below
300 list->setLabel(saveLabel);
303 lastLabel = list->getLabel();
306 list = input->getListVector();
309 //output error messages about any remaining user labels
310 bool needToRun = false;
311 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
312 mothurOut("Your file does not include the label " + *it);
313 if (processedLabels.count(list->getLabel()) != 1) {
314 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
317 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
321 //run last label if you need to
322 if (needToRun == true) {
323 if (list != NULL) { delete list; }
324 list = input->getListVector(lastLabel);
325 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
326 error = process(list);
328 if (error == 1) { return 0; } //there is an error in hte input files, abort command
331 //close and remove formatted matrix file
334 //remove(distFile.c_str());
337 globaldata->gListVector = NULL;
338 delete input; globaldata->ginput = NULL;
341 if (groupfile != "") {
342 delete groupMap; globaldata->gGroupmap = NULL;
347 catch(exception& e) {
348 errorOut(e, "GetOTURepCommand", "execute");
353 //**********************************************************************************************************************
354 void GetOTURepCommand::readNamesFile() {
356 vector<string> dupNames;
357 openInputFile(namefile, inNames);
359 string name, names, sequence;
362 inNames >> name; //read from first column A
363 inNames >> names; //read from second column A,B,C,D
367 //parse names into vector
368 splitAtComma(names, dupNames);
370 //store names in fasta map
371 sequence = fasta->getSequence(name);
372 for (int i = 0; i < dupNames.size(); i++) {
373 fasta->push_back(dupNames[i], sequence);
381 catch(exception& e) {
382 errorOut(e, "GetOTURepCommand", "readNamesFile");
386 //**********************************************************************************************************************
387 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
389 vector<string> names;
390 map<string, string> groups;
391 map<string, string>::iterator groupIt;
393 //parse names into vector
394 string binnames = thisList->get(bin);
395 splitAtComma(binnames, names);
396 binsize = names.size();
398 //if you have a groupfile
399 if (groupfile != "") {
400 //find the groups that are in this bin
401 for (size_t i = 0; i < names.size(); i++) {
402 string groupName = groupMap->getGroup(names[i]);
403 if (groupName == "not found") {
404 mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
407 groups[groupName] = groupName;
411 //turn the groups into a string
412 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
413 group += groupIt->first + "-";
416 group = group.substr(0, group.length()-1);
419 // if only 1 sequence in bin or processing the "unique" label, then
420 // the first sequence of the OTU is the representative one
421 if ((names.size() == 1) || (list->getLabel() == "unique")) {
424 vector<int> seqIndex(names.size());
425 vector<float> max_dist(names.size());
426 vector<float> total_dist(names.size());
428 //fill seqIndex and initialize sums
429 for (size_t i = 0; i < names.size(); i++) {
430 seqIndex[i] = nameToIndex[names[i]];
435 // loop through all entries in seqIndex
438 for (size_t i=0; i < seqIndex.size(); i++) {
440 if (!large) { currMap = seqVec[seqIndex[i]]; }
441 else { currMap = getMap(seqIndex[i]); }
443 for (size_t j=0; j < seqIndex.size(); j++) {
444 it = currMap.find(seqIndex[j]);
445 if (it != currMap.end()) {
446 max_dist[i] = max(max_dist[i], it->second);
447 max_dist[j] = max(max_dist[j], it->second);
448 total_dist[i] += it->second;
449 total_dist[j] += it->second;
450 }else{ //if you can't find the distance make it the cutoff
451 max_dist[i] = max(max_dist[i], cutoff);
452 max_dist[j] = max(max_dist[j], cutoff);
453 total_dist[i] += cutoff;
454 total_dist[j] += cutoff;
459 // sequence with the smallest maximum distance is the representative
460 //if tie occurs pick sequence with smallest average distance
463 for (size_t i=0; i < max_dist.size(); i++) {
464 if (max_dist[i] < min) {
467 }else if (max_dist[i] == min) {
468 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
469 float newAverage = total_dist[i] / (float) total_dist.size();
471 if (newAverage < currentAverage) {
478 return(names[minIndex]);
481 catch(exception& e) {
482 errorOut(e, "GetOTURepCommand", "FindRep");
487 //**********************************************************************************************************************
488 int GetOTURepCommand::process(ListVector* processList) {
490 string nameRep, name, sequence;
493 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
494 openOutputFile(outputFileName, out);
495 vector<repStruct> reps;
497 ofstream newNamesOutput;
498 string outputNamesFile = getRootName(listfile) + processList->getLabel() + ".rep.names";
499 openOutputFile(outputNamesFile, newNamesOutput);
501 //for each bin in the list vector
502 for (int i = 0; i < processList->size(); i++) {
505 nameRep = findRep(i, groups, processList, binsize);
507 //output to new names file
508 newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
510 //print out name and sequence for that bin
511 sequence = fasta->getSequence(nameRep);
513 if (sequence != "not found") {
514 if (sorted == "") { //print them out
515 nameRep = nameRep + "|" + toString(i+1);
516 nameRep = nameRep + "|" + toString(binsize);
517 if (groupfile != "") {
518 nameRep = nameRep + "|" + groups;
520 out << ">" << nameRep << endl;
521 out << sequence << endl;
523 repStruct newRep(nameRep, i+1, binsize, groups);
524 reps.push_back(newRep);
527 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine();
528 remove(outputFileName.c_str());
529 remove(outputNamesFile.c_str());
534 if (sorted != "") { //then sort them and print them
535 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
536 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
537 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
538 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
541 for (int i = 0; i < reps.size(); i++) {
542 string sequence = fasta->getSequence(reps[i].name);
543 string outputName = reps[i].name + "|" + toString(reps[i].bin);
544 outputName = outputName + "|" + toString(reps[i].size);
545 if (groupfile != "") {
546 outputName = outputName + "|" + reps[i].group;
548 out << ">" << outputName << endl;
549 out << sequence << endl;
554 newNamesOutput.close();
558 catch(exception& e) {
559 errorOut(e, "GetOTURepCommand", "process");
564 //**********************************************************************************************************************
565 SeqMap GetOTURepCommand::getMap(int row) {
569 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
570 if (rowPositions[row] != -1){
572 inRow.seekg(rowPositions[row]);
574 int rowNum, numDists, colNum;
577 inRow >> rowNum >> numDists;
579 for(int i = 0; i < numDists; i++) {
580 inRow >> colNum >> dist;
581 rowMap[colNum] = dist;
588 catch(exception& e) {
589 errorOut(e, "GetOTURepCommand", "getMap");
593 //**********************************************************************************************************************