5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option) {
41 globaldata = GlobalData::getInstance();
46 //allow user to run help
47 if (option == "help") {
50 //valid paramters for this command
51 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","outputdir","inputdir"};
52 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 OptionParser parser(option);
55 map<string, string> parameters = parser.getParameters();
57 ValidParameters validParameter;
58 map<string, string>::iterator it;
60 //check to make sure all parameters are valid for command
61 for (it = parameters.begin(); it != parameters.end(); it++) {
62 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
65 //if the user changes the input directory command factory will send this info to us in the output parameter
66 string inputDir = validParameter.validFile(parameters, "inputdir", false);
67 if (inputDir == "not found"){ inputDir = ""; }
70 it = parameters.find("list");
71 //user has given a template file
72 if(it != parameters.end()){
73 path = hasPath(it->second);
74 //if the user has not given a path then, add inputdir. else leave path alone.
75 if (path == "") { parameters["list"] = inputDir + it->second; }
78 it = parameters.find("fasta");
79 //user has given a template file
80 if(it != parameters.end()){
81 path = hasPath(it->second);
82 //if the user has not given a path then, add inputdir. else leave path alone.
83 if (path == "") { parameters["fasta"] = inputDir + it->second; }
86 it = parameters.find("phylip");
87 //user has given a template file
88 if(it != parameters.end()){
89 path = hasPath(it->second);
90 //if the user has not given a path then, add inputdir. else leave path alone.
91 if (path == "") { parameters["phylip"] = inputDir + it->second; }
94 it = parameters.find("column");
95 //user has given a template file
96 if(it != parameters.end()){
97 path = hasPath(it->second);
98 //if the user has not given a path then, add inputdir. else leave path alone.
99 if (path == "") { parameters["column"] = inputDir + it->second; }
102 it = parameters.find("name");
103 //user has given a template file
104 if(it != parameters.end()){
105 path = hasPath(it->second);
106 //if the user has not given a path then, add inputdir. else leave path alone.
107 if (path == "") { parameters["name"] = inputDir + it->second; }
110 it = parameters.find("group");
111 //user has given a template file
112 if(it != parameters.end()){
113 path = hasPath(it->second);
114 //if the user has not given a path then, add inputdir. else leave path alone.
115 if (path == "") { parameters["group"] = inputDir + it->second; }
120 //if the user changes the output directory command factory will send this info to us in the output parameter
121 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
123 //check for required parameters
124 fastafile = validParameter.validFile(parameters, "fasta", true);
125 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
126 else if (fastafile == "not open") { abort = true; }
128 listfile = validParameter.validFile(parameters, "list", true);
129 if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
130 else if (listfile == "not open") { abort = true; }
132 phylipfile = validParameter.validFile(parameters, "phylip", true);
133 if (phylipfile == "not found") { phylipfile = ""; }
134 else if (phylipfile == "not open") { abort = true; }
135 else { distFile = phylipfile; format = "phylip"; }
137 columnfile = validParameter.validFile(parameters, "column", true);
138 if (columnfile == "not found") { columnfile = ""; }
139 else if (columnfile == "not open") { abort = true; }
140 else { distFile = columnfile; format = "column"; }
142 namefile = validParameter.validFile(parameters, "name", true);
143 if (namefile == "not open") { abort = true; }
144 else if (namefile == "not found") { namefile = ""; }
146 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
147 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
149 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
151 //check for optional parameter and set defaults
152 // ...at some point should added some additional type checking...
153 label = validParameter.validFile(parameters, "label", false);
154 if (label == "not found") { label = ""; allLines = 1; }
156 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
157 else { allLines = 1; }
160 groupfile = validParameter.validFile(parameters, "group", true);
161 if (groupfile == "not open") { groupfile = ""; abort = true; }
162 else if (groupfile == "not found") { groupfile = ""; }
164 //read in group map info.
165 groupMap = new GroupMap(groupfile);
166 int error = groupMap->readMap();
167 if (error == 1) { delete groupMap; abort = true; }
170 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
171 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
172 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
176 if ((sorted == "group") && (groupfile == "")) {
177 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
181 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
182 large = isTrue(temp);
184 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
185 convert(temp, precision);
187 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
188 convert(temp, cutoff);
189 cutoff += (5 / (precision * 10.0));
192 catch(exception& e) {
193 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
198 //**********************************************************************************************************************
200 void GetOTURepCommand::help(){
202 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n");
203 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
204 m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
205 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
206 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
207 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
208 m->mothurOut("The default value for label is all labels in your inputfile.\n");
209 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
210 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
211 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
212 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
213 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
215 catch(exception& e) {
216 m->errorOut(e, "GetOTURepCommand", "help");
221 //**********************************************************************************************************************
223 GetOTURepCommand::~GetOTURepCommand(){}
225 //**********************************************************************************************************************
227 int GetOTURepCommand::execute(){
230 if (abort == true) { return 0; }
234 //read distance files
235 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
236 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
237 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
239 readMatrix->setCutoff(cutoff);
242 nameMap = new NameAssignment(namefile);
244 }else{ nameMap = NULL; }
246 readMatrix->read(nameMap);
249 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
250 globaldata->gListVector = readMatrix->getListVector();
252 SparseMatrix* matrix = readMatrix->getMatrix();
254 // Create a data structure to quickly access the distance information.
255 // It consists of a vector of distance maps, where each map contains
256 // all distances of a certain sequence. Vector and maps are accessed
257 // via the index of a sequence in the distance matrix
258 seqVec = vector<SeqMap>(globaldata->gListVector->size());
259 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
260 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
266 //process file and set up indexes
267 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
268 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
269 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
271 formatMatrix->setCutoff(cutoff);
274 nameMap = new NameAssignment(namefile);
276 }else{ nameMap = NULL; }
278 formatMatrix->read(nameMap);
281 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
282 globaldata->gListVector = formatMatrix->getListVector();
284 distFile = formatMatrix->getFormattedFileName();
286 //positions in file where the distances for each sequence begin
287 //rowPositions[1] = position in file where distance related to sequence 1 start.
288 rowPositions = formatMatrix->getRowPositions();
292 //openfile for getMap to use
293 openInputFile(distFile, inRow);
296 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
297 if (globaldata->gListVector != NULL) {
298 vector<string> names;
300 //map names to rows in sparsematrix
301 for (int i = 0; i < globaldata->gListVector->size(); i++) {
303 binnames = globaldata->gListVector->get(i);
305 splitAtComma(binnames, names);
307 for (int j = 0; j < names.size(); j++) {
308 nameToIndex[names[j]] = i;
311 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
313 fasta = new FastaMap();
316 fasta->readFastaFile(fastafile);
318 //if user gave a namesfile then use it
319 if (namefile != "") { readNamesFile(); }
321 //set format to list so input can get listvector
322 globaldata->setFormat("list");
325 read = new ReadOTUFile(listfile);
326 read->read(&*globaldata);
328 input = globaldata->ginput;
329 list = globaldata->gListVector;
330 string lastLabel = list->getLabel();
332 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
333 set<string> processedLabels;
334 set<string> userLabels = labels;
336 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
338 if (allLines == 1 || labels.count(list->getLabel()) == 1){
339 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
340 error = process(list);
341 if (error == 1) { return 0; } //there is an error in hte input files, abort command
343 processedLabels.insert(list->getLabel());
344 userLabels.erase(list->getLabel());
347 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
348 string saveLabel = list->getLabel();
351 list = input->getListVector(lastLabel);
352 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
353 error = process(list);
354 if (error == 1) { return 0; } //there is an error in hte input files, abort command
356 processedLabels.insert(list->getLabel());
357 userLabels.erase(list->getLabel());
359 //restore real lastlabel to save below
360 list->setLabel(saveLabel);
363 lastLabel = list->getLabel();
366 list = input->getListVector();
369 //output error messages about any remaining user labels
370 bool needToRun = false;
371 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
372 m->mothurOut("Your file does not include the label " + *it);
373 if (processedLabels.count(list->getLabel()) != 1) {
374 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
377 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
381 //run last label if you need to
382 if (needToRun == true) {
383 if (list != NULL) { delete list; }
384 list = input->getListVector(lastLabel);
385 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
386 error = process(list);
388 if (error == 1) { return 0; } //there is an error in hte input files, abort command
391 //close and remove formatted matrix file
394 remove(distFile.c_str());
397 globaldata->gListVector = NULL;
398 delete input; globaldata->ginput = NULL;
401 if (groupfile != "") {
402 delete groupMap; globaldata->gGroupmap = NULL;
405 m->mothurOutEndLine();
406 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
407 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
408 m->mothurOutEndLine();
412 catch(exception& e) {
413 m->errorOut(e, "GetOTURepCommand", "execute");
418 //**********************************************************************************************************************
419 void GetOTURepCommand::readNamesFile() {
421 vector<string> dupNames;
422 openInputFile(namefile, inNames);
424 string name, names, sequence;
427 inNames >> name; //read from first column A
428 inNames >> names; //read from second column A,B,C,D
432 //parse names into vector
433 splitAtComma(names, dupNames);
435 //store names in fasta map
436 sequence = fasta->getSequence(name);
437 for (int i = 0; i < dupNames.size(); i++) {
438 fasta->push_back(dupNames[i], sequence);
446 catch(exception& e) {
447 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
451 //**********************************************************************************************************************
452 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
454 vector<string> names;
455 map<string, string> groups;
456 map<string, string>::iterator groupIt;
458 //parse names into vector
459 string binnames = thisList->get(bin);
460 splitAtComma(binnames, names);
461 binsize = names.size();
463 //if you have a groupfile
464 if (groupfile != "") {
465 //find the groups that are in this bin
466 for (size_t i = 0; i < names.size(); i++) {
467 string groupName = groupMap->getGroup(names[i]);
468 if (groupName == "not found") {
469 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
472 groups[groupName] = groupName;
476 //turn the groups into a string
477 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
478 group += groupIt->first + "-";
481 group = group.substr(0, group.length()-1);
484 // if only 1 sequence in bin or processing the "unique" label, then
485 // the first sequence of the OTU is the representative one
486 if ((names.size() == 1) || (list->getLabel() == "unique")) {
489 vector<int> seqIndex(names.size());
490 vector<float> max_dist(names.size());
491 vector<float> total_dist(names.size());
493 //fill seqIndex and initialize sums
494 for (size_t i = 0; i < names.size(); i++) {
495 seqIndex[i] = nameToIndex[names[i]];
500 // loop through all entries in seqIndex
503 for (size_t i=0; i < seqIndex.size(); i++) {
505 if (!large) { currMap = seqVec[seqIndex[i]]; }
506 else { currMap = getMap(seqIndex[i]); }
508 for (size_t j=0; j < seqIndex.size(); j++) {
509 it = currMap.find(seqIndex[j]);
510 if (it != currMap.end()) {
511 max_dist[i] = max(max_dist[i], it->second);
512 max_dist[j] = max(max_dist[j], it->second);
513 total_dist[i] += it->second;
514 total_dist[j] += it->second;
515 }else{ //if you can't find the distance make it the cutoff
516 max_dist[i] = max(max_dist[i], cutoff);
517 max_dist[j] = max(max_dist[j], cutoff);
518 total_dist[i] += cutoff;
519 total_dist[j] += cutoff;
524 // sequence with the smallest maximum distance is the representative
525 //if tie occurs pick sequence with smallest average distance
528 for (size_t i=0; i < max_dist.size(); i++) {
529 if (max_dist[i] < min) {
532 }else if (max_dist[i] == min) {
533 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
534 float newAverage = total_dist[i] / (float) total_dist.size();
536 if (newAverage < currentAverage) {
543 return(names[minIndex]);
546 catch(exception& e) {
547 m->errorOut(e, "GetOTURepCommand", "FindRep");
552 //**********************************************************************************************************************
553 int GetOTURepCommand::process(ListVector* processList) {
555 string nameRep, name, sequence;
558 if (outputDir == "") { outputDir += hasPath(listfile); }
559 string outputFileName = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.fasta";
560 openOutputFile(outputFileName, out);
561 vector<repStruct> reps;
562 outputNames.push_back(outputFileName);
564 ofstream newNamesOutput;
565 string outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
566 openOutputFile(outputNamesFile, newNamesOutput);
567 outputNames.push_back(outputNamesFile);
569 //for each bin in the list vector
570 for (int i = 0; i < processList->size(); i++) {
573 nameRep = findRep(i, groups, processList, binsize);
575 //output to new names file
576 newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
578 //print out name and sequence for that bin
579 sequence = fasta->getSequence(nameRep);
581 if (sequence != "not found") {
582 if (sorted == "") { //print them out
583 nameRep = nameRep + "|" + toString(i+1);
584 nameRep = nameRep + "|" + toString(binsize);
585 if (groupfile != "") {
586 nameRep = nameRep + "|" + groups;
588 out << ">" << nameRep << endl;
589 out << sequence << endl;
591 repStruct newRep(nameRep, i+1, binsize, groups);
592 reps.push_back(newRep);
595 m->mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine();
596 remove(outputFileName.c_str());
597 remove(outputNamesFile.c_str());
602 if (sorted != "") { //then sort them and print them
603 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
604 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
605 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
606 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
609 for (int i = 0; i < reps.size(); i++) {
610 string sequence = fasta->getSequence(reps[i].name);
611 string outputName = reps[i].name + "|" + toString(reps[i].bin);
612 outputName = outputName + "|" + toString(reps[i].size);
613 if (groupfile != "") {
614 outputName = outputName + "|" + reps[i].group;
616 out << ">" << outputName << endl;
617 out << sequence << endl;
622 newNamesOutput.close();
626 catch(exception& e) {
627 m->errorOut(e, "GetOTURepCommand", "process");
632 //**********************************************************************************************************************
633 SeqMap GetOTURepCommand::getMap(int row) {
637 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
638 if (rowPositions[row] != -1){
640 inRow.seekg(rowPositions[row]);
642 int rowNum, numDists, colNum;
645 inRow >> rowNum >> numDists;
647 for(int i = 0; i < numDists; i++) {
648 inRow >> colNum >> dist;
649 rowMap[colNum] = dist;
656 catch(exception& e) {
657 m->errorOut(e, "GetOTURepCommand", "getMap");
661 //**********************************************************************************************************************