5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option) {
41 globaldata = GlobalData::getInstance();
46 //allow user to run help
47 if (option == "help") {
50 //valid paramters for this command
51 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","outputdir","inputdir"};
52 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 OptionParser parser(option);
55 map<string, string> parameters = parser.getParameters();
57 ValidParameters validParameter;
58 map<string, string>::iterator it;
60 //check to make sure all parameters are valid for command
61 for (it = parameters.begin(); it != parameters.end(); it++) {
62 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
65 //if the user changes the input directory command factory will send this info to us in the output parameter
66 string inputDir = validParameter.validFile(parameters, "inputdir", false);
67 if (inputDir == "not found"){ inputDir = ""; }
70 it = parameters.find("list");
71 //user has given a template file
72 if(it != parameters.end()){
73 path = hasPath(it->second);
74 //if the user has not given a path then, add inputdir. else leave path alone.
75 if (path == "") { parameters["list"] = inputDir + it->second; }
78 it = parameters.find("fasta");
79 //user has given a template file
80 if(it != parameters.end()){
81 path = hasPath(it->second);
82 //if the user has not given a path then, add inputdir. else leave path alone.
83 if (path == "") { parameters["fasta"] = inputDir + it->second; }
86 it = parameters.find("phylip");
87 //user has given a template file
88 if(it != parameters.end()){
89 path = hasPath(it->second);
90 //if the user has not given a path then, add inputdir. else leave path alone.
91 if (path == "") { parameters["phylip"] = inputDir + it->second; }
94 it = parameters.find("column");
95 //user has given a template file
96 if(it != parameters.end()){
97 path = hasPath(it->second);
98 //if the user has not given a path then, add inputdir. else leave path alone.
99 if (path == "") { parameters["column"] = inputDir + it->second; }
102 it = parameters.find("name");
103 //user has given a template file
104 if(it != parameters.end()){
105 path = hasPath(it->second);
106 //if the user has not given a path then, add inputdir. else leave path alone.
107 if (path == "") { parameters["name"] = inputDir + it->second; }
110 it = parameters.find("group");
111 //user has given a template file
112 if(it != parameters.end()){
113 path = hasPath(it->second);
114 //if the user has not given a path then, add inputdir. else leave path alone.
115 if (path == "") { parameters["group"] = inputDir + it->second; }
120 //if the user changes the output directory command factory will send this info to us in the output parameter
121 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
123 //check for required parameters
124 fastafile = validParameter.validFile(parameters, "fasta", true);
125 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
126 else if (fastafile == "not open") { abort = true; }
128 listfile = validParameter.validFile(parameters, "list", true);
129 if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
130 else if (listfile == "not open") { abort = true; }
132 phylipfile = validParameter.validFile(parameters, "phylip", true);
133 if (phylipfile == "not found") { phylipfile = ""; }
134 else if (phylipfile == "not open") { abort = true; }
135 else { distFile = phylipfile; format = "phylip"; }
137 columnfile = validParameter.validFile(parameters, "column", true);
138 if (columnfile == "not found") { columnfile = ""; }
139 else if (columnfile == "not open") { abort = true; }
140 else { distFile = columnfile; format = "column"; }
142 namefile = validParameter.validFile(parameters, "name", true);
143 if (namefile == "not open") { abort = true; }
144 else if (namefile == "not found") { namefile = ""; }
146 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
147 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
149 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
151 //check for optional parameter and set defaults
152 // ...at some point should added some additional type checking...
153 label = validParameter.validFile(parameters, "label", false);
154 if (label == "not found") { label = ""; allLines = 1; }
156 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
157 else { allLines = 1; }
160 groupfile = validParameter.validFile(parameters, "group", true);
161 if (groupfile == "not open") { groupfile = ""; abort = true; }
162 else if (groupfile == "not found") { groupfile = ""; }
164 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
165 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
166 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
170 if ((sorted == "group") && (groupfile == "")) {
171 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
175 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
176 large = isTrue(temp);
178 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
179 convert(temp, precision);
181 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
182 convert(temp, cutoff);
183 cutoff += (5 / (precision * 10.0));
186 catch(exception& e) {
187 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
192 //**********************************************************************************************************************
194 void GetOTURepCommand::help(){
196 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n");
197 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
198 m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
199 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
200 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
201 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
202 m->mothurOut("The default value for label is all labels in your inputfile.\n");
203 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
204 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
205 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
206 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
207 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
209 catch(exception& e) {
210 m->errorOut(e, "GetOTURepCommand", "help");
215 //**********************************************************************************************************************
217 GetOTURepCommand::~GetOTURepCommand(){}
219 //**********************************************************************************************************************
221 int GetOTURepCommand::execute(){
224 if (abort == true) { return 0; }
228 //read distance files
229 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
230 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
231 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
233 readMatrix->setCutoff(cutoff);
236 nameMap = new NameAssignment(namefile);
238 }else{ nameMap = NULL; }
240 readMatrix->read(nameMap);
242 if (m->control_pressed) { delete readMatrix; return 0; }
245 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
246 globaldata->gListVector = readMatrix->getListVector();
248 SparseMatrix* matrix = readMatrix->getMatrix();
250 // Create a data structure to quickly access the distance information.
251 // It consists of a vector of distance maps, where each map contains
252 // all distances of a certain sequence. Vector and maps are accessed
253 // via the index of a sequence in the distance matrix
254 seqVec = vector<SeqMap>(globaldata->gListVector->size());
255 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
256 if (m->control_pressed) { delete readMatrix; return 0; }
257 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
264 if (m->control_pressed) { return 0; }
266 //process file and set up indexes
267 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
268 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
269 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
271 formatMatrix->setCutoff(cutoff);
274 nameMap = new NameAssignment(namefile);
276 }else{ nameMap = NULL; }
278 formatMatrix->read(nameMap);
280 if (m->control_pressed) { delete formatMatrix; return 0; }
283 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
284 globaldata->gListVector = formatMatrix->getListVector();
286 distFile = formatMatrix->getFormattedFileName();
288 //positions in file where the distances for each sequence begin
289 //rowPositions[1] = position in file where distance related to sequence 1 start.
290 rowPositions = formatMatrix->getRowPositions();
295 //openfile for getMap to use
296 openInputFile(distFile, inRow);
298 if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
302 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
303 if (globaldata->gListVector != NULL) {
304 vector<string> names;
306 //map names to rows in sparsematrix
307 for (int i = 0; i < globaldata->gListVector->size(); i++) {
309 binnames = globaldata->gListVector->get(i);
311 splitAtComma(binnames, names);
313 for (int j = 0; j < names.size(); j++) {
314 nameToIndex[names[j]] = i;
317 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
320 if (m->control_pressed) {
321 if (large) { inRow.close(); remove(distFile.c_str()); }
325 //set format to list so input can get listvector
326 globaldata->setFormat("list");
329 read = new ReadOTUFile(listfile);
330 read->read(&*globaldata);
332 input = globaldata->ginput;
333 list = globaldata->gListVector;
334 string lastLabel = list->getLabel();
336 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
337 set<string> processedLabels;
338 set<string> userLabels = labels;
340 if (m->control_pressed) {
341 if (large) { inRow.close(); remove(distFile.c_str()); }
342 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
345 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
347 if (allLines == 1 || labels.count(list->getLabel()) == 1){
348 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
349 error = process(list);
350 if (error == 1) { return 0; } //there is an error in hte input files, abort command
352 if (m->control_pressed) {
353 if (large) { inRow.close(); remove(distFile.c_str()); }
354 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
355 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
358 processedLabels.insert(list->getLabel());
359 userLabels.erase(list->getLabel());
362 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
363 string saveLabel = list->getLabel();
366 list = input->getListVector(lastLabel);
367 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
368 error = process(list);
369 if (error == 1) { return 0; } //there is an error in hte input files, abort command
371 if (m->control_pressed) {
372 if (large) { inRow.close(); remove(distFile.c_str()); }
373 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
374 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
377 processedLabels.insert(list->getLabel());
378 userLabels.erase(list->getLabel());
380 //restore real lastlabel to save below
381 list->setLabel(saveLabel);
384 lastLabel = list->getLabel();
387 list = input->getListVector();
390 //output error messages about any remaining user labels
391 bool needToRun = false;
392 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
393 m->mothurOut("Your file does not include the label " + *it);
394 if (processedLabels.count(list->getLabel()) != 1) {
395 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
398 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
402 //run last label if you need to
403 if (needToRun == true) {
404 if (list != NULL) { delete list; }
405 list = input->getListVector(lastLabel);
406 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
407 error = process(list);
409 if (error == 1) { return 0; } //there is an error in hte input files, abort command
411 if (m->control_pressed) {
412 if (large) { inRow.close(); remove(distFile.c_str()); }
413 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
414 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
418 //close and remove formatted matrix file
421 remove(distFile.c_str());
424 globaldata->gListVector = NULL;
425 delete input; globaldata->ginput = NULL;
428 if (groupfile != "") {
429 //read in group map info.
430 groupMap = new GroupMap(groupfile);
431 int error = groupMap->readMap();
432 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
436 fasta = new FastaMap();
437 fasta->readFastaFile(fastafile);
439 //if user gave a namesfile then use it
440 if (namefile != "") { readNamesFile(); }
442 //output create and output the .rep.fasta files
443 map<string, string>::iterator itNameFile;
444 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
445 processNames(itNameFile->first, itNameFile->second);
449 if (groupfile != "") {
450 delete groupMap; globaldata->gGroupmap = NULL;
453 if (m->control_pressed) { return 0; }
455 m->mothurOutEndLine();
456 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
457 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
458 m->mothurOutEndLine();
462 catch(exception& e) {
463 m->errorOut(e, "GetOTURepCommand", "execute");
468 //**********************************************************************************************************************
469 void GetOTURepCommand::readNamesFile() {
471 vector<string> dupNames;
472 openInputFile(namefile, inNames);
474 string name, names, sequence;
477 inNames >> name; //read from first column A
478 inNames >> names; //read from second column A,B,C,D
482 //parse names into vector
483 splitAtComma(names, dupNames);
485 //store names in fasta map
486 sequence = fasta->getSequence(name);
487 for (int i = 0; i < dupNames.size(); i++) {
488 fasta->push_back(dupNames[i], sequence);
496 catch(exception& e) {
497 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
501 //**********************************************************************************************************************
502 string GetOTURepCommand::findRep(int bin, ListVector* thisList) {
504 vector<string> names;
505 map<string, string> groups;
506 map<string, string>::iterator groupIt;
508 //parse names into vector
509 string binnames = thisList->get(bin);
510 splitAtComma(binnames, names);
512 // if only 1 sequence in bin or processing the "unique" label, then
513 // the first sequence of the OTU is the representative one
514 if ((names.size() == 1) || (list->getLabel() == "unique")) {
517 vector<int> seqIndex(names.size());
518 vector<float> max_dist(names.size());
519 vector<float> total_dist(names.size());
521 //fill seqIndex and initialize sums
522 for (size_t i = 0; i < names.size(); i++) {
523 seqIndex[i] = nameToIndex[names[i]];
528 // loop through all entries in seqIndex
531 for (size_t i=0; i < seqIndex.size(); i++) {
532 if (m->control_pressed) { return "control"; }
534 if (!large) { currMap = seqVec[seqIndex[i]]; }
535 else { currMap = getMap(seqIndex[i]); }
537 for (size_t j=0; j < seqIndex.size(); j++) {
538 it = currMap.find(seqIndex[j]);
539 if (it != currMap.end()) {
540 max_dist[i] = max(max_dist[i], it->second);
541 max_dist[j] = max(max_dist[j], it->second);
542 total_dist[i] += it->second;
543 total_dist[j] += it->second;
544 }else{ //if you can't find the distance make it the cutoff
545 max_dist[i] = max(max_dist[i], cutoff);
546 max_dist[j] = max(max_dist[j], cutoff);
547 total_dist[i] += cutoff;
548 total_dist[j] += cutoff;
553 // sequence with the smallest maximum distance is the representative
554 //if tie occurs pick sequence with smallest average distance
557 for (size_t i=0; i < max_dist.size(); i++) {
558 if (m->control_pressed) { return "control"; }
559 if (max_dist[i] < min) {
562 }else if (max_dist[i] == min) {
563 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
564 float newAverage = total_dist[i] / (float) total_dist.size();
566 if (newAverage < currentAverage) {
573 return(names[minIndex]);
576 catch(exception& e) {
577 m->errorOut(e, "GetOTURepCommand", "FindRep");
582 //**********************************************************************************************************************
583 int GetOTURepCommand::process(ListVector* processList) {
585 string nameRep, name, sequence;
588 if (outputDir == "") { outputDir += hasPath(listfile); }
590 ofstream newNamesOutput;
591 string outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
592 openOutputFile(outputNamesFile, newNamesOutput);
593 outputNames.push_back(outputNamesFile);
594 outputNameFiles[outputNamesFile] = processList->getLabel();
596 //for each bin in the list vector
597 for (int i = 0; i < processList->size(); i++) {
598 nameRep = findRep(i, processList);
600 if (m->control_pressed) { out.close(); newNamesOutput.close(); return 0; }
602 //output to new names file
603 newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
606 newNamesOutput.close();
610 catch(exception& e) {
611 m->errorOut(e, "GetOTURepCommand", "process");
615 //**********************************************************************************************************************
616 int GetOTURepCommand::processNames(string filename, string label) {
620 if (outputDir == "") { outputDir += hasPath(listfile); }
621 string outputFileName = outputDir + getRootName(getSimpleName(listfile)) + label + ".rep.fasta";
622 openOutputFile(outputFileName, out);
623 vector<repStruct> reps;
624 outputNames.push_back(outputFileName);
627 openInputFile(filename, in);
631 string rep, binnames;
632 in >> rep >> binnames; gobble(in);
634 vector<string> names;
635 splitAtComma(binnames, names);
636 int binsize = names.size();
638 //if you have a groupfile
640 if (groupfile != "") {
641 map<string, string> groups;
642 map<string, string>::iterator groupIt;
644 //find the groups that are in this bin
645 for (size_t i = 0; i < names.size(); i++) {
646 string groupName = groupMap->getGroup(names[i]);
647 if (groupName == "not found") {
648 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
651 groups[groupName] = groupName;
655 //turn the groups into a string
656 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
657 group += groupIt->first + "-";
660 group = group.substr(0, group.length()-1);
664 //print out name and sequence for that bin
665 string sequence = fasta->getSequence(rep);
667 if (sequence != "not found") {
668 if (sorted == "") { //print them out
669 rep = rep + "|" + toString(i+1);
670 rep = rep + "|" + toString(binsize);
671 if (groupfile != "") {
672 rep = rep + "|" + group;
674 out << ">" << rep << endl;
675 out << sequence << endl;
677 repStruct newRep(rep, i+1, binsize, group);
678 reps.push_back(newRep);
681 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
687 if (sorted != "") { //then sort them and print them
688 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
689 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
690 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
691 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
694 for (int i = 0; i < reps.size(); i++) {
695 string sequence = fasta->getSequence(reps[i].name);
696 string outputName = reps[i].name + "|" + toString(reps[i].bin);
697 outputName = outputName + "|" + toString(reps[i].size);
698 if (groupfile != "") {
699 outputName = outputName + "|" + reps[i].group;
701 out << ">" << outputName << endl;
702 out << sequence << endl;
711 catch(exception& e) {
712 m->errorOut(e, "GetOTURepCommand", "processNames");
716 //**********************************************************************************************************************
717 SeqMap GetOTURepCommand::getMap(int row) {
721 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
722 if (rowPositions[row] != -1){
724 inRow.seekg(rowPositions[row]);
726 int rowNum, numDists, colNum;
729 inRow >> rowNum >> numDists;
731 for(int i = 0; i < numDists; i++) {
732 inRow >> colNum >> dist;
733 rowMap[colNum] = dist;
740 catch(exception& e) {
741 m->errorOut(e, "GetOTURepCommand", "getMap");
745 //**********************************************************************************************************************