5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option) {
41 globaldata = GlobalData::getInstance();
46 //allow user to run help
47 if (option == "help") {
50 //valid paramters for this command
51 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","outputdir","inputdir"};
52 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 OptionParser parser(option);
55 map<string, string> parameters = parser.getParameters();
57 ValidParameters validParameter;
58 map<string, string>::iterator it;
60 //check to make sure all parameters are valid for command
61 for (it = parameters.begin(); it != parameters.end(); it++) {
62 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
65 //if the user changes the input directory command factory will send this info to us in the output parameter
66 string inputDir = validParameter.validFile(parameters, "inputdir", false);
67 if (inputDir == "not found"){ inputDir = ""; }
70 it = parameters.find("list");
71 //user has given a template file
72 if(it != parameters.end()){
73 path = hasPath(it->second);
74 //if the user has not given a path then, add inputdir. else leave path alone.
75 if (path == "") { parameters["list"] = inputDir + it->second; }
78 it = parameters.find("fasta");
79 //user has given a template file
80 if(it != parameters.end()){
81 path = hasPath(it->second);
82 //if the user has not given a path then, add inputdir. else leave path alone.
83 if (path == "") { parameters["fasta"] = inputDir + it->second; }
86 it = parameters.find("phylip");
87 //user has given a template file
88 if(it != parameters.end()){
89 path = hasPath(it->second);
90 //if the user has not given a path then, add inputdir. else leave path alone.
91 if (path == "") { parameters["phylip"] = inputDir + it->second; }
94 it = parameters.find("column");
95 //user has given a template file
96 if(it != parameters.end()){
97 path = hasPath(it->second);
98 //if the user has not given a path then, add inputdir. else leave path alone.
99 if (path == "") { parameters["column"] = inputDir + it->second; }
102 it = parameters.find("name");
103 //user has given a template file
104 if(it != parameters.end()){
105 path = hasPath(it->second);
106 //if the user has not given a path then, add inputdir. else leave path alone.
107 if (path == "") { parameters["name"] = inputDir + it->second; }
110 it = parameters.find("group");
111 //user has given a template file
112 if(it != parameters.end()){
113 path = hasPath(it->second);
114 //if the user has not given a path then, add inputdir. else leave path alone.
115 if (path == "") { parameters["group"] = inputDir + it->second; }
120 //if the user changes the output directory command factory will send this info to us in the output parameter
121 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
123 //check for required parameters
124 fastafile = validParameter.validFile(parameters, "fasta", true);
125 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
126 else if (fastafile == "not open") { abort = true; }
128 listfile = validParameter.validFile(parameters, "list", true);
129 if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
130 else if (listfile == "not open") { abort = true; }
132 phylipfile = validParameter.validFile(parameters, "phylip", true);
133 if (phylipfile == "not found") { phylipfile = ""; }
134 else if (phylipfile == "not open") { abort = true; }
135 else { distFile = phylipfile; format = "phylip"; }
137 columnfile = validParameter.validFile(parameters, "column", true);
138 if (columnfile == "not found") { columnfile = ""; }
139 else if (columnfile == "not open") { abort = true; }
140 else { distFile = columnfile; format = "column"; }
142 namefile = validParameter.validFile(parameters, "name", true);
143 if (namefile == "not open") { abort = true; }
144 else if (namefile == "not found") { namefile = ""; }
146 if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
147 else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
149 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
151 //check for optional parameter and set defaults
152 // ...at some point should added some additional type checking...
153 label = validParameter.validFile(parameters, "label", false);
154 if (label == "not found") { label = ""; allLines = 1; }
156 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
157 else { allLines = 1; }
160 groupfile = validParameter.validFile(parameters, "group", true);
161 if (groupfile == "not open") { groupfile = ""; abort = true; }
162 else if (groupfile == "not found") { groupfile = ""; }
164 //read in group map info.
165 groupMap = new GroupMap(groupfile);
166 int error = groupMap->readMap();
167 if (error == 1) { delete groupMap; abort = true; }
170 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
171 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
172 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
176 if ((sorted == "group") && (groupfile == "")) {
177 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
181 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
182 large = isTrue(temp);
184 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
185 convert(temp, precision);
187 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
188 convert(temp, cutoff);
189 cutoff += (5 / (precision * 10.0));
192 catch(exception& e) {
193 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
198 //**********************************************************************************************************************
200 void GetOTURepCommand::help(){
202 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n");
203 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
204 m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
205 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
206 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
207 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
208 m->mothurOut("The default value for label is all labels in your inputfile.\n");
209 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
210 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
211 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
212 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
213 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
215 catch(exception& e) {
216 m->errorOut(e, "GetOTURepCommand", "help");
221 //**********************************************************************************************************************
223 GetOTURepCommand::~GetOTURepCommand(){}
225 //**********************************************************************************************************************
227 int GetOTURepCommand::execute(){
230 if (abort == true) { return 0; }
234 //read distance files
235 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
236 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
237 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
239 readMatrix->setCutoff(cutoff);
242 nameMap = new NameAssignment(namefile);
244 }else{ nameMap = NULL; }
246 readMatrix->read(nameMap);
248 if (m->control_pressed) { delete readMatrix; delete groupMap; return 0; }
251 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
252 globaldata->gListVector = readMatrix->getListVector();
254 SparseMatrix* matrix = readMatrix->getMatrix();
256 // Create a data structure to quickly access the distance information.
257 // It consists of a vector of distance maps, where each map contains
258 // all distances of a certain sequence. Vector and maps are accessed
259 // via the index of a sequence in the distance matrix
260 seqVec = vector<SeqMap>(globaldata->gListVector->size());
261 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
262 if (m->control_pressed) { delete readMatrix; delete groupMap; return 0; }
263 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
269 if (m->control_pressed) { delete groupMap; return 0; }
271 //process file and set up indexes
272 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
273 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
274 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
276 formatMatrix->setCutoff(cutoff);
279 nameMap = new NameAssignment(namefile);
281 }else{ nameMap = NULL; }
283 formatMatrix->read(nameMap);
285 if (m->control_pressed) { delete formatMatrix; delete groupMap; return 0; }
288 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
289 globaldata->gListVector = formatMatrix->getListVector();
291 distFile = formatMatrix->getFormattedFileName();
293 //positions in file where the distances for each sequence begin
294 //rowPositions[1] = position in file where distance related to sequence 1 start.
295 rowPositions = formatMatrix->getRowPositions();
299 //openfile for getMap to use
300 openInputFile(distFile, inRow);
302 if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); delete groupMap; return 0; }
306 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
307 if (globaldata->gListVector != NULL) {
308 vector<string> names;
310 //map names to rows in sparsematrix
311 for (int i = 0; i < globaldata->gListVector->size(); i++) {
313 binnames = globaldata->gListVector->get(i);
315 splitAtComma(binnames, names);
317 for (int j = 0; j < names.size(); j++) {
318 nameToIndex[names[j]] = i;
321 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
323 fasta = new FastaMap();
326 fasta->readFastaFile(fastafile);
328 if (m->control_pressed) {
329 if (large) { inRow.close(); remove(distFile.c_str()); }
330 delete groupMap; delete fasta; return 0;
333 //if user gave a namesfile then use it
334 if (namefile != "") { readNamesFile(); }
336 //set format to list so input can get listvector
337 globaldata->setFormat("list");
340 read = new ReadOTUFile(listfile);
341 read->read(&*globaldata);
343 input = globaldata->ginput;
344 list = globaldata->gListVector;
345 string lastLabel = list->getLabel();
347 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
348 set<string> processedLabels;
349 set<string> userLabels = labels;
351 if (m->control_pressed) {
352 if (large) { inRow.close(); remove(distFile.c_str()); }
353 delete groupMap; delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
356 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
358 if (allLines == 1 || labels.count(list->getLabel()) == 1){
359 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
360 error = process(list);
361 if (error == 1) { return 0; } //there is an error in hte input files, abort command
363 if (m->control_pressed) {
364 if (large) { inRow.close(); remove(distFile.c_str()); }
365 if (groupfile != "") { delete groupMap; globaldata->gGroupmap = NULL; }
366 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
367 delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
370 processedLabels.insert(list->getLabel());
371 userLabels.erase(list->getLabel());
374 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
375 string saveLabel = list->getLabel();
378 list = input->getListVector(lastLabel);
379 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
380 error = process(list);
381 if (error == 1) { return 0; } //there is an error in hte input files, abort command
383 if (m->control_pressed) {
384 if (large) { inRow.close(); remove(distFile.c_str()); }
385 if (groupfile != "") { delete groupMap; globaldata->gGroupmap = NULL; }
386 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
387 delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
390 processedLabels.insert(list->getLabel());
391 userLabels.erase(list->getLabel());
393 //restore real lastlabel to save below
394 list->setLabel(saveLabel);
397 lastLabel = list->getLabel();
400 list = input->getListVector();
403 //output error messages about any remaining user labels
404 bool needToRun = false;
405 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
406 m->mothurOut("Your file does not include the label " + *it);
407 if (processedLabels.count(list->getLabel()) != 1) {
408 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
411 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
415 //run last label if you need to
416 if (needToRun == true) {
417 if (list != NULL) { delete list; }
418 list = input->getListVector(lastLabel);
419 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
420 error = process(list);
422 if (error == 1) { return 0; } //there is an error in hte input files, abort command
424 if (m->control_pressed) {
425 if (large) { inRow.close(); remove(distFile.c_str()); }
426 if (groupfile != "") { delete groupMap; globaldata->gGroupmap = NULL; }
427 for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); }
428 delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
432 //close and remove formatted matrix file
435 remove(distFile.c_str());
438 globaldata->gListVector = NULL;
439 delete input; globaldata->ginput = NULL;
442 if (groupfile != "") {
443 delete groupMap; globaldata->gGroupmap = NULL;
446 if (m->control_pressed) { return 0; }
448 m->mothurOutEndLine();
449 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
450 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
451 m->mothurOutEndLine();
455 catch(exception& e) {
456 m->errorOut(e, "GetOTURepCommand", "execute");
461 //**********************************************************************************************************************
462 void GetOTURepCommand::readNamesFile() {
464 vector<string> dupNames;
465 openInputFile(namefile, inNames);
467 string name, names, sequence;
470 inNames >> name; //read from first column A
471 inNames >> names; //read from second column A,B,C,D
475 //parse names into vector
476 splitAtComma(names, dupNames);
478 //store names in fasta map
479 sequence = fasta->getSequence(name);
480 for (int i = 0; i < dupNames.size(); i++) {
481 fasta->push_back(dupNames[i], sequence);
489 catch(exception& e) {
490 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
494 //**********************************************************************************************************************
495 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
497 vector<string> names;
498 map<string, string> groups;
499 map<string, string>::iterator groupIt;
501 //parse names into vector
502 string binnames = thisList->get(bin);
503 splitAtComma(binnames, names);
504 binsize = names.size();
506 //if you have a groupfile
507 if (groupfile != "") {
508 //find the groups that are in this bin
509 for (size_t i = 0; i < names.size(); i++) {
510 string groupName = groupMap->getGroup(names[i]);
511 if (groupName == "not found") {
512 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
515 groups[groupName] = groupName;
519 //turn the groups into a string
520 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
521 group += groupIt->first + "-";
524 group = group.substr(0, group.length()-1);
527 // if only 1 sequence in bin or processing the "unique" label, then
528 // the first sequence of the OTU is the representative one
529 if ((names.size() == 1) || (list->getLabel() == "unique")) {
532 vector<int> seqIndex(names.size());
533 vector<float> max_dist(names.size());
534 vector<float> total_dist(names.size());
536 //fill seqIndex and initialize sums
537 for (size_t i = 0; i < names.size(); i++) {
538 seqIndex[i] = nameToIndex[names[i]];
543 // loop through all entries in seqIndex
546 for (size_t i=0; i < seqIndex.size(); i++) {
547 if (m->control_pressed) { return "control"; }
549 if (!large) { currMap = seqVec[seqIndex[i]]; }
550 else { currMap = getMap(seqIndex[i]); }
552 for (size_t j=0; j < seqIndex.size(); j++) {
553 it = currMap.find(seqIndex[j]);
554 if (it != currMap.end()) {
555 max_dist[i] = max(max_dist[i], it->second);
556 max_dist[j] = max(max_dist[j], it->second);
557 total_dist[i] += it->second;
558 total_dist[j] += it->second;
559 }else{ //if you can't find the distance make it the cutoff
560 max_dist[i] = max(max_dist[i], cutoff);
561 max_dist[j] = max(max_dist[j], cutoff);
562 total_dist[i] += cutoff;
563 total_dist[j] += cutoff;
568 // sequence with the smallest maximum distance is the representative
569 //if tie occurs pick sequence with smallest average distance
572 for (size_t i=0; i < max_dist.size(); i++) {
573 if (m->control_pressed) { return "control"; }
574 if (max_dist[i] < min) {
577 }else if (max_dist[i] == min) {
578 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
579 float newAverage = total_dist[i] / (float) total_dist.size();
581 if (newAverage < currentAverage) {
588 return(names[minIndex]);
591 catch(exception& e) {
592 m->errorOut(e, "GetOTURepCommand", "FindRep");
597 //**********************************************************************************************************************
598 int GetOTURepCommand::process(ListVector* processList) {
600 string nameRep, name, sequence;
603 if (outputDir == "") { outputDir += hasPath(listfile); }
604 string outputFileName = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.fasta";
605 openOutputFile(outputFileName, out);
606 vector<repStruct> reps;
607 outputNames.push_back(outputFileName);
609 ofstream newNamesOutput;
610 string outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
611 openOutputFile(outputNamesFile, newNamesOutput);
612 outputNames.push_back(outputNamesFile);
614 //for each bin in the list vector
615 for (int i = 0; i < processList->size(); i++) {
619 if (m->control_pressed) { out.close(); newNamesOutput.close(); return 0; }
621 nameRep = findRep(i, groups, processList, binsize);
623 if (m->control_pressed) { out.close(); newNamesOutput.close(); return 0; }
625 //output to new names file
626 newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
628 //print out name and sequence for that bin
629 sequence = fasta->getSequence(nameRep);
631 if (sequence != "not found") {
632 if (sorted == "") { //print them out
633 nameRep = nameRep + "|" + toString(i+1);
634 nameRep = nameRep + "|" + toString(binsize);
635 if (groupfile != "") {
636 nameRep = nameRep + "|" + groups;
638 out << ">" << nameRep << endl;
639 out << sequence << endl;
641 repStruct newRep(nameRep, i+1, binsize, groups);
642 reps.push_back(newRep);
645 m->mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine();
646 remove(outputFileName.c_str());
647 remove(outputNamesFile.c_str());
652 if (sorted != "") { //then sort them and print them
653 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
654 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
655 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
656 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
659 for (int i = 0; i < reps.size(); i++) {
660 string sequence = fasta->getSequence(reps[i].name);
661 string outputName = reps[i].name + "|" + toString(reps[i].bin);
662 outputName = outputName + "|" + toString(reps[i].size);
663 if (groupfile != "") {
664 outputName = outputName + "|" + reps[i].group;
666 out << ">" << outputName << endl;
667 out << sequence << endl;
672 newNamesOutput.close();
676 catch(exception& e) {
677 m->errorOut(e, "GetOTURepCommand", "process");
682 //**********************************************************************************************************************
683 SeqMap GetOTURepCommand::getMap(int row) {
687 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
688 if (rowPositions[row] != -1){
690 inRow.seekg(rowPositions[row]);
692 int rowNum, numDists, colNum;
695 inRow >> rowNum >> numDists;
697 for(int i = 0; i < numDists; i++) {
698 inRow >> colNum >> dist;
699 rowMap[colNum] = dist;
706 catch(exception& e) {
707 m->errorOut(e, "GetOTURepCommand", "getMap");
711 //**********************************************************************************************************************