5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option){
41 globaldata = GlobalData::getInstance();
46 //allow user to run help
47 if (option == "help") {
50 //valid paramters for this command
51 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff","precision","outputdir","inputdir"};
52 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 OptionParser parser(option);
55 map<string, string> parameters = parser.getParameters();
57 ValidParameters validParameter;
58 map<string, string>::iterator it;
60 //check to make sure all parameters are valid for command
61 for (it = parameters.begin(); it != parameters.end(); it++) {
62 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
65 //if the user changes the input directory command factory will send this info to us in the output parameter
66 string inputDir = validParameter.validFile(parameters, "inputdir", false);
67 if (inputDir == "not found"){ inputDir = ""; }
70 it = parameters.find("list");
71 //user has given a template file
72 if(it != parameters.end()){
73 path = hasPath(it->second);
74 //if the user has not given a path then, add inputdir. else leave path alone.
75 if (path == "") { parameters["list"] = inputDir + it->second; }
78 it = parameters.find("fasta");
79 //user has given a template file
80 if(it != parameters.end()){
81 path = hasPath(it->second);
82 //if the user has not given a path then, add inputdir. else leave path alone.
83 if (path == "") { parameters["fasta"] = inputDir + it->second; }
86 it = parameters.find("phylip");
87 //user has given a template file
88 if(it != parameters.end()){
89 path = hasPath(it->second);
90 //if the user has not given a path then, add inputdir. else leave path alone.
91 if (path == "") { parameters["phylip"] = inputDir + it->second; }
94 it = parameters.find("column");
95 //user has given a template file
96 if(it != parameters.end()){
97 path = hasPath(it->second);
98 //if the user has not given a path then, add inputdir. else leave path alone.
99 if (path == "") { parameters["column"] = inputDir + it->second; }
102 it = parameters.find("name");
103 //user has given a template file
104 if(it != parameters.end()){
105 path = hasPath(it->second);
106 //if the user has not given a path then, add inputdir. else leave path alone.
107 if (path == "") { parameters["name"] = inputDir + it->second; }
110 it = parameters.find("group");
111 //user has given a template file
112 if(it != parameters.end()){
113 path = hasPath(it->second);
114 //if the user has not given a path then, add inputdir. else leave path alone.
115 if (path == "") { parameters["group"] = inputDir + it->second; }
120 //if the user changes the output directory command factory will send this info to us in the output parameter
121 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
123 //check for required parameters
124 fastafile = validParameter.validFile(parameters, "fasta", true);
125 if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
126 else if (fastafile == "not open") { abort = true; }
128 listfile = validParameter.validFile(parameters, "list", true);
129 if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
130 else if (listfile == "not open") { abort = true; }
132 phylipfile = validParameter.validFile(parameters, "phylip", true);
133 if (phylipfile == "not found") { phylipfile = ""; }
134 else if (phylipfile == "not open") { abort = true; }
135 else { distFile = phylipfile; format = "phylip"; }
137 columnfile = validParameter.validFile(parameters, "column", true);
138 if (columnfile == "not found") { columnfile = ""; }
139 else if (columnfile == "not open") { abort = true; }
140 else { distFile = columnfile; format = "column"; }
142 namefile = validParameter.validFile(parameters, "name", true);
143 if (namefile == "not open") { abort = true; }
144 else if (namefile == "not found") { namefile = ""; }
146 if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
147 else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
149 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
151 //check for optional parameter and set defaults
152 // ...at some point should added some additional type checking...
153 label = validParameter.validFile(parameters, "label", false);
154 if (label == "not found") { label = ""; allLines = 1; }
156 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
157 else { allLines = 1; }
160 groupfile = validParameter.validFile(parameters, "group", true);
161 if (groupfile == "not open") { groupfile = ""; abort = true; }
162 else if (groupfile == "not found") { groupfile = ""; }
164 //read in group map info.
165 groupMap = new GroupMap(groupfile);
166 int error = groupMap->readMap();
167 if (error == 1) { delete groupMap; abort = true; }
170 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
171 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
172 mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); mothurOutEndLine();
176 if ((sorted == "group") && (groupfile == "")) {
177 mothurOut("You must provide a groupfile to sort by group. I will not sort."); mothurOutEndLine();
181 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
182 large = isTrue(temp);
184 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
185 convert(temp, precision);
187 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
188 convert(temp, cutoff);
189 cutoff += (5 / (precision * 10.0));
192 catch(exception& e) {
193 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
198 //**********************************************************************************************************************
200 void GetOTURepCommand::help(){
202 mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, cutoff, precision, sorted and label. The fasta and list parameters are required, as well as phylip or column and name.\n");
203 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
204 mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
205 mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
206 mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
207 mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
208 mothurOut("The default value for label is all labels in your inputfile.\n");
209 mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
210 mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
211 mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
212 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
213 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
215 catch(exception& e) {
216 errorOut(e, "GetOTURepCommand", "help");
221 //**********************************************************************************************************************
223 GetOTURepCommand::~GetOTURepCommand(){}
225 //**********************************************************************************************************************
227 int GetOTURepCommand::execute(){
230 if (abort == true) { return 0; }
234 //read distance files
235 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
236 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
237 else { mothurOut("File format error."); mothurOutEndLine(); return 0; }
239 readMatrix->setCutoff(cutoff);
242 nameMap = new NameAssignment(namefile);
244 }else{ nameMap = NULL; }
246 readMatrix->read(nameMap);
249 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
250 globaldata->gListVector = readMatrix->getListVector();
252 SparseMatrix* matrix = readMatrix->getMatrix();
254 // Create a data structure to quickly access the distance information.
255 // It consists of a vector of distance maps, where each map contains
256 // all distances of a certain sequence. Vector and maps are accessed
257 // via the index of a sequence in the distance matrix
258 seqVec = vector<SeqMap>(globaldata->gListVector->size());
259 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
260 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
266 //process file and set up indexes
267 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
268 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
269 else { mothurOut("File format error."); mothurOutEndLine(); return 0; }
271 formatMatrix->setCutoff(cutoff);
274 nameMap = new NameAssignment(namefile);
276 }else{ nameMap = NULL; }
278 formatMatrix->read(nameMap);
281 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
282 globaldata->gListVector = formatMatrix->getListVector();
284 distFile = formatMatrix->getFormattedFileName();
286 //positions in file where the distances for each sequence begin
287 //rowPositions[1] = position in file where distance related to sequence 1 start.
288 rowPositions = formatMatrix->getRowPositions();
292 //openfile for getMap to use
293 openInputFile(distFile, inRow);
296 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
297 if (globaldata->gListVector != NULL) {
298 vector<string> names;
300 //map names to rows in sparsematrix
301 for (int i = 0; i < globaldata->gListVector->size(); i++) {
303 binnames = globaldata->gListVector->get(i);
305 splitAtComma(binnames, names);
307 for (int j = 0; j < names.size(); j++) {
308 nameToIndex[names[j]] = i;
311 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
313 fasta = new FastaMap();
316 fasta->readFastaFile(fastafile);
318 //if user gave a namesfile then use it
319 if (namefile != "") { readNamesFile(); }
321 //set format to list so input can get listvector
322 globaldata->setFormat("list");
325 read = new ReadOTUFile(listfile);
326 read->read(&*globaldata);
328 input = globaldata->ginput;
329 list = globaldata->gListVector;
330 string lastLabel = list->getLabel();
332 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
333 set<string> processedLabels;
334 set<string> userLabels = labels;
336 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
338 if (allLines == 1 || labels.count(list->getLabel()) == 1){
339 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
340 error = process(list);
341 if (error == 1) { return 0; } //there is an error in hte input files, abort command
343 processedLabels.insert(list->getLabel());
344 userLabels.erase(list->getLabel());
347 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
348 string saveLabel = list->getLabel();
351 list = input->getListVector(lastLabel);
352 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
353 error = process(list);
354 if (error == 1) { return 0; } //there is an error in hte input files, abort command
356 processedLabels.insert(list->getLabel());
357 userLabels.erase(list->getLabel());
359 //restore real lastlabel to save below
360 list->setLabel(saveLabel);
363 lastLabel = list->getLabel();
366 list = input->getListVector();
369 //output error messages about any remaining user labels
370 bool needToRun = false;
371 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
372 mothurOut("Your file does not include the label " + *it);
373 if (processedLabels.count(list->getLabel()) != 1) {
374 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
377 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
381 //run last label if you need to
382 if (needToRun == true) {
383 if (list != NULL) { delete list; }
384 list = input->getListVector(lastLabel);
385 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
386 error = process(list);
388 if (error == 1) { return 0; } //there is an error in hte input files, abort command
391 //close and remove formatted matrix file
394 remove(distFile.c_str());
397 globaldata->gListVector = NULL;
398 delete input; globaldata->ginput = NULL;
401 if (groupfile != "") {
402 delete groupMap; globaldata->gGroupmap = NULL;
407 catch(exception& e) {
408 errorOut(e, "GetOTURepCommand", "execute");
413 //**********************************************************************************************************************
414 void GetOTURepCommand::readNamesFile() {
416 vector<string> dupNames;
417 openInputFile(namefile, inNames);
419 string name, names, sequence;
422 inNames >> name; //read from first column A
423 inNames >> names; //read from second column A,B,C,D
427 //parse names into vector
428 splitAtComma(names, dupNames);
430 //store names in fasta map
431 sequence = fasta->getSequence(name);
432 for (int i = 0; i < dupNames.size(); i++) {
433 fasta->push_back(dupNames[i], sequence);
441 catch(exception& e) {
442 errorOut(e, "GetOTURepCommand", "readNamesFile");
446 //**********************************************************************************************************************
447 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
449 vector<string> names;
450 map<string, string> groups;
451 map<string, string>::iterator groupIt;
453 //parse names into vector
454 string binnames = thisList->get(bin);
455 splitAtComma(binnames, names);
456 binsize = names.size();
458 //if you have a groupfile
459 if (groupfile != "") {
460 //find the groups that are in this bin
461 for (size_t i = 0; i < names.size(); i++) {
462 string groupName = groupMap->getGroup(names[i]);
463 if (groupName == "not found") {
464 mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
467 groups[groupName] = groupName;
471 //turn the groups into a string
472 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
473 group += groupIt->first + "-";
476 group = group.substr(0, group.length()-1);
479 // if only 1 sequence in bin or processing the "unique" label, then
480 // the first sequence of the OTU is the representative one
481 if ((names.size() == 1) || (list->getLabel() == "unique")) {
484 vector<int> seqIndex(names.size());
485 vector<float> max_dist(names.size());
486 vector<float> total_dist(names.size());
488 //fill seqIndex and initialize sums
489 for (size_t i = 0; i < names.size(); i++) {
490 seqIndex[i] = nameToIndex[names[i]];
495 // loop through all entries in seqIndex
498 for (size_t i=0; i < seqIndex.size(); i++) {
500 if (!large) { currMap = seqVec[seqIndex[i]]; }
501 else { currMap = getMap(seqIndex[i]); }
503 for (size_t j=0; j < seqIndex.size(); j++) {
504 it = currMap.find(seqIndex[j]);
505 if (it != currMap.end()) {
506 max_dist[i] = max(max_dist[i], it->second);
507 max_dist[j] = max(max_dist[j], it->second);
508 total_dist[i] += it->second;
509 total_dist[j] += it->second;
510 }else{ //if you can't find the distance make it the cutoff
511 max_dist[i] = max(max_dist[i], cutoff);
512 max_dist[j] = max(max_dist[j], cutoff);
513 total_dist[i] += cutoff;
514 total_dist[j] += cutoff;
519 // sequence with the smallest maximum distance is the representative
520 //if tie occurs pick sequence with smallest average distance
523 for (size_t i=0; i < max_dist.size(); i++) {
524 if (max_dist[i] < min) {
527 }else if (max_dist[i] == min) {
528 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
529 float newAverage = total_dist[i] / (float) total_dist.size();
531 if (newAverage < currentAverage) {
538 return(names[minIndex]);
541 catch(exception& e) {
542 errorOut(e, "GetOTURepCommand", "FindRep");
547 //**********************************************************************************************************************
548 int GetOTURepCommand::process(ListVector* processList) {
550 string nameRep, name, sequence;
553 if (outputDir == "") { outputDir += hasPath(listfile); }
554 string outputFileName = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.fasta";
555 openOutputFile(outputFileName, out);
556 vector<repStruct> reps;
558 ofstream newNamesOutput;
559 string outputNamesFile = outputDir + getRootName(getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
560 openOutputFile(outputNamesFile, newNamesOutput);
562 //for each bin in the list vector
563 for (int i = 0; i < processList->size(); i++) {
566 nameRep = findRep(i, groups, processList, binsize);
568 //output to new names file
569 newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
571 //print out name and sequence for that bin
572 sequence = fasta->getSequence(nameRep);
574 if (sequence != "not found") {
575 if (sorted == "") { //print them out
576 nameRep = nameRep + "|" + toString(i+1);
577 nameRep = nameRep + "|" + toString(binsize);
578 if (groupfile != "") {
579 nameRep = nameRep + "|" + groups;
581 out << ">" << nameRep << endl;
582 out << sequence << endl;
584 repStruct newRep(nameRep, i+1, binsize, groups);
585 reps.push_back(newRep);
588 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine();
589 remove(outputFileName.c_str());
590 remove(outputNamesFile.c_str());
595 if (sorted != "") { //then sort them and print them
596 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
597 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
598 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
599 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
602 for (int i = 0; i < reps.size(); i++) {
603 string sequence = fasta->getSequence(reps[i].name);
604 string outputName = reps[i].name + "|" + toString(reps[i].bin);
605 outputName = outputName + "|" + toString(reps[i].size);
606 if (groupfile != "") {
607 outputName = outputName + "|" + reps[i].group;
609 out << ">" << outputName << endl;
610 out << sequence << endl;
615 newNamesOutput.close();
619 catch(exception& e) {
620 errorOut(e, "GetOTURepCommand", "process");
625 //**********************************************************************************************************************
626 SeqMap GetOTURepCommand::getMap(int row) {
630 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
631 if (rowPositions[row] != -1){
633 inRow.seekg(rowPositions[row]);
635 int rowNum, numDists, colNum;
638 inRow >> rowNum >> numDists;
640 for(int i = 0; i < numDists; i++) {
641 inRow >> colNum >> dist;
642 rowMap[colNum] = dist;
649 catch(exception& e) {
650 errorOut(e, "GetOTURepCommand", "getMap");
654 //**********************************************************************************************************************