5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(string option){
41 globaldata = GlobalData::getInstance();
46 //allow user to run help
47 if (option == "help") {
50 //valid paramters for this command
51 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column","large","cutoff"};
52 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
54 OptionParser parser(option);
55 map<string, string> parameters = parser.getParameters();
57 ValidParameters validParameter;
59 //check to make sure all parameters are valid for command
60 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
61 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
64 //check for required parameters
65 fastafile = validParameter.validFile(parameters, "fasta", true);
66 if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
67 else if (fastafile == "not open") { abort = true; }
69 listfile = validParameter.validFile(parameters, "list", true);
70 if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
71 else if (listfile == "not open") { abort = true; }
73 phylipfile = validParameter.validFile(parameters, "phylip", true);
74 if (phylipfile == "not found") { phylipfile = ""; }
75 else if (phylipfile == "not open") { abort = true; }
76 else { distFile = phylipfile; format = "phylip"; }
78 columnfile = validParameter.validFile(parameters, "column", true);
79 if (columnfile == "not found") { columnfile = ""; }
80 else if (columnfile == "not open") { abort = true; }
81 else { distFile = columnfile; format = "column"; }
83 namefile = validParameter.validFile(parameters, "name", true);
84 if (namefile == "not open") { abort = true; }
85 else if (namefile == "not found") { namefile = ""; }
87 if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
88 else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
90 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
92 //check for optional parameter and set defaults
93 // ...at some point should added some additional type checking...
94 label = validParameter.validFile(parameters, "label", false);
95 if (label == "not found") { label = ""; allLines = 1; }
97 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
98 else { allLines = 1; }
101 groupfile = validParameter.validFile(parameters, "group", true);
102 if (groupfile == "not open") { groupfile = ""; abort = true; }
103 else if (groupfile == "not found") { groupfile = ""; }
105 //read in group map info.
106 groupMap = new GroupMap(groupfile);
110 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
111 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
112 mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); mothurOutEndLine();
116 if ((sorted == "group") && (groupfile == "")) {
117 mothurOut("You must provide a groupfile to sort by group. I will not sort."); mothurOutEndLine();
121 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
122 large = isTrue(temp);
124 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
125 convert(temp, precision);
127 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
128 convert(temp, cutoff);
129 cutoff += (5 / (precision * 10.0));
132 catch(exception& e) {
133 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
138 //**********************************************************************************************************************
140 void GetOTURepCommand::help(){
142 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
143 mothurOut("The get.oturep command parameters are list, fasta, name, group, large, sorted and label. The fasta and list parameters are required.\n");
144 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
145 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
146 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
147 mothurOut("The default value for label is all labels in your inputfile.\n");
148 mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
149 mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n");
150 mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
151 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
152 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
154 catch(exception& e) {
155 errorOut(e, "GetOTURepCommand", "help");
160 //**********************************************************************************************************************
162 GetOTURepCommand::~GetOTURepCommand(){}
164 //**********************************************************************************************************************
166 int GetOTURepCommand::execute(){
169 if (abort == true) { return 0; }
173 //read distance files
174 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
175 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
176 else { mothurOut("File format error."); mothurOutEndLine(); return 0; }
178 readMatrix->setCutoff(cutoff);
181 nameMap = new NameAssignment(namefile);
183 }else{ nameMap = NULL; }
185 readMatrix->read(nameMap);
188 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
189 globaldata->gListVector = readMatrix->getListVector();
191 SparseMatrix* matrix = readMatrix->getMatrix();
193 // Create a data structure to quickly access the distance information.
194 // It consists of a vector of distance maps, where each map contains
195 // all distances of a certain sequence. Vector and maps are accessed
196 // via the index of a sequence in the distance matrix
197 seqVec = vector<SeqMap>(globaldata->gListVector->size());
198 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
199 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
205 //process file and set up indexes
206 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
207 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
208 else { mothurOut("File format error."); mothurOutEndLine(); return 0; }
210 formatMatrix->setCutoff(cutoff);
213 nameMap = new NameAssignment(namefile);
215 }else{ nameMap = NULL; }
217 formatMatrix->read(nameMap);
220 if (globaldata->gListVector != NULL) { delete globaldata->gListVector; }
221 globaldata->gListVector = formatMatrix->getListVector();
223 distFile = formatMatrix->getFormattedFileName();
225 //positions in file where the distances for each sequence begin
226 //rowPositions[1] = position in file where distance related to sequence 1 start.
227 rowPositions = formatMatrix->getRowPositions();
231 //openfile for getMap to use
232 openInputFile(distFile, inRow);
235 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
236 if (globaldata->gListVector != NULL) {
237 vector<string> names;
239 //map names to rows in sparsematrix
240 for (int i = 0; i < globaldata->gListVector->size(); i++) {
242 binnames = globaldata->gListVector->get(i);
244 splitAtComma(binnames, names);
246 for (int j = 0; j < names.size(); j++) {
247 nameToIndex[names[j]] = i;
250 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
252 fasta = new FastaMap();
255 fasta->readFastaFile(fastafile);
257 //if user gave a namesfile then use it
258 if (namefile != "") { readNamesFile(); }
260 //set format to list so input can get listvector
261 globaldata->setFormat("list");
264 read = new ReadOTUFile(listfile);
265 read->read(&*globaldata);
267 input = globaldata->ginput;
268 list = globaldata->gListVector;
269 string lastLabel = list->getLabel();
271 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
272 set<string> processedLabels;
273 set<string> userLabels = labels;
275 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
277 if (allLines == 1 || labels.count(list->getLabel()) == 1){
278 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
279 error = process(list);
280 if (error == 1) { return 0; } //there is an error in hte input files, abort command
282 processedLabels.insert(list->getLabel());
283 userLabels.erase(list->getLabel());
286 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
287 string saveLabel = list->getLabel();
290 list = input->getListVector(lastLabel);
291 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
292 error = process(list);
293 if (error == 1) { return 0; } //there is an error in hte input files, abort command
295 processedLabels.insert(list->getLabel());
296 userLabels.erase(list->getLabel());
298 //restore real lastlabel to save below
299 list->setLabel(saveLabel);
302 lastLabel = list->getLabel();
305 list = input->getListVector();
308 //output error messages about any remaining user labels
309 bool needToRun = false;
310 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
311 mothurOut("Your file does not include the label " + *it);
312 if (processedLabels.count(list->getLabel()) != 1) {
313 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
316 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
320 //run last label if you need to
321 if (needToRun == true) {
322 if (list != NULL) { delete list; }
323 list = input->getListVector(lastLabel);
324 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
325 error = process(list);
327 if (error == 1) { return 0; } //there is an error in hte input files, abort command
330 //close and remove formatted matrix file
332 remove(distFile.c_str());
334 globaldata->gListVector = NULL;
335 delete input; globaldata->ginput = NULL;
338 if (groupfile != "") {
339 delete groupMap; globaldata->gGroupmap = NULL;
344 catch(exception& e) {
345 errorOut(e, "GetOTURepCommand", "execute");
350 //**********************************************************************************************************************
351 void GetOTURepCommand::readNamesFile() {
353 vector<string> dupNames;
354 openInputFile(namefile, inNames);
356 string name, names, sequence;
359 inNames >> name; //read from first column A
360 inNames >> names; //read from second column A,B,C,D
364 //parse names into vector
365 splitAtComma(names, dupNames);
367 //store names in fasta map
368 sequence = fasta->getSequence(name);
369 for (int i = 0; i < dupNames.size(); i++) {
370 fasta->push_back(dupNames[i], sequence);
378 catch(exception& e) {
379 errorOut(e, "GetOTURepCommand", "readNamesFile");
383 //**********************************************************************************************************************
384 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
386 vector<string> names;
387 map<string, string> groups;
388 map<string, string>::iterator groupIt;
390 //parse names into vector
391 string binnames = thisList->get(bin);
392 splitAtComma(binnames, names);
393 binsize = names.size();
395 //if you have a groupfile
396 if (groupfile != "") {
397 //find the groups that are in this bin
398 for (size_t i = 0; i < names.size(); i++) {
399 string groupName = groupMap->getGroup(names[i]);
400 if (groupName == "not found") {
401 mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
404 groups[groupName] = groupName;
408 //turn the groups into a string
409 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
410 group += groupIt->first + "-";
413 group = group.substr(0, group.length()-1);
416 // if only 1 sequence in bin or processing the "unique" label, then
417 // the first sequence of the OTU is the representative one
418 if ((names.size() == 1) || (list->getLabel() == "unique")) {
421 vector<int> seqIndex(names.size());
422 vector<float> max_dist(names.size());
423 vector<float> total_dist(names.size());
425 //fill seqIndex and initialize sums
426 for (size_t i = 0; i < names.size(); i++) {
427 seqIndex[i] = nameToIndex[names[i]];
432 // loop through all entries in seqIndex
435 for (size_t i=0; i < seqIndex.size(); i++) {
437 if (!large) { currMap = seqVec[seqIndex[i]]; }
438 else { currMap = getMap(seqIndex[i]); }
440 for (size_t j=0; j < seqIndex.size(); j++) {
441 it = currMap.find(seqIndex[j]);
442 if (it != currMap.end()) {
443 max_dist[i] = max(max_dist[i], it->second);
444 max_dist[j] = max(max_dist[j], it->second);
445 total_dist[i] += it->second;
446 total_dist[j] += it->second;
447 }else{ //if you can't find the distance make it the cutoff
448 max_dist[i] = max(max_dist[i], cutoff);
449 max_dist[j] = max(max_dist[j], cutoff);
450 total_dist[i] += cutoff;
451 total_dist[j] += cutoff;
456 // sequence with the smallest maximum distance is the representative
457 //if tie occurs pick sequence with smallest average distance
460 for (size_t i=0; i < max_dist.size(); i++) {
461 if (max_dist[i] < min) {
464 }else if (max_dist[i] == min) {
465 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
466 float newAverage = total_dist[i] / (float) total_dist.size();
468 if (newAverage < currentAverage) {
475 return(names[minIndex]);
478 catch(exception& e) {
479 errorOut(e, "GetOTURepCommand", "FindRep");
484 //**********************************************************************************************************************
485 int GetOTURepCommand::process(ListVector* processList) {
487 string nameRep, name, sequence;
490 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
491 openOutputFile(outputFileName, out);
492 vector<repStruct> reps;
494 ofstream newNamesOutput;
495 string outputNamesFile = getRootName(listfile) + processList->getLabel() + ".rep.names";
496 openOutputFile(outputNamesFile, newNamesOutput);
498 //for each bin in the list vector
499 for (int i = 0; i < processList->size(); i++) {
502 nameRep = findRep(i, groups, processList, binsize);
504 //output to new names file
505 newNamesOutput << nameRep << '\t' << processList->get(i) << endl;
507 //print out name and sequence for that bin
508 sequence = fasta->getSequence(nameRep);
510 if (sequence != "not found") {
511 if (sorted == "") { //print them out
512 nameRep = nameRep + "|" + toString(i+1);
513 nameRep = nameRep + "|" + toString(binsize);
514 if (groupfile != "") {
515 nameRep = nameRep + "|" + groups;
517 out << ">" << nameRep << endl;
518 out << sequence << endl;
520 repStruct newRep(nameRep, i+1, binsize, groups);
521 reps.push_back(newRep);
524 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine();
525 remove(outputFileName.c_str());
526 remove(outputNamesFile.c_str());
531 if (sorted != "") { //then sort them and print them
532 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
533 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
534 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
535 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
538 for (int i = 0; i < reps.size(); i++) {
539 string sequence = fasta->getSequence(reps[i].name);
540 string outputName = reps[i].name + "|" + toString(reps[i].bin);
541 outputName = outputName + "|" + toString(reps[i].size);
542 if (groupfile != "") {
543 outputName = outputName + "|" + reps[i].group;
545 out << ">" << outputName << endl;
546 out << sequence << endl;
551 newNamesOutput.close();
555 catch(exception& e) {
556 errorOut(e, "GetOTURepCommand", "process");
561 //**********************************************************************************************************************
562 SeqMap GetOTURepCommand::getMap(int row) {
566 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
567 if (rowPositions[row] != -1){
569 inRow.seekg(rowPositions[row]);
571 int rowNum, numDists, colNum;
574 inRow >> rowNum >> numDists;
576 for(int i = 0; i < numDists; i++) {
577 inRow >> colNum >> dist;
578 rowMap[colNum] = dist;
585 catch(exception& e) {
586 errorOut(e, "GetOTURepCommand", "getMap");
590 //**********************************************************************************************************************