5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
15 globaldata = GlobalData::getInstance();
21 //allow user to run help
22 if (option == "help") {
25 //valid paramters for this command
26 string Array[] = {"fasta","list","line","label","name", "group"};
27 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
29 OptionParser parser(option);
30 map<string, string> parameters = parser.getParameters();
32 ValidParameters validParameter;
34 //check to make sure all parameters are valid for command
35 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
36 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
39 //make sure the user has already run the read.otu command
40 if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
41 mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine();
45 //check for required parameters
46 fastafile = validParameter.validFile(parameters, "fasta", true);
47 if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
48 else if (fastafile == "not open") { abort = true; }
50 listfile = validParameter.validFile(parameters, "list", true);
51 if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
52 else if (listfile == "not open") { abort = true; }
54 //check for optional parameter and set defaults
55 // ...at some point should added some additional type checking...
56 line = validParameter.validFile(parameters, "line", false);
57 if (line == "not found") { line = ""; }
59 if(line != "all") { splitAtDash(line, lines); allLines = 0; }
60 else { allLines = 1; }
63 label = validParameter.validFile(parameters, "label", false);
64 if (label == "not found") { label = ""; }
66 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
67 else { allLines = 1; }
70 //make sure user did not use both the line and label parameters
71 if ((line != "") && (label != "")) { mothurOut("You cannot use both the line and label parameters at the same time. "); mothurOutEndLine(); abort = true; }
72 //if the user has not specified any line or labels use the ones from read.otu
73 else if ((line == "") && (label == "")) {
74 allLines = globaldata->allLines;
75 labels = globaldata->labels;
76 lines = globaldata->lines;
79 namesfile = validParameter.validFile(parameters, "name", true);
80 if (namesfile == "not open") { abort = true; }
81 else if (namesfile == "not found") { namesfile = ""; }
83 groupfile = validParameter.validFile(parameters, "group", true);
84 if (groupfile == "not open") { abort = true; }
85 else if (groupfile == "not found") { groupfile = ""; }
87 //read in group map info.
88 groupMap = new GroupMap(groupfile);
94 if(globaldata->gSparseMatrix != NULL) {
95 matrix = globaldata->gSparseMatrix;
98 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
99 if (globaldata->gListVector != NULL) {
100 vector<string> names;
102 //map names to rows in sparsematrix
103 for (int i = 0; i < globaldata->gListVector->size(); i++) {
105 binnames = globaldata->gListVector->get(i);
107 splitAtComma(binnames, names);
109 for (int j = 0; j < names.size(); j++) {
110 nameToIndex[names[j]] = i;
113 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
115 // Create a data structure to quickly access the distance information.
116 // It consists of a vector of distance maps, where each map contains
117 // all distances of a certain sequence. Vector and maps are accessed
118 // via the index of a sequence in the distance matrix
119 seqVec = vector<SeqMap>(globaldata->gListVector->size());
120 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
121 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
124 fasta = new FastaMap();
128 catch(exception& e) {
129 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
134 //**********************************************************************************************************************
136 void GetOTURepCommand::help(){
138 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
139 mothurOut("The get.oturep command parameters are list, fasta, name, group, line and label. The fasta and list parameters are required, and you may not use line and label at the same time.\n");
140 mothurOut("The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes.\n");
141 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, line=yourLines, label=yourLabels).\n");
142 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, line=1-3-5, name=amazon.names).\n");
143 mothurOut("The default value for line and label are all lines in your inputfile.\n");
144 mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
145 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
146 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
148 catch(exception& e) {
149 errorOut(e, "GetOTURepCommand", "help");
154 //**********************************************************************************************************************
156 GetOTURepCommand::~GetOTURepCommand(){
157 if (abort == false) {
158 delete input; globaldata->ginput = NULL;
161 if (groupfile != "") {
162 delete groupMap; globaldata->gGroupmap = NULL;
167 //**********************************************************************************************************************
169 int GetOTURepCommand::execute(){
172 if (abort == true) { return 0; }
178 fasta->readFastaFile(fastafile);
182 //set format to list so input can get listvector
183 globaldata->setFormat("list");
185 //if user gave a namesfile then use it
186 if (namesfile != "") {
191 read = new ReadOTUFile(listfile);
192 read->read(&*globaldata);
194 input = globaldata->ginput;
195 list = globaldata->gListVector;
196 string lastLabel = list->getLabel();
198 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
199 set<string> processedLabels;
200 set<string> userLabels = labels;
201 set<int> userLines = lines;
204 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
206 if (allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
207 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
208 error = process(list);
209 if (error == 1) { return 0; } //there is an error in hte input files, abort command
211 processedLabels.insert(list->getLabel());
212 userLabels.erase(list->getLabel());
213 userLines.erase(count);
216 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
218 list = input->getListVector(lastLabel);
219 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
220 error = process(list);
221 if (error == 1) { return 0; } //there is an error in hte input files, abort command
223 processedLabels.insert(list->getLabel());
224 userLabels.erase(list->getLabel());
227 lastLabel = list->getLabel();
230 list = input->getListVector();
234 //output error messages about any remaining user labels
235 bool needToRun = false;
236 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
237 mothurOut("Your file does not include the label " + *it);
238 if (processedLabels.count(list->getLabel()) != 1) {
239 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
242 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
246 //run last line if you need to
247 if (needToRun == true) {
249 list = input->getListVector(lastLabel);
250 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
251 error = process(list);
253 if (error == 1) { return 0; } //there is an error in hte input files, abort command
257 globaldata->gSparseMatrix = NULL;
259 globaldata->gListVector = NULL;
263 catch(exception& e) {
264 errorOut(e, "GetOTURepCommand", "execute");
269 //**********************************************************************************************************************
270 void GetOTURepCommand::readNamesFile() {
272 vector<string> dupNames;
273 openInputFile(namesfile, inNames);
275 string name, names, sequence;
278 inNames >> name; //read from first column A
279 inNames >> names; //read from second column A,B,C,D
283 //parse names into vector
284 splitAtComma(names, dupNames);
286 //store names in fasta map
287 sequence = fasta->getSequence(name);
288 for (int i = 0; i < dupNames.size(); i++) {
289 fasta->push_back(dupNames[i], sequence);
297 catch(exception& e) {
298 errorOut(e, "GetOTURepCommand", "readNamesFile");
302 //**********************************************************************************************************************
303 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
305 vector<string> names;
306 map<string, string> groups;
307 map<string, string>::iterator groupIt;
309 //parse names into vector
310 string binnames = thisList->get(bin);
311 splitAtComma(binnames, names);
312 binsize = names.size();
314 //if you have a groupfile
315 if (groupfile != "") {
316 //find the groups that are in this bin
317 for (size_t i = 0; i < names.size(); i++) {
318 string groupName = groupMap->getGroup(names[i]);
319 if (groupName == "not found") {
320 mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
323 groups[groupName] = groupName;
327 //turn the groups into a string
328 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
329 group += groupIt->first + "-";
332 group = group.substr(0, group.length()-1);
335 // if only 1 sequence in bin or processing the "unique" line, then
336 // the first sequence of the OTU is the representative one
337 if ((names.size() == 1) || (list->getLabel() == "unique")) {
340 vector<int> seqIndex(names.size());
341 vector<float> max_dist(names.size());
343 //fill seqIndex and initialize sums
344 for (size_t i = 0; i < names.size(); i++) {
345 seqIndex[i] = nameToIndex[names[i]];
349 // loop through all entries in seqIndex
352 for (size_t i=0; i < seqIndex.size(); i++) {
353 currMap = seqVec[seqIndex[i]];
354 for (size_t j=0; j < seqIndex.size(); j++) {
355 it = currMap.find(seqIndex[j]);
356 if (it != currMap.end()) {
357 max_dist[i] = max(max_dist[i], it->second);
358 max_dist[j] = max(max_dist[j], it->second);
363 // sequence with the smallest maximum distance is the representative
366 for (size_t i=0; i < max_dist.size(); i++) {
367 if (max_dist[i] < min) {
373 return(names[minIndex]);
376 catch(exception& e) {
377 errorOut(e, "GetOTURepCommand", "FindRep");
382 //**********************************************************************************************************************
383 int GetOTURepCommand::process(ListVector* processList) {
385 string nameRep, name, sequence;
388 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
389 openOutputFile(outputFileName, out);
391 //for each bin in the list vector
392 for (int i = 0; i < processList->size(); i++) {
395 nameRep = findRep(i, groups, processList, binsize);
397 //print out name and sequence for that bin
398 sequence = fasta->getSequence(nameRep);
400 if (sequence != "not found") {
401 nameRep = nameRep + "|" + toString(i+1);
402 nameRep = nameRep + "|" + toString(binsize);
403 if (groupfile != "") {
404 nameRep = nameRep + "|" + groups;
406 out << ">" << nameRep << endl;
407 out << sequence << endl;
409 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine();
410 remove(outputFileName.c_str());
419 catch(exception& e) {
420 errorOut(e, "GetOTURepCommand", "process");
425 //**********************************************************************************************************************