5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
15 globaldata = GlobalData::getInstance();
20 //allow user to run help
21 if (option == "help") {
24 //valid paramters for this command
25 string Array[] = {"fasta","list","label","name", "group"};
26 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28 OptionParser parser(option);
29 map<string, string> parameters = parser.getParameters();
31 ValidParameters validParameter;
33 //check to make sure all parameters are valid for command
34 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //make sure the user has already run the read.otu command
39 if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
40 mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine();
44 //check for required parameters
45 fastafile = validParameter.validFile(parameters, "fasta", true);
46 if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
47 else if (fastafile == "not open") { abort = true; }
49 listfile = validParameter.validFile(parameters, "list", true);
50 if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
51 else if (listfile == "not open") { abort = true; }
53 //check for optional parameter and set defaults
54 // ...at some point should added some additional type checking...
55 label = validParameter.validFile(parameters, "label", false);
56 if (label == "not found") { label = ""; }
58 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
59 else { allLines = 1; }
62 //if the user has not specified any labels use the ones from read.otu
64 allLines = globaldata->allLines;
65 labels = globaldata->labels;
68 namesfile = validParameter.validFile(parameters, "name", true);
69 if (namesfile == "not open") { abort = true; }
70 else if (namesfile == "not found") { namesfile = ""; }
72 groupfile = validParameter.validFile(parameters, "group", true);
73 if (groupfile == "not open") { abort = true; }
74 else if (groupfile == "not found") { groupfile = ""; }
76 //read in group map info.
77 groupMap = new GroupMap(groupfile);
83 if(globaldata->gSparseMatrix != NULL) {
84 matrix = globaldata->gSparseMatrix;
87 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
88 if (globaldata->gListVector != NULL) {
91 //map names to rows in sparsematrix
92 for (int i = 0; i < globaldata->gListVector->size(); i++) {
94 binnames = globaldata->gListVector->get(i);
96 splitAtComma(binnames, names);
98 for (int j = 0; j < names.size(); j++) {
99 nameToIndex[names[j]] = i;
102 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
104 // Create a data structure to quickly access the distance information.
105 // It consists of a vector of distance maps, where each map contains
106 // all distances of a certain sequence. Vector and maps are accessed
107 // via the index of a sequence in the distance matrix
108 seqVec = vector<SeqMap>(globaldata->gListVector->size());
109 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
110 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
113 fasta = new FastaMap();
117 catch(exception& e) {
118 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
123 //**********************************************************************************************************************
125 void GetOTURepCommand::help(){
127 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
128 mothurOut("The get.oturep command parameters are list, fasta, name, group and label. The fasta and list parameters are required.\n");
129 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
130 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
131 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
132 mothurOut("The default value for label is all labels in your inputfile.\n");
133 mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
134 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
135 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
137 catch(exception& e) {
138 errorOut(e, "GetOTURepCommand", "help");
143 //**********************************************************************************************************************
145 GetOTURepCommand::~GetOTURepCommand(){
146 if (abort == false) {
147 delete input; globaldata->ginput = NULL;
150 if (groupfile != "") {
151 delete groupMap; globaldata->gGroupmap = NULL;
156 //**********************************************************************************************************************
158 int GetOTURepCommand::execute(){
161 if (abort == true) { return 0; }
166 fasta->readFastaFile(fastafile);
170 //set format to list so input can get listvector
171 globaldata->setFormat("list");
173 //if user gave a namesfile then use it
174 if (namesfile != "") {
179 read = new ReadOTUFile(listfile);
180 read->read(&*globaldata);
182 input = globaldata->ginput;
183 list = globaldata->gListVector;
184 string lastLabel = list->getLabel();
186 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
187 set<string> processedLabels;
188 set<string> userLabels = labels;
190 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
192 if (allLines == 1 || labels.count(list->getLabel()) == 1){
193 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
194 error = process(list);
195 if (error == 1) { return 0; } //there is an error in hte input files, abort command
197 processedLabels.insert(list->getLabel());
198 userLabels.erase(list->getLabel());
201 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
202 string saveLabel = list->getLabel();
205 list = input->getListVector(lastLabel);
206 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
207 error = process(list);
208 if (error == 1) { return 0; } //there is an error in hte input files, abort command
210 processedLabels.insert(list->getLabel());
211 userLabels.erase(list->getLabel());
213 //restore real lastlabel to save below
214 list->setLabel(saveLabel);
217 lastLabel = list->getLabel();
220 list = input->getListVector();
223 //output error messages about any remaining user labels
224 bool needToRun = false;
225 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
226 mothurOut("Your file does not include the label " + *it);
227 if (processedLabels.count(list->getLabel()) != 1) {
228 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
231 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
235 //run last label if you need to
236 if (needToRun == true) {
237 if (list != NULL) { delete list; }
238 list = input->getListVector(lastLabel);
239 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
240 error = process(list);
242 if (error == 1) { return 0; } //there is an error in hte input files, abort command
246 globaldata->gSparseMatrix = NULL;
248 globaldata->gListVector = NULL;
252 catch(exception& e) {
253 errorOut(e, "GetOTURepCommand", "execute");
258 //**********************************************************************************************************************
259 void GetOTURepCommand::readNamesFile() {
261 vector<string> dupNames;
262 openInputFile(namesfile, inNames);
264 string name, names, sequence;
267 inNames >> name; //read from first column A
268 inNames >> names; //read from second column A,B,C,D
272 //parse names into vector
273 splitAtComma(names, dupNames);
275 //store names in fasta map
276 sequence = fasta->getSequence(name);
277 for (int i = 0; i < dupNames.size(); i++) {
278 fasta->push_back(dupNames[i], sequence);
286 catch(exception& e) {
287 errorOut(e, "GetOTURepCommand", "readNamesFile");
291 //**********************************************************************************************************************
292 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
294 vector<string> names;
295 map<string, string> groups;
296 map<string, string>::iterator groupIt;
298 //parse names into vector
299 string binnames = thisList->get(bin);
300 splitAtComma(binnames, names);
301 binsize = names.size();
303 //if you have a groupfile
304 if (groupfile != "") {
305 //find the groups that are in this bin
306 for (size_t i = 0; i < names.size(); i++) {
307 string groupName = groupMap->getGroup(names[i]);
308 if (groupName == "not found") {
309 mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
312 groups[groupName] = groupName;
316 //turn the groups into a string
317 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
318 group += groupIt->first + "-";
321 group = group.substr(0, group.length()-1);
324 // if only 1 sequence in bin or processing the "unique" label, then
325 // the first sequence of the OTU is the representative one
326 if ((names.size() == 1) || (list->getLabel() == "unique")) {
329 vector<int> seqIndex(names.size());
330 vector<float> max_dist(names.size());
332 //fill seqIndex and initialize sums
333 for (size_t i = 0; i < names.size(); i++) {
334 seqIndex[i] = nameToIndex[names[i]];
338 // loop through all entries in seqIndex
341 for (size_t i=0; i < seqIndex.size(); i++) {
342 currMap = seqVec[seqIndex[i]];
343 for (size_t j=0; j < seqIndex.size(); j++) {
344 it = currMap.find(seqIndex[j]);
345 if (it != currMap.end()) {
346 max_dist[i] = max(max_dist[i], it->second);
347 max_dist[j] = max(max_dist[j], it->second);
352 // sequence with the smallest maximum distance is the representative
355 for (size_t i=0; i < max_dist.size(); i++) {
356 if (max_dist[i] < min) {
362 return(names[minIndex]);
365 catch(exception& e) {
366 errorOut(e, "GetOTURepCommand", "FindRep");
371 //**********************************************************************************************************************
372 int GetOTURepCommand::process(ListVector* processList) {
374 string nameRep, name, sequence;
377 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
378 openOutputFile(outputFileName, out);
380 //for each bin in the list vector
381 for (int i = 0; i < processList->size(); i++) {
384 nameRep = findRep(i, groups, processList, binsize);
386 //print out name and sequence for that bin
387 sequence = fasta->getSequence(nameRep);
389 if (sequence != "not found") {
390 nameRep = nameRep + "|" + toString(i+1);
391 nameRep = nameRep + "|" + toString(binsize);
392 if (groupfile != "") {
393 nameRep = nameRep + "|" + groups;
395 out << ">" << nameRep << endl;
396 out << sequence << endl;
398 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine();
399 remove(outputFileName.c_str());
408 catch(exception& e) {
409 errorOut(e, "GetOTURepCommand", "process");
414 //**********************************************************************************************************************