5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
15 globaldata = GlobalData::getInstance();
20 //allow user to run help
21 if (option == "help") {
24 //valid paramters for this command
25 string Array[] = {"fasta","list","label","name", "group"};
26 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
28 OptionParser parser(option);
29 map<string, string> parameters = parser.getParameters();
31 ValidParameters validParameter;
33 //check to make sure all parameters are valid for command
34 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //make sure the user has already run the read.otu command
39 if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
40 mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine();
44 //check for required parameters
45 fastafile = validParameter.validFile(parameters, "fasta", true);
46 if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
47 else if (fastafile == "not open") { abort = true; }
49 listfile = validParameter.validFile(parameters, "list", true);
50 if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
51 else if (listfile == "not open") { abort = true; }
53 //check for optional parameter and set defaults
54 // ...at some point should added some additional type checking...
55 label = validParameter.validFile(parameters, "label", false);
56 if (label == "not found") { label = ""; }
58 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
59 else { allLines = 1; }
62 //if the user has not specified any labels use the ones from read.otu
64 allLines = globaldata->allLines;
65 labels = globaldata->labels;
68 namesfile = validParameter.validFile(parameters, "name", true);
69 if (namesfile == "not open") { abort = true; }
70 else if (namesfile == "not found") { namesfile = ""; }
72 groupfile = validParameter.validFile(parameters, "group", true);
73 if (groupfile == "not open") { abort = true; }
74 else if (groupfile == "not found") { groupfile = ""; }
76 //read in group map info.
77 groupMap = new GroupMap(groupfile);
83 if(globaldata->gSparseMatrix != NULL) {
84 matrix = globaldata->gSparseMatrix;
87 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
88 if (globaldata->gListVector != NULL) {
91 //map names to rows in sparsematrix
92 for (int i = 0; i < globaldata->gListVector->size(); i++) {
94 binnames = globaldata->gListVector->get(i);
96 splitAtComma(binnames, names);
98 for (int j = 0; j < names.size(); j++) {
99 nameToIndex[names[j]] = i;
102 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
104 // Create a data structure to quickly access the distance information.
105 // It consists of a vector of distance maps, where each map contains
106 // all distances of a certain sequence. Vector and maps are accessed
107 // via the index of a sequence in the distance matrix
108 seqVec = vector<SeqMap>(globaldata->gListVector->size());
109 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
110 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
113 fasta = new FastaMap();
117 catch(exception& e) {
118 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
123 //**********************************************************************************************************************
125 void GetOTURepCommand::help(){
127 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
128 mothurOut("The get.oturep command parameters are list, fasta, name, group and label. The fasta and list parameters are required.\n");
129 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
130 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
131 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
132 mothurOut("The default value for label is all labels in your inputfile.\n");
133 mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
134 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
135 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
137 catch(exception& e) {
138 errorOut(e, "GetOTURepCommand", "help");
143 //**********************************************************************************************************************
145 GetOTURepCommand::~GetOTURepCommand(){
146 if (abort == false) {
147 delete input; globaldata->ginput = NULL;
150 if (groupfile != "") {
151 delete groupMap; globaldata->gGroupmap = NULL;
156 //**********************************************************************************************************************
158 int GetOTURepCommand::execute(){
161 if (abort == true) { return 0; }
166 fasta->readFastaFile(fastafile);
170 //set format to list so input can get listvector
171 globaldata->setFormat("list");
173 //if user gave a namesfile then use it
174 if (namesfile != "") {
179 read = new ReadOTUFile(listfile);
180 read->read(&*globaldata);
182 input = globaldata->ginput;
183 list = globaldata->gListVector;
184 string lastLabel = list->getLabel();
186 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
187 set<string> processedLabels;
188 set<string> userLabels = labels;
190 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
192 if (allLines == 1 || labels.count(list->getLabel()) == 1){
193 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
194 error = process(list);
195 if (error == 1) { return 0; } //there is an error in hte input files, abort command
197 processedLabels.insert(list->getLabel());
198 userLabels.erase(list->getLabel());
201 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
203 list = input->getListVector(lastLabel);
204 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
205 error = process(list);
206 if (error == 1) { return 0; } //there is an error in hte input files, abort command
208 processedLabels.insert(list->getLabel());
209 userLabels.erase(list->getLabel());
212 lastLabel = list->getLabel();
215 list = input->getListVector();
218 //output error messages about any remaining user labels
219 bool needToRun = false;
220 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
221 mothurOut("Your file does not include the label " + *it);
222 if (processedLabels.count(list->getLabel()) != 1) {
223 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
226 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
230 //run last label if you need to
231 if (needToRun == true) {
232 if (list != NULL) { delete list; }
233 list = input->getListVector(lastLabel);
234 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
235 error = process(list);
237 if (error == 1) { return 0; } //there is an error in hte input files, abort command
241 globaldata->gSparseMatrix = NULL;
243 globaldata->gListVector = NULL;
247 catch(exception& e) {
248 errorOut(e, "GetOTURepCommand", "execute");
253 //**********************************************************************************************************************
254 void GetOTURepCommand::readNamesFile() {
256 vector<string> dupNames;
257 openInputFile(namesfile, inNames);
259 string name, names, sequence;
262 inNames >> name; //read from first column A
263 inNames >> names; //read from second column A,B,C,D
267 //parse names into vector
268 splitAtComma(names, dupNames);
270 //store names in fasta map
271 sequence = fasta->getSequence(name);
272 for (int i = 0; i < dupNames.size(); i++) {
273 fasta->push_back(dupNames[i], sequence);
281 catch(exception& e) {
282 errorOut(e, "GetOTURepCommand", "readNamesFile");
286 //**********************************************************************************************************************
287 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
289 vector<string> names;
290 map<string, string> groups;
291 map<string, string>::iterator groupIt;
293 //parse names into vector
294 string binnames = thisList->get(bin);
295 splitAtComma(binnames, names);
296 binsize = names.size();
298 //if you have a groupfile
299 if (groupfile != "") {
300 //find the groups that are in this bin
301 for (size_t i = 0; i < names.size(); i++) {
302 string groupName = groupMap->getGroup(names[i]);
303 if (groupName == "not found") {
304 mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
307 groups[groupName] = groupName;
311 //turn the groups into a string
312 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
313 group += groupIt->first + "-";
316 group = group.substr(0, group.length()-1);
319 // if only 1 sequence in bin or processing the "unique" label, then
320 // the first sequence of the OTU is the representative one
321 if ((names.size() == 1) || (list->getLabel() == "unique")) {
324 vector<int> seqIndex(names.size());
325 vector<float> max_dist(names.size());
327 //fill seqIndex and initialize sums
328 for (size_t i = 0; i < names.size(); i++) {
329 seqIndex[i] = nameToIndex[names[i]];
333 // loop through all entries in seqIndex
336 for (size_t i=0; i < seqIndex.size(); i++) {
337 currMap = seqVec[seqIndex[i]];
338 for (size_t j=0; j < seqIndex.size(); j++) {
339 it = currMap.find(seqIndex[j]);
340 if (it != currMap.end()) {
341 max_dist[i] = max(max_dist[i], it->second);
342 max_dist[j] = max(max_dist[j], it->second);
347 // sequence with the smallest maximum distance is the representative
350 for (size_t i=0; i < max_dist.size(); i++) {
351 if (max_dist[i] < min) {
357 return(names[minIndex]);
360 catch(exception& e) {
361 errorOut(e, "GetOTURepCommand", "FindRep");
366 //**********************************************************************************************************************
367 int GetOTURepCommand::process(ListVector* processList) {
369 string nameRep, name, sequence;
372 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
373 openOutputFile(outputFileName, out);
375 //for each bin in the list vector
376 for (int i = 0; i < processList->size(); i++) {
379 nameRep = findRep(i, groups, processList, binsize);
381 //print out name and sequence for that bin
382 sequence = fasta->getSequence(nameRep);
384 if (sequence != "not found") {
385 nameRep = nameRep + "|" + toString(i+1);
386 nameRep = nameRep + "|" + toString(binsize);
387 if (groupfile != "") {
388 nameRep = nameRep + "|" + groups;
390 out << ">" << nameRep << endl;
391 out << sequence << endl;
393 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine();
394 remove(outputFileName.c_str());
403 catch(exception& e) {
404 errorOut(e, "GetOTURepCommand", "process");
409 //**********************************************************************************************************************