5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
12 //********************************************************************************************************************
13 //sorts lowest to highest
14 inline bool compareName(repStruct left, repStruct right){
15 return (left.name < right.name);
17 //********************************************************************************************************************
18 //sorts lowest to highest
19 inline bool compareBin(repStruct left, repStruct right){
20 return (left.bin < right.bin);
22 //********************************************************************************************************************
23 //sorts lowest to highest
24 inline bool compareSize(repStruct left, repStruct right){
25 return (left.size < right.size);
27 //********************************************************************************************************************
28 //sorts lowest to highest
29 inline bool compareGroup(repStruct left, repStruct right){
30 return (left.group < right.group);
32 //**********************************************************************************************************************
33 GetOTURepCommand::GetOTURepCommand(string option){
35 globaldata = GlobalData::getInstance();
40 //allow user to run help
41 if (option == "help") {
44 //valid paramters for this command
45 string Array[] = {"fasta","list","label","name", "group", "sorted"};
46 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
48 OptionParser parser(option);
49 map<string, string> parameters = parser.getParameters();
51 ValidParameters validParameter;
53 //check to make sure all parameters are valid for command
54 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
55 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
58 //make sure the user has already run the read.otu command
59 if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
60 mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine();
64 //check for required parameters
65 fastafile = validParameter.validFile(parameters, "fasta", true);
66 if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
67 else if (fastafile == "not open") { abort = true; }
69 listfile = validParameter.validFile(parameters, "list", true);
70 if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
71 else if (listfile == "not open") { abort = true; }
73 //check for optional parameter and set defaults
74 // ...at some point should added some additional type checking...
75 label = validParameter.validFile(parameters, "label", false);
76 if (label == "not found") { label = ""; }
78 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
79 else { allLines = 1; }
82 //if the user has not specified any labels use the ones from read.otu
84 allLines = globaldata->allLines;
85 labels = globaldata->labels;
88 namesfile = validParameter.validFile(parameters, "name", true);
89 if (namesfile == "not open") { abort = true; }
90 else if (namesfile == "not found") { namesfile = ""; }
92 groupfile = validParameter.validFile(parameters, "group", true);
93 if (groupfile == "not open") { groupfile = ""; abort = true; }
94 else if (groupfile == "not found") { groupfile = ""; }
96 //read in group map info.
97 groupMap = new GroupMap(groupfile);
101 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
102 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
103 mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); mothurOutEndLine();
107 if ((sorted == "group") && (groupfile == "")) {
108 mothurOut("You must provide a groupfile to sort by group. I will not sort."); mothurOutEndLine();
112 if (abort == false) {
114 if(globaldata->gSparseMatrix != NULL) {
115 matrix = globaldata->gSparseMatrix;
118 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
119 if (globaldata->gListVector != NULL) {
120 vector<string> names;
122 //map names to rows in sparsematrix
123 for (int i = 0; i < globaldata->gListVector->size(); i++) {
125 binnames = globaldata->gListVector->get(i);
127 splitAtComma(binnames, names);
129 for (int j = 0; j < names.size(); j++) {
130 nameToIndex[names[j]] = i;
133 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
135 // Create a data structure to quickly access the distance information.
136 // It consists of a vector of distance maps, where each map contains
137 // all distances of a certain sequence. Vector and maps are accessed
138 // via the index of a sequence in the distance matrix
139 seqVec = vector<SeqMap>(globaldata->gListVector->size());
140 for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
141 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
144 fasta = new FastaMap();
148 catch(exception& e) {
149 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
154 //**********************************************************************************************************************
156 void GetOTURepCommand::help(){
158 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
159 mothurOut("The get.oturep command parameters are list, fasta, name, group, sorted and label. The fasta and list parameters are required.\n");
160 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
161 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
162 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
163 mothurOut("The default value for label is all labels in your inputfile.\n");
164 mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
165 mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
166 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
167 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
169 catch(exception& e) {
170 errorOut(e, "GetOTURepCommand", "help");
175 //**********************************************************************************************************************
177 GetOTURepCommand::~GetOTURepCommand(){
178 if (abort == false) {
179 delete input; globaldata->ginput = NULL;
182 if (groupfile != "") {
183 delete groupMap; globaldata->gGroupmap = NULL;
188 //**********************************************************************************************************************
190 int GetOTURepCommand::execute(){
193 if (abort == true) { return 0; }
198 fasta->readFastaFile(fastafile);
202 //set format to list so input can get listvector
203 globaldata->setFormat("list");
205 //if user gave a namesfile then use it
206 if (namesfile != "") {
211 read = new ReadOTUFile(listfile);
212 read->read(&*globaldata);
214 input = globaldata->ginput;
215 list = globaldata->gListVector;
216 string lastLabel = list->getLabel();
218 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
219 set<string> processedLabels;
220 set<string> userLabels = labels;
222 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
224 if (allLines == 1 || labels.count(list->getLabel()) == 1){
225 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
226 error = process(list);
227 if (error == 1) { return 0; } //there is an error in hte input files, abort command
229 processedLabels.insert(list->getLabel());
230 userLabels.erase(list->getLabel());
233 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
234 string saveLabel = list->getLabel();
237 list = input->getListVector(lastLabel);
238 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
239 error = process(list);
240 if (error == 1) { return 0; } //there is an error in hte input files, abort command
242 processedLabels.insert(list->getLabel());
243 userLabels.erase(list->getLabel());
245 //restore real lastlabel to save below
246 list->setLabel(saveLabel);
249 lastLabel = list->getLabel();
252 list = input->getListVector();
255 //output error messages about any remaining user labels
256 bool needToRun = false;
257 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
258 mothurOut("Your file does not include the label " + *it);
259 if (processedLabels.count(list->getLabel()) != 1) {
260 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
263 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
267 //run last label if you need to
268 if (needToRun == true) {
269 if (list != NULL) { delete list; }
270 list = input->getListVector(lastLabel);
271 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
272 error = process(list);
274 if (error == 1) { return 0; } //there is an error in hte input files, abort command
278 globaldata->gSparseMatrix = NULL;
280 globaldata->gListVector = NULL;
284 catch(exception& e) {
285 errorOut(e, "GetOTURepCommand", "execute");
290 //**********************************************************************************************************************
291 void GetOTURepCommand::readNamesFile() {
293 vector<string> dupNames;
294 openInputFile(namesfile, inNames);
296 string name, names, sequence;
299 inNames >> name; //read from first column A
300 inNames >> names; //read from second column A,B,C,D
304 //parse names into vector
305 splitAtComma(names, dupNames);
307 //store names in fasta map
308 sequence = fasta->getSequence(name);
309 for (int i = 0; i < dupNames.size(); i++) {
310 fasta->push_back(dupNames[i], sequence);
318 catch(exception& e) {
319 errorOut(e, "GetOTURepCommand", "readNamesFile");
323 //**********************************************************************************************************************
324 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
326 vector<string> names;
327 map<string, string> groups;
328 map<string, string>::iterator groupIt;
330 //parse names into vector
331 string binnames = thisList->get(bin);
332 splitAtComma(binnames, names);
333 binsize = names.size();
335 //if you have a groupfile
336 if (groupfile != "") {
337 //find the groups that are in this bin
338 for (size_t i = 0; i < names.size(); i++) {
339 string groupName = groupMap->getGroup(names[i]);
340 if (groupName == "not found") {
341 mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
344 groups[groupName] = groupName;
348 //turn the groups into a string
349 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
350 group += groupIt->first + "-";
353 group = group.substr(0, group.length()-1);
356 // if only 1 sequence in bin or processing the "unique" label, then
357 // the first sequence of the OTU is the representative one
358 if ((names.size() == 1) || (list->getLabel() == "unique")) {
361 vector<int> seqIndex(names.size());
362 vector<float> max_dist(names.size());
364 //fill seqIndex and initialize sums
365 for (size_t i = 0; i < names.size(); i++) {
366 seqIndex[i] = nameToIndex[names[i]];
370 // loop through all entries in seqIndex
373 for (size_t i=0; i < seqIndex.size(); i++) {
374 currMap = seqVec[seqIndex[i]];
375 for (size_t j=0; j < seqIndex.size(); j++) {
376 it = currMap.find(seqIndex[j]);
377 if (it != currMap.end()) {
378 max_dist[i] = max(max_dist[i], it->second);
379 max_dist[j] = max(max_dist[j], it->second);
384 // sequence with the smallest maximum distance is the representative
387 for (size_t i=0; i < max_dist.size(); i++) {
388 if (max_dist[i] < min) {
394 return(names[minIndex]);
397 catch(exception& e) {
398 errorOut(e, "GetOTURepCommand", "FindRep");
403 //**********************************************************************************************************************
404 int GetOTURepCommand::process(ListVector* processList) {
406 string nameRep, name, sequence;
409 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
410 openOutputFile(outputFileName, out);
411 vector<repStruct> reps;
413 //for each bin in the list vector
414 for (int i = 0; i < processList->size(); i++) {
417 nameRep = findRep(i, groups, processList, binsize);
419 //print out name and sequence for that bin
420 sequence = fasta->getSequence(nameRep);
422 if (sequence != "not found") {
423 if (sorted == "") { //print them out
424 nameRep = nameRep + "|" + toString(i+1);
425 nameRep = nameRep + "|" + toString(binsize);
426 if (groupfile != "") {
427 nameRep = nameRep + "|" + groups;
429 out << ">" << nameRep << endl;
430 out << sequence << endl;
432 repStruct newRep(nameRep, i+1, binsize, groups);
433 reps.push_back(newRep);
436 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine();
437 remove(outputFileName.c_str());
442 if (sorted != "") { //then sort them and print them
443 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
444 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
445 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
446 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
449 for (int i = 0; i < reps.size(); i++) {
450 string sequence = fasta->getSequence(reps[i].name);
451 string outputName = reps[i].name + "|" + toString(reps[i].bin);
452 outputName = outputName + "|" + toString(reps[i].size);
453 if (groupfile != "") {
454 outputName = outputName + "|" + reps[i].group;
456 out << ">" << outputName << endl;
457 out << sequence << endl;
465 catch(exception& e) {
466 errorOut(e, "GetOTURepCommand", "process");
471 //**********************************************************************************************************************