5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
15 globaldata = GlobalData::getInstance();
21 //allow user to run help
22 if(option == "help") { help(); abort = true; }
25 //valid paramters for this command
26 string Array[] = {"fasta","list","line","label","name", "group"};
27 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
29 parser = new OptionParser();
30 parser->parse(option, parameters); delete parser;
32 ValidParameters* validParameter = new ValidParameters();
34 //check to make sure all parameters are valid for command
35 for (it4 = parameters.begin(); it4 != parameters.end(); it4++) {
36 if (validParameter->isValidParameter(it4->first, myArray, it4->second) != true) { abort = true; }
39 //make sure the user has already run the read.otu command
40 if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
41 cout << "Before you use the get.oturep command, you first need to read in a distance matrix." << endl;
45 //check for required parameters
46 fastafile = validParameter->validFile(parameters, "fasta", true);
47 if (fastafile == "not found") { cout << "fasta is a required parameter for the get.oturep command." << endl; abort = true; }
48 else if (fastafile == "not open") { abort = true; }
50 globaldata->setFastaFile(fastafile);
53 listfile = validParameter->validFile(parameters, "list", true);
54 if (listfile == "not found") { cout << "list is a required parameter for the get.oturep command." << endl; abort = true; }
55 else if (listfile == "not open") { abort = true; }
57 globaldata->setListFile(listfile);
60 //check for optional parameter and set defaults
61 // ...at some point should added some additional type checking...
62 line = validParameter->validFile(parameters, "line", false);
63 if (line == "not found") { line = ""; }
65 if(line != "all") { splitAtDash(line, lines); allLines = 0; }
66 else { allLines = 1; }
69 label = validParameter->validFile(parameters, "label", false);
70 if (label == "not found") { label = ""; }
72 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
73 else { allLines = 1; }
76 //make sure user did not use both the line and label parameters
77 if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
78 //if the user has not specified any line or labels use the ones from read.otu
79 else if ((line == "") && (label == "")) {
80 allLines = globaldata->allLines;
81 labels = globaldata->labels;
82 lines = globaldata->lines;
85 namesfile = validParameter->validFile(parameters, "name", true);
86 if (namesfile == "not open") { abort = true; }
87 else if (namesfile == "not found") { namesfile = ""; }
89 groupfile = validParameter->validFile(parameters, "group", true);
90 if (groupfile == "not open") { abort = true; }
91 else if (groupfile == "not found") { groupfile = ""; }
93 //read in group map info.
94 groupMap = new GroupMap(groupfile);
98 delete validParameter;
100 if (abort == false) {
102 if(globaldata->gSparseMatrix != NULL) { matrix = new SparseMatrix(*globaldata->gSparseMatrix); }
104 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
105 if(globaldata->gListVector != NULL) {
107 vector<string> names;
109 //map names to rows in sparsematrix
110 for (int i = 0; i < globaldata->gListVector->size(); i++) {
112 binnames = globaldata->gListVector->get(i);
114 splitAtComma(binnames, names);
116 for (int j = 0; j < names.size(); j++) {
117 nameToIndex[names[j]] = i;
121 }else { cout << "error, no listvector." << endl; }
123 openInputFile(fastafile, in);
124 fasta = new FastaMap();
130 catch(exception& e) {
131 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
135 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
140 //**********************************************************************************************************************
142 void GetOTURepCommand::help(){
144 cout << "The get.oturep command can only be executed after a successful read.dist command." << "\n";
145 cout << "The get.oturep command parameters are list, fasta, name, group, line and label. The fasta and list parameters are required, and you may not use line and label at the same time." << "\n";
146 cout << "The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes." << "\n";
147 cout << "The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, line=yourLines, label=yourLabels)." << "\n";
148 cout << "Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, line=1-3-5, name=amazon.names)." << "\n";
149 cout << "The default value for line and label are all lines in your inputfile." << "\n";
150 cout << "The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin." << "\n";
151 cout << "If you provide a groupfile, then it also appends the names of the groups present in that bin." << "\n";
152 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile)." << "\n" << "\n";
154 catch(exception& e) {
155 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
159 cout << "An unknown error has occurred in the GetOTURepCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
164 //**********************************************************************************************************************
166 GetOTURepCommand::~GetOTURepCommand(){
170 if (groupfile != "") {
175 //**********************************************************************************************************************
177 int GetOTURepCommand::execute(){
180 if (abort == true) { return 0; }
186 fasta->readFastaFile(in);
188 //set format to list so input can get listvector
189 globaldata->setFormat("list");
191 //if user gave a namesfile then use it
192 if (namesfile != "") {
197 read = new ReadOTUFile(listfile);
198 read->read(&*globaldata);
200 input = globaldata->ginput;
201 list = globaldata->gListVector;
202 ListVector* lastList = list;
204 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
205 set<string> processedLabels;
206 set<string> userLabels = labels;
207 set<int> userLines = lines;
210 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
212 if(allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
213 cout << list->getLabel() << '\t' << count << endl;
214 error = process(list);
215 if (error == 1) { return 0; } //there is an error in hte input files, abort command
217 processedLabels.insert(list->getLabel());
218 userLabels.erase(list->getLabel());
219 userLines.erase(count);
222 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
223 cout << lastList->getLabel() << '\t' << count << endl;
224 error = process(lastList);
225 if (error == 1) { return 0; } //there is an error in hte input files, abort command
227 processedLabels.insert(lastList->getLabel());
228 userLabels.erase(lastList->getLabel());
231 if (count != 1) { delete lastList; }
234 list = input->getListVector();
238 //output error messages about any remaining user labels
239 set<string>::iterator it;
240 bool needToRun = false;
241 for (it = userLabels.begin(); it != userLabels.end(); it++) {
242 cout << "Your file does not include the label "<< *it;
243 if (processedLabels.count(lastList->getLabel()) != 1) {
244 cout << ". I will use " << lastList->getLabel() << "." << endl;
247 cout << ". Please refer to " << lastList->getLabel() << "." << endl;
251 //run last line if you need to
252 if (needToRun == true) {
253 cout << lastList->getLabel() << '\t' << count << endl;
254 error = process(lastList);
255 if (error == 1) { return 0; } //there is an error in hte input files, abort command
260 globaldata->gSparseMatrix = NULL;
262 globaldata->gListVector = NULL;
266 catch(exception& e) {
267 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
271 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
277 //**********************************************************************************************************************
278 void GetOTURepCommand::readNamesFile() {
280 vector<string> dupNames;
281 openInputFile(namesfile, inNames);
283 string name, names, sequence;
286 inNames >> name; //read from first column A
287 inNames >> names; //read from second column A,B,C,D
291 //parse names into vector
292 splitAtComma(names, dupNames);
294 //store names in fasta map
295 sequence = fasta->getSequence(name);
296 for (int i = 0; i < dupNames.size(); i++) {
297 fasta->push_back(dupNames[i], sequence);
305 catch(exception& e) {
306 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
310 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
314 //**********************************************************************************************************************
315 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
317 vector<string> names;
318 map<string, float> sums;
319 map<string, float>::iterator it4;
320 map<int, string> binMap; //subset of namesToIndex - just member of this bin
324 map<string, string> groups;
325 map<string, string>::iterator groupIt;
327 binnames = thisList->get(bin);
329 //parse names into vector
330 splitAtComma(binnames, names);
332 //if you have a groupfile
333 if(groupfile != "") {
334 //find the groups that are in this bin
335 for (int i = 0; i < names.size(); i++) {
336 string groupName = groupMap->getGroup(names[i]);
337 if (groupName == "not found") {
338 cout << names[i] << " is missing from your group file. Please correct. " << endl;
341 groups[groupName] = groupName;
345 //turn the groups into a string
346 for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
349 group = group.substr(0, group.length()-1);
352 //if only 1 sequence in bin then that's the rep
353 if (names.size() == 1) { return names[0]; }
356 for (int i = 0; i < names.size(); i++) {
357 for (it3 = nameToIndex.begin(); it3 != nameToIndex.end(); it3++) {
359 if (it3->first == names[i]) {
360 binMap[it3->second] = it3->first;
362 //initialize sums map
363 sums[it3->first] = 0.0;
369 //go through each cell in the sparsematrix
370 for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
371 //is this a distance between 2 members of this bin
372 it = binMap.find(currentCell->row);
373 it2 = binMap.find(currentCell->column);
375 //sum the distance of the sequences in the bin to eachother
376 if ((it != binMap.end()) && (it2 != binMap.end())) {
377 //this is a cell that repesents the distance between to of this bins members
378 sums[it->second] += currentCell->dist;
379 sums[it2->second] += currentCell->dist;
383 //smallest sum is the representative
384 for (it4 = sums.begin(); it4 != sums.end(); it4++) {
385 if (it4->second < min) {
387 minName = it4->first;
396 catch(exception& e) {
397 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
401 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
406 //**********************************************************************************************************************
407 int GetOTURepCommand::process(ListVector* processList) {
409 string nameRep, name, sequence;
412 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
413 openOutputFile(outputFileName, out);
415 //for each bin in the list vector
416 for (int i = 0; i < processList->size(); i++) {
418 nameRep = FindRep(i, groups, processList);
420 //print out name and sequence for that bin
421 sequence = fasta->getSequence(nameRep);
423 if (sequence != "not found") {
424 if (groupfile == "") {
425 nameRep = nameRep + "|" + toString(i+1);
426 out << ">" << nameRep << endl;
427 out << sequence << endl;
429 nameRep = nameRep + "|" + groups + "|" + toString(i+1);
430 out << ">" << nameRep << endl;
431 out << sequence << endl;
434 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl;
435 remove(outputFileName.c_str());
444 catch(exception& e) {
445 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
449 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
454 //**********************************************************************************************************************