5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
15 globaldata = GlobalData::getInstance();
21 //allow user to run help
22 if(option == "help") { help(); abort = true; }
25 //valid paramters for this command
26 string Array[] = {"fasta","list","line","label","name", "group"};
27 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
29 OptionParser parser(option);
30 map<string, string> parameters = parser.getParameters();
32 ValidParameters validParameter;
34 //check to make sure all parameters are valid for command
35 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
36 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
39 //make sure the user has already run the read.otu command
40 if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
41 cout << "Before you use the get.oturep command, you first need to read in a distance matrix." << endl;
45 //check for required parameters
46 fastafile = validParameter.validFile(parameters, "fasta", true);
47 if (fastafile == "not found") { cout << "fasta is a required parameter for the get.oturep command." << endl; abort = true; }
48 else if (fastafile == "not open") { abort = true; }
50 listfile = validParameter.validFile(parameters, "list", true);
51 if (listfile == "not found") { cout << "list is a required parameter for the get.oturep command." << endl; abort = true; }
52 else if (listfile == "not open") { abort = true; }
54 //check for optional parameter and set defaults
55 // ...at some point should added some additional type checking...
56 line = validParameter.validFile(parameters, "line", false);
57 if (line == "not found") { line = ""; }
59 if(line != "all") { splitAtDash(line, lines); allLines = 0; }
60 else { allLines = 1; }
63 label = validParameter.validFile(parameters, "label", false);
64 if (label == "not found") { label = ""; }
66 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
67 else { allLines = 1; }
70 //make sure user did not use both the line and label parameters
71 if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
72 //if the user has not specified any line or labels use the ones from read.otu
73 else if ((line == "") && (label == "")) {
74 allLines = globaldata->allLines;
75 labels = globaldata->labels;
76 lines = globaldata->lines;
79 namesfile = validParameter.validFile(parameters, "name", true);
80 if (namesfile == "not open") { abort = true; }
81 else if (namesfile == "not found") { namesfile = ""; }
83 groupfile = validParameter.validFile(parameters, "group", true);
84 if (groupfile == "not open") { abort = true; }
85 else if (groupfile == "not found") { groupfile = ""; }
87 //read in group map info.
88 groupMap = new GroupMap(groupfile);
94 if(globaldata->gSparseMatrix != NULL) { matrix = globaldata->gSparseMatrix; }
96 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
97 if(globaldata->gListVector != NULL) {
101 //map names to rows in sparsematrix
102 for (int i = 0; i < globaldata->gListVector->size(); i++) {
104 binnames = globaldata->gListVector->get(i);
106 splitAtComma(binnames, names);
108 for (int j = 0; j < names.size(); j++) {
109 nameToIndex[names[j]] = i;
113 }else { cout << "error, no listvector." << endl; }
115 // openInputFile(fastafile, in);
116 fasta = new FastaMap();
122 catch(exception& e) {
123 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
127 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
132 //**********************************************************************************************************************
134 void GetOTURepCommand::help(){
136 cout << "The get.oturep command can only be executed after a successful read.dist command." << "\n";
137 cout << "The get.oturep command parameters are list, fasta, name, group, line and label. The fasta and list parameters are required, and you may not use line and label at the same time." << "\n";
138 cout << "The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes." << "\n";
139 cout << "The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, line=yourLines, label=yourLabels)." << "\n";
140 cout << "Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, line=1-3-5, name=amazon.names)." << "\n";
141 cout << "The default value for line and label are all lines in your inputfile." << "\n";
142 cout << "The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin." << "\n";
143 cout << "If you provide a groupfile, then it also appends the names of the groups present in that bin." << "\n";
144 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile)." << "\n" << "\n";
146 catch(exception& e) {
147 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
151 cout << "An unknown error has occurred in the GetOTURepCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
156 //**********************************************************************************************************************
158 GetOTURepCommand::~GetOTURepCommand(){
159 if (abort == false) {
160 delete input; globaldata->ginput = NULL;
163 if (groupfile != "") {
164 delete groupMap; globaldata->gGroupmap = NULL;
169 //**********************************************************************************************************************
171 int GetOTURepCommand::execute(){
174 if (abort == true) { return 0; }
180 fasta->readFastaFile(fastafile);
184 //set format to list so input can get listvector
185 globaldata->setFormat("list");
187 //if user gave a namesfile then use it
188 if (namesfile != "") {
193 read = new ReadOTUFile(listfile);
194 read->read(&*globaldata);
196 input = globaldata->ginput;
197 list = globaldata->gListVector;
198 string lastLabel = list->getLabel();
200 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
201 set<string> processedLabels;
202 set<string> userLabels = labels;
203 set<int> userLines = lines;
206 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
208 if(allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
209 cout << list->getLabel() << '\t' << count << endl;
210 error = process(list);
211 if (error == 1) { return 0; } //there is an error in hte input files, abort command
213 processedLabels.insert(list->getLabel());
214 userLabels.erase(list->getLabel());
215 userLines.erase(count);
218 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
220 list = input->getListVector(lastLabel);
222 cout << list->getLabel() << '\t' << count << endl;
223 error = process(list);
224 if (error == 1) { return 0; } //there is an error in hte input files, abort command
226 processedLabels.insert(list->getLabel());
227 userLabels.erase(list->getLabel());
231 lastLabel = list->getLabel();
234 list = input->getListVector();
238 //output error messages about any remaining user labels
239 bool needToRun = false;
240 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
241 cout << "Your file does not include the label "<< *it;
242 if (processedLabels.count(lastLabel) != 1) {
243 cout << ". I will use " << lastLabel << "." << endl;
246 cout << ". Please refer to " << lastLabel << "." << endl;
250 //run last line if you need to
251 if (needToRun == true) {
253 list = input->getListVector(lastLabel);
255 cout << list->getLabel() << '\t' << count << endl;
256 error = process(list);
257 if (error == 1) { return 0; } //there is an error in hte input files, abort command
262 globaldata->gSparseMatrix = NULL;
263 globaldata->gListVector = NULL;
267 catch(exception& e) {
268 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
272 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
278 //**********************************************************************************************************************
279 void GetOTURepCommand::readNamesFile() {
281 vector<string> dupNames;
282 openInputFile(namesfile, inNames);
284 string name, names, sequence;
287 inNames >> name; //read from first column A
288 inNames >> names; //read from second column A,B,C,D
292 //parse names into vector
293 splitAtComma(names, dupNames);
295 //store names in fasta map
296 sequence = fasta->getSequence(name);
297 for (int i = 0; i < dupNames.size(); i++) {
298 fasta->push_back(dupNames[i], sequence);
306 catch(exception& e) {
307 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
311 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
315 //**********************************************************************************************************************
316 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
318 vector<string> names;
319 map<string, float> sums;
320 map<int, string> binMap; //subset of namesToIndex - just member of this bin
324 map<string, string> groups;
325 map<string, string>::iterator groupIt;
327 binnames = thisList->get(bin);
329 //parse names into vector
330 splitAtComma(binnames, names);
332 //if you have a groupfile
333 if(groupfile != "") {
334 //find the groups that are in this bin
335 for (int i = 0; i < names.size(); i++) {
336 string groupName = groupMap->getGroup(names[i]);
337 if (groupName == "not found") {
338 cout << names[i] << " is missing from your group file. Please correct. " << endl;
341 groups[groupName] = groupName;
345 //turn the groups into a string
346 for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
349 group = group.substr(0, group.length()-1);
352 //if only 1 sequence in bin then that's the rep
353 if (names.size() == 1) { return names[0]; }
356 for (int i = 0; i < names.size(); i++) {
357 for (map<string, int>::iterator it = nameToIndex.begin(); it != nameToIndex.end(); it++) {
359 if (it->first == names[i]) {
360 binMap[it->second] = it->first;
362 //initialize sums map
363 sums[it->first] = 0.0;
369 //go through each cell in the sparsematrix
370 for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
371 //is this a distance between 2 members of this bin
372 map<int, string>::iterator it = binMap.find(currentCell->row);
373 map<int, string>::iterator it2 = binMap.find(currentCell->column);
375 //sum the distance of the sequences in the bin to eachother
376 if ((it != binMap.end()) && (it2 != binMap.end())) {
377 //this is a cell that repesents the distance between to of this bins members
378 sums[it->second] += currentCell->dist;
379 sums[it2->second] += currentCell->dist;
383 //smallest sum is the representative
384 for (map<string, float>::iterator it4 = sums.begin(); it4 != sums.end(); it4++) {
385 if (it4->second < min) {
387 minName = it4->first;
396 catch(exception& e) {
397 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
401 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
406 //**********************************************************************************************************************
407 int GetOTURepCommand::process(ListVector* processList) {
409 string nameRep, name, sequence;
412 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
413 openOutputFile(outputFileName, out);
415 //for each bin in the list vector
416 for (int i = 0; i < processList->size(); i++) {
418 nameRep = FindRep(i, groups, processList);
420 //print out name and sequence for that bin
421 sequence = fasta->getSequence(nameRep);
423 if (sequence != "not found") {
424 if (groupfile == "") {
425 nameRep = nameRep + "|" + toString(i+1);
426 out << ">" << nameRep << endl;
427 out << sequence << endl;
429 nameRep = nameRep + "|" + groups + "|" + toString(i+1);
430 out << ">" << nameRep << endl;
431 out << sequence << endl;
434 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl;
435 remove(outputFileName.c_str());
444 catch(exception& e) {
445 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
449 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
454 //**********************************************************************************************************************