5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
15 globaldata = GlobalData::getInstance();
21 //allow user to run help
22 if(option == "help") { help(); abort = true; }
25 //valid paramters for this command
26 string Array[] = {"fasta","list","line","label","name", "group"};
27 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
29 OptionParser parser(option);
30 map<string, string> parameters = parser.getParameters();
32 ValidParameters validParameter;
34 //check to make sure all parameters are valid for command
35 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
36 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
39 //make sure the user has already run the read.otu command
40 if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
41 cout << "Before you use the get.oturep command, you first need to read in a distance matrix." << endl;
45 //check for required parameters
46 fastafile = validParameter.validFile(parameters, "fasta", true);
47 if (fastafile == "not found") { cout << "fasta is a required parameter for the get.oturep command." << endl; abort = true; }
48 else if (fastafile == "not open") { abort = true; }
50 listfile = validParameter.validFile(parameters, "list", true);
51 if (listfile == "not found") { cout << "list is a required parameter for the get.oturep command." << endl; abort = true; }
52 else if (listfile == "not open") { abort = true; }
54 //check for optional parameter and set defaults
55 // ...at some point should added some additional type checking...
56 line = validParameter.validFile(parameters, "line", false);
57 if (line == "not found") { line = ""; }
59 if(line != "all") { splitAtDash(line, lines); allLines = 0; }
60 else { allLines = 1; }
63 label = validParameter.validFile(parameters, "label", false);
64 if (label == "not found") { label = ""; }
66 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
67 else { allLines = 1; }
70 //make sure user did not use both the line and label parameters
71 if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
72 //if the user has not specified any line or labels use the ones from read.otu
73 else if ((line == "") && (label == "")) {
74 allLines = globaldata->allLines;
75 labels = globaldata->labels;
76 lines = globaldata->lines;
79 namesfile = validParameter.validFile(parameters, "name", true);
80 if (namesfile == "not open") { abort = true; }
81 else if (namesfile == "not found") { namesfile = ""; }
83 groupfile = validParameter.validFile(parameters, "group", true);
84 if (groupfile == "not open") { abort = true; }
85 else if (groupfile == "not found") { groupfile = ""; }
87 //read in group map info.
88 groupMap = new GroupMap(groupfile);
94 if(globaldata->gSparseMatrix != NULL) { matrix = globaldata->gSparseMatrix; }
96 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
97 if(globaldata->gListVector != NULL) {
101 //map names to rows in sparsematrix
102 for (int i = 0; i < globaldata->gListVector->size(); i++) {
104 binnames = globaldata->gListVector->get(i);
106 splitAtComma(binnames, names);
108 for (int j = 0; j < names.size(); j++) {
109 nameToIndex[names[j]] = i;
113 }else { cout << "error, no listvector." << endl; }
115 openInputFile(fastafile, in);
116 fasta = new FastaMap();
122 catch(exception& e) {
123 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
127 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
132 //**********************************************************************************************************************
134 void GetOTURepCommand::help(){
136 cout << "The get.oturep command can only be executed after a successful read.dist command." << "\n";
137 cout << "The get.oturep command parameters are list, fasta, name, group, line and label. The fasta and list parameters are required, and you may not use line and label at the same time." << "\n";
138 cout << "The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes." << "\n";
139 cout << "The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, line=yourLines, label=yourLabels)." << "\n";
140 cout << "Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, line=1-3-5, name=amazon.names)." << "\n";
141 cout << "The default value for line and label are all lines in your inputfile." << "\n";
142 cout << "The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin." << "\n";
143 cout << "If you provide a groupfile, then it also appends the names of the groups present in that bin." << "\n";
144 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile)." << "\n" << "\n";
146 catch(exception& e) {
147 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
151 cout << "An unknown error has occurred in the GetOTURepCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
156 //**********************************************************************************************************************
158 GetOTURepCommand::~GetOTURepCommand(){
162 if (groupfile != "") {
167 //**********************************************************************************************************************
169 int GetOTURepCommand::execute(){
172 if (abort == true) { return 0; }
178 fasta->readFastaFile(in);
180 //set format to list so input can get listvector
181 // globaldata->setFormat("list");
183 //if user gave a namesfile then use it
184 if (namesfile != "") {
189 read = new ReadOTUFile(listfile);
190 read->read(&*globaldata);
192 input = globaldata->ginput;
193 list = globaldata->gListVector;
194 ListVector* lastList = list;
196 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
197 set<string> processedLabels;
198 set<string> userLabels = labels;
199 set<int> userLines = lines;
202 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
204 if(allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
205 cout << list->getLabel() << '\t' << count << endl;
206 error = process(list);
207 if (error == 1) { return 0; } //there is an error in hte input files, abort command
209 processedLabels.insert(list->getLabel());
210 userLabels.erase(list->getLabel());
211 userLines.erase(count);
214 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
215 cout << lastList->getLabel() << '\t' << count << endl;
216 error = process(lastList);
217 if (error == 1) { return 0; } //there is an error in hte input files, abort command
219 processedLabels.insert(lastList->getLabel());
220 userLabels.erase(lastList->getLabel());
223 if (count != 1) { delete lastList; }
226 list = input->getListVector();
230 //output error messages about any remaining user labels
231 bool needToRun = false;
232 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
233 cout << "Your file does not include the label "<< *it;
234 if (processedLabels.count(lastList->getLabel()) != 1) {
235 cout << ". I will use " << lastList->getLabel() << "." << endl;
238 cout << ". Please refer to " << lastList->getLabel() << "." << endl;
242 //run last line if you need to
243 if (needToRun == true) {
244 cout << lastList->getLabel() << '\t' << count << endl;
245 error = process(lastList);
246 if (error == 1) { return 0; } //there is an error in hte input files, abort command
251 globaldata->gSparseMatrix = NULL;
253 globaldata->gListVector = NULL;
257 catch(exception& e) {
258 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
262 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
268 //**********************************************************************************************************************
269 void GetOTURepCommand::readNamesFile() {
271 vector<string> dupNames;
272 openInputFile(namesfile, inNames);
274 string name, names, sequence;
277 inNames >> name; //read from first column A
278 inNames >> names; //read from second column A,B,C,D
282 //parse names into vector
283 splitAtComma(names, dupNames);
285 //store names in fasta map
286 sequence = fasta->getSequence(name);
287 for (int i = 0; i < dupNames.size(); i++) {
288 fasta->push_back(dupNames[i], sequence);
296 catch(exception& e) {
297 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
301 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
305 //**********************************************************************************************************************
306 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
308 vector<string> names;
309 map<string, float> sums;
310 map<int, string> binMap; //subset of namesToIndex - just member of this bin
314 map<string, string> groups;
315 map<string, string>::iterator groupIt;
317 binnames = thisList->get(bin);
319 //parse names into vector
320 splitAtComma(binnames, names);
322 //if you have a groupfile
323 if(groupfile != "") {
324 //find the groups that are in this bin
325 for (int i = 0; i < names.size(); i++) {
326 string groupName = groupMap->getGroup(names[i]);
327 if (groupName == "not found") {
328 cout << names[i] << " is missing from your group file. Please correct. " << endl;
331 groups[groupName] = groupName;
335 //turn the groups into a string
336 for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
339 group = group.substr(0, group.length()-1);
342 //if only 1 sequence in bin then that's the rep
343 if (names.size() == 1) { return names[0]; }
346 for (int i = 0; i < names.size(); i++) {
347 for (map<string, int>::iterator it = nameToIndex.begin(); it != nameToIndex.end(); it++) {
349 if (it->first == names[i]) {
350 binMap[it->second] = it->first;
352 //initialize sums map
353 sums[it->first] = 0.0;
359 //go through each cell in the sparsematrix
360 for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
361 //is this a distance between 2 members of this bin
362 map<int, string>::iterator it = binMap.find(currentCell->row);
363 map<int, string>::iterator it2 = binMap.find(currentCell->column);
365 //sum the distance of the sequences in the bin to eachother
366 if ((it != binMap.end()) && (it2 != binMap.end())) {
367 //this is a cell that repesents the distance between to of this bins members
368 sums[it->second] += currentCell->dist;
369 sums[it2->second] += currentCell->dist;
373 //smallest sum is the representative
374 for (map<string, float>::iterator it4 = sums.begin(); it4 != sums.end(); it4++) {
375 if (it4->second < min) {
377 minName = it4->first;
386 catch(exception& e) {
387 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
391 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
396 //**********************************************************************************************************************
397 int GetOTURepCommand::process(ListVector* processList) {
399 string nameRep, name, sequence;
402 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
403 openOutputFile(outputFileName, out);
405 //for each bin in the list vector
406 for (int i = 0; i < processList->size(); i++) {
408 nameRep = FindRep(i, groups, processList);
410 //print out name and sequence for that bin
411 sequence = fasta->getSequence(nameRep);
413 if (sequence != "not found") {
414 if (groupfile == "") {
415 nameRep = nameRep + "|" + toString(i+1);
416 out << ">" << nameRep << endl;
417 out << sequence << endl;
419 nameRep = nameRep + "|" + groups + "|" + toString(i+1);
420 out << ">" << nameRep << endl;
421 out << sequence << endl;
424 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl;
425 remove(outputFileName.c_str());
434 catch(exception& e) {
435 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
439 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
444 //**********************************************************************************************************************