5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(string option){
15 globaldata = GlobalData::getInstance();
21 //allow user to run help
22 if(option == "help") { help(); abort = true; }
25 //valid paramters for this command
26 string Array[] = {"fasta","list","line","label","name", "group"};
27 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
29 OptionParser parser(option);
30 map<string, string> parameters = parser.getParameters();
32 ValidParameters validParameter;
34 //check to make sure all parameters are valid for command
35 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
36 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
39 //make sure the user has already run the read.otu command
40 if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
41 cout << "Before you use the get.oturep command, you first need to read in a distance matrix." << endl;
45 //check for required parameters
46 fastafile = validParameter.validFile(parameters, "fasta", true);
47 if (fastafile == "not found") { cout << "fasta is a required parameter for the get.oturep command." << endl; abort = true; }
48 else if (fastafile == "not open") { abort = true; }
50 listfile = validParameter.validFile(parameters, "list", true);
51 if (listfile == "not found") { cout << "list is a required parameter for the get.oturep command." << endl; abort = true; }
52 else if (listfile == "not open") { abort = true; }
54 //check for optional parameter and set defaults
55 // ...at some point should added some additional type checking...
56 line = validParameter.validFile(parameters, "line", false);
57 if (line == "not found") { line = ""; }
59 if(line != "all") { splitAtDash(line, lines); allLines = 0; }
60 else { allLines = 1; }
63 label = validParameter.validFile(parameters, "label", false);
64 if (label == "not found") { label = ""; }
66 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
67 else { allLines = 1; }
70 //make sure user did not use both the line and label parameters
71 if ((line != "") && (label != "")) { cout << "You cannot use both the line and label parameters at the same time. " << endl; abort = true; }
72 //if the user has not specified any line or labels use the ones from read.otu
73 else if ((line == "") && (label == "")) {
74 allLines = globaldata->allLines;
75 labels = globaldata->labels;
76 lines = globaldata->lines;
79 namesfile = validParameter.validFile(parameters, "name", true);
80 if (namesfile == "not open") { abort = true; }
81 else if (namesfile == "not found") { namesfile = ""; }
83 groupfile = validParameter.validFile(parameters, "group", true);
84 if (groupfile == "not open") { abort = true; }
85 else if (groupfile == "not found") { groupfile = ""; }
87 //read in group map info.
88 groupMap = new GroupMap(groupfile);
94 if(globaldata->gSparseMatrix != NULL) { matrix = globaldata->gSparseMatrix; }
96 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
97 if(globaldata->gListVector != NULL) {
101 //map names to rows in sparsematrix
102 for (int i = 0; i < globaldata->gListVector->size(); i++) {
104 binnames = globaldata->gListVector->get(i);
106 splitAtComma(binnames, names);
108 for (int j = 0; j < names.size(); j++) {
109 nameToIndex[names[j]] = i;
113 }else { cout << "error, no listvector." << endl; }
115 openInputFile(fastafile, in);
116 fasta = new FastaMap();
122 catch(exception& e) {
123 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
127 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
132 //**********************************************************************************************************************
134 void GetOTURepCommand::help(){
136 cout << "The get.oturep command can only be executed after a successful read.dist command." << "\n";
137 cout << "The get.oturep command parameters are list, fasta, name, group, line and label. The fasta and list parameters are required, and you may not use line and label at the same time." << "\n";
138 cout << "The line and label allow you to select what distance levels you would like a output files created for, and are separated by dashes." << "\n";
139 cout << "The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, line=yourLines, label=yourLabels)." << "\n";
140 cout << "Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, line=1-3-5, name=amazon.names)." << "\n";
141 cout << "The default value for line and label are all lines in your inputfile." << "\n";
142 cout << "The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin." << "\n";
143 cout << "If you provide a groupfile, then it also appends the names of the groups present in that bin." << "\n";
144 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile)." << "\n" << "\n";
146 catch(exception& e) {
147 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
151 cout << "An unknown error has occurred in the GetOTURepCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
156 //**********************************************************************************************************************
158 GetOTURepCommand::~GetOTURepCommand(){
159 if (abort == false) {
160 delete input; globaldata->ginput = NULL;
163 if (groupfile != "") {
164 delete groupMap; globaldata->gGroupmap = NULL;
169 //**********************************************************************************************************************
171 int GetOTURepCommand::execute(){
174 if (abort == true) { return 0; }
180 fasta->readFastaFile(in);
184 //set format to list so input can get listvector
185 globaldata->setFormat("list");
187 //if user gave a namesfile then use it
188 if (namesfile != "") {
193 read = new ReadOTUFile(listfile);
194 read->read(&*globaldata);
196 input = globaldata->ginput;
197 list = globaldata->gListVector;
198 ListVector* lastList = list;
200 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
201 set<string> processedLabels;
202 set<string> userLabels = labels;
203 set<int> userLines = lines;
206 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
208 if(allLines == 1 || lines.count(count) == 1 || labels.count(list->getLabel()) == 1){
209 cout << list->getLabel() << '\t' << count << endl;
210 error = process(list);
211 if (error == 1) { return 0; } //there is an error in hte input files, abort command
213 processedLabels.insert(list->getLabel());
214 userLabels.erase(list->getLabel());
215 userLines.erase(count);
218 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
219 cout << lastList->getLabel() << '\t' << count << endl;
220 error = process(lastList);
221 if (error == 1) { return 0; } //there is an error in hte input files, abort command
223 processedLabels.insert(lastList->getLabel());
224 userLabels.erase(lastList->getLabel());
227 if (count != 1) { delete lastList; }
230 list = input->getListVector();
234 //output error messages about any remaining user labels
235 bool needToRun = false;
236 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
237 cout << "Your file does not include the label "<< *it;
238 if (processedLabels.count(lastList->getLabel()) != 1) {
239 cout << ". I will use " << lastList->getLabel() << "." << endl;
242 cout << ". Please refer to " << lastList->getLabel() << "." << endl;
246 //run last line if you need to
247 if (needToRun == true) {
248 cout << lastList->getLabel() << '\t' << count << endl;
249 error = process(lastList);
250 if (error == 1) { return 0; } //there is an error in hte input files, abort command
255 globaldata->gSparseMatrix = NULL;
257 globaldata->gListVector = NULL;
261 catch(exception& e) {
262 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
266 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
272 //**********************************************************************************************************************
273 void GetOTURepCommand::readNamesFile() {
275 vector<string> dupNames;
276 openInputFile(namesfile, inNames);
278 string name, names, sequence;
281 inNames >> name; //read from first column A
282 inNames >> names; //read from second column A,B,C,D
286 //parse names into vector
287 splitAtComma(names, dupNames);
289 //store names in fasta map
290 sequence = fasta->getSequence(name);
291 for (int i = 0; i < dupNames.size(); i++) {
292 fasta->push_back(dupNames[i], sequence);
300 catch(exception& e) {
301 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
305 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
309 //**********************************************************************************************************************
310 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
312 vector<string> names;
313 map<string, float> sums;
314 map<int, string> binMap; //subset of namesToIndex - just member of this bin
318 map<string, string> groups;
319 map<string, string>::iterator groupIt;
321 binnames = thisList->get(bin);
323 //parse names into vector
324 splitAtComma(binnames, names);
326 //if you have a groupfile
327 if(groupfile != "") {
328 //find the groups that are in this bin
329 for (int i = 0; i < names.size(); i++) {
330 string groupName = groupMap->getGroup(names[i]);
331 if (groupName == "not found") {
332 cout << names[i] << " is missing from your group file. Please correct. " << endl;
335 groups[groupName] = groupName;
339 //turn the groups into a string
340 for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
343 group = group.substr(0, group.length()-1);
346 //if only 1 sequence in bin then that's the rep
347 if (names.size() == 1) { return names[0]; }
350 for (int i = 0; i < names.size(); i++) {
351 for (map<string, int>::iterator it = nameToIndex.begin(); it != nameToIndex.end(); it++) {
353 if (it->first == names[i]) {
354 binMap[it->second] = it->first;
356 //initialize sums map
357 sums[it->first] = 0.0;
363 //go through each cell in the sparsematrix
364 for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
365 //is this a distance between 2 members of this bin
366 map<int, string>::iterator it = binMap.find(currentCell->row);
367 map<int, string>::iterator it2 = binMap.find(currentCell->column);
369 //sum the distance of the sequences in the bin to eachother
370 if ((it != binMap.end()) && (it2 != binMap.end())) {
371 //this is a cell that repesents the distance between to of this bins members
372 sums[it->second] += currentCell->dist;
373 sums[it2->second] += currentCell->dist;
377 //smallest sum is the representative
378 for (map<string, float>::iterator it4 = sums.begin(); it4 != sums.end(); it4++) {
379 if (it4->second < min) {
381 minName = it4->first;
390 catch(exception& e) {
391 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
395 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
400 //**********************************************************************************************************************
401 int GetOTURepCommand::process(ListVector* processList) {
403 string nameRep, name, sequence;
406 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
407 openOutputFile(outputFileName, out);
409 //for each bin in the list vector
410 for (int i = 0; i < processList->size(); i++) {
412 nameRep = FindRep(i, groups, processList);
414 //print out name and sequence for that bin
415 sequence = fasta->getSequence(nameRep);
417 if (sequence != "not found") {
418 if (groupfile == "") {
419 nameRep = nameRep + "|" + toString(i+1);
420 out << ">" << nameRep << endl;
421 out << sequence << endl;
423 nameRep = nameRep + "|" + groups + "|" + toString(i+1);
424 out << ">" << nameRep << endl;
425 out << sequence << endl;
428 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl;
429 remove(outputFileName.c_str());
438 catch(exception& e) {
439 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
443 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
448 //**********************************************************************************************************************