5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
12 //********************************************************************************************************************
13 //sorts lowest to highest
14 inline bool compareName(repStruct left, repStruct right){
15 return (left.name < right.name);
17 //********************************************************************************************************************
18 //sorts lowest to highest
19 inline bool compareBin(repStruct left, repStruct right){
20 return (left.bin < right.bin);
22 //********************************************************************************************************************
23 //sorts lowest to highest
24 inline bool compareSize(repStruct left, repStruct right){
25 return (left.size < right.size);
27 //********************************************************************************************************************
28 //sorts lowest to highest
29 inline bool compareGroup(repStruct left, repStruct right){
30 return (left.group < right.group);
32 //**********************************************************************************************************************
33 GetOTURepCommand::GetOTURepCommand(string option){
35 globaldata = GlobalData::getInstance();
40 //allow user to run help
41 if (option == "help") {
44 //valid paramters for this command
45 string Array[] = {"fasta","list","label","name", "group", "sorted", "phylip","column"};
46 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
48 OptionParser parser(option);
49 map<string, string> parameters = parser.getParameters();
51 ValidParameters validParameter;
53 //check to make sure all parameters are valid for command
54 for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
55 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
58 //make sure the user has already run the read.otu command
59 if ((globaldata->gSparseMatrix == NULL) || (globaldata->gListVector == NULL)) {
60 mothurOut("Before you use the get.oturep command, you first need to read in a distance matrix."); mothurOutEndLine();
64 //check for required parameters
65 fastafile = validParameter.validFile(parameters, "fasta", true);
66 if (fastafile == "not found") { mothurOut("fasta is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
67 else if (fastafile == "not open") { abort = true; }
69 listfile = validParameter.validFile(parameters, "list", true);
70 if (listfile == "not found") { mothurOut("list is a required parameter for the get.oturep command."); mothurOutEndLine(); abort = true; }
71 else if (listfile == "not open") { abort = true; }
73 phylipfile = validParameter.validFile(parameters, "phylip", true);
74 if (phylipfile == "not found") { phylipfile = ""; }
75 else if (phylipfile == "not open") { abort = true; }
77 columnfile = validParameter.validFile(parameters, "column", true);
78 if (columnfile == "not found") { columnfile = ""; }
79 else if (columnfile == "not open") { abort = true; }
81 if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a get.oturep command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
82 else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
84 if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } }
86 //check for optional parameter and set defaults
87 // ...at some point should added some additional type checking...
88 label = validParameter.validFile(parameters, "label", false);
89 if (label == "not found") { label = ""; }
91 if(label != "all") { splitAtDash(label, labels); allLines = 0; }
92 else { allLines = 1; }
95 //if the user has not specified any labels use the ones from read.otu
97 allLines = globaldata->allLines;
98 labels = globaldata->labels;
101 namesfile = validParameter.validFile(parameters, "name", true);
102 if (namesfile == "not open") { abort = true; }
103 else if (namesfile == "not found") { namesfile = ""; }
105 groupfile = validParameter.validFile(parameters, "group", true);
106 if (groupfile == "not open") { groupfile = ""; abort = true; }
107 else if (groupfile == "not found") { groupfile = ""; }
109 //read in group map info.
110 groupMap = new GroupMap(groupfile);
114 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
115 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
116 mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); mothurOutEndLine();
120 if ((sorted == "group") && (groupfile == "")) {
121 mothurOut("You must provide a groupfile to sort by group. I will not sort."); mothurOutEndLine();
125 if (abort == false) {
127 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
128 if (globaldata->gListVector != NULL) {
129 vector<string> names;
131 //map names to rows in sparsematrix
132 for (int i = 0; i < globaldata->gListVector->size(); i++) {
134 binnames = globaldata->gListVector->get(i);
136 splitAtComma(binnames, names);
138 for (int j = 0; j < names.size(); j++) {
139 nameToIndex[names[j]] = i;
142 } else { mothurOut("error, no listvector."); mothurOutEndLine(); }
144 fasta = new FastaMap();
148 catch(exception& e) {
149 errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
154 //**********************************************************************************************************************
156 void GetOTURepCommand::help(){
158 mothurOut("The get.oturep command can only be executed after a successful read.dist command.\n");
159 mothurOut("The get.oturep command parameters are list, fasta, name, group, sorted and label. The fasta and list parameters are required.\n");
160 mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
161 mothurOut("The get.oturep command should be in the following format: get.oturep(fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
162 mothurOut("Example get.oturep(fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups, name=amazon.names).\n");
163 mothurOut("The default value for label is all labels in your inputfile.\n");
164 mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
165 mothurOut("The get.oturep command outputs a .fastarep file for each distance you specify, selecting one OTU representative for each bin.\n");
166 mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
167 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
169 catch(exception& e) {
170 errorOut(e, "GetOTURepCommand", "help");
175 //**********************************************************************************************************************
177 GetOTURepCommand::~GetOTURepCommand(){
178 if (abort == false) {
179 delete input; globaldata->ginput = NULL;
182 if (groupfile != "") {
183 delete groupMap; globaldata->gGroupmap = NULL;
188 //**********************************************************************************************************************
190 int GetOTURepCommand::execute(){
193 if (abort == true) { return 0; }
197 fasta->readFastaFile(fastafile);
200 //read distance file and generate indexed distance file that can be used by findrep function
201 //....new reading class....//
203 //if user gave a namesfile then use it
204 if (namesfile != "") { readNamesFile(); }
206 //set format to list so input can get listvector
207 globaldata->setFormat("list");
210 read = new ReadOTUFile(listfile);
211 read->read(&*globaldata);
213 input = globaldata->ginput;
214 list = globaldata->gListVector;
215 string lastLabel = list->getLabel();
217 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
218 set<string> processedLabels;
219 set<string> userLabels = labels;
221 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
223 if (allLines == 1 || labels.count(list->getLabel()) == 1){
224 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
225 error = process(list);
226 if (error == 1) { return 0; } //there is an error in hte input files, abort command
228 processedLabels.insert(list->getLabel());
229 userLabels.erase(list->getLabel());
232 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
233 string saveLabel = list->getLabel();
236 list = input->getListVector(lastLabel);
237 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
238 error = process(list);
239 if (error == 1) { return 0; } //there is an error in hte input files, abort command
241 processedLabels.insert(list->getLabel());
242 userLabels.erase(list->getLabel());
244 //restore real lastlabel to save below
245 list->setLabel(saveLabel);
248 lastLabel = list->getLabel();
251 list = input->getListVector();
254 //output error messages about any remaining user labels
255 bool needToRun = false;
256 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
257 mothurOut("Your file does not include the label " + *it);
258 if (processedLabels.count(list->getLabel()) != 1) {
259 mothurOut(". I will use " + lastLabel + "."); mothurOutEndLine();
262 mothurOut(". Please refer to " + lastLabel + "."); mothurOutEndLine();
266 //run last label if you need to
267 if (needToRun == true) {
268 if (list != NULL) { delete list; }
269 list = input->getListVector(lastLabel);
270 mothurOut(list->getLabel() + "\t" + toString(list->size())); mothurOutEndLine();
271 error = process(list);
273 if (error == 1) { return 0; } //there is an error in hte input files, abort command
277 globaldata->gListVector = NULL;
281 catch(exception& e) {
282 errorOut(e, "GetOTURepCommand", "execute");
287 //**********************************************************************************************************************
288 void GetOTURepCommand::readNamesFile() {
290 vector<string> dupNames;
291 openInputFile(namesfile, inNames);
293 string name, names, sequence;
296 inNames >> name; //read from first column A
297 inNames >> names; //read from second column A,B,C,D
301 //parse names into vector
302 splitAtComma(names, dupNames);
304 //store names in fasta map
305 sequence = fasta->getSequence(name);
306 for (int i = 0; i < dupNames.size(); i++) {
307 fasta->push_back(dupNames[i], sequence);
315 catch(exception& e) {
316 errorOut(e, "GetOTURepCommand", "readNamesFile");
320 //**********************************************************************************************************************
321 string GetOTURepCommand::findRep(int bin, string& group, ListVector* thisList, int& binsize) {
323 vector<string> names;
324 map<string, string> groups;
325 map<string, string>::iterator groupIt;
327 //parse names into vector
328 string binnames = thisList->get(bin);
329 splitAtComma(binnames, names);
330 binsize = names.size();
332 //if you have a groupfile
333 if (groupfile != "") {
334 //find the groups that are in this bin
335 for (size_t i = 0; i < names.size(); i++) {
336 string groupName = groupMap->getGroup(names[i]);
337 if (groupName == "not found") {
338 mothurOut(names[i] + " is missing from your group file. Please correct. "); mothurOutEndLine();
341 groups[groupName] = groupName;
345 //turn the groups into a string
346 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
347 group += groupIt->first + "-";
350 group = group.substr(0, group.length()-1);
353 // if only 1 sequence in bin or processing the "unique" label, then
354 // the first sequence of the OTU is the representative one
355 if ((names.size() == 1) || (list->getLabel() == "unique")) {
358 vector<int> seqIndex(names.size());
359 vector<float> max_dist(names.size());
361 //fill seqIndex and initialize sums
362 for (size_t i = 0; i < names.size(); i++) {
363 seqIndex[i] = nameToIndex[names[i]];
367 // loop through all entries in seqIndex
370 for (size_t i=0; i < seqIndex.size(); i++) {
371 //currMap = seqVec[seqIndex[i]];
372 for (size_t j=0; j < seqIndex.size(); j++) {
373 it = currMap.find(seqIndex[j]);
374 if (it != currMap.end()) {
375 max_dist[i] = max(max_dist[i], it->second);
376 max_dist[j] = max(max_dist[j], it->second);
381 // sequence with the smallest maximum distance is the representative
384 for (size_t i=0; i < max_dist.size(); i++) {
385 if (max_dist[i] < min) {
391 return(names[minIndex]);
394 catch(exception& e) {
395 errorOut(e, "GetOTURepCommand", "FindRep");
400 //**********************************************************************************************************************
401 int GetOTURepCommand::process(ListVector* processList) {
403 string nameRep, name, sequence;
406 string outputFileName = getRootName(listfile) + processList->getLabel() + ".rep.fasta";
407 openOutputFile(outputFileName, out);
408 vector<repStruct> reps;
410 //for each bin in the list vector
411 for (int i = 0; i < processList->size(); i++) {
414 nameRep = findRep(i, groups, processList, binsize);
416 //print out name and sequence for that bin
417 sequence = fasta->getSequence(nameRep);
419 if (sequence != "not found") {
420 if (sorted == "") { //print them out
421 nameRep = nameRep + "|" + toString(i+1);
422 nameRep = nameRep + "|" + toString(binsize);
423 if (groupfile != "") {
424 nameRep = nameRep + "|" + groups;
426 out << ">" << nameRep << endl;
427 out << sequence << endl;
429 repStruct newRep(nameRep, i+1, binsize, groups);
430 reps.push_back(newRep);
433 mothurOut(nameRep + " is missing from your fasta or name file. Please correct. "); mothurOutEndLine();
434 remove(outputFileName.c_str());
439 if (sorted != "") { //then sort them and print them
440 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
441 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
442 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
443 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
446 for (int i = 0; i < reps.size(); i++) {
447 string sequence = fasta->getSequence(reps[i].name);
448 string outputName = reps[i].name + "|" + toString(reps[i].bin);
449 outputName = outputName + "|" + toString(reps[i].size);
450 if (groupfile != "") {
451 outputName = outputName + "|" + reps[i].group;
453 out << ">" << outputName << endl;
454 out << sequence << endl;
462 catch(exception& e) {
463 errorOut(e, "GetOTURepCommand", "process");
468 //**********************************************************************************************************************