5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(){
15 globaldata = GlobalData::getInstance();
17 if(globaldata->gSparseMatrix != NULL) { matrix = new SparseMatrix(*globaldata->gSparseMatrix); }
19 //listOfNames bin 0 = first name read in distance matrix, listOfNames bin 1 = second name read in distance matrix
20 if(globaldata->gListVector != NULL) {
21 listOfNames = new ListVector(*globaldata->gListVector);
25 //map names to rows in sparsematrix
26 for (int i = 0; i < listOfNames->size(); i++) {
28 binnames = listOfNames->get(i);
29 splitAtComma(binnames, names);
31 for (int j = 0; j < names.size(); j++) {
32 nameToIndex[names[j]] = i;
35 }else { cout << "error, no listvector." << endl; }
38 fastafile = globaldata->getFastaFile();
39 namesfile = globaldata->getNameFile();
40 groupfile = globaldata->getGroupFile();
42 if (groupfile != "") {
43 //read in group map info.
44 groupMap = new GroupMap(groupfile);
48 openInputFile(fastafile, in);
50 fasta = new FastaMap();
54 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
58 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
62 //**********************************************************************************************************************
64 GetOTURepCommand::~GetOTURepCommand(){
68 if (groupfile != "") {
73 //**********************************************************************************************************************
75 int GetOTURepCommand::execute(){
81 fasta->readFastaFile(in);
83 //set format to list so input can get listvector
84 globaldata->setFormat("list");
86 //if user gave a namesfile then use it
87 if (namesfile != "") {
92 read = new ReadOTUFile(globaldata->getListFile());
93 read->read(&*globaldata);
95 input = globaldata->ginput;
96 list = globaldata->gListVector;
97 ListVector* lastList = list;
99 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
100 set<string> processedLabels;
101 set<string> userLabels = globaldata->labels;
102 set<int> userLines = globaldata->lines;
105 while((list != NULL) && ((globaldata->allLines == 1) || (userLabels.size() != 0) || (userLines.size() != 0))) {
107 if(globaldata->allLines == 1 || globaldata->lines.count(count) == 1 || globaldata->labels.count(list->getLabel()) == 1){
108 cout << list->getLabel() << '\t' << count << endl;
109 error = process(list);
110 if (error == 1) { return 0; } //there is an error in hte input files, abort command
112 processedLabels.insert(list->getLabel());
113 userLabels.erase(list->getLabel());
114 userLines.erase(count);
117 if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
118 cout << lastList->getLabel() << '\t' << count << endl;
119 error = process(lastList);
120 if (error == 1) { return 0; } //there is an error in hte input files, abort command
122 processedLabels.insert(lastList->getLabel());
123 userLabels.erase(lastList->getLabel());
126 if (count != 1) { delete lastList; }
129 list = input->getListVector();
133 //output error messages about any remaining user labels
134 set<string>::iterator it;
135 bool needToRun = false;
136 for (it = userLabels.begin(); it != userLabels.end(); it++) {
137 cout << "Your file does not include the label "<< *it;
138 if (processedLabels.count(lastList->getLabel()) != 1) {
139 cout << ". I will use " << lastList->getLabel() << "." << endl;
142 cout << ". Please refer to " << lastList->getLabel() << "." << endl;
146 //run last line if you need to
147 if (needToRun == true) {
148 cout << lastList->getLabel() << '\t' << count << endl;
149 error = process(lastList);
150 if (error == 1) { return 0; } //there is an error in hte input files, abort command
155 globaldata->gSparseMatrix = NULL;
157 globaldata->gListVector = NULL;
161 catch(exception& e) {
162 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
166 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
172 //**********************************************************************************************************************
173 void GetOTURepCommand::readNamesFile() {
175 vector<string> dupNames;
176 openInputFile(namesfile, inNames);
178 string name, names, sequence;
181 inNames >> name; //read from first column A
182 inNames >> names; //read from second column A,B,C,D
186 //parse names into vector
187 splitAtComma(names, dupNames);
189 //store names in fasta map
190 sequence = fasta->getSequence(name);
191 for (int i = 0; i < dupNames.size(); i++) {
192 fasta->push_back(dupNames[i], sequence);
200 catch(exception& e) {
201 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
205 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
209 //**********************************************************************************************************************
210 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
212 vector<string> names;
213 map<string, float> sums;
214 map<string, float>::iterator it4;
215 map<int, string> binMap; //subset of namesToIndex - just member of this bin
219 map<string, string> groups;
220 map<string, string>::iterator groupIt;
222 binnames = thisList->get(bin);
224 //parse names into vector
225 splitAtComma(binnames, names);
227 //if you have a groupfile
228 if(groupfile != "") {
229 //find the groups that are in this bin
230 for (int i = 0; i < names.size(); i++) {
231 string groupName = groupMap->getGroup(names[i]);
232 if (groupName == "not found") {
233 cout << names[i] << " is missing from your group file. Please correct. " << endl;
236 groups[groupName] = groupName;
240 //turn the groups into a string
241 for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
244 group = group.substr(0, group.length()-1);
247 //if only 1 sequence in bin then that's the rep
248 if (names.size() == 1) { return names[0]; }
251 for (int i = 0; i < names.size(); i++) {
252 for (it3 = nameToIndex.begin(); it3 != nameToIndex.end(); it3++) {
254 if (it3->first == names[i]) {
255 binMap[it3->second] = it3->first;
257 //initialize sums map
258 sums[it3->first] = 0.0;
264 //go through each cell in the sparsematrix
265 for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
266 //is this a distance between 2 members of this bin
267 it = binMap.find(currentCell->row);
268 it2 = binMap.find(currentCell->column);
270 //sum the distance of the sequences in the bin to eachother
271 if ((it != binMap.end()) && (it2 != binMap.end())) {
272 //this is a cell that repesents the distance between to of this bins members
273 sums[it->second] += currentCell->dist;
274 sums[it2->second] += currentCell->dist;
278 //smallest sum is the representative
279 for (it4 = sums.begin(); it4 != sums.end(); it4++) {
280 if (it4->second < min) {
282 minName = it4->first;
291 catch(exception& e) {
292 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
296 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
301 //**********************************************************************************************************************
302 int GetOTURepCommand::process(ListVector* processList) {
304 string nameRep, name, sequence;
307 string outputFileName = getRootName(globaldata->getListFile()) + processList->getLabel() + ".rep.fasta";
308 openOutputFile(outputFileName, out);
310 //for each bin in the list vector
311 for (int i = 0; i < processList->size(); i++) {
313 nameRep = FindRep(i, groups, processList);
315 //print out name and sequence for that bin
316 sequence = fasta->getSequence(nameRep);
318 if (sequence != "not found") {
319 if (groupfile == "") {
320 nameRep = nameRep + "|" + toString(i+1);
321 out << ">" << nameRep << endl;
322 out << sequence << endl;
324 nameRep = nameRep + "|" + groups + "|" + toString(i+1);
325 out << ">" << nameRep << endl;
326 out << sequence << endl;
329 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl;
330 remove(outputFileName.c_str());
339 catch(exception& e) {
340 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
344 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
349 //**********************************************************************************************************************