]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
added smart distance feature and optimized all commands using line by line processing
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11
12 //**********************************************************************************************************************
13 GetOTURepCommand::GetOTURepCommand(){
14         try{
15                 globaldata = GlobalData::getInstance();
16         
17                 if(globaldata->gSparseMatrix != NULL)   {       matrix = new SparseMatrix(*globaldata->gSparseMatrix);          }
18                 
19                 //listOfNames bin 0 = first name read in distance matrix, listOfNames bin 1 = second name read in distance matrix
20                 if(globaldata->gListVector != NULL)             {       
21                         listOfNames = new ListVector(*globaldata->gListVector); 
22                         
23                         vector<string> names;
24                         string binnames;
25                         //map names to rows in sparsematrix
26                         for (int i = 0; i < listOfNames->size(); i++) {
27                                 names.clear();
28                                 binnames = listOfNames->get(i);
29                                 splitAtComma(binnames, names);
30                                 
31                                 for (int j = 0; j < names.size(); j++) {
32                                         nameToIndex[names[j]] = i;
33                                 }
34                         }
35                 }else { cout << "error, no listvector." << endl; }
36
37                 
38                 fastafile = globaldata->getFastaFile();
39                 namesfile = globaldata->getNameFile();
40                 groupfile = globaldata->getGroupFile();
41                 
42                 if (groupfile != "") {
43                         //read in group map info.
44                         groupMap = new GroupMap(groupfile);
45                         groupMap->readMap();
46                 }
47
48                 openInputFile(fastafile, in);
49                 
50                 fasta = new FastaMap();
51
52         }
53         catch(exception& e) {
54                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
55                 exit(1);
56         }
57         catch(...) {
58                 cout << "An unknown error has occurred in the GetOTURepCommand class function GetOTURepCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
59                 exit(1);
60         }
61 }
62 //**********************************************************************************************************************
63
64 GetOTURepCommand::~GetOTURepCommand(){
65         delete input;
66         delete read;
67         delete fasta;
68         if (groupfile != "") {
69                 delete groupMap;
70         }
71 }
72
73 //**********************************************************************************************************************
74
75 int GetOTURepCommand::execute(){
76         try {
77                 int count = 1;
78                 int error;
79                 
80                 //read fastafile
81                 fasta->readFastaFile(in);
82                 
83                 //set format to list so input can get listvector
84                 globaldata->setFormat("list");
85                 
86                 //if user gave a namesfile then use it
87                 if (namesfile != "") {
88                         readNamesFile();
89                 }
90                 
91                 //read list file
92                 read = new ReadOTUFile(globaldata->getListFile());      
93                 read->read(&*globaldata); 
94                 
95                 input = globaldata->ginput;
96                 list = globaldata->gListVector;
97                 ListVector* lastList = list;
98                 
99                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
100                 set<string> processedLabels;
101                 set<string> userLabels = globaldata->labels;
102
103                 
104                 while((list != NULL) && ((globaldata->allLines == 1) || (userLabels.size() != 0))) {
105                         
106                         if(globaldata->allLines == 1 || globaldata->lines.count(count) == 1 || globaldata->labels.count(list->getLabel()) == 1){
107                                         cout << list->getLabel() << '\t' << count << endl;
108                                         error = process(list);
109                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
110                                         
111                                         processedLabels.insert(list->getLabel());
112                                         userLabels.erase(list->getLabel());
113                         }
114                         
115                         if ((anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastList->getLabel()) != 1)) {
116                                         cout << lastList->getLabel() << '\t' << count << endl;
117                                         error = process(lastList);
118                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
119                                         
120                                         processedLabels.insert(lastList->getLabel());
121                                         userLabels.erase(lastList->getLabel());
122                         }
123                         
124                         if (count != 1) { delete lastList; }
125                         lastList = list;                        
126                         
127                         list = input->getListVector();
128                         count++;
129                 }
130                 
131                 //output error messages about any remaining user labels
132                 set<string>::iterator it;
133                 bool needToRun = false;
134                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
135                         cout << "Your file does not include the label "<< *it; 
136                         if (processedLabels.count(lastList->getLabel()) != 1) {
137                                 cout << ". I will use " << lastList->getLabel() << "." << endl;
138                                 needToRun = true;
139                         }else {
140                                 cout << ". Please refer to " << lastList->getLabel() << "." << endl;
141                         }
142                 }
143                 
144                 //run last line if you need to
145                 if (needToRun == true)  {
146                         cout << lastList->getLabel() << '\t' << count << endl;
147                         error = process(lastList);
148                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
149                 }
150                 delete lastList;
151                 
152                 delete matrix;
153                 globaldata->gSparseMatrix = NULL;
154                 delete list;
155                 globaldata->gListVector = NULL;
156
157                 return 0;
158         }
159         catch(exception& e) {
160                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
161                 exit(1);
162         }
163         catch(...) {
164                 cout << "An unknown error has occurred in the GetOTURepCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
165                 exit(1);
166         }
167
168 }
169
170 //**********************************************************************************************************************
171 void GetOTURepCommand::readNamesFile() {
172         try {
173                 vector<string> dupNames;
174                 openInputFile(namesfile, inNames);
175                 
176                 string name, names, sequence;
177         
178                 while(inNames){
179                         inNames >> name;                        //read from first column  A
180                         inNames >> names;               //read from second column  A,B,C,D
181                         
182                         dupNames.clear();
183                         
184                         //parse names into vector
185                         splitAtComma(names, dupNames);
186                         
187                         //store names in fasta map
188                         sequence = fasta->getSequence(name);
189                         for (int i = 0; i < dupNames.size(); i++) {
190                                 fasta->push_back(dupNames[i], sequence);
191                         }
192                 
193                         gobble(inNames);
194                 }
195                 inNames.close();
196
197         }
198         catch(exception& e) {
199                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
200                 exit(1);
201         }
202         catch(...) {
203                 cout << "An unknown error has occurred in the GetOTURepCommand class function readNamesFile. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
204                 exit(1);
205         }       
206 }
207 //**********************************************************************************************************************
208 string GetOTURepCommand::FindRep(int bin, string& group, ListVector* thisList) {
209         try{
210                 vector<string> names;
211                 map<string, float> sums;
212                 map<string, float>::iterator it4;
213                 map<int, string> binMap; //subset of namesToIndex - just member of this bin
214                 string binnames;
215                 float min = 10000;
216                 string minName;
217                 map<string, string> groups;
218                 map<string, string>::iterator groupIt;
219                 
220                 binnames = thisList->get(bin);
221         
222                 //parse names into vector
223                 splitAtComma(binnames, names);
224                 
225                 //if you have a groupfile
226                 if(groupfile != "") {
227                         //find the groups that are in this bin
228                         for (int i = 0; i < names.size(); i++) {
229                                 string groupName = groupMap->getGroup(names[i]);
230                                 if (groupName == "not found") {  
231                                         cout << names[i] << " is missing from your group file. Please correct. " << endl;
232                                         groupError = true;
233                                 }else{
234                                         groups[groupName] = groupName;
235                                 }
236                         }
237                         
238                         //turn the groups into a string
239                         for(groupIt = groups.begin(); groupIt != groups.end(); groupIt++) { group += groupIt->first + "-"; }
240                         
241                         //rip off last dash
242                         group = group.substr(0, group.length()-1);
243                 }
244                 
245                 //if only 1 sequence in bin then that's the rep
246                 if (names.size() == 1) { return names[0]; }
247                 else {
248                         //fill binMap
249                         for (int i = 0; i < names.size(); i++) {
250                                 for (it3 = nameToIndex.begin(); it3 != nameToIndex.end(); it3++) {
251
252                                         if (it3->first == names[i]) {  
253                                                 binMap[it3->second] = it3->first;
254
255                                                 //initialize sums map
256                                                 sums[it3->first] = 0.0;
257                                                 break;
258                                         }
259                                 }
260                         }
261                         
262                         //go through each cell in the sparsematrix
263                         for(MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++){
264                                 //is this a distance between 2 members of this bin
265                                 it = binMap.find(currentCell->row);
266                                 it2 = binMap.find(currentCell->column);
267                                 
268                                 //sum the distance of the sequences in the bin to eachother
269                                 if ((it != binMap.end()) && (it2 != binMap.end())) {
270                                         //this is a cell that repesents the distance between to of this bins members
271                                         sums[it->second] += currentCell->dist;
272                                         sums[it2->second] += currentCell->dist;
273                                 }
274                         }
275                         
276                         //smallest sum is the representative
277                         for (it4 = sums.begin(); it4 != sums.end(); it4++) {
278                                 if (it4->second < min) {
279                                         min = it4->second;
280                                         minName = it4->first;
281                                 }
282
283                         }
284                         
285                         return minName;
286                 }
287         
288         }
289         catch(exception& e) {
290                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
291                 exit(1);
292         }
293         catch(...) {
294                 cout << "An unknown error has occurred in the GetOTURepCommand class function FindRep. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
295                 exit(1);
296         }       
297 }
298
299 //**********************************************************************************************************************
300 int GetOTURepCommand::process(ListVector* processList) {
301         try{
302                                 string nameRep, name, sequence;
303
304                                 //create output file
305                                 string outputFileName = getRootName(globaldata->getListFile()) + processList->getLabel() + ".rep.fasta";
306                                 openOutputFile(outputFileName, out);
307                                 
308                                 //for each bin in the list vector
309                                 for (int i = 0; i < processList->size(); i++) {
310                                         string groups;
311                                         nameRep = FindRep(i, groups, processList);
312                                         
313                                         //print out name and sequence for that bin
314                                         sequence = fasta->getSequence(nameRep);
315
316                                         if (sequence != "not found") {
317                                                 if (groupfile == "") {
318                                                         nameRep = nameRep + "|" + toString(i+1);
319                                                         out << ">" << nameRep << endl;
320                                                         out << sequence << endl;
321                                                 }else {
322                                                         nameRep = nameRep + "|" + groups + "|" + toString(i+1);
323                                                         out << ">" << nameRep << endl;
324                                                         out << sequence << endl;
325                                                 }
326                                         }else { 
327                                                 cout << nameRep << " is missing from your fasta or name file. Please correct. " << endl; 
328                                                 remove(outputFileName.c_str());
329                                                 return 1;
330                                         }
331                                 }
332                                 
333                                 out.close();
334                                 return 0;
335         
336         }
337         catch(exception& e) {
338                 cout << "Standard Error: " << e.what() << " has occurred in the GetOTURepCommand class Function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
339                 exit(1);
340         }
341         catch(...) {
342                 cout << "An unknown error has occurred in the GetOTURepCommand class function process. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
343                 exit(1);
344         }       
345 }
346
347 //**********************************************************************************************************************
348
349
350
351
352