]> git.donarmstrong.com Git - mothur.git/blob - readtree.cpp
fixed bug in cluster.split with classify method
[mothur.git] / readtree.cpp
1 /*
2  *  readtree.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 1/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "readtree.h"
11
12 /***********************************************************************/
13 ReadTree::ReadTree() {
14         try {
15                 globaldata = GlobalData::getInstance();
16                 m = MothurOut::getInstance();
17                 globaldata->gTree.clear();
18         }
19         catch(exception& e) {
20                 m->errorOut(e, "ReadTree", "ReadTree");
21                 exit(1);
22         }
23 }
24 /***********************************************************************/
25 int ReadTree::readSpecialChar(istream& f, char c, string name) {
26     try {
27         
28                 gobble(f);
29                 char d = f.get();
30         
31                 if(d == EOF){
32                         m->mothurOut("Error: Input file ends prematurely, expecting a " + name + "\n");
33                         exit(1);
34                 }
35                 if(d != c){
36                         m->mothurOut("Error: Expected " + name + " in input file.  Found " + toString(d) + ".\n");
37                         exit(1);
38                 }
39                 if(d == ')' && f.peek() == '\n'){
40                         gobble(f);
41                 }       
42                 return d;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ReadTree", "readSpecialChar");
46                 exit(1);
47         }
48 }
49 /**************************************************************************************************/
50
51 int ReadTree::readNodeChar(istream& f) {
52         try {
53 //              while(isspace(d=f.get()))               {;}
54                 gobble(f);
55                 char d = f.get();
56
57                 if(d == EOF){
58                         m->mothurOut("Error: Input file ends prematurely, expecting a left parenthesis\n");
59                         exit(1);
60                 }
61                 return d;
62         }
63         catch(exception& e) {
64                 m->errorOut(e, "ReadTree", "readNodeChar");
65                 exit(1);
66         }
67 }
68
69 /**************************************************************************************************/
70
71 float ReadTree::readBranchLength(istream& f) {
72     try {
73                 float b;
74         
75                 if(!(f >> b)){
76                         m->mothurOut("Error: Missing branch length in input tree.\n");
77                         exit(1);
78                 }
79                 gobble(f);
80                 return b;
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "ReadTree", "readBranchLength");
84                 exit(1);
85         }
86 }
87
88 /***********************************************************************/
89 /***********************************************************************/
90
91 //Child Classes Below
92
93 /***********************************************************************/
94 /***********************************************************************/
95 //This class reads a file in Newick form and stores it in a tree.
96
97 int ReadNewickTree::read() {
98         try {
99                 holder = "";
100                 int c, error;
101                 int comment = 0;
102                 
103                 //if you are not a nexus file 
104                 if ((c = filehandle.peek()) != '#') {  
105                         while((c = filehandle.peek()) != EOF) { 
106                                 while ((c = filehandle.peek()) != EOF) {
107                                         // get past comments
108                                         if(c == '[') {
109                                                 comment = 1;
110                                         }
111                                         if(c == ']'){
112                                                 comment = 0;
113                                         }
114                                         if((c == '(') && (comment != 1)){ break; }
115                                         filehandle.get();
116                                 }
117
118                                 //make new tree
119                                 T = new Tree(); 
120
121                                 numNodes = T->getNumNodes();
122                                 numLeaves = T->getNumLeaves();
123                                 
124                                 error = readTreeString(); 
125                                 
126                                 //save trees for later commands
127                                 globaldata->gTree.push_back(T); 
128                                 gobble(filehandle);
129                         }
130                 //if you are a nexus file
131                 }else if ((c = filehandle.peek()) == '#') {
132                         nexusTranslation();  //reads file through the translation and updates treemap
133                         while((c = filehandle.peek()) != EOF) { 
134                                 // get past comments
135                                 while ((c = filehandle.peek()) != EOF) {        
136                                         if(holder == "[" || holder == "[!"){
137                                                 comment = 1;
138                                         }
139                                         if(holder == "]"){
140                                                 comment = 0;
141                                         }
142                                         if((holder == "tree" || holder == "end;") && comment != 1){ holder = ""; comment = 0; break;}
143                                         filehandle >> holder;
144                                 }
145                         
146                                 //pass over the "tree rep.6878900 = "
147                                 while (((c = filehandle.get()) != '(') && ((c = filehandle.peek()) != EOF) ) {;}
148                                         
149                                 if (c == EOF ) { break; }
150                                 filehandle.putback(c);  //put back first ( of tree.
151                                 
152                                 //make new tree
153                                 T = new Tree(); 
154                                 numNodes = T->getNumNodes();
155                                 numLeaves = T->getNumLeaves();
156                                 
157                                 //read tree info
158                                 error = readTreeString(); 
159                                  
160                                 //save trees for later commands
161                                 globaldata->gTree.push_back(T); 
162                         }
163                 }
164                 
165                 if (error != 0) { readOk = error; } 
166                 
167                 filehandle.close();
168                 return readOk;
169         }
170         catch(exception& e) {
171                 m->errorOut(e, "ReadNewickTree", "read");
172                 exit(1);
173         }
174 }
175 /**************************************************************************************************/
176 //This function read the file through the translation of the sequences names and updates treemap.
177 void ReadNewickTree::nexusTranslation() {
178         try {
179                 
180                 holder = "";
181                 int numSeqs = globaldata->gTreemap->getNumSeqs(); //must save this some when we clear old names we can still know how many sequences there were
182                 int comment = 0;
183                 
184                 // get past comments
185                 while(holder != "translate" && holder != "Translate"){  
186                         if(holder == "[" || holder == "[!"){
187                                 comment = 1;
188                         }
189                         if(holder == "]"){
190                                 comment = 0;
191                         }
192                         filehandle >> holder; 
193                         if(holder == "tree" && comment != 1){return;}
194                 }
195                 
196                 //update treemap
197                 globaldata->gTreemap->namesOfSeqs.clear();
198                 for(int i=0;i<numSeqs;i++){
199                         string number, name;
200                         filehandle >> number;
201                         filehandle >> name;
202                         name.erase(name.end()-1);  //erase the comma
203                         //insert new one with new name
204                         globaldata->gTreemap->treemap[toString(number)].groupname = globaldata->gTreemap->treemap[name].groupname;
205                         globaldata->gTreemap->treemap[toString(number)].vectorIndex = globaldata->gTreemap->treemap[name].vectorIndex;
206                         //erase old one.  so treemap[sarah].groupnumber is now treemap[1].groupnumber. if number is 1 and name is sarah.
207                         globaldata->gTreemap->treemap.erase(name);
208                         globaldata->gTreemap->namesOfSeqs.push_back(number);
209                 }
210         }
211         catch(exception& e) {
212                 m->errorOut(e, "ReadNewickTree", "nexusTranslation");
213                 exit(1);
214         }
215 }
216
217 /**************************************************************************************************/
218 int ReadNewickTree::readTreeString() {
219         try {
220                 
221                 int n = 0;
222                 int lc, rc; 
223                 
224                 int rooted = 0;
225         
226                 int ch = filehandle.peek();     
227                 
228                 if(ch == '('){
229                         n = numLeaves;  //number of leaves / sequences, we want node 1 to start where the leaves left off
230
231                         lc = readNewickInt(filehandle, n, T);
232                         if (lc == -1) { m->mothurOut("error with lc"); m->mothurOutEndLine(); return -1; } //reports an error in reading
233         
234                         if(filehandle.peek()==','){                                                     
235                                 readSpecialChar(filehandle,',',"comma");
236                         }
237                         // ';' means end of tree.                                                                                               
238                         else if((ch=filehandle.peek())==';' || ch=='['){                
239                                 rooted = 1;                                                                     
240                         }                                                                                               
241                         if(rooted != 1){                                                                
242                                 rc = readNewickInt(filehandle, n, T);
243                                 if (rc == -1) { m->mothurOut("error with rc"); m->mothurOutEndLine(); return -1; } //reports an error in reading
244                                 if(filehandle.peek() == ')'){                                   
245                                         readSpecialChar(filehandle,')',"right parenthesis");
246                                 }                                                                                       
247                         }                                                                                               
248                 }
249                 //note: treeclimber had the code below added - not sure why?
250                 else{
251                         filehandle.putback(ch);
252                         char name[MAX_LINE];
253                         filehandle.get(name, MAX_LINE,'\n');
254                         SKIPLINE(filehandle, ch);
255                 
256                         n = T->getIndex(name);
257
258                         if(n!=0){
259                                 m->mothurOut("Internal error: The only taxon is not taxon 0.\n");
260                                 //exit(1);
261                                 readOk = -1; return -1;
262                         }
263                         lc = rc = -1;
264                 } 
265                 
266                 while(((ch=filehandle.get())!=';') && (filehandle.eof() != true)){;}                                            
267                 if(rooted != 1){                                                                        
268                         T->tree[n].setChildren(lc,rc);
269                         T->tree[n].setBranchLength(0);
270                         T->tree[n].setParent(-1);
271                         if(lc!=-1){             T->tree[lc].setParent(n);               }
272                         if(rc!=-1){             T->tree[rc].setParent(n);               }
273                 }
274                 return 0;
275         
276         }
277         catch(exception& e) {
278                 m->errorOut(e, "ReadNewickTree", "readTreeString");
279                 exit(1);
280         }
281 }
282 /**************************************************************************************************/
283
284 int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T) {
285         try {
286                 
287                 if (m->control_pressed) { return -1; } 
288                 
289                 int c = readNodeChar(f);
290   string k;
291 k = c;
292         cout << "at beginning = " << k <<endl;  
293                 if(c == '('){
294                 
295                         //to account for multifurcating trees generated by fasttree, we are forcing them to be bifurcating
296                         //read all children
297                         vector<int> childrenNodes;
298                         while(f.peek() != ')'){
299                                 int child = readNewickInt(f, n, T);
300                                 if (child == -1) { return -1; } //reports an error in reading
301                                 
302                                 childrenNodes.push_back(child);
303                                 
304                                 //after a child you either have , or ), check for both
305                                 if(f.peek()==')'){  break;  }
306                                 else if (f.peek()==',') {   readSpecialChar(f,',',"comma");  }
307                                 else { string k;
308                         k = f.peek();
309         cout << "in here k = " << k << '\t' << f.tellg() <<endl;
310  }
311                         }
312         cout << childrenNodes.size() << endl;           
313                         if (childrenNodes.size() < 2) {  m->mothurOut("Error in tree, please correct."); m->mothurOutEndLine(); return -1; }
314                         
315                         //then force into 2 node structure
316                         for (int i = 1; i < childrenNodes.size(); i++) {
317                         
318                                 int lc, rc;
319                                 if (i == 1) { lc = childrenNodes[i-1]; rc = childrenNodes[i]; }
320                                 else { lc = n; rc = childrenNodes[i]; }
321                         cout << i << '\t' << lc << '\t' << rc << endl;  
322                                 T->tree[n].setChildren(lc,rc);
323                                 T->tree[lc].setParent(n);
324                                 T->tree[rc].setParent(n);
325                                 
326                                 T->printTree(); cout << endl;
327                                 n++;
328                         }
329                         
330                         //to account for extra ++ in looping
331                         n--;
332                         //int lc = readNewickInt(f, n, T);
333                         //if (lc == -1) { return -1; } //reports an error in reading
334                         
335                         //readSpecialChar(f,',',"comma");
336
337                         //int rc = readNewickInt(f, n, T);
338                         //if (rc == -1) { return -1; }  //reports an error in reading   
339                         
340                         if(f.peek()==')'){      
341                                 readSpecialChar(f,')',"right parenthesis");     
342                                 //to pass over labels in trees
343                                 c=filehandle.get();
344                                 while((c!=',') && (c != -1) && (c!= ':') && (c!=';')){ c=filehandle.get(); }
345                                 filehandle.putback(c);
346                         }                       
347                 
348                         if(f.peek() == ':'){                                                                          
349                                 readSpecialChar(f,':',"colon"); 
350                                                                                 
351                                 if(n >= numNodes){      m->mothurOut("Error: Too many nodes in input tree\n");  readOk = -1; return -1; }
352                                 
353                                 T->tree[n].setBranchLength(readBranchLength(f));
354                         }else{
355                                 T->tree[n].setBranchLength(0.0); 
356                         }                                               
357                         
358                         //to account for multifurcating trees generated by fasttree, we are forcing them to be bifurcating
359                         /*while(f.peek() == ','){
360                         string k;
361                         k = f.peek();
362         cout << "in here k = " << k << '\t' << f.tellg() <<endl;
363                                 //force this node to be left child and read new rc
364                                 T->tree[n].setChildren(lc,rc);
365                                 T->tree[lc].setParent(n);
366                                 T->tree[rc].setParent(n);
367                                 
368                                 T->printTree(); cout << endl;
369                                 lc = n;
370                                 n++;
371                                 
372                                 readSpecialChar(f,',',"comma");
373
374                                 rc = readNewickInt(f, n, T);
375                 
376                                 if (rc == -1) { return -1; }  //reports an error in reading     
377                                 
378                                 if(f.peek()==')'){      
379                                         readSpecialChar(f,')',"right parenthesis");     
380                                         //to pass over labels in trees
381                                         c=filehandle.get();
382                                         while((c!=',') && (c != -1) && (c!= ':') && (c!=';')){ c=filehandle.get(); }
383                                         filehandle.putback(c);
384                                         
385                                         if(f.peek() == ':'){                                                                          
386                                                 readSpecialChar(f,':',"colon"); 
387                                         
388                                                 if(n >= numNodes){      m->mothurOut("Error: Too many nodes in input tree\n");  readOk = -1; return -1; }
389                                         
390                                                 T->tree[n].setBranchLength(readBranchLength(f));
391                                         }else{
392                                                 T->tree[n].setBranchLength(0.0); 
393                                         }                                               
394
395                                         break;
396                                 }                       
397                         }*/
398                 
399                         //T->tree[n].setChildren(lc,rc);
400                         //T->tree[lc].setParent(n);
401                         //T->tree[rc].setParent(n);
402                         //T->printTree();  cout << endl;
403                         
404                         return n++;
405                 
406                 }else{
407                         f.putback(c);
408                         string name = "";
409                         char d=f.get();
410                         while(d != ':' && d != ',' && d!=')' && d!='\n'){                                       
411                                 name += d;
412                                 d=f.get();
413                         }
414                 cout << name << endl;
415                         int blen = 0;
416                         if(d == ':')    {               blen = 1;       }               
417                 
418                         f.putback(d);
419                 
420                         //set group info
421                         string group = globaldata->gTreemap->getGroup(name);
422                         
423                         //find index in tree of name
424                         int n1 = T->getIndex(name);
425                         
426                         //adds sequence names that are not in group file to the "xxx" group
427                         if(group == "not found") {
428                                 m->mothurOut("Name: " + name + " is not in your groupfile, and will be disregarded. \n");  //readOk = -1; return n1;
429                                 
430                                 globaldata->gTreemap->namesOfSeqs.push_back(name);
431                                 globaldata->gTreemap->treemap[name].groupname = "xxx";
432                                 
433                                 map<string, int>::iterator it;
434                                 it = globaldata->gTreemap->seqsPerGroup.find("xxx");
435                                 if (it == globaldata->gTreemap->seqsPerGroup.end()) { //its a new group
436                                         globaldata->gTreemap->namesOfGroups.push_back("xxx");
437                                         globaldata->gTreemap->seqsPerGroup["xxx"] = 1;
438                                 }else {
439                                         globaldata->gTreemap->seqsPerGroup["xxx"]++;
440                                 }
441                                 
442                                 group = "xxx";
443                         }
444                         
445                         vector<string> tempGroup; tempGroup.push_back(group);
446                         
447                         T->tree[n1].setGroup(tempGroup);
448                         T->tree[n1].setChildren(-1,-1);
449                 
450                         if(blen == 1){  
451                                 f.get();
452                                 T->tree[n1].setBranchLength(readBranchLength(f));
453                         }else{
454                                 T->tree[n1].setBranchLength(0.0);
455                         }
456                 
457                         while((c=f.get())!=0 && (c != ':' && c != ',' && c!=')') )              {;}             
458                         f.putback(c);
459                 
460                         return n1;
461                 }
462         }
463         catch(exception& e) {
464                 m->errorOut(e, "ReadNewickTree", "readNewickInt");
465                 exit(1);
466         }
467 }
468 /**************************************************************************************************/
469 /**************************************************************************************************/
470