1 /* BIONJ.c 2008-01-14 */
3 /* Copyright 2007-2008 Olivier Gascuel, Hoa Sien Cuong,
4 R port by Vincent Lefort, bionj() below modified by Emmanuel Paradis */
6 /* This file is part of the R-package `ape'. */
7 /* See the file ../COPYING for licensing issues. */
9 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
15 ; GERAD - Montreal- Canada ;
16 ; olivierg@crt.umontreal.ca ;
18 ; LIRMM - Montpellier- France ;
21 ; UNIX version, written in C ;
22 ; by Hoa Sien Cuong (Univ. Montreal) ;
24 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
28 void Initialize(float **delta, double *X, char **labels, int n, POINTERS *trees);
29 void Print_outputChar(int i, POINTERS *trees, char *output);
30 void bionj(double *X, int *N, char **labels, int *edge1, int *edge2, double *el, char **tl);
31 int Symmetrize(float **delta, int n);
32 void Concatenate(char chain1[MAX_LABEL_LENGTH], int ind, POINTERS *trees, int post);
33 float Distance(int i, int j, float **delta);
34 float Variance(int i, int j, float **delta);
35 int Emptied(int i, float **delta);
36 float Sum_S(int i, float **delta);
37 void Compute_sums_Sx(float **delta, int n);
38 void Best_pair(float **delta, int r, int *a, int *b, int n);
39 float Finish_branch_length(int i, int j, int k, float **delta);
40 void FinishStr (float **delta, int n, POINTERS *trees, char *StrTree);
41 float Agglomerative_criterion(int i, int j, float **delta, int r);
42 float Branch_length(int a, int b, float **delta, int r);
43 float Reduction4(int a, float la, int b, float lb, int i, float lamda, float **delta);
44 float Reduction10(int a, int b, int i, float lamda, float vab, float **delta);
45 float Lamda(int a, int b, float vab, float **delta, int n, int r);
47 /* void printMat(float **delta, int n); */
49 /*;;;;;;;;;;; INPUT, OUTPUT, INITIALIZATION ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
52 ; The delta matrix is read from the input-file. ;
53 ; It is recommended to put it and the executable in ;
54 ; a special directory. The input-file and output-file ;
55 ; can be given as arguments to the executable by ;
56 ; typing them after the executable (Bionj input-file ;
57 ; output-file) or by typing them when asked by the ;
58 ; program. The input-file has to be formated according ;
59 ; the PHYLIP standard. The output file is formated ;
60 ; according to the NEWWICK standard. ;
62 ; The lower-half of the delta matrix is occupied by ;
63 ; dissimilarities. The upper-half of the matrix is ;
64 ; occupied by variances. The first column ;
65 ; is initialized as 0; during the algorithm some ;
66 ; indices are no more used, and the corresponding ;
67 ; positions in the first column are set to 1. ;
69 ; This delta matix is made symmetrical using the rule: ;
70 ; Dij = Dji <- (Dij + Dji)/2. The diagonal is set to 0; ;
71 ; during the further steps of the algorithm, it is used ;
72 ; to store the sums Sx. ;
74 ; A second array, trees, is used to store taxon names. ;
75 ; During the further steps of the algoritm, some ;
76 ; positions in this array are emptied while the others ;
77 ; are used to store subtrees. ;
79 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
82 /*;;;;;;;;;;;;;;;;;;;;;;;;;; Initialize ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
84 ; Description : This function reads an input file and return the ;
85 ; delta matrix and trees: the list of taxa. ;
88 ; float **delta : delta matrix ;
89 ; FILE *input : pointer to input file ;
90 ; int n : number of taxa ;
91 ; char **trees : list of taxa ;
94 ; float **delta : delta matrix ;
95 ; char *trees : list of taxa ;
97 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
99 //void Initialize(float **delta, FILE *input, int n, POINTERS *trees)
100 void Initialize(float **delta, double *X, char **labels, int n, POINTERS *trees)
102 int lig; /* matrix line */
103 int col; /* matrix column */
105 //char name_taxon[LEN]; /* taxon name */
106 char name_taxon[MAX_LABEL_LENGTH];
107 char format[MAX_DIGITS];
110 snprintf (format, MAX_DIGITS, "%%%ds", MAX_LABEL_LENGTH);
112 for(lig=1; lig <= n; lig++)
114 //fscanf(input,"%s",name_taxon); /* read taxon name */
115 //fscanf (input, format, name_taxon); /* read taxon name */
116 strncpy (name_taxon, labels[lig-1], MAX_LABEL_LENGTH);
117 name=(WORD *)calloc(1,sizeof(WORD)); /* taxon name is */
118 if(name == NULL) /* put in trees */
120 Rprintf("Out of memories !!");
125 strncpy (name->name, name_taxon, MAX_LABEL_LENGTH);
127 trees[lig].head=name;
128 trees[lig].tail=name;
129 for(col=lig; col <= n; col++)
131 //fscanf(input,"%f",&distance); /* read the distance */
132 // &distance = X[XINDEX(lig,col)];
133 delta[col][lig]=X[XINDEX(lig,col)];
134 delta[lig][col]=X[XINDEX(lig,col)];
143 /*;;;;;;;;;;;;;;;;;;;;;;;;;;; Print_output;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
146 ; Description : This function prints out the tree in the output file. ;
149 ; POINTERS *trees : pointer to the subtrees. ;
150 ; int i : indicate the subtree i to be printed. ;
151 : FILE *output : pointer to the output file. ;
153 ; return value: The phylogenetic tree in the output file. ;
155 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
157 void Print_outputChar(int i, POINTERS *trees, char *output)
160 parcour=trees[i].head;
161 while (parcour != NULL && (strlen (output) + strlen (parcour->name) < MAX_INPUT_SIZE))
163 output = strncat (output, parcour->name, strlen (parcour->name));
164 parcour=parcour->suiv;
169 //tree *bionj (FILE *input, boolean isNJ)
170 void bionj(double *X, int *N, char **labels,
171 int *edge1, int *edge2, double *el, char **tl)
173 POINTERS *trees; /* list of subtrees */
174 tree *T; /* the returned tree */
175 char *chain1; /* stringized branch-length */
176 char *str; /* the string containing final tree */
177 int *a, *b; /* pair to be agglomerated */
178 float **delta; /* delta matrix */
179 float la; /* first taxon branch-length */
180 float lb; /* second taxon branch-length */
181 float vab; /* variance of Dab */
185 int r; /* number of subtrees */
186 int n; /* number of taxa */
189 a=(int*)calloc(1,sizeof(int));
190 b=(int*)calloc(1,sizeof(int));
191 chain1=(char *)R_alloc(MAX_LABEL_LENGTH, sizeof(char));
192 str = (char *)R_alloc(MAX_INPUT_SIZE, sizeof(char));
195 strncpy(chain1, "", strlen(chain1));
197 strncpy(str, "", strlen(str));
200 // fscanf(input,"%d",&n);
202 /* Create the delta matrix */
203 delta=(float **)calloc(n+1,sizeof(float*));
206 delta[i]=(float *)calloc(n+1, sizeof(float));
209 Rprintf("Out of memories!!");
213 trees=(POINTERS *)calloc(n+1,sizeof(POINTERS));
216 Rprintf("Out of memories!!");
219 /* initialise and symmetrize the running delta matrix */
223 Initialize(delta, X, labels, n, trees);
224 // ok=Symmetrize(delta, n);
227 // Rprintf("\n The matrix is not symmetric.\n ");
228 while (r > 3) /* until r=3 */
230 Compute_sums_Sx(delta, n); /* compute the sum Sx */
231 Best_pair(delta, r, a, b, n); /* find the best pair by */
232 vab=Variance(*a, *b, delta); /* minimizing (1) */
233 la=Branch_length(*a, *b, delta, r); /* compute branch-lengths */
234 lb=Branch_length(*b, *a, delta, r); /* using formula (2) */
236 lamda=Lamda(*a, *b, vab, delta, n, r); /* compute lambda* using (9)*/
237 for(i=1; i <= n; i++)
239 if(!Emptied(i,delta) && (i != *a) && (i != *b))
249 y=*a; /* apply reduction formulae */
250 } /* 4 and 10 to delta */
251 delta[x][y]=Reduction4(*a, la, *b, lb, i, lamda, delta);
252 delta[y][x]=Reduction10(*a, *b, i, lamda, vab, delta);
255 strncpy(chain1,"",1); /* agglomerate the subtrees */
256 strncat(chain1,"(",1); /* a and b together with the*/
257 Concatenate(chain1, *a, trees, 0); /* branch-lengths according */
258 strncpy(chain1,"",1); /* to the NEWWICK format */
259 strncat(chain1,":",1);
260 snprintf(chain1+strlen(chain1),MAX_LABEL_LENGTH,"%f",la);
262 strncat(chain1,",",1);
263 Concatenate(chain1,*a, trees, 1);
264 trees[*a].tail->suiv=trees[*b].head;
265 trees[*a].tail=trees[*b].tail;
266 strncpy(chain1,"",1);
267 strncat(chain1,":",1);
268 snprintf(chain1+strlen(chain1),MAX_LABEL_LENGTH,"%f",lb);
270 strncat(chain1,")",1);
271 Concatenate(chain1, *a, trees, 1);
272 delta[*b][0]=1.0; /* make the b line empty */
278 FinishStr (delta, n, trees, str); /* compute the branch-lengths*/
279 /* of the last three subtrees*/
280 /* and print the tree in the */
282 T = readNewickString (str, n);
284 // compareSets(T,species);
287 tree2phylo(T, edge1, edge2, el, tl, n); /* by EP */
305 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
309 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
312 /*;;;;;;;;;;;;;;;;;;;;;;;;;;; Symmetrize ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
314 ; Description : This function verifies if the delta matrix is symmetric; ;
315 ; if not the matrix is made symmetric. ;
318 ; float **delta : delta matrix ;
319 ; int n : number of taxa ;
322 ; int symmetric : indicate if the matrix has been made ;
325 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
327 int Symmetrize(float **delta, int n)
329 int lig; /* matrix line */
330 int col; /* matrix column */
331 float value; /* symmetrized value */
335 for(lig=1; lig <= n; lig++)
337 for(col=1; col < lig; col++)
339 if(delta[lig][col] != delta[col][lig])
341 value= (delta[lig][col]+delta[col][lig])/2;
342 delta[lig][col]=value;
343 delta[col][lig]=value;
352 /*;;;;;;;;;;;;;;;;;;;;;;;;;;; Concatenate ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
355 ; Description : This function concatenates a string to another. ;
358 ; char *chain1 : the string to be concatenated. ;
359 ; int ind : indicate the subtree to which concatenate the ;
361 ; POINTERS *trees : pointer to subtrees. ;
362 ; int post : position to which concatenate (front (0) or ;
365 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
367 //void Concatenate(char chain1[LEN], int ind, POINTERS *trees, int post)
368 void Concatenate(char chain1[MAX_LABEL_LENGTH], int ind, POINTERS *trees, int post)
372 bran=(WORD *)calloc(1,sizeof(WORD));
375 Rprintf("Out of memories");
380 strncpy(bran->name,chain1,MAX_LABEL_LENGTH);
385 bran->suiv=trees[ind].head;
386 trees[ind].head=bran;
390 trees[ind].tail->suiv=bran;
391 trees[ind].tail=trees[ind].tail->suiv;
396 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Distance;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
398 ; Description : This function retrieve ant return de distance between taxa ;
399 ; i and j from the delta matrix. ;
404 ; float **delta : the delta matrix ;
407 ; float distance : dissimilarity between the two taxa ;
409 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
411 float Distance(int i, int j, float **delta)
420 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Variance;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
422 ; Description : This function retrieve and return the variance of the ;
423 ; distance between i and j, from the delta matrix. ;
428 ; float **delta : the delta matrix ;
431 ; float distance : the variance of Dij ;
433 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
435 float Variance(int i, int j, float **delta)
444 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Emptied ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
446 ; Description : This function verifie if a line is emptied or not. ;
449 ; int i : subtree (or line) i ;
450 ; float **delta : the delta matrix ;
453 ; 0 : if not emptied. ;
456 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
458 int Emptied(int i, float **delta) /* test if the ith line is emptied */
460 return((int)delta[i][0]);
464 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Sum_S;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
466 ; Description : This function retrieves the sum Sx from the diagonal ;
467 ; of the delta matrix. ;
470 ; int i : subtree i ;
471 ; float **delta : the delta matrix ;
474 ; float delta[i][i] : sum Si ;
476 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
478 float Sum_S(int i, float **delta) /* get sum Si form the diagonal */
484 /*;;;;;;;;;;;;;;;;;;;;;;;Compute_sums_Sx;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
486 ; Description : This function computes the sums Sx and store them in the ;
487 ; diagonal the delta matrix. ;
490 ; float **delta : the delta matrix. ;
491 ; int n : the number of taxa ;
493 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
495 void Compute_sums_Sx(float **delta, int n)
501 for(i= 1; i <= n ; i++)
503 if(!Emptied(i,delta))
508 if(i != j && !Emptied(j,delta)) /* compute the sum Si */
509 sum=sum + Distance(i,j,delta);
512 delta[i][i]=sum; /* store the sum Si in */
513 } /* delta
\92s diagonal */
517 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Best_pair;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
519 ; Description : This function finds the best pair to be agglomerated by ;
520 ; minimizing the agglomerative criterion (1). ;
523 ; float **delta : the delta matrix ;
524 ; int r : number of subtrees ;
525 ; int *a : contain the first taxon of the pair ;
526 ; int *b : contain the second taxon of the pair ;
527 ; int n : number of taxa ;
530 ; int *a : the first taxon of the pair ;
531 ; int *b : the second taxon of the pair ;
532 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
534 void Best_pair(float **delta, int r, int *a, int *b, int n)
536 float Qxy; /* value of the criterion calculated*/
537 int x,y; /* the pair which is tested */
538 float Qmin; /* current minimun of the criterion */
541 for(x=1; x <= n; x++)
543 if(!Emptied(x,delta))
547 if(!Emptied(y,delta))
549 Qxy=Agglomerative_criterion(x,y,delta,r);
550 if(Qxy < Qmin-0.000001)
563 /*;;;;;;;;;;;;;;;;;;;;;;Finish_branch_length;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
565 ; Description : Compute the length of the branch attached ;
566 ; to the subtree i, during the final step ;
569 ; int i : position of subtree i ;
570 ; int j : position of subtree j ;
571 ; int k : position of subtree k ;
575 ; float length : The length of the branch ;
577 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
579 float Finish_branch_length(int i, int j, int k, float **delta)
582 length=0.5*(Distance(i,j,delta) + Distance(i,k,delta)
583 -Distance(j,k,delta));
587 void FinishStr (float **delta, int n, POINTERS *trees, char *StrTree)
595 int last[3]; /* the last three subtrees */
598 { /* find the last tree subtree */
599 if(!Emptied(l, delta))
606 tmp = (char*) calloc (12, sizeof(char));
609 length=Finish_branch_length(last[0],last[1],last[2],delta);
610 Print_outputChar (last[0], trees, StrTree);
611 snprintf (tmp, 12, "%f,", length);
612 if (strlen (StrTree) + strlen (tmp) < MAX_INPUT_SIZE-1) {
613 strncat (StrTree, ":", 1);
614 strncat (StrTree, tmp, strlen (tmp));
617 length=Finish_branch_length(last[1],last[0],last[2],delta);
618 Print_outputChar (last[1], trees, StrTree);
619 snprintf (tmp, 12, "%f,", length);
620 if (strlen (StrTree) + strlen (tmp) < MAX_INPUT_SIZE-1) {
621 strncat (StrTree, ":", 1);
622 strncat (StrTree, tmp, strlen (tmp));
625 length=Finish_branch_length(last[2],last[1],last[0],delta);
626 Print_outputChar (last[2], trees, StrTree);
627 snprintf (tmp, 12, "%f", length);
628 if (strlen (StrTree) + strlen (tmp) < MAX_INPUT_SIZE-1) {
629 strncat (StrTree, ":", 1);
630 strncat (StrTree, tmp, strlen (tmp));
633 if (strlen (StrTree) < MAX_INPUT_SIZE-3)
634 strncat (StrTree, ");", 3);
639 bidon=trees[last[i]].head;
651 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
655 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
658 float Agglomerative_criterion(int i, int j, float **delta, int r)
661 Qij=(r-2)*Distance(i,j,delta) /* Formula (1) */
669 float Branch_length(int a, int b, float **delta, int r)
672 length=0.5*(Distance(a,b,delta) /* Formula (2) */
674 -Sum_S(b,delta))/(r-2));
679 float Reduction4(int a, float la, int b, float lb, int i, float lamda, float **delta)
682 Dui=lamda*(Distance(a,i,delta)-la)
683 +(1-lamda)*(Distance(b,i,delta)-lb); /* Formula (4) */
688 float Reduction10(int a, int b, int i, float lamda, float vab, float **delta)
691 Vci=lamda*Variance(a,i,delta)+(1-lamda)*Variance(b,i,delta)
692 -lamda*(1-lamda)*vab; /*Formula (10) */
696 float Lamda(int a, int b, float vab, float **delta, int n, int r)
705 for(i=1; i <= n ; i++)
707 if(a != i && b != i && !Emptied(i,delta))
708 lamda=lamda + (Variance(b,i,delta) - Variance(a,i,delta));
710 lamda=0.5 + lamda/(2*(r-2)*vab);
711 } /* Formula (9) and the */
712 if(lamda > 1.0) /* constraint that lamda*/
713 lamda = 1.0; /* belongs to [0,1] */
720 void printMat(float **delta, int n)
723 for (i=1; i<=n; i++) {
725 Rprintf ("%f ", delta[i][j]);