1 /* BIONJ.c 2012-02-09 */
3 /* Copyright 2007-2008 Olivier Gascuel, Hoa Sien Cuong,
4 R port by Vincent Lefort, bionj() below modified by Emmanuel Paradis */
6 /* This file is part of the R-package `ape'. */
7 /* See the file ../COPYING for licensing issues. */
9 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
15 ; GERAD - Montreal- Canada ;
16 ; olivierg@crt.umontreal.ca ;
18 ; LIRMM - Montpellier- France ;
21 ; UNIX version, written in C ;
22 ; by Hoa Sien Cuong (Univ. Montreal) ;
24 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
28 void Initialize(float **delta, double *X, char **labels, int n, POINTERS *trees);
29 void Print_outputChar(int i, POINTERS *trees, char *output);
30 void bionj(double *X, int *N, char **labels, int *edge1, int *edge2, double *el, char **tl);
31 int Symmetrize(float **delta, int n);
32 void Concatenate(char chain1[MAX_LABEL_LENGTH], int ind, POINTERS *trees, int post);
33 float Distance(int i, int j, float **delta);
34 float Variance(int i, int j, float **delta);
35 int Emptied(int i, float **delta);
36 float Sum_S(int i, float **delta);
37 void Compute_sums_Sx(float **delta, int n);
38 void Best_pair(float **delta, int r, int *a, int *b, int n);
39 float Finish_branch_length(int i, int j, int k, float **delta);
40 void FinishStr (float **delta, int n, POINTERS *trees, char *StrTree);
41 float Agglomerative_criterion(int i, int j, float **delta, int r);
42 float Branch_length(int a, int b, float **delta, int r);
43 float Reduction4(int a, float la, int b, float lb, int i, float lamda, float **delta);
44 float Reduction10(int a, int b, int i, float lamda, float vab, float **delta);
45 float Lamda(int a, int b, float vab, float **delta, int n, int r);
47 /* void printMat(float **delta, int n); */
49 /*;;;;;;;;;;; INPUT, OUTPUT, INITIALIZATION ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
52 ; The delta matrix is read from the input-file. ;
53 ; It is recommended to put it and the executable in ;
54 ; a special directory. The input-file and output-file ;
55 ; can be given as arguments to the executable by ;
56 ; typing them after the executable (Bionj input-file ;
57 ; output-file) or by typing them when asked by the ;
58 ; program. The input-file has to be formated according ;
59 ; the PHYLIP standard. The output file is formated ;
60 ; according to the NEWWICK standard. ;
62 ; The lower-half of the delta matrix is occupied by ;
63 ; dissimilarities. The upper-half of the matrix is ;
64 ; occupied by variances. The first column ;
65 ; is initialized as 0; during the algorithm some ;
66 ; indices are no more used, and the corresponding ;
67 ; positions in the first column are set to 1. ;
69 ; This delta matix is made symmetrical using the rule: ;
70 ; Dij = Dji <- (Dij + Dji)/2. The diagonal is set to 0; ;
71 ; during the further steps of the algorithm, it is used ;
72 ; to store the sums Sx. ;
74 ; A second array, trees, is used to store taxon names. ;
75 ; During the further steps of the algoritm, some ;
76 ; positions in this array are emptied while the others ;
77 ; are used to store subtrees. ;
79 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
82 /*;;;;;;;;;;;;;;;;;;;;;;;;;; Initialize ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
84 ; Description : This function reads an input file and return the ;
85 ; delta matrix and trees: the list of taxa. ;
88 ; float **delta : delta matrix ;
89 ; FILE *input : pointer to input file ;
90 ; int n : number of taxa ;
91 ; char **trees : list of taxa ;
94 ; float **delta : delta matrix ;
95 ; char *trees : list of taxa ;
97 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
99 //void Initialize(float **delta, FILE *input, int n, POINTERS *trees)
100 void Initialize(float **delta, double *X, char **labels, int n, POINTERS *trees)
102 int lig; /* matrix line */
103 int col; /* matrix column */
105 //char name_taxon[LEN]; /* taxon name */
106 char name_taxon[MAX_LABEL_LENGTH];
107 char format[MAX_DIGITS];
110 snprintf (format, MAX_DIGITS, "%%%ds", MAX_LABEL_LENGTH);
112 for(lig=1; lig <= n; lig++)
114 //fscanf(input,"%s",name_taxon); /* read taxon name */
115 //fscanf (input, format, name_taxon); /* read taxon name */
116 strncpy (name_taxon, labels[lig-1], MAX_LABEL_LENGTH);
117 name=(WORD *)calloc(1,sizeof(WORD)); /* taxon name is */
118 if(name == NULL) /* put in trees */
120 error("out of memory");
124 strncpy (name->name, name_taxon, MAX_LABEL_LENGTH);
126 trees[lig].head=name;
127 trees[lig].tail=name;
128 for(col=lig; col <= n; col++)
130 //fscanf(input,"%f",&distance); /* read the distance */
131 // &distance = X[XINDEX(lig,col)];
132 delta[col][lig]=X[XINDEX(lig,col)];
133 delta[lig][col]=X[XINDEX(lig,col)];
142 /*;;;;;;;;;;;;;;;;;;;;;;;;;;; Print_output;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
145 ; Description : This function prints out the tree in the output file. ;
148 ; POINTERS *trees : pointer to the subtrees. ;
149 ; int i : indicate the subtree i to be printed. ;
150 : FILE *output : pointer to the output file. ;
152 ; return value: The phylogenetic tree in the output file. ;
154 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
156 void Print_outputChar(int i, POINTERS *trees, char *output)
159 parcour=trees[i].head;
160 while (parcour != NULL && (strlen (output) + strlen (parcour->name) < MAX_INPUT_SIZE))
162 output = strncat (output, parcour->name, strlen (parcour->name));
163 parcour=parcour->suiv;
168 //tree *bionj (FILE *input, boolean isNJ)
169 void bionj(double *X, int *N, char **labels,
170 int *edge1, int *edge2, double *el, char **tl)
172 POINTERS *trees; /* list of subtrees */
173 tree *T; /* the returned tree */
174 char *chain1; /* stringized branch-length */
175 char *str; /* the string containing final tree */
176 int *a, *b; /* pair to be agglomerated */
177 float **delta; /* delta matrix */
178 float la; /* first taxon branch-length */
179 float lb; /* second taxon branch-length */
180 float vab; /* variance of Dab */
184 int r; /* number of subtrees */
185 int n; /* number of taxa */
188 a=(int*)calloc(1,sizeof(int));
189 b=(int*)calloc(1,sizeof(int));
190 chain1=(char *)R_alloc(MAX_LABEL_LENGTH, sizeof(char));
191 str = (char *)R_alloc(MAX_INPUT_SIZE, sizeof(char));
194 strncpy(chain1, "", strlen(chain1));
196 strncpy(str, "", strlen(str));
199 // fscanf(input,"%d",&n);
201 /* Create the delta matrix */
202 delta=(float **)calloc(n+1,sizeof(float*));
205 delta[i]=(float *)calloc(n+1, sizeof(float));
208 error("out of memory");
211 trees=(POINTERS *)calloc(n+1,sizeof(POINTERS));
214 error("out of memory");
216 /* initialise and symmetrize the running delta matrix */
220 Initialize(delta, X, labels, n, trees);
221 // ok=Symmetrize(delta, n);
224 // Rprintf("\n The matrix is not symmetric.\n ");
225 while (r > 3) /* until r=3 */
227 Compute_sums_Sx(delta, n); /* compute the sum Sx */
228 Best_pair(delta, r, a, b, n); /* find the best pair by */
229 vab=Variance(*a, *b, delta); /* minimizing (1) */
230 la=Branch_length(*a, *b, delta, r); /* compute branch-lengths */
231 lb=Branch_length(*b, *a, delta, r); /* using formula (2) */
233 lamda=Lamda(*a, *b, vab, delta, n, r); /* compute lambda* using (9)*/
234 for(i=1; i <= n; i++)
236 if(!Emptied(i,delta) && (i != *a) && (i != *b))
246 y=*a; /* apply reduction formulae */
247 } /* 4 and 10 to delta */
248 delta[x][y]=Reduction4(*a, la, *b, lb, i, lamda, delta);
249 delta[y][x]=Reduction10(*a, *b, i, lamda, vab, delta);
252 strncpy(chain1,"",1); /* agglomerate the subtrees */
253 strncat(chain1,"(",1); /* a and b together with the*/
254 Concatenate(chain1, *a, trees, 0); /* branch-lengths according */
255 strncpy(chain1,"",1); /* to the NEWWICK format */
256 strncat(chain1,":",1);
257 snprintf(chain1+strlen(chain1),MAX_LABEL_LENGTH,"%f",la);
259 strncat(chain1,",",1);
260 Concatenate(chain1,*a, trees, 1);
261 trees[*a].tail->suiv=trees[*b].head;
262 trees[*a].tail=trees[*b].tail;
263 strncpy(chain1,"",1);
264 strncat(chain1,":",1);
265 snprintf(chain1+strlen(chain1),MAX_LABEL_LENGTH,"%f",lb);
267 strncat(chain1,")",1);
268 Concatenate(chain1, *a, trees, 1);
269 delta[*b][0]=1.0; /* make the b line empty */
275 FinishStr (delta, n, trees, str); /* compute the branch-lengths*/
276 /* of the last three subtrees*/
277 /* and print the tree in the */
279 T = readNewickString (str, n);
281 // compareSets(T,species);
284 tree2phylo(T, edge1, edge2, el, tl, n); /* by EP */
302 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
306 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
309 /*;;;;;;;;;;;;;;;;;;;;;;;;;;; Symmetrize ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
311 ; Description : This function verifies if the delta matrix is symmetric; ;
312 ; if not the matrix is made symmetric. ;
315 ; float **delta : delta matrix ;
316 ; int n : number of taxa ;
319 ; int symmetric : indicate if the matrix has been made ;
322 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
324 int Symmetrize(float **delta, int n)
326 int lig; /* matrix line */
327 int col; /* matrix column */
328 float value; /* symmetrized value */
332 for(lig=1; lig <= n; lig++)
334 for(col=1; col < lig; col++)
336 if(delta[lig][col] != delta[col][lig])
338 value= (delta[lig][col]+delta[col][lig])/2;
339 delta[lig][col]=value;
340 delta[col][lig]=value;
349 /*;;;;;;;;;;;;;;;;;;;;;;;;;;; Concatenate ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
352 ; Description : This function concatenates a string to another. ;
355 ; char *chain1 : the string to be concatenated. ;
356 ; int ind : indicate the subtree to which concatenate the ;
358 ; POINTERS *trees : pointer to subtrees. ;
359 ; int post : position to which concatenate (front (0) or ;
362 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
364 //void Concatenate(char chain1[LEN], int ind, POINTERS *trees, int post)
365 void Concatenate(char chain1[MAX_LABEL_LENGTH], int ind, POINTERS *trees, int post)
369 bran=(WORD *)calloc(1,sizeof(WORD));
372 error("out of memory");
376 strncpy(bran->name,chain1,MAX_LABEL_LENGTH);
381 bran->suiv=trees[ind].head;
382 trees[ind].head=bran;
386 trees[ind].tail->suiv=bran;
387 trees[ind].tail=trees[ind].tail->suiv;
392 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Distance;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
394 ; Description : This function retrieve ant return de distance between taxa ;
395 ; i and j from the delta matrix. ;
400 ; float **delta : the delta matrix ;
403 ; float distance : dissimilarity between the two taxa ;
405 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
407 float Distance(int i, int j, float **delta)
416 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Variance;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
418 ; Description : This function retrieve and return the variance of the ;
419 ; distance between i and j, from the delta matrix. ;
424 ; float **delta : the delta matrix ;
427 ; float distance : the variance of Dij ;
429 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
431 float Variance(int i, int j, float **delta)
440 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Emptied ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
442 ; Description : This function verifie if a line is emptied or not. ;
445 ; int i : subtree (or line) i ;
446 ; float **delta : the delta matrix ;
449 ; 0 : if not emptied. ;
452 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
454 int Emptied(int i, float **delta) /* test if the ith line is emptied */
456 return((int)delta[i][0]);
460 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Sum_S;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
462 ; Description : This function retrieves the sum Sx from the diagonal ;
463 ; of the delta matrix. ;
466 ; int i : subtree i ;
467 ; float **delta : the delta matrix ;
470 ; float delta[i][i] : sum Si ;
472 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
474 float Sum_S(int i, float **delta) /* get sum Si form the diagonal */
480 /*;;;;;;;;;;;;;;;;;;;;;;;Compute_sums_Sx;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
482 ; Description : This function computes the sums Sx and store them in the ;
483 ; diagonal the delta matrix. ;
486 ; float **delta : the delta matrix. ;
487 ; int n : the number of taxa ;
489 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
491 void Compute_sums_Sx(float **delta, int n)
497 for(i= 1; i <= n ; i++)
499 if(!Emptied(i,delta))
504 if(i != j && !Emptied(j,delta)) /* compute the sum Si */
505 sum=sum + Distance(i,j,delta);
508 delta[i][i]=sum; /* store the sum Si in */
509 } /* delta
\92s diagonal */
513 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Best_pair;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
515 ; Description : This function finds the best pair to be agglomerated by ;
516 ; minimizing the agglomerative criterion (1). ;
519 ; float **delta : the delta matrix ;
520 ; int r : number of subtrees ;
521 ; int *a : contain the first taxon of the pair ;
522 ; int *b : contain the second taxon of the pair ;
523 ; int n : number of taxa ;
526 ; int *a : the first taxon of the pair ;
527 ; int *b : the second taxon of the pair ;
528 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
530 void Best_pair(float **delta, int r, int *a, int *b, int n)
532 float Qxy; /* value of the criterion calculated*/
533 int x,y; /* the pair which is tested */
534 float Qmin; /* current minimun of the criterion */
537 for(x=1; x <= n; x++)
539 if(!Emptied(x,delta))
543 if(!Emptied(y,delta))
545 Qxy=Agglomerative_criterion(x,y,delta,r);
546 if(Qxy < Qmin-0.000001)
559 /*;;;;;;;;;;;;;;;;;;;;;;Finish_branch_length;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
561 ; Description : Compute the length of the branch attached ;
562 ; to the subtree i, during the final step ;
565 ; int i : position of subtree i ;
566 ; int j : position of subtree j ;
567 ; int k : position of subtree k ;
571 ; float length : The length of the branch ;
573 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
575 float Finish_branch_length(int i, int j, int k, float **delta)
578 length=0.5*(Distance(i,j,delta) + Distance(i,k,delta)
579 -Distance(j,k,delta));
583 void FinishStr (float **delta, int n, POINTERS *trees, char *StrTree)
591 int last[3]; /* the last three subtrees */
594 { /* find the last tree subtree */
595 if(!Emptied(l, delta))
602 tmp = (char*) calloc (12, sizeof(char));
605 length=Finish_branch_length(last[0],last[1],last[2],delta);
606 Print_outputChar (last[0], trees, StrTree);
607 snprintf (tmp, 12, "%f,", length);
608 if (strlen (StrTree) + strlen (tmp) < MAX_INPUT_SIZE-1) {
609 strncat (StrTree, ":", 1);
610 strncat (StrTree, tmp, strlen (tmp));
613 length=Finish_branch_length(last[1],last[0],last[2],delta);
614 Print_outputChar (last[1], trees, StrTree);
615 snprintf (tmp, 12, "%f,", length);
616 if (strlen (StrTree) + strlen (tmp) < MAX_INPUT_SIZE-1) {
617 strncat (StrTree, ":", 1);
618 strncat (StrTree, tmp, strlen (tmp));
621 length=Finish_branch_length(last[2],last[1],last[0],delta);
622 Print_outputChar (last[2], trees, StrTree);
623 snprintf (tmp, 12, "%f", length);
624 if (strlen (StrTree) + strlen (tmp) < MAX_INPUT_SIZE-1) {
625 strncat (StrTree, ":", 1);
626 strncat (StrTree, tmp, strlen (tmp));
629 if (strlen (StrTree) < MAX_INPUT_SIZE-3)
630 strncat (StrTree, ");", 3);
635 bidon=trees[last[i]].head;
647 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
651 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
654 float Agglomerative_criterion(int i, int j, float **delta, int r)
657 Qij=(r-2)*Distance(i,j,delta) /* Formula (1) */
665 float Branch_length(int a, int b, float **delta, int r)
668 length=0.5*(Distance(a,b,delta) /* Formula (2) */
670 -Sum_S(b,delta))/(r-2));
675 float Reduction4(int a, float la, int b, float lb, int i, float lamda, float **delta)
678 Dui=lamda*(Distance(a,i,delta)-la)
679 +(1-lamda)*(Distance(b,i,delta)-lb); /* Formula (4) */
684 float Reduction10(int a, int b, int i, float lamda, float vab, float **delta)
687 Vci=lamda*Variance(a,i,delta)+(1-lamda)*Variance(b,i,delta)
688 -lamda*(1-lamda)*vab; /*Formula (10) */
692 float Lamda(int a, int b, float vab, float **delta, int n, int r)
701 for(i=1; i <= n ; i++)
703 if(a != i && b != i && !Emptied(i,delta))
704 lamda=lamda + (Variance(b,i,delta) - Variance(a,i,delta));
706 lamda=0.5 + lamda/(2*(r-2)*vab);
707 } /* Formula (9) and the */
708 if(lamda > 1.0) /* constraint that lamda*/
709 lamda = 1.0; /* belongs to [0,1] */
716 void printMat(float **delta, int n)
719 for (i=1; i<=n; i++) {
721 Rprintf ("%f ", delta[i][j]);