1 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
7 ; GERAD - Montreal- Canada ;
8 ; olivierg@crt.umontreal.ca ;
10 ; LIRMM - Montpellier- France ;
13 ; UNIX version, written in C ;
14 ; by Hoa Sien Cuong (Univ. Montreal) ;
16 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
21 void Initialize(float **delta, double *X, char **labels, int n, POINTERS *trees);
22 void Print_outputChar(int i, POINTERS *trees, char *output);
23 void bionj (double *X, int *N, char **labels, char **treeStr);
24 int Symmetrize(float **delta, int n);
25 void Concatenate(char chain1[MAX_LABEL_LENGTH], int ind, POINTERS *trees, int post);
26 float Distance(int i, int j, float **delta);
27 float Variance(int i, int j, float **delta);
28 int Emptied(int i, float **delta);
29 float Sum_S(int i, float **delta);
30 void Compute_sums_Sx(float **delta, int n);
31 void Best_pair(float **delta, int r, int *a, int *b, int n);
32 float Finish_branch_length(int i, int j, int k, float **delta);
33 void FinishStr (float **delta, int n, POINTERS *trees, char *StrTree);
34 float Agglomerative_criterion(int i, int j, float **delta, int r);
35 float Branch_length(int a, int b, float **delta, int r);
36 float Reduction4(int a, float la, int b, float lb, int i, float lamda, float **delta);
37 float Reduction10(int a, int b, int i, float lamda, float vab, float **delta);
38 float Lamda(int a, int b, float vab, float **delta, int n, int r);
40 /* void printMat(float **delta, int n); */
42 /*;;;;;;;;;;; INPUT, OUTPUT, INITIALIZATION ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
45 ; The delta matrix is read from the input-file. ;
46 ; It is recommended to put it and the executable in ;
47 ; a special directory. The input-file and output-file ;
48 ; can be given as arguments to the executable by ;
49 ; typing them after the executable (Bionj input-file ;
50 ; output-file) or by typing them when asked by the ;
51 ; program. The input-file has to be formated according ;
52 ; the PHYLIP standard. The output file is formated ;
53 ; according to the NEWWICK standard. ;
55 ; The lower-half of the delta matrix is occupied by ;
56 ; dissimilarities. The upper-half of the matrix is ;
57 ; occupied by variances. The first column ;
58 ; is initialized as 0; during the algorithm some ;
59 ; indices are no more used, and the corresponding ;
60 ; positions in the first column are set to 1. ;
62 ; This delta matix is made symmetrical using the rule: ;
63 ; Dij = Dji <- (Dij + Dji)/2. The diagonal is set to 0; ;
64 ; during the further steps of the algorithm, it is used ;
65 ; to store the sums Sx. ;
67 ; A second array, trees, is used to store taxon names. ;
68 ; During the further steps of the algoritm, some ;
69 ; positions in this array are emptied while the others ;
70 ; are used to store subtrees. ;
72 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
75 /*;;;;;;;;;;;;;;;;;;;;;;;;;; Initialize ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
77 ; Description : This function reads an input file and return the ;
78 ; delta matrix and trees: the list of taxa. ;
81 ; float **delta : delta matrix ;
82 ; FILE *input : pointer to input file ;
83 ; int n : number of taxa ;
84 ; char **trees : list of taxa ;
87 ; float **delta : delta matrix ;
88 ; char *trees : list of taxa ;
90 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
92 //void Initialize(float **delta, FILE *input, int n, POINTERS *trees)
93 void Initialize(float **delta, double *X, char **labels, int n, POINTERS *trees)
95 int lig; /* matrix line */
96 int col; /* matrix column */
98 //char name_taxon[LEN]; /* taxon name */
99 char name_taxon[MAX_LABEL_LENGTH];
100 char format[MAX_DIGITS];
103 snprintf (format, MAX_DIGITS, "%%%ds", MAX_LABEL_LENGTH);
105 for(lig=1; lig <= n; lig++)
107 //fscanf(input,"%s",name_taxon); /* read taxon name */
108 //fscanf (input, format, name_taxon); /* read taxon name */
109 strncpy (name_taxon, labels[lig-1], MAX_LABEL_LENGTH);
110 name=(WORD *)calloc(1,sizeof(WORD)); /* taxon name is */
111 if(name == NULL) /* put in trees */
113 Rprintf("Out of memories !!");
118 strncpy (name->name, name_taxon, MAX_LABEL_LENGTH);
120 trees[lig].head=name;
121 trees[lig].tail=name;
122 for(col=lig; col <= n; col++)
124 //fscanf(input,"%f",&distance); /* read the distance */
125 // &distance = X[XINDEX(lig,col)];
126 delta[col][lig]=X[XINDEX(lig,col)];
127 delta[lig][col]=X[XINDEX(lig,col)];
136 /*;;;;;;;;;;;;;;;;;;;;;;;;;;; Print_output;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
139 ; Description : This function prints out the tree in the output file. ;
142 ; POINTERS *trees : pointer to the subtrees. ;
143 ; int i : indicate the subtree i to be printed. ;
144 : FILE *output : pointer to the output file. ;
146 ; return value: The phylogenetic tree in the output file. ;
148 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
150 void Print_outputChar(int i, POINTERS *trees, char *output)
153 parcour=trees[i].head;
154 while (parcour != NULL && (strlen (output) + strlen (parcour->name) < MAX_INPUT_SIZE))
156 output = strncat (output, parcour->name, strlen (parcour->name));
157 parcour=parcour->suiv;
162 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
166 ; argc is the number of arguments ;
167 ; **argv contains the arguments: ;
168 ; the first argument has to be BIONJ; ;
169 ; the second is the inptu-file; ;
170 ; the third is the output-file. ;
171 ; When the input and output files are ;
172 ; not given, the user is asked for them. ;
174 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
177 //tree *bionj (FILE *input, boolean isNJ)
178 void bionj (double *X, int *N, char **labels, char **treeStr)
180 POINTERS *trees; /* list of subtrees */
181 tree *T; /* the returned tree */
182 char *chain1; /* stringized branch-length */
183 // char *chain2; /* idem */
184 char *str; /* the string containing final tree */
185 int *a, *b; /* pair to be agglomerated */
186 float **delta; /* delta matrix */
187 float la; /* first taxon branch-length */
188 float lb; /* second taxon branch-length */
189 float vab; /* variance of Dab */
193 int r; /* number of subtrees */
194 int n; /* number of taxa */
197 a=(int*)calloc(1,sizeof(int));
198 b=(int*)calloc(1,sizeof(int));
199 chain1=(char *)calloc(MAX_LABEL_LENGTH,sizeof(char));
200 str = (char *)R_alloc(MAX_INPUT_SIZE, sizeof(char));
203 strncpy(str, "", strlen(str));
206 // fscanf(input,"%d",&n);
208 /* Create the delta matrix */
209 delta=(float **)calloc(n+1,sizeof(float*));
212 delta[i]=(float *)calloc(n+1, sizeof(float));
215 Rprintf("Out of memories!!");
219 trees=(POINTERS *)calloc(n+1,sizeof(POINTERS));
222 Rprintf("Out of memories!!");
225 /* initialise and symmetrize the running delta matrix */
229 Initialize(delta, X, labels, n, trees);
230 // ok=Symmetrize(delta, n);
233 // Rprintf("\n The matrix is not symmetric.\n ");
234 while (r > 3) /* until r=3 */
236 Compute_sums_Sx(delta, n); /* compute the sum Sx */
237 Best_pair(delta, r, a, b, n); /* find the best pair by */
238 vab=Variance(*a, *b, delta); /* minimizing (1) */
239 la=Branch_length(*a, *b, delta, r); /* compute branch-lengths */
240 lb=Branch_length(*b, *a, delta, r); /* using formula (2) */
242 lamda=Lamda(*a, *b, vab, delta, n, r); /* compute lambda* using (9)*/
243 for(i=1; i <= n; i++)
245 if(!Emptied(i,delta) && (i != *a) && (i != *b))
255 y=*a; /* apply reduction formulae */
256 } /* 4 and 10 to delta */
257 delta[x][y]=Reduction4(*a, la, *b, lb, i, lamda, delta);
258 delta[y][x]=Reduction10(*a, *b, i, lamda, vab, delta);
261 strncpy(chain1,"",1); /* agglomerate the subtrees */
262 strncat(chain1,"(",1); /* a and b together with the*/
263 Concatenate(chain1, *a, trees, 0); /* branch-lengths according */
264 strncpy(chain1,"",1); /* to the NEWWICK format */
265 strncat(chain1,":",1);
266 snprintf(chain1+strlen(chain1),MAX_LABEL_LENGTH,"%f",la);
268 strncat(chain1,",",1);
269 Concatenate(chain1,*a, trees, 1);
270 trees[*a].tail->suiv=trees[*b].head;
271 trees[*a].tail=trees[*b].tail;
272 strncpy(chain1,"",1);
273 strncat(chain1,":",1);
274 snprintf(chain1+strlen(chain1),MAX_LABEL_LENGTH,"%f",lb);
276 strncat(chain1,")",1);
277 Concatenate(chain1, *a, trees, 1);
278 delta[*b][0]=1.0; /* make the b line empty */
284 FinishStr (delta, n, trees, str); /* compute the branch-lengths*/
285 /* of the last three subtrees*/
286 /* and print the tree in the */
288 T = readNewickString (str, n);
290 // compareSets(T,species);
292 *treeStr = (char *)R_alloc(MAX_INPUT_SIZE, sizeof(char));
294 if (strlen(*treeStr))
295 strncpy(*treeStr, "", strlen(*treeStr));
297 NewickPrintTreeStr (T, *treeStr);
317 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
321 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
325 /*;;;;;;;;;;;;;;;;;;;;;;;;;;; Symmetrize ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
327 ; Description : This function verifies if the delta matrix is symmetric; ;
328 ; if not the matrix is made symmetric. ;
331 ; float **delta : delta matrix ;
332 ; int n : number of taxa ;
335 ; int symmetric : indicate if the matrix has been made ;
338 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
340 int Symmetrize(float **delta, int n)
342 int lig; /* matrix line */
343 int col; /* matrix column */
344 float value; /* symmetrized value */
348 for(lig=1; lig <= n; lig++)
350 for(col=1; col < lig; col++)
352 if(delta[lig][col] != delta[col][lig])
354 value= (delta[lig][col]+delta[col][lig])/2;
355 delta[lig][col]=value;
356 delta[col][lig]=value;
367 /*;;;;;;;;;;;;;;;;;;;;;;;;;;; Concatenate ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
370 ; Description : This function concatenates a string to another. ;
373 ; char *chain1 : the string to be concatenated. ;
374 ; int ind : indicate the subtree to which concatenate the ;
376 ; POINTERS *trees : pointer to subtrees. ;
377 ; int post : position to which concatenate (front (0) or ;
380 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
382 //void Concatenate(char chain1[LEN], int ind, POINTERS *trees, int post)
383 void Concatenate(char chain1[MAX_LABEL_LENGTH], int ind, POINTERS *trees, int post)
387 bran=(WORD *)calloc(1,sizeof(WORD));
390 Rprintf("Out of memories");
395 strncpy(bran->name,chain1,MAX_LABEL_LENGTH);
400 bran->suiv=trees[ind].head;
401 trees[ind].head=bran;
405 trees[ind].tail->suiv=bran;
406 trees[ind].tail=trees[ind].tail->suiv;
411 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Distance;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
413 ; Description : This function retrieve ant return de distance between taxa ;
414 ; i and j from the delta matrix. ;
419 ; float **delta : the delta matrix ;
422 ; float distance : dissimilarity between the two taxa ;
424 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
426 float Distance(int i, int j, float **delta)
435 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Variance;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
437 ; Description : This function retrieve and return the variance of the ;
438 ; distance between i and j, from the delta matrix. ;
443 ; float **delta : the delta matrix ;
446 ; float distance : the variance of Dij ;
448 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
450 float Variance(int i, int j, float **delta)
459 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Emptied ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
461 ; Description : This function verifie if a line is emptied or not. ;
464 ; int i : subtree (or line) i ;
465 ; float **delta : the delta matrix ;
468 ; 0 : if not emptied. ;
471 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
473 int Emptied(int i, float **delta) /* test if the ith line is emptied */
475 return((int)delta[i][0]);
479 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Sum_S;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
481 ; Description : This function retrieves the sum Sx from the diagonal ;
482 ; of the delta matrix. ;
485 ; int i : subtree i ;
486 ; float **delta : the delta matrix ;
489 ; float delta[i][i] : sum Si ;
491 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
493 float Sum_S(int i, float **delta) /* get sum Si form the diagonal */
499 /*;;;;;;;;;;;;;;;;;;;;;;;Compute_sums_Sx;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
501 ; Description : This function computes the sums Sx and store them in the ;
502 ; diagonal the delta matrix. ;
505 ; float **delta : the delta matrix. ;
506 ; int n : the number of taxa ;
508 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
510 void Compute_sums_Sx(float **delta, int n)
516 for(i= 1; i <= n ; i++)
518 if(!Emptied(i,delta))
523 if(i != j && !Emptied(j,delta)) /* compute the sum Si */
524 sum=sum + Distance(i,j,delta);
527 delta[i][i]=sum; /* store the sum Si in */
528 } /* delta
\92s diagonal */
532 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Best_pair;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
534 ; Description : This function finds the best pair to be agglomerated by ;
535 ; minimizing the agglomerative criterion (1). ;
538 ; float **delta : the delta matrix ;
539 ; int r : number of subtrees ;
540 ; int *a : contain the first taxon of the pair ;
541 ; int *b : contain the second taxon of the pair ;
542 ; int n : number of taxa ;
545 ; int *a : the first taxon of the pair ;
546 ; int *b : the second taxon of the pair ;
547 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
549 void Best_pair(float **delta, int r, int *a, int *b, int n)
551 float Qxy; /* value of the criterion calculated*/
552 int x,y; /* the pair which is tested */
553 float Qmin; /* current minimun of the criterion */
556 for(x=1; x <= n; x++)
558 if(!Emptied(x,delta))
562 if(!Emptied(y,delta))
564 Qxy=Agglomerative_criterion(x,y,delta,r);
565 if(Qxy < Qmin-0.000001)
578 /*;;;;;;;;;;;;;;;;;;;;;;Finish_branch_length;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
580 ; Description : Compute the length of the branch attached ;
581 ; to the subtree i, during the final step ;
584 ; int i : position of subtree i ;
585 ; int j : position of subtree j ;
586 ; int k : position of subtree k ;
590 ; float length : The length of the branch ;
592 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
594 float Finish_branch_length(int i, int j, int k, float **delta)
597 length=0.5*(Distance(i,j,delta) + Distance(i,k,delta)
598 -Distance(j,k,delta));
603 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Finish;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
605 ; Description : This function compute the length of the lasts three ;
606 ; subtrees and write the tree in the output file. ;
609 ; float **delta : the delta matrix ;
610 ; int n : the number of taxa ;
611 ; WORD *trees : list of subtrees ;
614 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
616 void Finish(float **delta, int n, POINTERS *trees, FILE *output)
623 int last[3]; // the last three subtrees
626 { // find the last tree subtree
627 if(!Emptied(l, delta))
635 length=Finish_branch_length(last[0],last[1],last[2],delta);
637 Print_output(last[0],trees,output);
639 // gcvt(length,PREC, str);
640 // fprintf(output,"%s,",str);
641 fprintf(output,"%f,",length);
643 length=Finish_branch_length(last[1],last[0],last[2],delta);
644 Print_output(last[1],trees,output);
646 // gcvt(length,PREC, str);
647 // fprintf(output,"%s,",str);
648 fprintf(output,"%f,",length);
650 length=Finish_branch_length(last[2],last[1],last[0],delta);
651 Print_output(last[2],trees,output);
653 // gcvt(length,PREC,str);
654 // fprintf(output,"%s",str);
655 fprintf(output,"%f",length);
656 fprintf(output,");");
657 fprintf(output,"\n");
661 bidon=trees[last[i]].head;
673 void FinishStr (float **delta, int n, POINTERS *trees, char *StrTree)
681 int last[3]; /* the last three subtrees */
684 { /* find the last tree subtree */
685 if(!Emptied(l, delta))
692 tmp = (char*) calloc (12, sizeof(char));
695 length=Finish_branch_length(last[0],last[1],last[2],delta);
696 Print_outputChar (last[0], trees, StrTree);
697 snprintf (tmp, 12, "%f,", length);
698 if (strlen (StrTree) + strlen (tmp) < MAX_INPUT_SIZE-1) {
699 strncat (StrTree, ":", 1);
700 strncat (StrTree, tmp, strlen (tmp));
703 length=Finish_branch_length(last[1],last[0],last[2],delta);
704 Print_outputChar (last[1], trees, StrTree);
705 snprintf (tmp, 12, "%f,", length);
706 if (strlen (StrTree) + strlen (tmp) < MAX_INPUT_SIZE-1) {
707 strncat (StrTree, ":", 1);
708 strncat (StrTree, tmp, strlen (tmp));
711 length=Finish_branch_length(last[2],last[1],last[0],delta);
712 Print_outputChar (last[2], trees, StrTree);
713 snprintf (tmp, 12, "%f", length);
714 if (strlen (StrTree) + strlen (tmp) < MAX_INPUT_SIZE-1) {
715 strncat (StrTree, ":", 1);
716 strncat (StrTree, tmp, strlen (tmp));
719 if (strlen (StrTree) < MAX_INPUT_SIZE-3)
720 strncat (StrTree, ");\n", 3);
725 bidon=trees[last[i]].head;
737 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\
741 \*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
744 float Agglomerative_criterion(int i, int j, float **delta, int r)
747 Qij=(r-2)*Distance(i,j,delta) /* Formula (1) */
755 float Branch_length(int a, int b, float **delta, int r)
758 length=0.5*(Distance(a,b,delta) /* Formula (2) */
760 -Sum_S(b,delta))/(r-2));
765 float Reduction4(int a, float la, int b, float lb, int i, float lamda, float **delta)
768 Dui=lamda*(Distance(a,i,delta)-la)
769 +(1-lamda)*(Distance(b,i,delta)-lb); /* Formula (4) */
774 float Reduction10(int a, int b, int i, float lamda, float vab, float **delta)
777 Vci=lamda*Variance(a,i,delta)+(1-lamda)*Variance(b,i,delta)
778 -lamda*(1-lamda)*vab; /*Formula (10) */
782 float Lamda(int a, int b, float vab, float **delta, int n, int r)
791 for(i=1; i <= n ; i++)
793 if(a != i && b != i && !Emptied(i,delta))
794 lamda=lamda + (Variance(b,i,delta) - Variance(a,i,delta));
796 lamda=0.5 + lamda/(2*(r-2)*vab);
797 } /* Formula (9) and the */
798 if(lamda > 1.0) /* constraint that lamda*/
799 lamda = 1.0; /* belongs to [0,1] */
806 /* void printMat(float **delta, int n) */
809 /* for (i=1; i<=n; i++) { */
810 /* for (j=1; j<=n; j++) */
811 /* Rprintf ("%f ", delta[i][j]); */