]> git.donarmstrong.com Git - ape.git/blob - src/bionjs.c
big update with files from Andrei
[ape.git] / src / bionjs.c
1 /* bionjs.c    2011-10-11 */
2
3 /* Copyright 2011 Andrei-Alin Popescu */
4
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
7
8 #include "ape.h"
9
10 void bionjs(double *D, int *N, int *edge1, int *edge2, double *edge_length,int* fsS)
11 {       //assume missing values are denoted by -1
12         double *S,*R , *v,*new_v, Sdist, Ndist, *new_dist, A, B, smallest_S, x, y;
13         int n, i, j, k, ij, smallest, OTU1, OTU2, cur_nod, o_l, *otu_label;
14         /*for(i=0;i<n*(n-1)/2;i++)
15           {if(isNA(D[i])){D[i]=-1;}
16           }*/
17         int *s;//s contains |Sxy|, which is all we need for agglomeration
18         double *newR;
19         int *newS;
20         int fS=*fsS;
21
22         R = &Sdist;
23         new_dist = &Ndist;
24         otu_label = &o_l;
25         n = *N;
26         cur_nod = 2*n - 2;
27
28         R = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
29         v = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
30         new_v = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
31         S = (double*)R_alloc(n + 1, sizeof(double));
32         newR = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
33         new_dist = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
34         otu_label = (int*)R_alloc(n + 1, sizeof(int));
35         s = (int*)R_alloc(n*(n - 1)/2, sizeof(int));
36         newS = (int*)R_alloc(n*(n - 1)/2, sizeof(int));
37
38         for (i = 1; i <= n; i++) otu_label[i] = i; /* otu_label[0] is not used */
39
40         k = 0;
41         //populate the v matrix
42         for(i=1;i<n;i++)
43          for(j=i+1;j<=n;j++)
44           {
45            v[give_index(i,j,n)]=D[give_index(i,j,n)];
46           }
47         //compute Sxy and Rxy
48         for(i=0;i<n*(n-1)/2;i++)
49           {newR[i]=0;
50            newS[i]=0;
51            s[i]=0;
52            R[i]=0;
53           }
54
55         for(i=1;i<n;i++)
56          for(j=i+1;j<=n;j++)
57          {//algorithm assumes i,j /in Sij, so skip pair if it is not known
58           if(D[give_index(i,j,n)]==-1)
59             {
60               continue;
61             }
62           for(k=1;k<=n;k++)
63            {//ij is the pair for which we compute
64             //skip k if we do not know the distances between it and i AND j
65
66              if(k==i || k==j)
67                {
68                   s[give_index(i,j,n)]++;
69                   continue;
70                }
71               if(D[give_index(i,k,n)]==-1 || D[give_index(j,k,n)]==-1)continue;
72               //Rprintf("%i\n",k);
73               s[give_index(i,j,n)]++;
74               R[give_index(i,j,n)]+=D[give_index(i,k,n)];
75               R[give_index(i,j,n)]+=D[give_index(j,k,n)];
76            }
77          }
78
79         /*for(i=1;i<n;i++)
80                   {
81                     for(j=i+1;j<=n;j++)
82                       {
83                         Rprintf("R[%i,%i]=%f ",i,j,R[give_index(i,j,n)]);
84                       }
85                     Rprintf("\n");
86                   }
87
88                 for(i=1;i<n;i++)
89                   {
90                     for(j=i+1;j<=n;j++)
91                       {
92                         Rprintf("s[%i,%i]=%i ",i,j,s[give_index(i,j,n)]);
93                       }
94                     Rprintf("\n");
95                   }*/
96
97         k=0;
98         int sw=1;//if 1 then incomplete
99         while (n > 3) {
100
101                 ij = 0;
102                 for(i=1;i<n;i++)
103                  for(j=i+1;j<=n;j++)
104                   {newR[give_index(i,j,n)]=0;
105                    newS[give_index(i,j,n)]=0;
106                   }
107                 smallest_S = -1e50;
108                 if(sw==0)
109                     for(i=1;i<=n;i++)
110                        {S[i]=0;
111                        }
112
113                 B=n-2;
114                 if(sw==1)
115                      {
116                       choosePair(D,n,R,s,&sw,&OTU1,&OTU2,fS);
117                      }
118                  else{ //Rprintf("distance matrix is now complete\n");
119                         for (i=1;i<=n;i++)
120                          for(j=1;j<=n;j++)
121                            {if(i==j)continue;
122                              //Rprintf("give_index(%i,%i)=%i\n",i,j,give_index(i,j,n));
123                              //Rprintf("D[%i,%i]=%f\n",i,j,D[give_index(i,j,n)]);
124                              S[i]+=D[give_index(i,j,n)];
125                            }
126                         B=n-2;
127                         //Rprintf("n=%i,B=%f",n,B);
128                         for (i = 1; i < n; i++) {
129                          for (j = i + 1; j <= n; j++) {
130                              //Rprintf("S[%i]=%f, S[%i]=%f, D[%i,%i]=%f, B=%f",i,S[i],j,S[j],i,j,D[give_index(i,j,n)],B);
131                                 A=S[i]+S[j]-B*D[give_index(i,j,n)];
132                                 //Rprintf("Q[%i,%i]=%f\n",i,j,A);
133                                 if (A > smallest_S) {
134                                         OTU1 = i;
135                                         OTU2 = j;
136                                         smallest_S = A;
137                                         smallest = ij;
138                                 }
139                                 ij++;
140                         }
141                         }
142                      }
143
144                 /*Rprintf("agglomerating %i and %i, Q=%f \n",OTU1,OTU2,smallest_S);
145
146                 for(i=1;i<n;i++)
147                   {
148                     for(j=i+1;j<=n;j++)
149                       {
150                         Rprintf("R[%i,%i]=%f ",i,j,R[give_index(i,j,n)]);
151                       }
152                     Rprintf("\n");
153                   }
154
155                 for(i=1;i<n;i++)
156                   {
157                     for(j=i+1;j<=n;j++)
158                       {
159                         Rprintf("s[%i,%i]=%i ",i,j,s[give_index(i,j,n)]);
160                       }
161                     Rprintf("\n");
162                   }
163
164                 for(i=1;i<n;i++)
165                   {
166                     for(j=i+1;j<=n;j++)
167                       {
168                         Rprintf("d[%i,%i]=%f ",i,j,D[give_index(i,j,n)]);
169                       }
170                     Rprintf("\n");
171                   }*/
172                 //update R and S, only if matrix still incomplete
173                 if(sw==1)
174                 for(i=1;i<n;i++)
175                 {if(i==OTU1 || i==OTU2)continue;
176                  for(j=i+1;j<=n;j++)
177                   {if(j==OTU1 || j==OTU2)continue;
178                     if(D[give_index(i,j,n)]==-1)continue;
179                      if(D[give_index(i,OTU1,n)]!=-1 && D[give_index(j,OTU1,n)]!=-1)
180                       {//OTU1 was considered for Rij, so now subtract
181                        R[give_index(i,j,n)]-=(D[give_index(i,OTU1,n)]+D[give_index(j,OTU1,n)]);
182                        s[give_index(i,j,n)]--;
183                       }
184                      if(D[give_index(i,OTU2,n)]!=-1 && D[give_index(j,OTU2,n)]!=-1)
185                       {//OTU2 was considered for Rij, so now subtract
186                        R[give_index(i,j,n)]-=(D[give_index(i,OTU2,n)]+D[give_index(j,OTU2,n)]);
187                        s[give_index(i,j,n)]--;
188                       }
189                   }
190                 }
191
192                 edge2[k] = otu_label[OTU1];
193                 edge2[k + 1] = otu_label[OTU2];
194                 edge1[k] = edge1[k + 1] = cur_nod;
195
196                 double sum=0;
197                 double lamb=0;//the parameter used for matrix reduction
198                 double lambSum=0;
199                 for(i=1;i<=n;i++)
200                 {if(i==OTU1 || i==OTU2)continue;
201                  if(D[give_index(OTU1,i,n)]==-1 || D[give_index(OTU2,i,n)]==-1)continue;
202                  sum+=(D[give_index(OTU1,i,n)]-D[give_index(OTU2,i,n)]);
203                  lambSum+=(v[give_index(OTU2,i,n)]-v[give_index(OTU1,i,n)]);
204                 }
205                 //if we stil have incomplete distances
206                 if(sw==1)
207                 {
208                  lamb=0.5+(1/(2*(s[give_index(OTU1,OTU2,n)]-2)*v[give_index(OTU2,OTU1,n)]))*lambSum;
209                 }else{
210                  lamb=0.5+(1/(2*(n-2)*v[give_index(OTU2,OTU1,n)]))*lambSum;
211                      }
212
213                 //although s was updated above, s[otu1,otu2] has remained unchanged
214                 //so it is safe to use it here
215                 //if complete distanes, use N-2, else use S
216                 int down=B;
217                 if(sw==1){down=s[give_index(OTU1,OTU2,n)]-2;}
218                 if(down==0)
219                   {error("distance information insufficient to construct a tree, leaves %i and %i isolated from tree",OTU1,OTU2);
220                   }
221                 //Rprintf("down=%f\n",B);
222                 sum*=(1.0/(2*(down)));
223                 //Rprintf("sum=%f\n",sum);
224                 double dxy=D[give_index(OTU1,OTU2,n)]/2;
225
226                 //Rprintf("R[%i,%i]:%f \n",OTU1,OTU2,sum);
227                 edge_length[k] = dxy+sum;//OTU1
228                 //Rprintf("l1:%f \n",edge_length[k]);
229                 edge_length[k + 1] = dxy-sum;//OTU2
230                 //Rprintf("l2:%f \n",edge_length[k+1]);
231                //no need to change distance matrix update for complete distance
232                //case, as pairs will automatically fall in the right cathegory
233
234                 //OTU1=x, OTU2=y from formulas
235                 A = D[give_index(OTU1,OTU2,n)];
236                 ij = 0;
237                 for (i = 1; i <= n; i++) {
238                         if (i == OTU1 || i == OTU2) continue;
239                         if(D[give_index(OTU1,i,n)]!=-1 && D[give_index(OTU2,i,n)]!=-1)
240                          {
241                             new_dist[ij]= lamb*(D[give_index(OTU1,i,n)]-edge_length[k])+(1-lamb)*(D[give_index(OTU2,i,n)]-edge_length[k+1]);
242                             new_v[ij]=lamb*v[give_index(OTU1,i,n)]+(1-lamb)*v[give_index(OTU2,i,n)]-lamb*(1-lamb)*v[give_index(OTU1,OTU2,n)];
243                          }else{
244                          if(D[give_index(OTU1,i,n)]!=-1)
245                                 {
246                                  new_dist[ij]=D[give_index(OTU1,i,n)]-edge_length[k];
247                                  new_v[ij]=v[give_index(OTU1,i,n)];
248                                 }else{
249                                       if(D[give_index(OTU2,i,n)]!=-1)
250                                         {
251                                             new_dist[ij]=D[give_index(OTU2,i,n)]-edge_length[k+1];
252                                             new_v[ij]=v[give_index(OTU2,i,n)];
253                                         }else{new_dist[ij]=-1;new_v[ij]=-1;}
254                                      }
255                               }
256
257                         ij++;
258                 }
259
260                 for (i = 1; i < n; i++) {
261                         if (i == OTU1 || i == OTU2) continue;
262                         for (j = i + 1; j <= n; j++) {
263                                 if (j == OTU1 || j == OTU2) continue;
264                                 new_dist[ij] = D[DINDEX(i, j)];
265                                 new_v[ij]=v[give_index(i,j,n)];
266                                 ij++;
267                         }
268                 }
269
270                 /*for(i=1;i<n-1;i++)
271                 {
272                   for(j=i+1;j<=n-1;j++)
273                    {Rprintf("%f ",new_dist[give_index(i,j,n-1)]);
274                    }
275                   Rprintf("\n");
276                 }*/
277                 //compute Rui, only if distance matrix is still incomplete
278                 ij=0;
279                 if(sw==1)
280                 for(i=2;i<n;i++)
281                   {
282                    ij++;
283                    if(new_dist[give_index(i,1,n-1)]==-1)continue;
284
285                    for(j=1;j<n;j++)
286                      {
287                        if(j==1 || j==i)
288                        {
289                          newS[give_index(1,i,n-1)]++;
290                          continue;
291                        }
292                        if(new_dist[give_index(i,j,n-1)]!=-1 && new_dist[give_index(1,j,n-1)]!=-1)
293                         {
294                           newS[give_index(1,i,n-1)]++;
295                           newR[give_index(1,i,n-1)]+=new_dist[give_index(i,j,n-1)];
296                           newR[give_index(1,i,n-1)]+=new_dist[give_index(1,j,n-1)];
297                         }
298                      }
299                   }
300                 //fill in the rest of R and S, again only if distance matrix still
301                 //incomplete
302                 if(sw==1)
303                 for(i=1;i<n;i++)
304                 {if(i==OTU1 || i==OTU2)continue;
305                  for(j=i+1;j<=n;j++)
306                   {if(j==OTU1 || j==OTU2)continue;
307                    newR[ij]=R[give_index(i,j,n)];
308                    newS[ij]=s[give_index(i,j,n)];
309                    ij++;
310                   }
311                 }
312                 //update newR and newS with the new taxa, again only if distance
313                 //matrix is still incomplete
314                 if(sw==1)
315                 for(i=2;i<n-1;i++)
316                 {if(new_dist[give_index(1,i,n-1)]==-1)continue;
317                  for(j=i+1;j<=n-1;j++)
318                   {if(new_dist[give_index(1,j,n-1)]==-1)continue;
319                    newR[give_index(i,j,n-1)]+=(new_dist[give_index(1,i,n-1)]+new_dist[give_index(1,j,n-1)]);
320                    newS[give_index(i,j,n-1)]++;
321                   }
322                 }
323                 /* compute the branch lengths */
324
325
326
327                 /* update before the next loop
328                    (we are sure that OTU1 < OTU2) */
329                 if (OTU1 != 1)
330                         for (i = OTU1; i > 1; i--)
331                                 otu_label[i] = otu_label[i - 1];
332                 if (OTU2 != n)
333                         for (i = OTU2; i < n; i++)
334                                 otu_label[i] = otu_label[i + 1];
335                 otu_label[1] = cur_nod;
336
337
338
339                 n--;
340                 for (i = 0; i < n*(n - 1)/2; i++)
341                   {
342                     D[i] = new_dist[i];
343                     v[i] = new_v[i];
344                     if(sw==1)
345                        {
346                         R[i] = newR[i];
347                         s[i] = newS[i];
348                        }
349                   }
350                 cur_nod--;
351                 k = k + 2;
352         }
353         int dK=0;//number of known distances in final distance matrix
354         int iUK=-1;//index of unkown distance, if we have one missing distance
355         int iK=-1;//index of only known distance, only needed if dK==1
356         for (i = 0; i < 3; i++) {
357                 edge1[*N*2 - 4 - i] = cur_nod;
358                 edge2[*N*2 - 4 - i] = otu_label[i + 1];
359                 if(D[i]!=-1){dK++;iK=i;}else{iUK=i;}
360         }
361         if(dK==2)
362          {//if two distances are known: assume our leaves are x,y,z, d(x,z) unknown
363           //and edge weights of three edges are a,b,c, then any b,c>0 that
364           //satisfy c-b=d(y,z)-d(x,y) a+c=d(y,z) are good edge weights, but for
365           //simplicity we assume a=c if d(yz)<d(xy) a=b otherwise, and after some
366           //algebra we get that we can set the missing distance equal to the
367           //maximum of the already present distances
368             double max=-1e50;
369           for(i=0;i<3;i++)
370             {if(i==iUK)continue;
371              if(D[i]>max)max=D[i];
372             }
373           D[iUK]=max;
374          }
375         if(dK==1)
376          {//through similar motivation as above, if we have just one known distance
377           //we set the other two distances equal to it
378           for(i=0;i<3;i++)
379             {if(i==iK)continue;
380              D[i]=D[iK];
381             }
382          }
383         if(dK==0)
384          {//no distances are known, we just set them to 1
385           for(i=0;i<3;i++)
386            {D[i]=1;
387            }
388          }
389         edge_length[*N*2 - 4] = (D[0] + D[1] - D[2])/2;
390         edge_length[*N*2 - 5] = (D[0] + D[2] - D[1])/2;
391         edge_length[*N*2 - 6] = (D[2] + D[1] - D[0])/2;
392 }
393
394