3 /* Copyright 2011 Andrei-Alin Popescu */
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
16 void choosePair(double* D,int n,double* R,int* s, int* sw, int* x, int* y,int fS)
29 for (i = 1; i < n; i++)
31 for (j = i + 1; j <= n; j++)
32 {if(D[give_index(i,j,n)]==-1){sww=1;continue;}
33 //Rprintf("R[%i,%i]=%f\n",i,j,R[give_index(i,j,n)]);
34 //Rprintf("s[%i,%i]=%i\n",i,j,s[give_index(i,j,n)]);
35 //Rprintf("D[%i,%i]=%f\n",i,j,D[give_index(i,j,n)]);
37 double numb=((R[give_index(i,j,n)])/(s[give_index(i,j,n)]-2))-D[give_index(i,j,n)];
38 //Rprintf("numb(%i,%i)=%f\n",i,j,numb);
39 for(k=0;k<fS, cFS[k]>numb;k++);
41 for(tr=fS-1;tr>k;tr--)
54 //no missing distances, just return the one with maximal Q-criterion
55 if(sww==0){*x=iFS[0];*y=jFS[0];*sw=0;return;
58 {Rprintf("value[%i]=%f ",k,cFS[k]);
59 Rprintf("i=%i ",iFS[k]);
60 Rprintf("j=%i ",jFS[k]);
66 double nb=nxy(iFS[i],jFS[i],n,D);
70 /*Rprintf("all N*(x,y)\n");
72 {Rprintf("value[%i]=%f ",k,cFS[k]);
73 Rprintf("i=%i ",iFS[k]);
74 Rprintf("j=%i ",jFS[k]);
78 //shift the max N*xy to the front of the array
89 //if just one pair realises max N*xy, return it
90 if(dk==1){*x=iFS[0];*y=jFS[0];return;}
92 /*Rprintf("max n*xy values\n");
94 {Rprintf("value[%i]=%f ",k,cFS[k]);
95 Rprintf("i=%i ",iFS[k]);
96 Rprintf("j=%i ",jFS[k]);
100 //on the fron of the array containing max N*xy values compute cxy
103 double nb=cxy(iFS[i],jFS[i],n,D);
107 /*Rprintf("all c*xy values\n");
109 {Rprintf("value[%i]=%f ",k,cFS[k]);
110 Rprintf("i=%i ",iFS[k]);
111 Rprintf("j=%i ",jFS[k]);
114 //and again shift maximal C*xy values at the fron of the array
126 //if just one C*xy with maximal value, return pair realising it
127 if(dk==1){*x=iFS[0];*y=jFS[0];return;}
129 /*Rprintf("max c*xy values\n");
131 {Rprintf("value[%i]=%f ",k,cFS[k]);
132 Rprintf("i=%i ",iFS[k]);
133 Rprintf("j=%i ",jFS[k]);
137 //on the front of the array containing max C*xy compute m*xy
140 double nb=mxy(iFS[i],jFS[i],n,D);
144 /*Rprintf("all m*xy values\n");
146 {Rprintf("value[%i]=%f ",k,cFS[k]);
147 Rprintf("i=%i ",iFS[k]);
148 Rprintf("j=%i ",jFS[k]);
151 //again shift maximal m*xy values to the fron of the array
163 //if just one maximal value for m*xy return the pair realising it, found at 0
164 if(dk==1){*x=iFS[0];*y=jFS[0];return;}
166 /*Rprintf("max m*xy values\n");
168 {Rprintf("value[%i]=%f ",k,cFS[k]);
169 Rprintf("i=%i ",iFS[k]);
170 Rprintf("j=%i ",jFS[k]);
173 //and calculate cnxy on these values, but this time we do not shift, but simply
174 //return the pair realising the maximum, stored at iPos
179 double nb=cnxy(iFS[i],jFS[i],n,D);
180 if(nb>max){max=nb;iPos=i;}
183 /*Rprintf("cN*xy\n");
184 Rprintf("value[%i]=%f ",0,cFS[0]);
185 Rprintf("i=%i ",iFS[0]);
186 Rprintf("j=%i ",jFS[0]);
188 *x=iFS[iPos];*y=jFS[iPos];
191 double cnxy(int x, int y, int n,double* D)
196 //Rprintf("cN[%i,%i]\n",x,y);
198 {if(i==x || i==y)continue;
201 if(j==y || j==x)continue;
204 n1=D[give_index(i,x,n)];
205 n2=D[give_index(j,y,n)];
206 if(n1==-1 || n2==-1 || D[give_index(i,j,n)]==-1)continue;
207 nMeanXY+=(n1+n2-D[give_index(x,y,n)]-D[give_index(i,j,n)]);
208 //Rprintf("cnMeanXY after (%i,%i)=%f\n",i,j,nMeanXY);
214 int mxy(int x,int y,int n,double* D)
225 if(i!=x && D[give_index(x,i,n)]==-1)
229 if(i!=y && D[give_index(y,i,n)]==-1)
236 Rprintf("mx[%i]=%i ",i,mx[i]);
242 Rprintf("my[%i]=%i ",i,my[i]);
250 if(mx[i]==1 && my[i]==0)
254 if(my[i]==1 && mx[i]==0)
259 //Rprintf("xmy=%i, ymx=%i, xmy+ymx=%i\n",xmy,ymx,xmy+ymx);
262 double nxy(int x, int y, int n,double* D)
268 //Rprintf("N[%i,%i]\n",x,y);
270 {if(i==x || i==y)continue;
273 if(j==x || j==y)continue;
276 n1=D[give_index(i,x,n)];
277 n2=D[give_index(j,y,n)];
278 if(n1==-1 || n2==-1 || D[give_index(i,j,n)]==-1)continue;
280 //Rprintf("considered pair (%i,%i)\n",i,j);
281 nMeanXY+=H(n1+n2-D[give_index(x,y,n)]-D[give_index(i,j,n)]);
282 //Rprintf("nMeanXY after (%i,%i)=%f\n",i,j,nMeanXY);
285 //Rprintf("sCXY=%i",sCXY);
289 int cxy(int x, int y, int n,double* D)
296 {if(i==x || i==y)continue;
299 if(j==x || j==y)continue;
302 if(i!=x)n1=D[give_index(i,x,n)];
303 if(j!=y)n2=D[give_index(j,y,n)];
304 if(n1==-1 || n2==-1 || D[give_index(i,j,n)]==-1)continue;
311 void njs(double *D, int *N, int *edge1, int *edge2, double *edge_length,int *fsS)
312 { //assume missing values are denoted by -1
313 double *S,*R, Sdist, Ndist, *new_dist, A, B, smallest_S, x, y;
314 int n, i, j, k, ij, smallest, OTU1, OTU2, cur_nod, o_l, *otu_label;
315 /*for(i=0;i<n*(n-1)/2;i++)
316 {if(isNA(D[i])){D[i]=-1;}
318 int *s;//s contains |Sxy|, which is all we need for agglomeration
329 R = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
330 S = (double*)R_alloc(n + 1, sizeof(double));
331 newR = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
332 new_dist = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
333 otu_label = (int*)R_alloc(n + 1, sizeof(int));
334 s = (int*)R_alloc(n*(n - 1)/2, sizeof(int));
335 newS = (int*)R_alloc(n*(n - 1)/2, sizeof(int));
337 for (i = 1; i <= n; i++) otu_label[i] = i; /* otu_label[0] is not used */
341 //compute Sxy and Rxy
342 for(i=0;i<n*(n-1)/2;i++)
351 {//algorithm assumes i,j /in Sij, so skip pair if it is not known
352 if(D[give_index(i,j,n)]==-1)
356 //Rprintf("for %i and %i :\n",i,j);
358 {//ij is the pair for which we compute
359 //skip k if we do not know the distances between it and i AND j
362 s[give_index(i,j,n)]++;
365 if(D[give_index(i,k,n)]==-1 || D[give_index(j,k,n)]==-1)continue;
367 s[give_index(i,j,n)]++;
368 R[give_index(i,j,n)]+=D[give_index(i,k,n)];
369 R[give_index(i,j,n)]+=D[give_index(j,k,n)];
377 Rprintf("R[%i,%i]=%f ",i,j,R[give_index(i,j,n)]);
386 Rprintf("s[%i,%i]=%i ",i,j,s[give_index(i,j,n)]);
392 int sw=1;//if 1 then incomplete
397 {newR[give_index(i,j,n)]=0;
398 newS[give_index(i,j,n)]=0;
408 choosePair(D,n,R,s,&sw,&OTU1,&OTU2,fS);
410 else{ //Rprintf("distance matrix is now complete\n");
414 //Rprintf("give_index(%i,%i)=%i\n",i,j,give_index(i,j,n));
415 //Rprintf("D[%i,%i]=%f\n",i,j,D[give_index(i,j,n)]);
416 S[i]+=D[give_index(i,j,n)];
419 //Rprintf("n=%i,B=%f",n,B);
420 for (i = 1; i < n; i++) {
421 for (j = i + 1; j <= n; j++) {
422 //Rprintf("S[%i]=%f, S[%i]=%f, D[%i,%i]=%f, B=%f",i,S[i],j,S[j],i,j,D[give_index(i,j,n)],B);
423 A=S[i]+S[j]-B*D[give_index(i,j,n)];
424 //Rprintf("Q[%i,%i]=%f\n",i,j,A);
425 if (A > smallest_S) {
436 /*Rprintf("agglomerating %i and %i, Q=%f \n",OTU1,OTU2,smallest_S);
442 Rprintf("R[%i,%i]=%f ",i,j,R[give_index(i,j,n)]);
451 Rprintf("s[%i,%i]=%i ",i,j,s[give_index(i,j,n)]);
460 Rprintf("d[%i,%i]=%f ",i,j,D[give_index(i,j,n)]);
464 //update Rxy and Sxy, only if matrix still incomplete
467 {if(i==OTU1 || i==OTU2)continue;
469 {if(j==OTU1 || j==OTU2)continue;
470 if(D[give_index(i,j,n)]==-1)continue;
471 if(D[give_index(i,OTU1,n)]!=-1 && D[give_index(j,OTU1,n)]!=-1)
472 {//OTU1 was considered for Rij, so now subtract
473 R[give_index(i,j,n)]-=(D[give_index(i,OTU1,n)]+D[give_index(j,OTU1,n)]);
474 s[give_index(i,j,n)]--;
476 if(D[give_index(i,OTU2,n)]!=-1 && D[give_index(j,OTU2,n)]!=-1)
477 {//OTU2 was considered for Rij, so now subtract
478 R[give_index(i,j,n)]-=(D[give_index(i,OTU2,n)]+D[give_index(j,OTU2,n)]);
479 s[give_index(i,j,n)]--;
484 edge2[k] = otu_label[OTU1];
485 edge2[k + 1] = otu_label[OTU2];
486 edge1[k] = edge1[k + 1] = cur_nod;
488 /* get the distances between all OTUs but the 2 selected ones
490 a) get the sum for both
491 b) compute the distances for the new OTU */
496 {if(i==OTU1 || i==OTU2)continue;
497 if(D[give_index(OTU1,i,n)]==-1 || D[give_index(OTU2,i,n)]==-1)continue;
498 sum+=(D[give_index(OTU1,i,n)]-D[give_index(OTU2,i,n)]);
500 //although s was updated above, s[otu1,otu2] has remained unchanged
501 //so it is safe to use it here
502 //if complete distanes, use N-2, else use S
504 if(sw==1){down=s[give_index(OTU1,OTU2,n)]-2;}
506 {error("distance information insufficient to construct a tree, leaves %i and %i isolated from tree",OTU1,OTU2);
508 //Rprintf("down=%i\n",down);
509 sum*=(1.0/(2*(down)));
510 //Rprintf("sum=%f\n",sum);
511 double dxy=D[give_index(OTU1,OTU2,n)]/2;
513 //Rprintf("R[%i,%i]:%f \n",OTU1,OTU2,sum);
514 edge_length[k] = dxy+sum;//OTU1
515 //Rprintf("l1:%f \n",edge_length[k]);
516 edge_length[k + 1] = dxy-sum;//OTU2
517 //Rprintf("l2:%f \n",edge_length[k+1]);
518 //no need to change distance matrix update for complete distance
519 //case, as pairs will automatically fall in the right cathegory
520 A = D[give_index(OTU1,OTU2,n)];
522 for (i = 1; i <= n; i++) {
523 if (i == OTU1 || i == OTU2) continue;
524 if(D[give_index(OTU1,i,n)]!=-1 && D[give_index(OTU2,i,n)]!=-1)
526 new_dist[ij]=0.5*(D[give_index(OTU1,i,n)]-edge_length[k]+D[give_index(OTU2,i,n)]-edge_length[k+1]);
528 if(D[give_index(OTU1,i,n)]!=-1)
530 new_dist[ij]=D[give_index(OTU1,i,n)]-edge_length[k];
532 if(D[give_index(OTU2,i,n)]!=-1)
534 new_dist[ij]=D[give_index(OTU2,i,n)]-edge_length[k+1];
535 }else{new_dist[ij]=-1;}
542 for (i = 1; i < n; i++) {
543 if (i == OTU1 || i == OTU2) continue;
544 for (j = i + 1; j <= n; j++) {
545 if (j == OTU1 || j == OTU2) continue;
546 new_dist[ij] = D[DINDEX(i, j)];
553 for(j=i+1;j<=n-1;j++)
554 {Rprintf("%f ",new_dist[give_index(i,j,n-1)]);
558 //compute Rui, only if distance matrix is still incomplete
564 if(new_dist[give_index(i,1,n-1)]==-1)continue;
569 newS[give_index(1,i,n-1)]++;
572 if(new_dist[give_index(i,j,n-1)]!=-1 && new_dist[give_index(1,j,n-1)]!=-1)
574 newS[give_index(1,i,n-1)]++;
575 newR[give_index(1,i,n-1)]+=new_dist[give_index(i,j,n-1)];
576 newR[give_index(1,i,n-1)]+=new_dist[give_index(1,j,n-1)];
580 //fill in the rest of R and S, again only if distance matrix still
583 {if(i==OTU1 || i==OTU2)continue;
585 {if(j==OTU1 || j==OTU2)continue;
586 newR[ij]=R[give_index(i,j,n)];
587 newS[ij]=s[give_index(i,j,n)];
591 //update newR and newS with the new taxa, again only if distance
592 //matrix is still incomplete
595 {if(new_dist[give_index(1,i,n-1)]==-1)continue;
596 for(j=i+1;j<=n-1;j++)
597 {if(new_dist[give_index(1,j,n-1)]==-1)continue;
598 if(new_dist[give_index(i,j,n-1)]==-1)continue;
599 newR[give_index(i,j,n-1)]+=(new_dist[give_index(1,i,n-1)]+new_dist[give_index(1,j,n-1)]);
600 newS[give_index(i,j,n-1)]++;
604 /* update before the next loop
605 (we are sure that OTU1 < OTU2) */
607 for (i = OTU1; i > 1; i--)
608 otu_label[i] = otu_label[i - 1];
610 for (i = OTU2; i < n; i++)
611 otu_label[i] = otu_label[i + 1];
612 otu_label[1] = cur_nod;
615 for (i = 0; i < n*(n - 1)/2; i++)
627 int dK=0;//number of known distances in final distance matrix
628 int iUK=-1;//index of unkown distance, if we have one missing distance
629 int iK=-1;//index of only known distance, only needed if dK==1
630 for (i = 0; i < 3; i++) {
631 edge1[*N*2 - 4 - i] = cur_nod;
632 edge2[*N*2 - 4 - i] = otu_label[i + 1];
633 if(D[i]!=-1){dK++;iK=i;}else{iUK=i;}
636 {//if two distances are known: assume our leaves are x,y,z, d(x,z) unknown
637 //and edge weights of three edges are a,b,c, then any b,c>0 that
638 //satisfy c-b=d(y,z)-d(x,y) a+c=d(y,z) are good edge weights, but for
639 //simplicity we assume a=c if d(yz)<d(xy) a=b otherwise, and after some
640 //algebra we get that we can set the missing distance equal to the
641 //maximum of the already present distances
645 if(D[i]>max)max=D[i];
650 {//through similar motivation as above, if we have just one known distance
651 //we set the other two distances equal to it
658 {//no distances are known, we just set them to 1
663 edge_length[*N*2 - 4] = (D[0] + D[1] - D[2])/2;
664 edge_length[*N*2 - 5] = (D[0] + D[2] - D[1])/2;
665 edge_length[*N*2 - 6] = (D[2] + D[1] - D[0])/2;