3 /* Copyright 2011-2012 Andrei-Alin Popescu */
5 /* This file is part of the R-package `ape'. */
6 /* See the file ../COPYING for licensing issues. */
12 if (t >= 0 - 1e-10) return 1;
16 void choosePair(double* D,int n,double* R,int* s, int* sw, int* x, int* y,int fS)
29 for (i = 1; i < n; i++)
31 for (j = i + 1; j <= n; j++)
32 {if(D[give_index(i,j,n)]==-1){sww=1;continue;}
33 if(s[give_index(i,j,n)]<=2)continue;
34 //Rprintf("R[%i,%i]=%f\n",i,j,R[give_index(i,j,n)]);
35 //Rprintf("s[%i,%i]=%i\n",i,j,s[give_index(i,j,n)]);
36 //Rprintf("D[%i,%i]=%f\n",i,j,D[give_index(i,j,n)]);
38 double numb=((R[give_index(i,j,n)])/(s[give_index(i,j,n)]-2))-D[give_index(i,j,n)];
39 //Rprintf("numb(%i,%i)=%f\n",i,j,numb);
40 for(k=0;k<fS, cFS[k]>numb;k++);
42 for(tr=fS-1;tr>k;tr--)
55 //no missing distances, just return the one with maximal Q-criterion
56 if(sww==0){*x=iFS[0];*y=jFS[0];*sw=0;return;
59 {Rprintf("value[%i]=%f ",k,cFS[k]);
60 Rprintf("i=%i ",iFS[k]);
61 Rprintf("j=%i ",jFS[k]);
67 if(iFS[i]==0 || jFS[i]==0)continue;
68 double nb=nxy(iFS[i],jFS[i],n,D);
72 /*Rprintf("all N*(x,y)\n");
74 {Rprintf("value[%i]=%f ",k,cFS[k]);
75 Rprintf("i=%i ",iFS[k]);
76 Rprintf("j=%i ",jFS[k]);
80 //shift the max N*xy to the front of the array
91 //if just one pair realises max N*xy, return it
92 if(dk==1){*x=iFS[0];*y=jFS[0];return;}
94 /*Rprintf("max n*xy values\n");
96 {Rprintf("value[%i]=%f ",k,cFS[k]);
97 Rprintf("i=%i ",iFS[k]);
98 Rprintf("j=%i ",jFS[k]);
102 //on the front of the array containing max N*xy values compute cxy
105 if(iFS[i]==0 || jFS[i]==0)continue;
106 double nb=cxy(iFS[i],jFS[i],n,D);
110 /*Rprintf("all c*xy values\n");
112 {Rprintf("value[%i]=%f ",k,cFS[k]);
113 Rprintf("i=%i ",iFS[k]);
114 Rprintf("j=%i ",jFS[k]);
117 //and again shift maximal C*xy values at the fron of the array
129 //if just one C*xy with maximal value, return pair realising it
130 if(dk==1){*x=iFS[0];*y=jFS[0];return;}
132 /*Rprintf("max c*xy values\n");
134 {Rprintf("value[%i]=%f ",k,cFS[k]);
135 Rprintf("i=%i ",iFS[k]);
136 Rprintf("j=%i ",jFS[k]);
140 //on the front of the array containing max C*xy compute m*xy
143 if(iFS[i]==0 || jFS[i]==0)continue;
144 double nb=mxy(iFS[i],jFS[i],n,D);
148 /*Rprintf("all m*xy values\n");
150 {Rprintf("value[%i]=%f ",k,cFS[k]);
151 Rprintf("i=%i ",iFS[k]);
152 Rprintf("j=%i ",jFS[k]);
155 //again shift maximal m*xy values to the fron of the array
167 //if just one maximal value for m*xy return the pair realising it, found at 0
168 if(dk==1){*x=iFS[0];*y=jFS[0];return;}
170 /*Rprintf("max m*xy values\n");
172 {Rprintf("value[%i]=%f ",k,cFS[k]);
173 Rprintf("i=%i ",iFS[k]);
174 Rprintf("j=%i ",jFS[k]);
177 //and calculate cnxy on these values, but this time we do not shift, but simply
178 //return the pair realising the maximum, stored at iPos
183 if(iFS[i]==0 || jFS[i]==0)continue;
184 double nb=cnxy(iFS[i],jFS[i],n,D);
185 if(nb>max){max=nb;iPos=i;}
188 /*Rprintf("cN*xy\n");
189 Rprintf("value[%i]=%f ",0,cFS[0]);
190 Rprintf("i=%i ",iFS[0]);
191 Rprintf("j=%i ",jFS[0]);
193 if(iFS[iPos]==0 || jFS[iPos]==0)
195 error("distance information insufficient to construct a tree, cannot calculate agglomeration criterion");
197 *x=iFS[iPos];*y=jFS[iPos];
200 double cnxy(int x, int y, int n,double* D)
205 //Rprintf("cN[%i,%i]\n",x,y);
210 if((i==x && j==y) || (j==x && i==y))continue;
213 if(i!=x)n1=D[give_index(i,x,n)];
214 if(j!=y)n2=D[give_index(j,y,n)];
215 if(n1==-1 || n2==-1 || D[give_index(i,j,n)]==-1)continue;
216 nMeanXY+=(n1+n2-D[give_index(x,y,n)]-D[give_index(i,j,n)]);
217 //Rprintf("cnMeanXY after (%i,%i)=%f\n",i,j,nMeanXY);
223 int mxy(int x,int y,int n,double* D)
234 if(i!=x && D[give_index(x,i,n)]==-1)
238 if(i!=y && D[give_index(y,i,n)]==-1)
245 Rprintf("mx[%i]=%i ",i,mx[i]);
251 Rprintf("my[%i]=%i ",i,my[i]);
259 if(i!=x && mx[i]==1 && my[i]==0)
263 if(i!=y && my[i]==1 && mx[i]==0)
268 //Rprintf("xmy=%i, ymx=%i, xmy+ymx=%i\n",xmy,ymx,xmy+ymx);
271 double nxy(int x, int y, int n,double* D)
277 //Rprintf("N[%i,%i]\n",x,y);
282 if((i==x && j==y) || (j==x && i==y))continue;
285 if(i!=x)n1=D[give_index(i,x,n)];
286 if(j!=y)n2=D[give_index(j,y,n)];
287 if(n1==-1 || n2==-1 || D[give_index(i,j,n)]==-1)continue;
289 //Rprintf("considered pair (%i,%i)\n",i,j);
290 nMeanXY+=H(n1+n2-D[give_index(x,y,n)]-D[give_index(i,j,n)]);
291 //Rprintf("nMeanXY after (%i,%i)=%f\n",i,j,nMeanXY);
294 //Rprintf("sCXY=%i",sCXY);
295 if(sCXY==0) return 0;
299 int cxy(int x, int y, int n,double* D)
309 if((i==x && j==y) || (j==x && i==y))continue;
312 if(i!=x)n1=D[give_index(i,x,n)];
313 if(j!=y)n2=D[give_index(j,y,n)];
314 if(n1==-1 || n2==-1 || D[give_index(i,j,n)]==-1)continue;
321 void njs(double *D, int *N, int *edge1, int *edge2, double *edge_length,int *fsS)
322 { //assume missing values are denoted by -1
323 double *S,*R, Sdist, Ndist, *new_dist, A, B, smallest_S, x, y;
324 int n, i, j, k, ij, smallest, OTU1, OTU2, cur_nod, o_l, *otu_label;
325 /*for(i=0;i<n*(n-1)/2;i++)
326 {if(isNA(D[i])){D[i]=-1;}
328 int *s;//s contains |Sxy|, which is all we need for agglomeration
339 R = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
340 S = (double*)R_alloc(n + 1, sizeof(double));
341 newR = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
342 new_dist = (double*)R_alloc(n*(n - 1)/2, sizeof(double));
343 otu_label = (int*)R_alloc(n + 1, sizeof(int));
344 s = (int*)R_alloc(n*(n - 1)/2, sizeof(int));
345 newS = (int*)R_alloc(n*(n - 1)/2, sizeof(int));
347 for (i = 1; i <= n; i++) otu_label[i] = i; /* otu_label[0] is not used */
351 //compute Sxy and Rxy
352 for(i=0;i<n*(n-1)/2;i++)
361 {//algorithm assumes i,j /in Sij, so skip pair if it is not known
362 if(D[give_index(i,j,n)]==-1)
366 //Rprintf("for %i and %i :\n",i,j);
368 {//ij is the pair for which we compute
369 //skip k if we do not know the distances between it and i AND j
372 s[give_index(i,j,n)]++;
373 if(i!=k)R[give_index(i,j,n)]+=D[give_index(i,k,n)];
374 if(j!=k)R[give_index(i,j,n)]+=D[give_index(j,k,n)];
377 if(D[give_index(i,k,n)]==-1 || D[give_index(j,k,n)]==-1)continue;
379 s[give_index(i,j,n)]++;
380 R[give_index(i,j,n)]+=D[give_index(i,k,n)];
381 R[give_index(i,j,n)]+=D[give_index(j,k,n)];
389 Rprintf("R[%i,%i]=%f ",i,j,R[give_index(i,j,n)]);
398 Rprintf("s[%i,%i]=%i ",i,j,s[give_index(i,j,n)]);
404 int sw=1;//if 1 then incomplete
409 {newR[give_index(i,j,n)]=0;
410 newS[give_index(i,j,n)]=0;
420 choosePair(D,n,R,s,&sw,&OTU1,&OTU2,fS);
422 else{ //Rprintf("distance matrix is now complete\n");
426 //Rprintf("give_index(%i,%i)=%i\n",i,j,give_index(i,j,n));
427 //Rprintf("D[%i,%i]=%f\n",i,j,D[give_index(i,j,n)]);
428 S[i]+=D[give_index(i,j,n)];
431 //Rprintf("n=%i,B=%f",n,B);
432 for (i = 1; i < n; i++) {
433 for (j = i + 1; j <= n; j++) {
434 //Rprintf("S[%i]=%f, S[%i]=%f, D[%i,%i]=%f, B=%f",i,S[i],j,S[j],i,j,D[give_index(i,j,n)],B);
435 A=S[i]+S[j]-B*D[give_index(i,j,n)];
436 //Rprintf("Q[%i,%i]=%f\n",i,j,A);
437 if (A > smallest_S) {
448 /*Rprintf("agglomerating %i and %i, Q=%f \n",OTU1,OTU2,smallest_S);
454 Rprintf("R[%i,%i]=%f ",i,j,R[give_index(i,j,n)]);
463 Rprintf("s[%i,%i]=%i ",i,j,s[give_index(i,j,n)]);
472 Rprintf("d[%i,%i]=%f ",i,j,D[give_index(i,j,n)]);
476 //update Rxy and Sxy, only if matrix still incomplete
479 {if(i==OTU1 || i==OTU2)continue;
481 {if(j==OTU1 || j==OTU2)continue;
482 if(D[give_index(i,j,n)]==-1)continue;
483 if(D[give_index(i,OTU1,n)]!=-1 && D[give_index(j,OTU1,n)]!=-1)
484 {//OTU1 was considered for Rij, so now subtract
485 R[give_index(i,j,n)]-=(D[give_index(i,OTU1,n)]+D[give_index(j,OTU1,n)]);
486 s[give_index(i,j,n)]--;
488 if(D[give_index(i,OTU2,n)]!=-1 && D[give_index(j,OTU2,n)]!=-1)
489 {//OTU2 was considered for Rij, so now subtract
490 R[give_index(i,j,n)]-=(D[give_index(i,OTU2,n)]+D[give_index(j,OTU2,n)]);
491 s[give_index(i,j,n)]--;
496 edge2[k] = otu_label[OTU1];
497 edge2[k + 1] = otu_label[OTU2];
498 edge1[k] = edge1[k + 1] = cur_nod;
500 /* get the distances between all OTUs but the 2 selected ones
502 a) get the sum for both
503 b) compute the distances for the new OTU */
508 {if(i==OTU1 || i==OTU2)continue;
509 if(D[give_index(OTU1,i,n)]==-1 || D[give_index(OTU2,i,n)]==-1)continue;
510 sum+=(D[give_index(OTU1,i,n)]-D[give_index(OTU2,i,n)]);
512 //although s was updated above, s[otu1,otu2] has remained unchanged
513 //so it is safe to use it here
514 //if complete distanes, use N-2, else use S
516 if(sw==1){down=s[give_index(OTU1,OTU2,n)]-2;}
518 {error("distance information insufficient to construct a tree, leaves %i and %i isolated from tree",OTU1,OTU2);
520 //Rprintf("down=%i\n",down);
521 sum*=(1.0/(2*(down)));
522 //Rprintf("sum=%f\n",sum);
523 double dxy=D[give_index(OTU1,OTU2,n)]/2;
525 //Rprintf("R[%i,%i]:%f \n",OTU1,OTU2,sum);
526 edge_length[k] = dxy+sum;//OTU1
527 //Rprintf("l1:%f \n",edge_length[k]);
528 edge_length[k + 1] = dxy-sum;//OTU2
529 //Rprintf("l2:%f \n",edge_length[k+1]);
530 //no need to change distance matrix update for complete distance
531 //case, as pairs will automatically fall in the right cathegory
532 A = D[give_index(OTU1,OTU2,n)];
534 for (i = 1; i <= n; i++) {
535 if (i == OTU1 || i == OTU2) continue;
536 if(D[give_index(OTU1,i,n)]!=-1 && D[give_index(OTU2,i,n)]!=-1)
538 new_dist[ij]=0.5*(D[give_index(OTU1,i,n)]-edge_length[k]+D[give_index(OTU2,i,n)]-edge_length[k+1]);
540 if(D[give_index(OTU1,i,n)]!=-1)
542 new_dist[ij]=D[give_index(OTU1,i,n)]-edge_length[k];
544 if(D[give_index(OTU2,i,n)]!=-1)
546 new_dist[ij]=D[give_index(OTU2,i,n)]-edge_length[k+1];
547 }else{new_dist[ij]=-1;}
554 for (i = 1; i < n; i++) {
555 if (i == OTU1 || i == OTU2) continue;
556 for (j = i + 1; j <= n; j++) {
557 if (j == OTU1 || j == OTU2) continue;
558 new_dist[ij] = D[DINDEX(i, j)];
565 for(j=i+1;j<=n-1;j++)
566 {Rprintf("%f ",new_dist[give_index(i,j,n-1)]);
570 //compute Rui, only if distance matrix is still incomplete
576 if(new_dist[give_index(i,1,n-1)]==-1)continue;
581 if(i!=j)newR[give_index(1,i,n-1)]+=new_dist[give_index(i,j,n-1)];
582 if(j!=1)newR[give_index(1,i,n-1)]+=new_dist[give_index(1,j,n-1)];
583 newS[give_index(1,i,n-1)]++;
586 if(new_dist[give_index(i,j,n-1)]!=-1 && new_dist[give_index(1,j,n-1)]!=-1)
588 newS[give_index(1,i,n-1)]++;
589 newR[give_index(1,i,n-1)]+=new_dist[give_index(i,j,n-1)];
590 newR[give_index(1,i,n-1)]+=new_dist[give_index(1,j,n-1)];
594 //fill in the rest of R and S, again only if distance matrix still
596 if(sw==1) /* added 2012-04-02 */
598 {if(i==OTU1 || i==OTU2)continue;
600 {if(j==OTU1 || j==OTU2)continue;
601 newR[ij]=R[give_index(i,j,n)];
602 newS[ij]=s[give_index(i,j,n)];
606 //update newR and newS with the new taxa, again only if distance
607 //matrix is still incomplete
610 {if(new_dist[give_index(1,i,n-1)]==-1)continue;
611 for(j=i+1;j<=n-1;j++)
612 {if(new_dist[give_index(1,j,n-1)]==-1)continue;
613 if(new_dist[give_index(i,j,n-1)]==-1)continue;
614 newR[give_index(i,j,n-1)]+=(new_dist[give_index(1,i,n-1)]+new_dist[give_index(1,j,n-1)]);
615 newS[give_index(i,j,n-1)]++;
619 /* update before the next loop
620 (we are sure that OTU1 < OTU2) */
622 for (i = OTU1; i > 1; i--)
623 otu_label[i] = otu_label[i - 1];
625 for (i = OTU2; i < n; i++)
626 otu_label[i] = otu_label[i + 1];
627 otu_label[1] = cur_nod;
630 for (i = 0; i < n*(n - 1)/2; i++)
642 int dK=0;//number of known distances in final distance matrix
643 int iUK=-1;//index of unkown distance, if we have one missing distance
644 int iK=-1;//index of only known distance, only needed if dK==1
645 for (i = 0; i < 3; i++) {
646 edge1[*N*2 - 4 - i] = cur_nod;
647 edge2[*N*2 - 4 - i] = otu_label[i + 1];
648 if(D[i]!=-1){dK++;iK=i;}else{iUK=i;}
651 {//if two distances are known: assume our leaves are x,y,z, d(x,z) unknown
652 //and edge weights of three edges are a,b,c, then any b,c>0 that
653 //satisfy c-b=d(y,z)-d(x,y) a+c=d(y,z) are good edge weights, but for
654 //simplicity we assume a=c if d(yz)<d(xy) a=b otherwise, and after some
655 //algebra we get that we can set the missing distance equal to the
656 //maximum of the already present distances
660 if(D[i]>max)max=D[i];
665 {//through similar motivation as above, if we have just one known distance
666 //we set the other two distances equal to it
673 {//no distances are known, we just set them to 1
678 edge_length[*N*2 - 4] = (D[0] + D[1] - D[2])/2;
679 edge_length[*N*2 - 5] = (D[0] + D[2] - D[1])/2;
680 edge_length[*N*2 - 6] = (D[2] + D[1] - D[0])/2;