2 ML parameter estimation for models with Gamma-distributed rates
\r
3 over sites, combined with tree topology estimation from DNA sequences
\r
5 Copyright, Ziheng YANG, July 1992 onwards
\r
6 cc -o basemlg -fast basemlg.c tools.o -lm
\r
7 basemlg <ControlFileName>
\r
15 #define NBRANCH (NS*2-2)
\r
16 #define NNODE (NS*2-1)
\r
21 #define NP (NBRANCH+NGENE+5)
\r
23 extern char BASEs[];
\r
24 extern int noisy, NFunCall, *ancestor;
\r
25 extern double *SeqDistance;
\r
27 int Forestry (FILE *fout, double space[]);
\r
28 int GetOptions (char *ctlf);
\r
29 int SetxBound (int np, double xb[][2]);
\r
30 int testx (double x[], int np);
\r
31 int GetInitials (double x[], int *fromfile);
\r
32 void TestFunction (FILE *fout, double x[], double space[]);
\r
33 int GetMem (int nbranch, int nR, int nitem);
\r
34 void GetSave (int nitem, int *nm, int M[], double alpha, double c);
\r
35 double GetBmx (int ninter, int nsum, int im, int M[], int nodeb[]);
\r
36 int RhoRate (double x[]);
\r
37 int lfunG_print (double x[], int np);
\r
38 double lfunG (double x[], int np);
\r
39 int lfunG_d (double x[], double *lnL, double dl[], int np);
\r
40 int lfunG_dd (double x[], double *lnL, double dl[], double ddl[], int np);
\r
43 unsigned char *z[NS], *spname[NS], seqf[96],outf[96],treef[96];
\r
44 int seqtype, ns, ls, ngene, posG[NGENE+1],lgene[NGENE],*pose,npatt, readpattern;
\r
45 int clock,fix_alpha,fix_kappa,fix_rgene,Malpha,print,verbose;
\r
46 int model, runmode, cleandata, ndata;
\r
47 int np, ntime, nrate, ncode, nrgene, icode, coding, fix_blength;
\r
48 double *fpatt, pi[4], lmax, alpha,kappa, rgene[NGENE],piG[NGENE][4],*conP;
\r
49 double *SSave, *ErSave, *EaSave;
\r
50 int nhomo, nparK, ncatG, fix_rho, getSE, npi0; /* unused */
\r
51 double pi_sqrt[4], rho; /* unused */
\r
55 int nbranch, nnode, root, branches[NBRANCH][2];
\r
60 int father, nson, sons[MAXNSONS], ibranch;
\r
61 double branch, age, label, *conP;
\r
62 char *nodeStr, fossil;
\r
65 static int nR=4, CijkIs0[64];
\r
66 static double Cijk[64], Root[4];
\r
68 FILE *frub, *flfh, *frate, *finitials=NULL;
\r
69 char *ratef="rates";
\r
70 char *models[]={"JC69","K80","F81","F84","HKY85","T92","TN93","REV","UNREST", "REVu","UNRESTu"};
\r
71 enum {JC69, K80, F81, F84, HKY85, T92, TN93, REV, UNREST, REVu, UNRESTu} MODELS;
\r
72 char *clockstr[]={"", "Global clock", "Local clock", "ClockCombined"};
\r
73 enum {GlobalClock=1, LocalClock, ClockCombined} ClockModels;
\r
79 int LASTROUND=0; /* no use for this */
\r
80 #include "treesub.c"
\r
82 int main (int argc, char *argv[])
\r
84 FILE *fout, *fseq=NULL, *fpair[6];
\r
85 char pairfs[1][32]={"2base.t"};
\r
86 char ctlf[96]="baseml.ctl";
\r
87 double space[50000];
\r
89 noisy=2; com.cleandata=1; /* works with clean data only */
\r
91 com.clock=0; com.fix_rgene=0;
\r
93 com.seqtype=0; com.model=F84;
\r
94 com.fix_kappa=0; com.kappa=5;
\r
95 com.fix_alpha=0; com.alpha=0.2;
\r
100 printf("BASEMLG in %s\n", pamlVerStr);
\r
101 frate=fopen(ratef,"w"); frub=fopen("rub","w"); flfh=fopen("lnf","w");
\r
103 if (argc>1) { strncpy(ctlf, argv[1], 95); printf ("\nctlfile is %s.\n", ctlf); }
\r
105 finitials=fopen("in.basemlg","r");
\r
107 if ((fout=fopen(com.outf, "w"))==NULL) error2("outfile creation err.");
\r
108 if((fseq=fopen(com.seqf,"r"))==NULL) error2("No sequence file!");
\r
109 ReadSeq (NULL, fseq, com.cleandata, 0);
\r
110 if((fpair[0]=(FILE*)fopen(pairfs[0],"w"))==NULL) error2("2base.t file open error");
\r
112 fprintf (fout,"BASEMLG %15s %8s + Gamma", com.seqf, models[com.model]);
\r
113 if (com.clock) fprintf (fout, ", Clock");
\r
114 if (com.model!=JC69 && com.model!=F81 && com.fix_kappa)
\r
115 fprintf (fout,"\nkappa =%7.3f given\n", com.kappa);
\r
116 if (com.ngene>1) fprintf (fout, " (%d genes) ", com.ngene);
\r
118 SeqDistance=(double*)malloc(com.ns*(com.ns-1)/2*sizeof(double));
\r
119 ancestor=(int*)malloc(com.ns*(com.ns-1)/2*sizeof(int));
\r
120 if (SeqDistance==NULL||ancestor==NULL) error2("oom");
\r
122 InitializeBaseAA(fout);
\r
123 if (com.model==JC69) PatternWeightJC69like(fout);
\r
125 DistanceMatNuc (fout, fpair[0], com.model, com.alpha);
\r
126 if (com.model<=HKY85)
\r
127 eigenTN93 (com.model, com.kappa, com.kappa, com.pi, &nR, Root, Cijk);
\r
128 Forestry (fout, space);
\r
129 fclose(fseq); fclose(fpair[0]);
\r
130 if(finitials) { fclose(finitials); finitials=NULL; }
\r
134 /* x[] is arranged in order of t[ntime], rgene[], kappa and alpha (if any) */
\r
136 int Forestry (FILE *fout, double space[])
\r
138 int status=0, i,j, itree, ntree, np;
\r
139 int pauptree=0, btree=0, haslength, iteration=1;
\r
140 double x[NP], xb[NP][2], lnL[NTREE]={0}, e=1e-6, *var=space+NP;
\r
143 if ((ftree=fopen(com.treef,"r"))==NULL) error2("treefile err.");
\r
144 GetTreeFileType(ftree,&ntree,&pauptree,0);
\r
146 fprintf (flfh,"%6d%6d%6d\n", ntree, com.ls, com.npatt);
\r
147 fprintf(frate,"Rates for sites, from BASEMLG, %d tree(s)\n\n", ntree);
\r
149 for (itree=0; itree<ntree; itree++) {
\r
150 printf("\nTREE # %2d\n", itree+1 );
\r
151 fprintf(fout,"\nTREE # %2d: ", itree+1);
\r
152 fprintf(flfh,"\n\n%2d\n", itree+1);
\r
153 fprintf(frub,"\n\nTREE # %2d", itree+1);
\r
154 fprintf(frate,"\nTREE # %2d\n", itree+1);
\r
156 if(ReadTreeN(ftree,&haslength,&i,0,1))
\r
157 { puts("err or end of tree file."); break; }
\r
159 com.ntime = com.clock ? tree.nnode-com.ns: tree.nbranch;
\r
161 OutTreeN (F0, 0, 0);
\r
162 OutTreeN (fout, 0, 0);
\r
163 fflush (fout); fflush (flfh);
\r
165 i=(com.clock||com.model>HKY85 ? 2 : 2+!com.fix_alpha);
\r
166 GetMem(tree.nbranch, nR, i);
\r
167 GetInitials(x, &i);
\r
168 if(i==-1) iteration=0;
\r
170 printf("\nnp =%6d\n", np);
\r
173 TestFunction (fout, x, space);
\r
175 if (itree==0 && i==0) {
\r
177 printf ("\n\nSuggest using BASEML to get initial values..");
\r
178 for(j=0; j<com.ntime; j++) fprintf (fout,"%9.5f", x[j]);
\r
181 for(i=0; i<np; i++) printf("%9.5f", x[i]); FPN(F0);
\r
182 if(!iteration) puts("parameters are fixed, calculating lnL . . .");
\r
183 lnL[itree]=lfunG(x, np);
\r
184 if(noisy) printf("\nlnL0 = %12.6f", -lnL[itree]);
\r
187 if (com.clock||com.model==REV) {
\r
188 SetxBound (np, xb);
\r
189 i=ming2(frub,&lnL[itree],lfunG,
\r
190 ((com.clock||com.model>HKY85)?NULL:lfunG_d),x,xb, space,e,np);
\r
191 Hessian (np,x,lnL[itree],space,var,lfunG,var+np*np);
\r
192 matinv (var, np, np, var+np*np);
\r
195 i=Newton(frub, &lnL[itree], lfunG, lfunG_dd,testx,x,var,e,np);
\r
197 if(noisy) printf("\nOut...\nlnL:%14.6f\n", -lnL[itree]);
\r
200 if (i || j<np) { status=-1; fprintf (fout, "\n\ncheck convergence.."); }
\r
201 fprintf(fout,"\nlnL(np:%3d):%14.6f%+12.6f\n", com.np,
\r
202 -lnL[itree], -lnL[itree]+lnL[0]);
\r
203 OutTreeB (fout); FPN (fout);
\r
205 for(i=0; i<np; i++) fprintf (fout,"%9.5f", x[i]); FPN (fout);
\r
207 for(i=0; i<np; i++)
\r
208 fprintf (fout,"%9.5f", var[i*np+i]>0.?sqrt(var[i*np+i]):0.);
\r
210 fprintf (fout, "\n\n# fun calls:%10d\n", NFunCall);
\r
211 OutTreeN(fout,1,1); fputs(";\n", fout);
\r
212 lfunG_print (x, np);
\r
216 } /* for (itree) */
\r
223 int SetBranch (double x[])
\r
229 FOR (i,com.ntime) nodes[i+com.ns].age=x[i];
\r
230 FOR (i,tree.nnode) {
\r
231 if (i==tree.root) continue;
\r
232 nodes[i].branch = nodes[nodes[i].father].age-nodes[i].age;
\r
233 if (nodes[i].branch<-small) status=-1;
\r
238 if (i!=tree.root && (nodes[i].branch=x[nodes[i].ibranch])<-small)
\r
243 int testx (double x[], int np)
\r
246 double tb[]={1e-5,4}, rgeneb[]={.01,20}, kappab[]={0,80}, alphab[]={.01,99};
\r
248 if (SetBranch(x)) return (-1);
\r
249 FOR (i,com.ntime) if (x[i]<tb[0] || x[i]>tb[1]) return (-1);
\r
250 if (np==com.ntime) return (0);
\r
251 for (i=0,k=com.ntime; i<com.nrgene; i++,k++)
\r
252 if (x[k]<rgeneb[0] || x[k]>rgeneb[1]) return (-1);
\r
253 for (i=0; i<com.nrate; i++,k++)
\r
254 if (x[k]<kappab[0] || x[k]>kappab[1]) return(-1);
\r
255 if (!com.fix_alpha && (x[np-1]<alphab[0] || x[np-1]>alphab[1])) return(-1);
\r
259 int SetxBound (int np, double xb[][2])
\r
261 /* sets lower and upper bounds for variables during iteration
\r
263 int i,j, k=com.ntime+com.nrgene+com.nrate;
\r
264 double tb[]={.4e-5, 20}, rgeneb[]={1e-4,99}, rateb[]={1e-5,999};
\r
265 double alphab[]={0.005, 99}, pb[2]={1e-5,1-1e-5};
\r
268 xb[0][0]=tb[0]; xb[0][1]=tb[1];
\r
269 for(i=1;i<com.ntime;i++) FOR(j,2) { xb[i][0]=pb[0]; xb[i][1]=pb[1]; }
\r
272 FOR (i,com.ntime) FOR (j,2) xb[i][j]=tb[j];
\r
273 FOR (i,com.nrgene) FOR (j,2) xb[com.ntime+i][j]=rgeneb[j];
\r
274 FOR (i,com.nrate) FOR (j,2) xb[com.ntime+com.nrgene+i][j]=rateb[j];
\r
275 if(!com.fix_alpha) FOR (j,2) xb[k][j]=alphab[j];
\r
277 puts("\nBounds:\n");
\r
278 FOR(i,np) printf(" %10.6f", xb[i][0]); FPN(F0);
\r
279 FOR(i,np) printf(" %10.6f", xb[i][1]); FPN(F0);
\r
284 int GetInitials (double x[], int *fromfile)
\r
289 com.nrgene = (!com.fix_rgene)*(com.ngene-1);
\r
291 if (com.model==REV) {
\r
293 x[com.ntime+com.nrgene] = 1;
\r
294 FOR (i,com.nrate-1) x[com.ntime+com.nrgene+i+1]=1/com.kappa;
\r
296 else if (!com.fix_kappa)
\r
297 { com.nrate=1; x[com.ntime+com.nrgene]=com.kappa; }
\r
298 if (com.model<=HKY85)
\r
299 eigenTN93 (com.model, com.kappa, com.kappa, com.pi, &nR, Root, Cijk);
\r
301 com.np = com.ntime+com.nrgene+com.nrate+(!com.fix_alpha);
\r
302 FOR (j,com.nrgene) x[com.ntime+j]=1;
\r
303 if (!com.fix_alpha) x[com.np-1]=com.alpha;
\r
305 if (com.clock) for(j=1,x[0]=0.1; j<com.ntime; j++) x[j]=0.5;
\r
306 else FOR (j,com.ntime) x[j]=0.1;
\r
308 LSDistance (&t, x, testx);
\r
309 for(j=0; j<com.ntime; j++)
\r
310 if (x[j]<1e-5) x[j]=1e-4;
\r
312 if(finitials) readx(x,fromfile);
\r
318 void TestFunction (FILE* fout, double x[], double space[])
\r
323 printf ("\ntest functions\n");
\r
325 FOR (i,np) x[i]=(double)i*rndu()+0.0001;
\r
326 matout (F0, x, 1, np); matout (fout, x, 1, np);
\r
329 printf ("\n\nnp:%6d\nlnL:%12.8f\n", np, lnL);
\r
330 fprintf (fout, "\n\nnp:%6d\nlnL:%12.8f\n", np, lnL);
\r
332 printf ("\ndl and gradient"); fprintf (fout, "\ndl and gradient");
\r
333 lfunG_d (x, &lnL, space, np);
\r
334 printf ("\nlnL:%12.8f", lnL); fprintf (fout, "\nlnL:%12.8f", lnL);
\r
335 matout (F0, space, 1, np); matout (fout, space, 1, np);
\r
336 gradient (np, x, lnL, space, lfunG, space+np, 0);
\r
337 printf ("\nlnL:%12.8f", lnL); fprintf (fout, "\nlnL:%12.8f", lnL);
\r
338 matout (F0, space, 1, np); matout (fout, space, 1, np);
\r
340 printf ("\nddl & Hessian"); fprintf (fout, "\nddl & Hessian");
\r
341 lfunG_dd (x, &lnL, space, space+np, np);
\r
342 printf ("\nlnL:%12.8f", lnL); fprintf (fout, "\nlnL:%12.8f", lnL);
\r
343 matout (F0, space, 1, np); matout (fout, space, 1, np);
\r
344 matout2 (F0, space+np, np, np, 8, 3);
\r
345 matout2 (fout, space+np, np, np, 8, 3);
\r
347 Hessian (np, x, lnL, space, space+np, lfunG, space+np+np*np);
\r
348 printf ("\nlnL:%12.8f", lnL); fprintf (fout, "\nlnL:%12.8f", lnL);
\r
349 matout2 (F0, space+np, np, np, 8, 3);
\r
350 matout2 (fout, space+np, np, np, 8, 3);
\r
355 int GetMem (int nbranch, int nR, int nitem)
\r
357 /* ns=4: 98KB, 5: 1.6MB, 6: 25MB, 7: 402MB (for HKY85, 3/3/93)
\r
358 nitem=1:ErSave; 2:SSave & ErSave; 3:SSave & ErSave & EaSave
\r
363 for(j=0,nm=1; j<nbranch; j++) nm*=nR;
\r
364 if(nm>1) memsize=(double)nm*nitem*sizeof(double);
\r
365 printf ("\nMemory required: %6.0fK bytes\n", memsize/1024);
\r
366 if(nm*nitem*sizeof(double)<=0 ||
\r
367 (com.conP=(double*)realloc(com.conP, nm*nitem*sizeof(double)))==NULL)
\r
368 error2("out of memory");
\r
370 com.ErSave = com.conP;
\r
371 if (nitem>1) com.SSave=com.ErSave+nm;
\r
372 if (nitem>2) com.EaSave=com.SSave+nm;
\r
377 void GetSave (int nitem, int *nm, int M[], double alpha, double c)
\r
379 /* correct for both clock=0 and 1
\r
380 nitem=1:ErSave; 2:SSave & ErSave; 3:SSave & ErSave & EaSave
\r
385 for(j=0,*nm=1; j<tree.nbranch; j++) *nm*=nR;
\r
386 for (im=0; im< *nm; im++) {
\r
387 for (j=0,it=im,S=0; j<tree.nbranch; j++) {
\r
388 M[j]=it%nR; it/=nR;
\r
389 if (M[j]) S+=nodes[tree.branches[j][1]].branch*Root[M[j]];
\r
391 com.ErSave[im] = pow(alpha/(alpha-c*S),alpha);
\r
392 if (nitem>1) com.SSave[im] = S;
\r
393 if (nitem>2) com.EaSave[im] = log(alpha/(alpha-c*S))-c*S/(alpha-c*S);
\r
397 double GetBmx (int ninter, int nsum, int im, int M[], int nodeb[])
\r
399 int isum, j, it, i, nR4=nR*4;
\r
402 for (j=0,it=im; j<tree.nbranch; M[j]=it%nR,it/=nR,j++) ;
\r
403 for (isum=0,Bmx=0; isum<nsum; isum++) {
\r
404 for (j=0,it=isum; j<ninter; nodeb[com.ns+j]=it&3,it>>=2,j++) ;
\r
405 for (j=0,y=com.pi[nodeb[tree.root]]; j<tree.nbranch; j++) {
\r
406 i = nodeb[tree.branches[j][0]]*nR4
\r
407 + nodeb[tree.branches[j][1]]*nR + M[j];
\r
408 if (CijkIs0[i]) goto nextone;
\r
417 int RhoRate (double x[])
\r
419 /* rate factors for all possible site patterns, and the correlation
\r
420 coefficient (Yang and Wang, in press).
\r
422 int i,j, h, nodeb[NNODE], M[NBRANCH], accurate=(com.ns<8);
\r
423 int im, nm, nsum=1<<(2*(tree.nnode-com.ns)), *fobs;
\r
424 int ninter=tree.nbranch-com.ns+1;
\r
425 double lnL, fh, sumfh=0, rh, Bmx, alpha, kappa;
\r
426 double mrh=0, mrh0=0, vrh=0, vrh0=0;
\r
428 if (com.ngene>1) error2 ("ngene>1");
\r
429 alpha=(com.fix_alpha ? com.alpha : x[com.np-1]);
\r
430 kappa=(com.fix_kappa ? com.kappa : x[com.ntime]);
\r
432 fprintf (flfh, "\n\nmodel:%6d\n kappa:%9.4f\nalpha:%9.4f\nBranches",
\r
433 com.model, kappa, alpha);
\r
434 matout (flfh, x, 1, tree.nbranch);
\r
435 if (com.model<=HKY85 && !com.fix_kappa)
\r
436 RootTN93 (com.model, kappa, kappa, com.pi, &rh, Root);
\r
438 if (com.model==REV)
\r
439 eigenREV (NULL, x+com.ntime+com.nrgene, com.pi, &nR, Root, Cijk);
\r
441 if (SetBranch (x)) puts ("\nx[] err..");
\r
442 GetSave (2, &nm, M, alpha, 1);
\r
443 fobs = (int*) malloc ((1<<(2*com.ns)) * sizeof (int));
\r
445 FOR (h,(1<<2*com.ns)) fobs[h]=0;
\r
446 for (h=0; h<com.npatt; h++) {
\r
447 for (j=0,im=0; j<com.ns; j++) im = im*4+com.z[j][h];
\r
448 fobs[im]=(int)com.fpatt[h];
\r
450 for (h=0,lnL=0; h<(1<<2*com.ns); h++) {
\r
451 if (accurate==0 && fobs[h]==0) continue;
\r
452 for (j=0,im=h; j<com.ns; nodeb[com.ns-1-j]=im%4,im/=4,j++);
\r
453 for (im=0,fh=0,rh=0; im<nm; im++) {
\r
454 Bmx=GetBmx (ninter, nsum, im, M, nodeb);
\r
455 fh += Bmx*com.ErSave[im];
\r
456 rh += Bmx*com.ErSave[im]*alpha/(alpha-com.SSave[im]);
\r
458 if (fh<=0) printf ("\a\nRhoRate: h=%4d fh=%9.4f \n", h, fh);
\r
459 rh /= fh; vrh += fh*rh*rh;
\r
460 sumfh += fh; mrh += fh*rh; mrh0+=rh*(double)fobs[h]/(double)com.ls;
\r
462 vrh0+= rh*rh*(double)fobs[h]/(double)com.ls;
\r
463 lnL -= log(fh)*(double)fobs[h];
\r
465 fprintf (flfh,"\n%6d%9.2f%8.4f ", fobs[h], fh*com.ls, rh);
\r
466 FOR (i,com.ns) fprintf (flfh, "%c", BASEs[nodeb[i]]);
\r
468 vrh-=1; vrh0-=mrh0*mrh0;
\r
469 fprintf (flfh, "\n%s Vrh", accurate?"accurate":"approximate");
\r
470 fprintf (flfh,"\nsumfh = 1 =%12.6f mrh = 1 =%12.6f\n", sumfh, mrh);
\r
471 fprintf (flfh, "\nVr :%12.6f Vr0:%12.6f mrh0:%12.6f", vrh, vrh0, mrh0);
\r
472 fprintf (flfh, "\nPEV:%12.6f %12.6f", 1/alpha-vrh, 1/alpha-vrh0);
\r
473 fprintf (flfh, "\nRHO:%12.6f %12.6f", sqrt(vrh*alpha), sqrt(vrh0*alpha));
\r
474 fprintf (flfh,"\nLn(L)=%12.6f\n", -lnL);
\r
479 int lfunG_print (double x[], int np)
\r
481 /* This prints log(f) into lfh, and rates into rates
\r
483 int i,j, h,hp, igene, lt, nodeb[NNODE], M[NBRANCH];
\r
484 int im, nm, nsum=1<<(2*(tree.nnode-com.ns));
\r
485 int ninter=tree.nbranch-com.ns+1;
\r
486 double lnL, fh, rh, mrh0=0,vrh0=0, Bmx, y, alpha,kappa, *fhs,*rates=NULL;
\r
488 fputs("\nEstimation of rates for sites by BASEMLG.\n",frate);
\r
490 if ((rates=(double*)malloc(com.npatt*2*sizeof(double)))==NULL) error2("oom");
\r
491 fhs=rates+com.npatt;
\r
493 FOR (i, com.nrgene) com.rgene[i+1]=x[com.ntime+i];
\r
494 kappa=(com.fix_kappa ? com.kappa : x[com.ntime+com.nrgene]);
\r
495 alpha=(com.fix_alpha ? com.alpha : x[com.np-1]);
\r
497 if (com.model<=HKY85 && !com.fix_kappa)
\r
498 RootTN93(com.model,kappa,kappa,com.pi,&y,Root);
\r
500 if (com.model==REV)
\r
501 eigenREV(NULL,x+com.ntime+com.nrgene,com.pi,&nR,Root,Cijk);
\r
503 if (SetBranch(x)) puts("\nx[] err..");
\r
504 for(j=0,nm=1; j<tree.nbranch; j++) nm*=nR;
\r
506 for (h=0,lnL=0,igene=-1,lt=0; h<com.npatt; lt+=(int)com.fpatt[h++]) {
\r
507 FOR(j,com.ns) nodeb[j] = com.z[j][h];
\r
508 if (h==0 || lt==com.lgene[igene])
\r
509 GetSave (2, &nm, M, alpha, com.rgene[++igene]);
\r
510 for (im=0,fh=0,rh=0; im<nm; im++) {
\r
511 Bmx=GetBmx (ninter, nsum, im, M, nodeb);
\r
512 fh += Bmx*com.ErSave[im];
\r
513 rh += Bmx*com.ErSave[im]*alpha/(alpha-com.SSave[im]);
\r
515 if (fh<=0) printf ("\a\nlfunG_print: h=%4d fh=%9.4f \n", h, fh);
\r
517 vrh0+=rh*rh*com.fpatt[h]/com.ls;
\r
518 mrh0+=rh*com.fpatt[h]/com.ls;
\r
521 fhs[h]=fh; fh=log(fh);
\r
522 lnL -= fh*com.fpatt[h];
\r
523 fprintf (flfh,"\n%4d%6.0f%14.8f%9.2f%9.4f ",
\r
524 h+1, com.fpatt[h], fh, com.ls*fhs[h], rh);
\r
525 FOR (i,com.ns) fprintf (flfh, "%c", BASEs[com.z[i][h]]);
\r
530 fprintf (flfh,"\n\nVrh0:%12.6f\nmrh0:%12.6f\nrho0:%12.6f\n", vrh0, mrh0, sqrt(vrh0*alpha));
\r
532 fputs("\n Site Freq Data Nexp. Rates\n\n",frate);
\r
536 fprintf(frate,"%4d %5.0f ",h+1,com.fpatt[hp]);
\r
537 FOR(j,com.ns) fprintf (frate,"%c", BASEs[com.z[j][hp]]);
\r
538 fprintf(frate,"%9.2f%9.4f\n", com.ls*fhs[hp],rates[hp]);
\r
540 fprintf(frate, "\nlnL = %12.6f", -lnL);
\r
542 fprintf(frate, "\nVr0:%12.6f mrh0:%12.6f", vrh0, mrh0);
\r
543 fprintf(frate, "\nApproximate corr(r,r^) =%12.6f\n",sqrt(vrh0*alpha));
\r
548 double lfunG (double x[], int np)
\r
550 /* likelihood with spatial rate variation
\r
552 int i,j, h, igene, lt, nodeb[NNODE], M[NBRANCH];
\r
553 int im, nm, nsum=1<<(2*(tree.nnode-com.ns));
\r
554 int ninter=tree.nbranch-com.ns+1;
\r
555 double lnL, fh, rh, Bmx, y, alpha,kappa;
\r
558 FOR (i, com.nrgene) com.rgene[i+1]=x[com.ntime+i];
\r
559 kappa=(com.fix_kappa ? com.kappa : x[com.ntime+com.nrgene]);
\r
560 alpha=(com.fix_alpha ? com.alpha : x[np-1]);
\r
561 if (com.model<=HKY85 && !com.fix_kappa)
\r
562 RootTN93 (com.model, kappa, kappa, com.pi, &y, Root);
\r
564 if (com.model==REV)
\r
565 eigenREV (NULL, x+com.ntime+com.nrgene, com.pi, &nR, Root, Cijk);
\r
567 if (SetBranch (x)) puts ("\nx[] err..");
\r
569 for (h=0,lnL=0,igene=-1,lt=0; h<com.npatt; lt+=(int)com.fpatt[h++]) {
\r
570 FOR(j,com.ns) nodeb[j] = com.z[j][h];
\r
571 if (h==0 || lt==com.lgene[igene])
\r
572 GetSave (1, &nm, M, alpha, com.rgene[++igene]);
\r
573 for (im=0,fh=0,rh=0; im<nm; im++) {
\r
574 Bmx=GetBmx (ninter, nsum, im, M, nodeb);
\r
575 fh += Bmx*com.ErSave[im];
\r
577 if (fh<=0) printf ("\a\nlfunG: h=%4d fh=%9.4f \n", h, fh);
\r
578 lnL -= log(fh)*com.fpatt[h];
\r
583 int lfunG_d (double x[], double *lnL, double dl[], int np)
\r
585 int nbranch=tree.nbranch, ninter=nbranch-com.ns+1;
\r
586 int i,j, nodeb[NNODE], M[NBRANCH], h, igene,lt;
\r
587 int im, nm, nsum=1<<(2*(tree.nnode-com.ns));
\r
588 double fh, y, Bmx, S, alpha, kappa, Er, dfh[NP];
\r
589 double drk1[4], c=1;
\r
590 double *p=com.pi, T=p[0],C=p[1],A=p[2],G=p[3],Y=T+C,R=A+G;
\r
592 if (com.clock||com.model>HKY85) error2 ("err lfunG_d");
\r
594 FOR (i, com.nrgene) com.rgene[i+1]=x[com.ntime+i];
\r
595 kappa=(com.fix_kappa ? com.kappa : x[com.ntime+com.nrgene]);
\r
596 alpha=(com.fix_alpha ? com.alpha : x[np-1]);
\r
597 if (SetBranch (x)) puts ("\nx[] err..");
\r
598 if (!com.fix_kappa) {
\r
599 RootTN93 (com.model, kappa, kappa, p, &S, Root);
\r
602 drk1[2]=2*(y-R*R)*Y*S*S;
\r
603 drk1[3]=2*(y-Y*Y)*R*S*S;
\r
604 if (com.model==F84) {
\r
607 drk1[2]=-S+(1+kappa)*y*S*S;
\r
610 *lnL=0; zero(dl,np);
\r
611 for (h=0,igene=-1,lt=0; h<com.npatt; lt+=(int)com.fpatt[h++]) {
\r
612 FOR(j,com.ns) nodeb[j] = com.z[j][h];
\r
613 if (h==0 || lt==com.lgene[igene])
\r
614 GetSave (2+!com.fix_alpha, &nm, M, alpha, (c=com.rgene[++igene]));
\r
615 for (im=0,fh=0,zero(dfh,np); im<nm; im++) {
\r
616 Bmx=GetBmx (ninter, nsum, im, M, nodeb);
\r
618 Er = com.ErSave[im]*Bmx;
\r
621 if (M[j]) dfh[j]+=c*Er*alpha*Root[M[j]]/(alpha-c*S); /* t */
\r
622 if (com.ngene>0 && igene>0)
\r
623 dfh[com.ntime+igene-1]+=Er*alpha/(alpha-c*S)*S; /* c */
\r
624 if (!com.fix_kappa) {
\r
625 for (j=0,y=0; j<nbranch; j++) if (M[j]) y+=x[j]*drk1[M[j]];
\r
626 dfh[com.ntime+com.nrgene] += Er*alpha/(alpha-c*S)*y*c; /* k */
\r
628 if (!com.fix_alpha) dfh[np-1] += Er*com.EaSave[im]; /* a */
\r
630 if (fh<=0) printf ("\a\nlfunG_d: h=%4d fh=%9.4f \n", h, fh);
\r
631 *lnL -= log(fh)*com.fpatt[h];
\r
632 FOR (j, np) dl[j] -= dfh[j]/fh*com.fpatt[h];
\r
637 int lfunG_dd (double x[], double *lnL, double dl[], double ddl[], int np)
\r
639 int nbranch=tree.nbranch, ninter=nbranch-com.ns+1;
\r
640 int i,j,k, nodeb[NNODE], M[NBRANCH], h, igene,lt;
\r
641 int im, nm, nsum=1<<(2*(tree.nnode-com.ns));
\r
642 double fh, y, Bmx,S,alpha,kappa, Er,Ea, y1,y2;
\r
643 double dfh[NP], ddfh[NP*NP], drk1[4], drk2[4], c=1, s1=0,s2=0;
\r
644 double T=com.pi[0],C=com.pi[1],A=com.pi[2],G=com.pi[3],Y=T+C,R=A+G;
\r
647 if (com.clock||com.model>HKY85) error2 ("err lfunG_dd");
\r
649 FOR (i, com.nrgene) com.rgene[i+1]=x[com.ntime+i];
\r
650 kappa=(com.fix_kappa ? com.kappa : x[com.ntime+com.nrgene]);
\r
651 alpha=(com.fix_alpha ? com.alpha : x[np-1]);
\r
652 if (SetBranch (x)) puts ("\nx[] err..");
\r
654 if (!com.fix_kappa) {
\r
655 RootTN93 (com.model, kappa, kappa, com.pi, &S, Root);
\r
659 drk1[2]=2*(y-R*R)*Y*S*S;
\r
660 drk1[3]=2*(y-Y*Y)*R*S*S;
\r
662 drk2[1]=-8*y*y*S*S*S;
\r
663 drk2[2]=-8*y*Y*(y-R*R)*S*S*S;
\r
664 drk2[3]=-8*y*R*(y-Y*Y)*S*S*S;
\r
666 if (com.model==F84) {
\r
669 drk1[2]=-S+(1+kappa)*y*S*S;
\r
671 drk2[1]=-2*y*y*S*S*S;
\r
672 drk2[2]=2*y*S*S*(1-(1+kappa)*y*S);
\r
675 *lnL=0, zero(dl,np), zero(ddl,np*np);
\r
676 for (h=0,igene=-1,lt=0; h<com.npatt; lt+=(int)com.fpatt[h++]) {
\r
677 FOR(j,com.ns) nodeb[j] = com.z[j][h];
\r
678 if (h==0 || lt==com.lgene[igene])
\r
679 GetSave (2+!com.fix_alpha, &nm, M, alpha, (c=com.rgene[++igene]));
\r
680 for (im=0,fh=0,zero(dfh,np),zero(ddfh,np*np); im<nm; im++) {
\r
681 Bmx=GetBmx (ninter, nsum, im, M, nodeb);
\r
683 Er = com.ErSave[im]*Bmx;
\r
687 for (j=0,y2=y1*(1+alpha)*y*c*c; j<nbranch; j++) {
\r
689 dfh[j]+=y1*Root[M[j]]*c; /* t */
\r
690 for (i=j; i<nbranch; i++) /* tt */
\r
691 if (M[i]) ddfh[i*np+j]+=y2*Root[M[i]]*Root[M[j]];
\r
694 if (!com.fix_kappa) {
\r
695 for (j=0,s1=0,s2=0; j<nbranch; j++)
\r
696 if (M[j]) { s1+=x[j]*drk1[M[j]]; s2+=x[j]*drk2[M[j]]; }
\r
697 k=com.ntime+com.nrgene;
\r
698 dfh[k] += c*y1*s1; /* k */
\r
699 for (j=0,y2=y*s1*(alpha+1); j<nbranch; j++) if (M[j])
\r
700 ddfh[k*np+j] += c*y1*(Root[M[j]]*y2*c+drk1[M[j]]); /* kt */
\r
701 ddfh[k*np+k] += y1*c*(c*s1*y2+s2); /* kk */
\r
703 if (com.ngene>0 && igene>0) {
\r
704 k=com.ntime+igene-1;
\r
705 dfh[k]+=y1*S; /* c */
\r
706 ddfh[k*np+k]+=y*y1*S*S*(1+alpha); /* cc */
\r
707 y2=alpha*y*y1*(1+c*S);
\r
708 FOR (j,nbranch) ddfh[k*np+j]+=y2*Root[M[j]]; /* ct */
\r
709 if (!com.fix_kappa)
\r
710 ddfh[(com.ntime+com.nrgene)*np+k]+=y2*s1; /* ck */
\r
712 if (!com.fix_alpha) {
\r
713 Ea = com.EaSave[im];
\r
714 dfh[np-1] += Er*Ea; /* a */
\r
715 ddfh[np*np-1]+=Er*(Ea*Ea+1/alpha-y+c*S*y*y); /* aa */
\r
716 y2=Er*y*(Ea*alpha-c*S*y);
\r
717 FOR (j,nbranch) ddfh[(np-1)*np+j]+=c*Root[M[j]]*y2; /* at */
\r
718 if (com.ngene>0 && igene>0)
\r
719 ddfh[(np-1)*np+nbranch+igene-1]+=S*y2; /* ac */
\r
720 if (!com.fix_kappa)
\r
721 ddfh[(np-1)*np+com.ntime+com.nrgene] += c*y2*s1; /* ak */
\r
724 if (fh<=0) printf ("\a\nlfunG_dd: h=%4d fh=%9.4f \n", h, fh);
\r
725 *lnL -= log(fh)*com.fpatt[h];
\r
726 FOR (j, np) dl[j] -= dfh[j]/fh*com.fpatt[h];
\r
727 FOR (j, np) for (i=j; i<np; i++) {
\r
729 (fh*ddfh[i*np+j]-dfh[i]*dfh[j])/(fh*fh)*com.fpatt[h];
\r
730 ddl[j*np+i] = ddl[i*np+j];
\r
736 int GetOptions (char *ctlf)
\r
738 int i, nopt=27, lline=255;
\r
739 char line[255], *pline, opt[32], *comment="*#";
\r
740 char *optstr[]={"seqfile","outfile","treefile", "noisy",
\r
741 "cleandata", "ndata", "verbose","runmode",
\r
742 "clock","fix_rgene","Mgene","nhomo","getSE","RateAncestor",
\r
743 "model","fix_kappa","kappa","fix_alpha","alpha","Malpha","ncatG",
\r
744 "fix_rho","rho","nparK", "Small_Diff", "method", "fix_blength"};
\r
746 FILE *fctl=fopen (ctlf, "r");
\r
749 if (noisy) printf ("\n\nReading options from %s..\n", ctlf);
\r
751 if (fgets (line, lline, fctl) == NULL) break;
\r
752 for (i=0,t=0,pline=line; i<lline&&line[i]; i++)
\r
753 if (isalnum(line[i])) { t=1; break; }
\r
754 else if (strchr(comment,line[i])) break;
\r
755 if (t==0) continue;
\r
756 sscanf (line, "%s%*s%lf", opt, &t);
\r
757 if ((pline=strstr(line, "="))==NULL) error2("option file.");
\r
759 for (i=0; i<nopt; i++) {
\r
760 if (strncmp(opt, optstr[i], 8)==0) {
\r
762 printf ("\n%3d %15s | %-20s %6.2f", i+1,optstr[i],opt,t);
\r
764 case ( 0): sscanf(pline+2, "%s", com.seqf); break;
\r
765 case ( 1): sscanf(pline+2, "%s", com.outf); break;
\r
766 case ( 2): sscanf(pline+2, "%s", com.treef); break;
\r
767 case ( 3): noisy=(int)t; break;
\r
768 case ( 4): com.cleandata=(int)t; break;
\r
769 case ( 5): break; /* ndata */
\r
770 case ( 6): com.verbose=(int)t; break;
\r
771 case ( 7): com.runmode=(int)t; break;
\r
772 case ( 8): com.clock=(int)t; break;
\r
773 case ( 9): break; /* fix_rgene */
\r
774 case (10): com.fix_rgene=(int)t; break;
\r
775 case (11): com.nhomo=(int)t; break;
\r
776 case (12): com.getSE=(int)t; break;
\r
777 case (13): com.print=(int)t; break;
\r
778 case (14): com.model=(int)t; break;
\r
779 case (15): com.fix_kappa=(int)t; break;
\r
780 case (16): com.kappa=t; break;
\r
781 case (17): com.fix_alpha=(int)t; break;
\r
782 case (18): com.alpha=t; break;
\r
783 case (19): break; /* Malpha */
\r
784 case (20): com.ncatG=(int)t; break;
\r
785 case (21): com.fix_rho=(int)t; break;
\r
786 case (22): com.rho=t; break;
\r
787 case (23): com.nparK=(int)t; break;
\r
788 case (24): break; /* smallDiff */
\r
789 case (25): break; /* method */
\r
790 case (26): break; /* fix_blength */
\r
796 { printf ("\noption %s in %s\n", opt, ctlf); exit (-1); }
\r
801 if (noisy) printf ("\nno ctl file..");
\r
803 if (com.runmode) error2("runmode!=0 for BASEMLG?");
\r
804 if (com.model!=F84 && com.kappa<=0) error2("init kappa..");
\r
805 if (com.alpha<=0) error2("init alpha..");
\r
806 if (com.alpha==0 || com.Malpha || com.rho>0 || com.nhomo>0 || com.nparK>0
\r
807 || com.model>HKY85)
\r
808 error2 ("\noptions in file baseml.ctl inappropriate.. giving up..");
\r
809 if (com.model==JC69 || com.model==F81) { com.fix_kappa=1; com.kappa=1; }
\r
810 if(com.cleandata==0) {
\r
811 puts("cleandata set to 1, with ambiguity data removed. ");
\r
820 I have not checked the program carefully since I implemented T92. Models up to
\r
821 HKY85 should be fine.
\r