2 Pairwise estimation of dS and dN by the method of Yang & Nielsen
\r
3 (2000 Mol. Biol. Evol. 17:32-43)
\r
5 Copyright, 1998, Ziheng Yang
\r
7 cc -o yn00 -fast yn00.c tools.o -lm
\r
8 cl -O2 yn00.c tools.o
\r
9 yn00 <SequenceFileName>
\r
11 Codon sequences are encoded as 0,1,...,61, as in codeml.c.
\r
19 int GetOptions (char *ctlf);
\r
20 int EncodeSeqCodon(void);
\r
21 int Statistics(FILE *fout, double space[]);
\r
22 int DistanceMatLWL85 (FILE *fout);
\r
23 int DistanceYN00(int is, int js, double*S, double*N, double*dS,double*dN,
\r
24 double *SEdS, double *SEdN, double *t,double space[]);
\r
25 int GetKappa (void);
\r
26 int GetFreqs(int is1, int is2, double f3x4[], double pi[]);
\r
27 int CountSites(char z[],double pi[],double*Stot,double*Ntot,
\r
28 double fbS[],double fbN[]);
\r
29 int GetPMatCodon(double P[],double t, double kappa, double omega, double space[]);
\r
30 int CountDiffs(char z1[],char z2[],
\r
31 double*Sdts,double*Sdtv,double*Ndts,double*Ndtv,double PMat[]);
\r
32 int DistanceF84(double n, double P, double Q, double pi[],
\r
33 double*k_HKY, double*t, double*SEt);
\r
34 double dsdnREV (int is, int js, double space[]);
\r
36 int ExpPattFreq(double t,double kappa,double omega,double pi[],double space[]);
\r
37 int ConsistencyMC(void);
\r
38 int InfiniteData(double t,double kappa,double omega,double f3x4_0[],
\r
40 void SimulateData2s64(FILE* fout, double f3x4_0[], double space[]);
\r
42 struct common_info {
\r
43 unsigned char *z[NS];
\r
44 char *spname[NS], seqf[512],outf[512];
\r
45 int ns,ls,npatt,codonf,icode,ncode,getSE,*pose,verbose, seqtype, readpattern;
\r
46 int cleandata, fcommon,kcommon, weighting, ndata, print;
\r
47 double *fpatt, pi[NCODE], f3x4s[NS][12], kappa, omega;
\r
48 int ngene,posG[NGENE+1],lgene[NGENE],fix_rgene, model;
\r
49 double rgene[NGENE],piG[NGENE][NCODE], alpha;
\r
53 int FROM61[64], FROM64[64], FourFold[4][4];
\r
54 double PMat[NCODE*NCODE];
\r
55 char *codonfreqs[]={"Fequal", "F1x4", "F3x4", "Fcodon"};
\r
56 enum {Fequal, F1x4, F3x4, Fcodon} CodonFreqs;
\r
58 FILE *frst, *frst1, *frub;
\r
59 extern char BASEs[], AAs[];
\r
60 extern int noisy, GeneticCode[][64];
\r
62 enum {NODEBUG, KAPPA, SITES, DIFF} DebugFunctions;
\r
65 double omega_NG, dN_NG, dS_NG; /* what are these for? */
\r
69 #define REALSEQUENCE
\r
70 #include "treesub.c"
\r
73 int main(int argc, char *argv[])
\r
75 char dsf[512]="2YN.dS", dnf[512]="2YN.dN", tf[512]="2YN.t";
\r
76 FILE *fout, *fseq, *fds, *fdn, *ft;
\r
77 char ctlf[96]="yn00.ctl", timestr[64];
\r
78 int n=com.ncode, is,js, j, idata, wname=20, sspace;
\r
79 double t=0.4, dS=0.1,dN=0.1, S,N, SEdS, SEdN, f3x4[12], *space=NULL;
\r
81 /* ConsistencyMC(); */
\r
83 printf("YN00 in %s\n", pamlVerStr);
\r
85 if (argc>1) strcpy(ctlf, argv[1]);
\r
86 com.seqtype=1; com.cleandata=1; /* works for clean data only? */
\r
87 com.ndata=1; com.print=0;
\r
88 noisy=1; com.icode=0; com.fcommon=0; com.kcommon=1;
\r
91 fout = fopen (com.outf, "w");
\r
92 frst = fopen("rst", "w");
\r
93 frst1 = fopen("rst1", "w");
\r
94 frub = fopen ("rub", "w");
\r
95 if (fout==NULL || frst==NULL) error2("outfile creation err.");
\r
96 fds = (FILE*)fopen(dsf, "w");
\r
97 fdn = (FILE*)fopen(dnf, "w");
\r
98 ft = (FILE*)fopen(tf, "w");
\r
99 if(fds==NULL || fdn==NULL || ft==NULL) error2("file open error");
\r
101 if((fseq=fopen (com.seqf,"r"))==NULL) {
\r
102 printf ("\n\nSequence file %s not found!\n", com.seqf);
\r
105 for (idata=0; idata<com.ndata; idata++) {
\r
107 printf("\nData set %d\n", idata+1);
\r
108 fprintf(fout, "\n\nData set %d\n", idata+1);
\r
109 fprintf(frst, "\t%d", idata+1);
\r
112 ReadSeq((com.verbose?fout:NULL), fseq, com.cleandata, 0);
\r
115 sspace = max2(200000,64*com.ns*sizeof(double));
\r
116 sspace = max2(sspace,64*64*5*sizeof(double));
\r
117 if ((space=(double*)realloc(space,sspace))==NULL) error2("oom space");
\r
121 fprintf(fout,"YN00 %15s", com.seqf);
\r
122 Statistics(fout, space);
\r
124 if(noisy) printf("\n\n(A) Nei-Gojobori (1986) method\n");
\r
125 fprintf(fout,"\n\n(A) Nei-Gojobori (1986) method\n");
\r
126 DistanceMatNG86 (fout, NULL, NULL, NULL, 0);
\r
129 if(noisy) printf("\n\n(B) Yang & Nielsen (2000) method\n\n");
\r
130 fprintf(fout,"\n\n(B) Yang & Nielsen (2000) method\n\n");
\r
131 fprintf(fout,"Yang Z, Nielsen R (2000) Estimating synonymous and nonsynonymous substitution rates under realistic evolutionary models. Mol. Biol. Evol. 17:32-43\n");
\r
132 if(!com.weighting) fputs("\n(equal weighting of pathways)\n",fout);
\r
134 if(com.fcommon) GetFreqs(-1, -1, f3x4, com.pi);
\r
137 printf("kappa = %.2f\n\n",com.kappa);
\r
138 /* puts("kappa?"); scanf("%lf", &com.kappa); */
\r
141 fputs("\nseq. seq. S N t kappa omega dN +- SE dS +- SE\n\n",fout);
\r
142 fprintf(fds,"%6d\n", com.ns);
\r
143 fprintf(fdn,"%6d\n", com.ns);
\r
144 fprintf(ft,"%6d\n", com.ns);
\r
145 for(is=0; is<com.ns; is++) {
\r
146 fprintf(fds,"%-*s ", wname,com.spname[is]);
\r
147 fprintf(fdn,"%-*s ", wname,com.spname[is]);
\r
148 fprintf(ft,"%-*s ", wname,com.spname[is]);
\r
149 for(js=0; js<is; js++) {
\r
150 if(noisy) printf("%3d vs. %3d\n", is+1, js+1);
\r
151 fprintf(fout, " %3d %3d ", is+1, js+1);
\r
153 if(!com.fcommon) GetFreqs(is, js, f3x4, com.pi);
\r
154 if(!com.kcommon) GetKappa();
\r
155 j = DistanceYN00(is, js, &S, &N, &dS,&dN, &SEdS, &SEdN, &t,space);
\r
157 fprintf(fout,"%7.1f %7.1f %8.4f %7.4f %7.4f %6.4f +- %6.4f %7.4f +- %6.4f\n",
\r
158 S,N,t,com.kappa,com.omega,dN,SEdN,dS,SEdS);
\r
159 fprintf(frst," YN: %8.4f%8.4f%8.4f %6.4f +- %6.4f %7.4f +- %6.4f\n",
\r
160 t,com.kappa,com.omega,dN,SEdN,dS,SEdS);
\r
162 fprintf(fds," %7.4f",dS); fprintf(fdn," %7.4f",dN); fprintf(ft," %7.4f",t);
\r
164 FPN(fds); FPN(fdn); FPN(ft);
\r
165 fflush(fds); fflush(fdn); fflush(ft);
\r
167 FPN(fds); FPN(fdn); FPN(ft);
\r
169 if(noisy) printf("\n\n(C) LWL85, LPB93 & LWLm methods\n\n");
\r
170 fprintf(fout,"\n\n(C) LWL85, LPB93 & LWLm methods\n\n");
\r
171 fprintf(fout,"Li W.-H., C.-I. Wu, Luo (1985) A new method for estimating synonymous and nonsynonymous rates of nucleotide substitutions considering the relative likelihood of nucleotide and codon changes. Mol. Biol. Evol. 2: 150-174.\n");
\r
172 fprintf(fout,"Li W-H (1993) Unbiased estimation of the rates of synonymous and nonsynonymous substitution. J. Mol. Evol. 36:96-99\n");
\r
173 fprintf(fout,"Pamilo P, Bianchi NO (1993) Evolution of the Zfx and Zfy genes - rates and interdependence between the genes. Mol. Biol. Evol. 10:271-281\n");
\r
174 fprintf(fout,"Yang Z (2006) Computational Molecular Evolution. Oxford University Press, Oxford. Eqs. 2.12 & 2.13\n");
\r
176 DistanceMatLWL85(fout);
\r
179 if(noisy) printf("\nTime used: %s\n", printtime(timestr));
\r
186 int GetOptions (char *ctlf)
\r
188 int i, nopt=9, lline=4096;
\r
189 char line[4096], *pline, opt[20], comment='*';
\r
190 char *optstr[]={"seqfile","outfile", "verbose", "noisy", "icode",
\r
191 "weighting","commonkappa", "commonf3x4", "ndata"};
\r
195 if((fctl=fopen(ctlf,"r"))==NULL) error2("\nctl file open error.\n");
\r
196 printf ("\nReading options from %s..\n", ctlf);
\r
198 if (fgets (line, lline, fctl) == NULL) break;
\r
199 for (i=0,t=0,pline=line; i<lline&&line[i]; i++)
\r
200 if (isalnum(line[i])) { t=1; break; }
\r
201 else if (line[i]==comment) break;
\r
202 if (t==0) continue;
\r
203 sscanf (line, "%s%*s%lf", opt, &t);
\r
204 if ((pline=strstr(line, "="))==NULL) error2("option file.");
\r
206 for (i=0; i<nopt; i++) {
\r
207 if (strncmp(opt, optstr[i], 8)==0) {
\r
209 printf ("\n%3d %15s | %-20s %6.2f", i+1,optstr[i],opt,t);
\r
211 case (0): sscanf(pline+2, "%s", com.seqf); break;
\r
212 case (1): sscanf(pline+2, "%s", com.outf); break;
\r
213 case (2): com.verbose=(int)t; break;
\r
214 case (3): noisy=(int)t; break;
\r
215 case (4): com.icode=(int)t; break;
\r
216 case (5): com.weighting=(int)t; break;
\r
217 case (6): com.kcommon=(int)t; break;
\r
218 case (7): com.fcommon=(int)t; break;
\r
219 case (8): com.ndata=(int)t; break;
\r
225 { printf ("\noption %s in %s\n", opt, ctlf); exit (-1); }
\r
228 for (i=0,Nsensecodon=0; i<64; i++)
\r
229 if (GeneticCode[com.icode][i]!=-1) Nsensecodon++;
\r
230 com.ncode = Nsensecodon;
\r
236 int DistanceYN00(int is, int js, double*S, double*N, double*dS,double*dN,
\r
237 double *SEdS, double *SEdN, double *t,double space[])
\r
239 /* calculates dS, dN, w, t by weighting.
\r
240 com.kappa & com.pi[] are calculated beforehand are not updated.
\r
242 int j,k,ir,nround=10, status=0;
\r
243 double fbS[4],fbN[4],fbSt[4],fbNt[4], St,Nt, Sdts,Sdtv,Ndts,Ndtv, kappaS,kappaN;
\r
244 double w0=0,dS0=0,dN0=0, accu=5e-4, minomega=1e-5,maxomega=99;
\r
247 if(com.omega<=0) com.omega=1;
\r
248 for(k=0; k<4; k++) fbS[k] = fbN[k] = 0;
\r
249 if(debug) printf("\nCountSites\n");
\r
250 if(noisy>3) printf("\n");
\r
251 for(k=0,*S=*N=0; k<2; k++) {
\r
252 CountSites(com.z[k==0?is:js], com.pi, &St, &Nt, fbSt, fbNt);
\r
255 for(j=0; j<4; j++) {
\r
256 fbS[j] += fbSt[j]/2;
\r
257 fbN[j] += fbNt[j]/2;
\r
259 if(noisy>3) printf("Seq. %d: S = %9.3f N=%9.3f\n",k+1,St,Nt);
\r
262 printf("Ave. : S = %9.3f N=%9.3f\n\n",*S,*N);
\r
263 printf("Base freqs at syn & nonsyn sites\n%10s%10s%10s%10s\n", "T", "C", "A", "G");
\r
264 for(k=0; k<4; k++) printf(" %9.6f", fbS[k]); FPN(F0);
\r
265 for(k=0; k<4; k++) printf(" %9.6f", fbN[k]); FPN(F0);
\r
268 printf(" # Sdts Sdtv Ndts Ndtv | t kappa w dN dS | kappaS kappaN\n");
\r
270 /* initial values? */
\r
271 if(com.weighting) {
\r
272 if(*t<0.001 || *t>5) *t=0.5;
\r
273 if(com.omega<0.01 || com.omega>5) com.omega=.5;
\r
275 for (ir=0; ir<(com.weighting?nround:1); ir++) { /* weighting or iteration */
\r
277 GetPMatCodon(PMat,*t,com.kappa,com.omega,space);
\r
279 for(j=0; j<com.ncode*com.ncode; j++)
\r
282 CountDiffs(com.z[is], com.z[js], &Sdts, &Sdtv, &Ndts, &Ndtv, PMat);
\r
284 if(DistanceF84(*S, Sdts/ *S, Sdtv/ *S, fbS, &kappaS, dS, SEdS)) status=-1;
\r
285 if(DistanceF84(*N, Ndts/ *N, Ndtv/ *N, fbN, &kappaN, dN, SEdN)) status=-1;
\r
289 com.omega = maxomega;
\r
292 com.omega= max2(minomega, *dN/ *dS);
\r
293 *t = *dS * 3 * *S/(*S + *N) + *dN * 3 * *N/(*S + *N);
\r
295 printf("%2d %7.2f%7.2f%7.2f%7.2f |", ir+1, Sdts,Sdtv,Ndts,Ndtv);
\r
296 printf("%8.4f%8.4f%8.4f%8.4f%8.4f", *t, com.kappa,com.omega,*dN,*dS);
\r
297 printf(" | %8.4f%8.4f\n", kappaS,kappaN);
\r
299 if(fabs(*dS-dS0)<accu && fabs(*dN-dN0)<accu && fabs(com.omega-w0)<accu)
\r
301 dS0=*dS; dN0=*dN; w0=com.omega;
\r
303 if(ir==nround) status=-2;
\r
304 /* if(status) printf("\n\tstatus: %d\n", status); */
\r
310 int Statistics(FILE *fout, double space[])
\r
312 /* This calculates base frequencies, using npatt & fpatt[]
\r
314 int h, is,j, c[3], wname=20;
\r
315 double f3x4tot[12], *fb3tot=com.pi, *fb3s=space;
\r
318 fprintf(fout, "\n\nns =%4d\tls =%4d", com.ns, com.ls);
\r
319 fprintf(fout,"\n\nCodon position x base (3x4) table for each sequence.");
\r
321 zero(f3x4tot,12); zero(fb3s,64*com.ns);
\r
322 for(is=0; is<com.ns; is++) zero(com.f3x4s[is], 12);
\r
323 for (is=0; is<com.ns; is++) {
\r
324 for (h=0; h<com.npatt; h++) {
\r
325 j = FROM61[com.z[is][h]];
\r
326 c[0]=j/16; c[1]=(j%16)/4; c[2]=j%4;
\r
327 fb3s[is*64+j] += com.fpatt[h];
\r
329 com.f3x4s[is][j*4+c[j]] += com.fpatt[h]/com.ls;
\r
331 for(j=0; j<12; j++) f3x4tot[j] += com.f3x4s[is][j]/com.ns;
\r
333 fprintf(fout,"\n\n%-*s", wname, com.spname[is]);
\r
334 for(j=0; j<3; j++) {
\r
335 fprintf (fout, "\nposition %2d:", j+1);
\r
337 fprintf (fout,"%5c:%7.5f", BASEs[h], com.f3x4s[is][j*4+h]);
\r
342 fprintf (fout, "\n\nAverage");
\r
343 for(j=0; j<3; j++) {
\r
344 fprintf (fout, "\nposition %2d:", j+1);
\r
346 fprintf (fout,"%5c:%7.5f", BASEs[h], f3x4tot[j*4+h]);
\r
348 for(is=0,zero(fb3tot,64);is<com.ns;is++)
\r
349 for(j=0; j<64; j++) fb3tot[j] += fb3s[is*64+j];
\r
350 fprintf (fout, "\n\nCodon usage for each species\n");
\r
351 printcums (fout, com.ns, fb3s, com.icode);
\r
352 fprintf (fout, "\nSums\n");
\r
353 printcums (fout, 1, fb3tot, com.icode);
\r
359 int GetFreqs(int is1, int is2, double f3x4[], double pi[])
\r
361 /* uses com.fcommon and com.f3x4s to calculate f3x4[] and pi[].
\r
362 Codon frequencies pi[] are calculated from the f3x4 table.
\r
363 The calculation is duplicated when com.fcommon=1.
\r
365 int n=com.ncode, j, k, ic, b[3];
\r
368 for(j=0,zero(f3x4,12);j<com.ns;j++)
\r
369 for(k=0; k<12; k++) f3x4[k]+=com.f3x4s[j][k]/com.ns;
\r
371 for(k=0; k<12; k++)
\r
372 f3x4[k] = (com.f3x4s[is1][k]+com.f3x4s[is2][k])/2;
\r
375 matout(F0, f3x4, 3, 4);
\r
376 for(j=0; j<n; j++) {
\r
377 ic=FROM61[j]; b[0]=ic/16; b[1]=(ic%16)/4; b[2]=ic%4;
\r
378 pi[j] = f3x4[b[0]] * f3x4[4+b[1]] * f3x4[8+b[2]];
\r
380 abyx(1/sum(pi,n), pi, n);
\r
386 int DistanceMatLWL85 (FILE *fout)
\r
388 /* This implements 3 methods: LWL85 (Li, Wu & Luo 1985), LPB (Li 1993,
\r
389 Pamilo & Bianchi 1993), and LWL85m (equation 12 in book; check other refs).
\r
392 int i,j,k, h, wname=15;
\r
393 char *codon1, *codon2, str[4]=" ";
\r
394 double L[3], sdiff[3], vdiff[3], Lt[3], sdifft[3], vdifft[3], A[3],B[3];
\r
395 double P[3],Q[3], a,b, dS,dN, pS2, S,N, Sd,Nd;
\r
397 for(i=0; i<com.ns; i++) {
\r
398 for(j=0; j<i; j++) { /* pair i and j */
\r
399 for(k=0; k<3; k++) L[k] = sdiff[k] = vdiff[k] = 0;
\r
401 for (h=0; h<com.npatt; h++) {
\r
402 codon1 = CODONs[com.z[i][h]];
\r
403 codon2 = CODONs[com.z[j][h]];
\r
404 difcodonLWL85(codon1, codon2, Lt, sdifft, vdifft, 0, com.icode);
\r
405 for(k=0; k<3; k++) {
\r
406 L[k] += Lt[k]*com.fpatt[h];
\r
407 sdiff[k] += sdifft[k]*com.fpatt[h];
\r
408 vdiff[k] += vdifft[k]*com.fpatt[h];
\r
412 for(k=0; k<3; k++) {
\r
413 P[k] = sdiff[k]/L[k];
\r
414 Q[k] = vdiff[k]/L[k];
\r
415 a = 1 - 2*P[k] - Q[k];
\r
417 A[k] = -log(a)/2 + log(b)/4;
\r
421 fprintf(fout, "\n%d (%s) vs. %d (%s)\n\n", i+1, com.spname[i], j+1, com.spname[j]);
\r
422 fprintf(fout,"L(i): %9.1f %9.1f %9.1f sum=%9.1f\n", L[0],L[1],L[2],L[0]+L[1]+L[2]);
\r
423 fprintf(fout,"Ns(i): %9.4f %9.4f %9.4f sum=%9.4f\n", sdiff[0],sdiff[1],sdiff[2], sdiff[0]+sdiff[1]+sdiff[2]);
\r
424 fprintf(fout,"Nv(i): %9.4f %9.4f %9.4f sum=%9.4f\n", vdiff[0],vdiff[1],vdiff[2], vdiff[0]+vdiff[1]+vdiff[2]);
\r
425 fprintf(fout,"A(i): %9.4f %9.4f %9.4f\n", A[0],A[1],A[2]);
\r
426 fprintf(fout,"B(i): %9.4f %9.4f %9.4f\n", B[0],B[1],B[2]);
\r
428 Sd = L[1]*A[1] + L[2]*(A[2]+B[2]);
\r
429 Nd = L[1]*B[1] + L[0]*(A[0]+B[0]);
\r
431 S = L[1]*pS2 + L[2];
\r
432 N = L[1]*(1-pS2) + L[0];
\r
435 fprintf(fout,"LWL85: dS = %7.4f dN = %7.4f w =%7.4f S =%7.1f N =%7.1f\n", dS,dN, dN/dS, S, N);
\r
436 pS2 = A[2]/(A[2]+B[2]);
\r
437 S = L[1]*pS2 + L[2];
\r
438 N = L[1]*(1-pS2) + L[0];
\r
441 fprintf(fout,"LWL85m: dS = %7.4f dN = %7.4f w =%7.4f S =%7.1f N =%7.1f (rho = %.3f)\n", dS,dN, dN/dS, S, N, pS2);
\r
443 dS = (L[1]*A[1]+L[2]*A[2])/(L[1]+L[2]) + B[2];
\r
444 dN = (L[0]*B[0]+L[1]*B[1])/(L[0]+L[1]) + A[0];
\r
445 fprintf(fout,"LPB93: dS = %7.4f dN = %7.4f w =%7.4f\n", dS, dN, dN/dS);
\r
448 if(noisy) printf(" %3d",i+1);
\r
451 if(fout) FPN(fout);
\r
459 /* This calculates mutational transition/transversion rate ratio kappa
\r
460 using 4-fold degenerate sites from pairwise comparisons
\r
461 under HKY85, weighting estimates by the numbers of sites
\r
463 int is,js,j,k,h, i1,pos,c[2],aa[2],b[2][3],a,ndeg,by[3]={16,4,1}, status=0;
\r
464 double ka[2], F[2][16],S[2],wk[2], t,P,Q,pi[4];
\r
465 /* F&S&wk [0]: non-degenerate; [1]:4-fold; S:sites */
\r
466 double kdefault=(com.kappa>0?com.kappa:(com.icode==1?10:2));
\r
467 char str1[4]=" ",str2[4]=" ", *sitestr[2]={"non-degenerate","4-fold"};
\r
469 for(is=0,com.kappa=0;is<com.ns;is++) {
\r
470 for(js=0; js<is; js++) {
\r
471 if(noisy>=9) printf ("\n%4d vs. %3d", is+1, js+1);
\r
472 for(k=0; k<2; k++) zero(F[k],16);
\r
473 for(h=0; h<com.npatt; h++) {
\r
474 c[0] = FROM61[com.z[is][h]];
\r
475 c[1] = FROM61[com.z[js][h]];
\r
476 for(k=0; k<2; k++) {
\r
478 b[k][1] = (c[k]%16)/4;
\r
480 aa[k] = GeneticCode[com.icode][c[k]];
\r
483 /* find non-degenerate sites */
\r
484 for(pos=0; pos<3; pos++) { /* check all positions */
\r
485 for(k=0,ndeg=0;k<2;k++) { /* two codons */
\r
486 for(i1=0; i1<4; i1++) {
\r
487 if(i1==b[k][pos]) continue;
\r
488 a = GeneticCode[com.icode][c[k]+(i1-b[k][pos])*by[pos]];
\r
489 if(a==aa[k]) break;
\r
494 F[0][b[0][pos]*4+b[1][pos]] += .5*com.fpatt[h];
\r
495 F[0][b[1][pos]*4+b[0][pos]] += .5*com.fpatt[h];
\r
499 /* find 4-fold degenerate sites at 3rd positions */
\r
500 for(k=0,ndeg=0;k<2;k++) { /* two codons */
\r
501 for(j=0,i1=c[k]-b[k][2]; j<4; j++)
\r
502 if(j!=b[k][2] && GeneticCode[com.icode][i1+j]!=aa[k]) break;
\r
503 if(aa[0]==aa[1] && j==4) ndeg++;
\r
505 if (ndeg<2) continue;
\r
506 F[1][b[0][2]*4+b[1][2]] += .5*com.fpatt[h];
\r
507 F[1][b[1][2]*4+b[0][2]] += .5*com.fpatt[h];
\r
509 for(k=0; k<2; k++) { /* two kinds of sites */
\r
511 if(noisy>3) printf("\n%s:\n",sitestr[k]);
\r
513 S[k] = sum(F[k],16);
\r
514 if(S[k]<=0) { wk[k]=0; continue; }
\r
515 for(j=0; j<16; j++) F[k][j]/=S[k];
\r
516 P = (F[k][0*4+1]+F[k][2*4+3])*2;
\r
517 Q = 1-(F[k][0*4+0]+F[k][1*4+1]+F[k][2*4+2]+F[k][3*4+3]) - P;
\r
519 pi[j] = sum(F[k]+j*4,4);
\r
520 DistanceF84(S[k], P,Q,pi, &ka[k], &t, NULL);
\r
521 wk[k] = (ka[k]>0?S[k]:0);
\r
523 /* matout(F0,F[k],4,4); matout(F0,pi,1,4); */
\r
526 printf("\nSPQkt:%9.4f%9.5f%9.5f%9.4f%9.4f\n",S[k],P,Q,ka[k],t);
\r
529 if(wk[0]+wk[1]==0) {
\r
532 if(noisy>3) printf("\ngot no kappa! fix it at %.4f\n",ka[0]);
\r
535 ka[0] = (ka[0]*wk[0]+ka[1]*wk[1])/(wk[0]+wk[1]);
\r
536 com.kappa += ka[0]/(com.ns*(com.ns-1.)/2);
\r
544 int CountSites(char z[],double pi[],double*Stot,double*Ntot,double fbS[],double fbN[])
\r
546 /* This calculates the total numbers of synonymous and nonsynonymous sites
\r
547 (Stot & Ntot) in the sequence z[] using com.kappa and pi[].
\r
548 It also count the base frequencies at the synonymous and nonsynonymous
\r
549 sites. Total number of sites is scaled to be equal to sequence length
\r
550 even if some changes are to stop codons. Since pi[] is scaled to sum
\r
551 to one, rates to stop codons are not considered.
\r
552 The counting goes through the sequence codon by codon, and so is different
\r
553 from the counting in codeml, which uses pi[] to count the sites.
\r
555 int h, j,k, c[2],aa[2], b[3], by[3]={16,4,1};
\r
556 double r, S,N, kappa=com.kappa;
\r
558 *Stot = *Ntot = 0;
\r
559 for(k=0; k<4; k++)
\r
560 fbS[k] = fbN[k] = 0;
\r
561 for (h=0; h<com.npatt; h++) {
\r
562 c[0] = FROM61[z[h]];
\r
563 b[0] = c[0]/16; b[1]=(c[0]%16)/4; b[2]=c[0]%4;
\r
564 aa[0] = GeneticCode[com.icode][c[0]];
\r
566 error2("stop codon");
\r
567 for (j=0,S=N=0; j<3; j++) {
\r
568 for(k=0; k<4; k++) { /* b[j] changes to k */
\r
569 if (k==b[j]) continue;
\r
570 c[1] = c[0]+(k-b[j])*by[j];
\r
571 aa[1] = GeneticCode[com.icode][c[1]];
\r
572 if(aa[1] == -1) continue;
\r
573 r = pi[FROM64[c[1]]];
\r
574 if (k+b[j]==1 || k+b[j]==5) r *= kappa; /* transition */
\r
575 if (aa[0]==aa[1]) { S += r; fbS[b[j]] += r*com.fpatt[h]; }
\r
576 else { N += r; fbN[b[j]] += r*com.fpatt[h]; }
\r
579 *Stot += com.fpatt[h]*S;
\r
580 *Ntot += com.fpatt[h]*N;
\r
582 r = 3*com.ls/(*Stot+*Ntot); *Stot*=r; *Ntot*=r;
\r
583 r = sum(fbS,4); for(k=0; k<4; k++) fbS[k] /= r;
\r
584 r = sum(fbN,4); for(k=0; k<4; k++) fbN[k] /= r;
\r
589 int GetPMatCodon(double P[],double t, double kappa, double omega, double space[])
\r
591 /* Get PMat=exp(Q*t) for weighting pathways
\r
593 int nterms=100, n=com.ncode, ic1, ic2, i,j,k, aa[2],ndiff,pos=0,from[3],to[3];
\r
594 double *Q=P, *U=space+n*n, *V=U+n*n, *Root=V+n*n, mr, spacesqrt[NCODE];
\r
596 for(i=0; i<n*n; i++) Q[i] = 0;
\r
597 for (i=0; i<n; i++) {
\r
598 ic1=FROM61[i]; from[0]=ic1/16; from[1]=(ic1/4)%4; from[2]=ic1%4;
\r
599 for(j=0; j<i; j++) {
\r
600 ic2=FROM61[j]; to[0]=ic2/16; to[1]=(ic2/4)%4; to[2]=ic2%4;
\r
601 aa[0] = GeneticCode[com.icode][ic1];
\r
602 aa[1] = GeneticCode[com.icode][ic2];
\r
603 if (aa[0]==-1 || aa[1]==-1) continue;
\r
604 for (k=0,ndiff=0; k<3; k++)
\r
605 if(from[k] != to[k]) { ndiff++; pos=k; }
\r
606 if (ndiff!=1) continue;
\r
608 if ((from[pos]+to[pos]-1)*(from[pos]+to[pos]-5)==0)
\r
610 if(aa[0] != aa[1]) Q[i*n+j] *= omega;
\r
611 Q[j*n+i] = Q[i*n+j];
\r
615 for(i=0; i<n; i++) for(j=0; j<n; j++)
\r
616 Q[i*n+j] *= com.pi[j];
\r
618 for (i=0,mr=0; i<n; i++) {
\r
619 Q[i*n+i] = -sum(Q+i*n,n);
\r
620 mr -= com.pi[i]*Q[i*n+i];
\r
623 eigenQREV(Q, com.pi, n, Root, U, V, spacesqrt);
\r
624 for(i=0; i<n; i++) Root[i] /= mr;
\r
625 PMatUVRoot(P, t, n, U, V, Root);
\r
627 testTransP(PMat, n);
\r
628 fprintf(frub,"\a\nP(%.5f)\n", t);
\r
629 for(i=0; i<n; i++,FPN(frub)) for(j=0; j<n; j++)
\r
630 fprintf(frub, " %9.5g", PMat[i*n+j]);
\r
638 int CountDiffs(char z1[],char z2[], double*Sdts,double*Sdtv,double*Ndts,double*Ndtv,double PMat[])
\r
640 /* Count the numbers of synonymous and nonsynonymous differences between
\r
641 sequences z1 and z2, weighting pathways with PMat. No weighting if PMat=NULL
\r
642 Modified from difcodon()
\r
643 dmark[i] (=0,1,2) is the i_th different codon position (i=0,1,ndiff).
\r
644 step[j] (=0,1,2) is the codon position to be changed at step j (j=0,1,ndiff).
\r
645 b[i][j] (=0,1,2,3) is the nucleotide at position j (0,1,2) in codon i (0,1)
\r
646 sts,stv,nts,ntv are syn ts & tv and nonsyn ts & tv at a codon site.
\r
647 stspath[k] stvpath[k] ntspath[k] ntvpath[k] are syn ts & tv and
\r
648 nonsyn ts & tv differences on path k (k=2,6).
\r
651 int n=com.ncode, h,i1,i2,i,k, transi, c[2],ct[2],aa[2], by[3]={16,4,1};
\r
652 int dmark[3], step[3], b[2][3], bt1[3], bt2[3];
\r
653 int ndiff, npath, nstop, stspath[6],stvpath[6],ntspath[6],ntvpath[6];
\r
654 double sts,stv,nts,ntv; /* syn ts & tv, nonsyn ts & tv for 2 codons */
\r
655 double ppath[6], sump,p;
\r
657 *Sdts = *Sdtv = *Ndts = *Ndtv = 0;
\r
658 for (h=0; h<com.npatt; h++) {
\r
659 c[0] = FROM61[z1[h]];
\r
660 c[1] = FROM61[z2[h]];
\r
661 if (c[0]==c[1]) continue;
\r
662 for(i=0; i<2; i++) {
\r
663 b[i][0]=c[i]/16; b[i][1]=(c[i]%16)/4; b[i][2]=c[i]%4;
\r
664 aa[i] = GeneticCode[com.icode][c[i]];
\r
666 if (aa[0]==-1 || aa[1]==-1)
\r
667 error2("stop codon in sequence.");
\r
668 ndiff=0; sts=stv=nts=ntv=0;
\r
669 for(k=0; k<3; k++) dmark[k] = -1;
\r
670 for(k=0; k<3; k++) if(b[0][k] != b[1][k]) dmark[ndiff++] = k;
\r
672 if(ndiff>1) npath = (ndiff==2 ? 2 : 6);
\r
674 transi = b[0][dmark[0]]+b[1][dmark[0]];
\r
675 transi = (transi==1 || transi==5);
\r
676 if (aa[0]==aa[1]) { if (transi) sts++; else stv++; }
\r
677 else { if (transi) nts++; else ntv++; }
\r
679 else { /* ndiff=2 or 3 */
\r
681 printf("\n\nh=%d %s (%c) .. ", h+1,getcodon(str,c[0]),AAs[aa[0]]);
\r
682 printf("%s (%c): ", getcodon(str,c[1]), AAs[aa[1]]);
\r
685 for(k=0; k<npath; k++) {
\r
686 if(debug==DIFF) printf("\npath %d: ", k+1);
\r
688 for(i1=0; i1<3; i1++) step[i1] = -1;
\r
690 step[0] = dmark[k];
\r
691 step[1] = dmark[1-k];
\r
696 if (step[0]<=step[1]) step[1]++;
\r
697 step[2] = 3-step[0]-step[1];
\r
699 for(i1=0; i1<3; i1++) bt1[i1] = bt2[i1]=b[0][i1];
\r
700 stspath[k] = stvpath[k] = ntspath[k] = ntvpath[k] = 0;
\r
701 /* mutations along each path */
\r
702 for (i1=0,ppath[k]=1; i1<ndiff; i1++) {
\r
703 bt2[step[i1]] = b[1][step[i1]];
\r
704 for (i2=0,ct[0]=ct[1]=0; i2<3; i2++) {
\r
705 ct[0] += bt1[i2]*by[i2];
\r
706 ct[1] += bt2[i2]*by[i2];
\r
708 ppath[k] *= PMat[ FROM64[ct[0]]*n + FROM64[ct[1]] ];
\r
709 for(i2=0; i2<2; i2++) aa[i2] = GeneticCode[com.icode][ct[i2]];
\r
711 if(debug==DIFF) printf("%s (%c) %.5f: ", getcodon(str,ct[1]),AAs[aa[1]],PMat[ct[0]*n+ct[1]]);
\r
714 nstop++; ppath[k]=0; break;
\r
716 transi = b[0][step[i1]]+b[1][step[i1]];
\r
717 transi = (transi==1 || transi==5); /* transition? */
\r
719 if(aa[0]==aa[1]) { if(transi) stspath[k]++; else stvpath[k]++; }
\r
720 else { if(transi) ntspath[k]++; else ntvpath[k]++; }
\r
721 for(i2=0; i2<3; i2++) bt1[i2] = bt2[i2];
\r
724 if(debug==DIFF) printf(" p =%.9f", ppath[k]);
\r
726 } /* for(k,npath) */
\r
727 if (npath==nstop) { /* all paths through stop codons */
\r
728 puts ("all paths through stop codons..");
\r
729 if (ndiff==2) { nts=.5; ntv=1.5; }
\r
730 else { nts=.5; ntv=2.5; }
\r
733 sump = sum(ppath,npath);
\r
735 printf("\nsump=0, npath=%4d\nh=%2d ", npath, h+1);
\r
736 printf("(%s ", getcodon(str,c[0]));
\r
737 printf("%s)", getcodon(str,c[1]));
\r
738 for(k=0; k<npath; k++) printf(" %9.6g", ppath[k]); FPN(F0);
\r
739 matout(frub, PMat, n, n);
\r
743 sump=1; FOR(k,npath) if(ppath[k]) ppath[k]=1./(npath-nstop);
\r
746 for(k=0; k<npath; k++) {
\r
748 sts += stspath[k]*p;
\r
749 stv += stvpath[k]*p;
\r
750 nts += ntspath[k]*p;
\r
751 ntv += ntvpath[k]*p;
\r
755 for(k=0; k<npath; k++) printf("\n p =%.5f", ppath[k]/sump); FPN(F0);
\r
756 printf(" syn ts & tv, nonsyn ts & tv:%9.5f%9.5f%9.5f%9.5f\n",sts,stv,nts,ntv);
\r
760 if(debug==DIFF) getchar();
\r
763 *Sdts += com.fpatt[h]*sts;
\r
764 *Sdtv += com.fpatt[h]*stv;
\r
765 *Ndts += com.fpatt[h]*nts;
\r
766 *Ndtv += com.fpatt[h]*ntv;
\r
772 int DistanceF84(double n, double P, double Q, double pi[],
\r
773 double*k_HKY, double*t, double*SEt)
\r
775 /* This calculates kappa and d from P (proportion of transitions) & Q
\r
776 (proportion of transversions) & pi under F84.
\r
777 When F84 fails, we try to use K80. When K80 fails, we try
\r
778 to use JC69. When JC69 fails, we set distance t to maxt.
\r
779 Variance formula under F84 is from Tateno et al. (1994), and briefly
\r
780 checked against simulated data sets.
\r
782 int failF84=0,failK80=0,failJC69=0;
\r
783 double tc,ag, Y,R, a=0,b=0, A=-1,B=-1,C=-1, k_F84;
\r
784 double Qsmall=min2(1e-10,0.1/n), maxkappa=999,maxt=99;
\r
787 Y=pi[0]+pi[1]; R=pi[2]+pi[3]; tc=pi[0]*pi[1]; ag=pi[2]*pi[3];
\r
788 if (P+Q>1) { *t=maxt; *k_HKY=1; return(3); }
\r
789 if (P<-1e-10 || Q<-1e-10 || fabs(Y+R-1)>1e-8) {
\r
790 printf("\nPQYR & pi[]: %9.5f%9.5f%9.5f%9.5f",P,Q,Y,R);
\r
792 error2("DistanceF84: input err.");
\r
794 if(Q<Qsmall) failF84=failK80=1;
\r
795 else if(Y<=0 || R<=0 || (tc<=0 && ag<=0)) failF84=1;
\r
797 A=tc/Y+ag/R; B=tc+ag; C=Y*R;
\r
798 a=(2*B+2*(tc*R/Y+ag*Y/R)*(1-Q/(2*C)) - P) / (2*A);
\r
800 if (a<=0 || b<=0) failF84=1;
\r
803 a=-.5*log(a); b=-.5*log(b);
\r
804 if(b<=0) failF84=1;
\r
807 *t = 4*b*(tc*(1+ k_F84/Y) + ag*(1+ k_F84/R)+C);
\r
808 *k_HKY = (B + (tc/Y+ag/R)* k_F84)/B; /* k_F84=>k_HKY85 */
\r
810 a = A*C/(A*C-C*P/2-(A-B)*Q/2);
\r
811 b = A*(A-B)/(A*C-C*P/2-(A-B)*Q/2) - (A-B-C)/(C-Q/2);
\r
812 *SEt = sqrt((a*a*P+b*b*Q-square(a*P+b*Q))/n);
\r
816 if(failF84 && !failK80) { /* try K80 */
\r
817 if (noisy>=9) printf("\na=%.5f b=%.5f, use K80\n", a,b);
\r
818 a=1-2*P-Q; b=1-2*Q;
\r
819 if (a<=0 || b<=0) failK80=1;
\r
821 a=-log(a); b=-log(b);
\r
822 if(b<=0) failK80=1;
\r
824 *k_HKY=(.5*a-.25*b)/(.25*b);
\r
828 a=1/(1-2*P-Q); b=(a+1/(1-2*Q))/2;
\r
829 *SEt = sqrt((a*a*P+b*b*Q-square(a*P+b*Q))/n);
\r
834 if((P+=Q)>=.75) { failJC69=1; P=.75*(n-1.)/n; }
\r
835 *t = -.75*log(1-P*4/3.);
\r
836 if(*t>maxt) *t=maxt;
\r
838 *SEt = sqrt(9*P*(1-P)/n) / (3-4*P);
\r
841 if(*k_HKY>maxkappa) *k_HKY=maxkappa;
\r
843 return(failF84 + failK80 + failJC69);
\r
850 double dsdnREV (int is, int js, double space[])
\r
852 /* This calculates ds and dn by recovering the Q*t matrix using the equation
\r
853 F(t) = PI * P(t) = PI * exp(Q*t)
\r
854 This is found not to work well and is not published.
\r
856 The code here is broken since I changed the coding. Codons are now coded 0, 1, ..., 60.
\r
858 int n=com.ncode, i,j, h;
\r
859 double *F=PMat, *Qt=F;
\r
860 double *Root=space+n*n,*pi=Root+n, *U=pi+n,*V=U+n*n;
\r
861 double *T1=V+n*n,*T2=T1+n*n, t, small=1e-6;
\r
863 fprintf(frst,"\npi in model\n");
\r
864 matout(frst,com.pi,1,n);
\r
866 FOR (h,com.npatt) {
\r
867 F[com.z[is][h]*n+com.z[js][h]]+=com.fpatt[h]/(2*com.ls);
\r
868 F[com.z[js][h]*n+com.z[is][h]]+=com.fpatt[h]/(2*com.ls);
\r
870 if(fabs(1-sum(F,n*n))>1e-6) error2("Sum F != 1 in dsdnREV");
\r
873 pi[i]=sum(F+i*n, n);
\r
875 if (F[i*n+i]<=small || F[i*n+i]<pi[i]/4)
\r
877 if (F[i*n+i]<=small) F[i*n+i]=1-pi[i]+F[i*n+i];
\r
878 else abyx(1/pi[i], F+i*n, n);
\r
880 if (eigen (1, F, n, Root, T1, U, V, T2)) error2 ("eigen jgl");
\r
882 matinv (V, n, n, T1);
\r
884 fprintf(frst,"\npi in data\n");
\r
885 matout (frst, pi, 1, n); FPN(F0);
\r
886 matout (frst, Root, 1, n);
\r
890 printf (" Root %d:%10.4f", i+1, Root[i]);
\r
891 Root[i]=log(Root[i]);
\r
893 FOR (i,n) FOR (j,n) T1[i*n+j]=U[i*n+j]*Root[j];
\r
894 matby (T1, V, Qt, n, n, n);
\r
895 for (i=0,t=0; i<n; i++) t-=pi[i]*Qt[i*n+i];
\r
896 if (t<=0) puts ("err: dsdnREV");
\r
898 FOR(i,n*n) Qt[i]+=1e-8; /* remove negative numbers from rounding errors */
\r
900 matout(frst,Qt,n,n);
\r
901 printf("\nt = %.5f\n", t);
\r