1 /* codeml.c (aaml.c & codonml.c)
\r
3 Maximum likelihood parameter estimation for codon sequences (seqtype=1)
\r
4 or amino-acid sequences (seqtype=2)
\r
5 Copyright, Ziheng YANG, 1993-2003
\r
7 cc -o codeml -fast codeml.c tools.o -lm
\r
8 codeml <ControlFileName>
\r
13 #define NSSITESBandits
\r
15 #define DSDN_MC_SITES 1
\r
21 #define NBRANCH (NS*2-2)
\r
22 #define NNODE (NS*2-1)
\r
23 #define MAXNSONS 100
\r
30 #define NP (NBRANCH*2+NGENE-1+2+NCODE+2)
\r
32 #define NP (NBRANCH+NGENE-1+189+2+NCODE+2)
\r
34 extern char BASEs[],AAs[];
\r
35 extern int noisy, NFunCall, NEigenQ, NPMatUVRoot, *ancestor, GeneticCode[][64];
\r
36 extern double *SeqDistance;
\r
37 extern double SS,NN,Sd,Nd; /* kostas, SS=# of syn. sites, NN=# of non-syn. sites, Sd=# of syn. subs., Nd=# of non-syn. subs. as defined in DistanceMatNG86 in treesub.c */
\r
39 int Forestry (FILE *fout);
\r
40 int GetMemPUVR(int nc, int nUVR);
\r
41 int sortwM3(double x[]);
\r
42 void DetailOutput(FILE *fout, double x[], double var[]);
\r
43 int GetOptions (char *ctlf);
\r
44 int testx (double x[], int np);
\r
45 int SetxBound (int np, double xb[][2]);
\r
46 int SetxInitials (int np, double x[], double xb[][2]);
\r
47 int GetInitials (double x[], int*fromfile);
\r
48 double *PointKappa (double xcom[], int igene);
\r
49 double *PointOmega (double xcom[], int igene, int inode, int isiteclass);
\r
50 int GetCodonFreqs (void);
\r
51 int SetParameters (double x[]);
\r
52 int SetParametersNSsites (double x[]);
\r
53 int Set_UVR_BranchSite (int iclass, int branchlabel);
\r
54 int SetPGene (int igene, int _pi, int _UVRoot, int _alpha, double x[]);
\r
55 int SetPSiteClass(int iclass, double x[]);
\r
56 int PMatJC69like (double P[], double t, int n);
\r
57 int printfcode (FILE *fout, double fb61[], double space[]);
\r
58 int InitializeCodon (FILE *fout, double space[]);
\r
59 int AA2Codonf (double faa[20], double fcodon[]);
\r
60 int DistanceMatAA (FILE *fout);
\r
61 int GetDaa(FILE *fout, double daa[]);
\r
62 void getpcodonClass(double x[], double pcodonClass[]);
\r
63 int SelectionCoefficients (FILE* fout, double kappa[], double ppi[], double omega);
\r
64 int eigenQcodon(int mode, double blength, double *S, double *dS, double *dN,
\r
65 double Root[], double U[], double V[], double *meanrate, double kappa[], double omega, double Q[]);
\r
66 int eigenQaa(FILE *fout, double Root[], double U[], double V[],double rate[]);
\r
67 int Qcodon2aa(double Qc[], double pic[], double Qaa[], double piaa[]);
\r
68 int SetAA1STEP(void);
\r
69 int GetOmegaAA(int OmegaAA[]);
\r
70 int TestModelQc(FILE *fout, double x[]);
\r
71 double lfun2dSdN(double x[], int np);
\r
72 int VariancedSdN(double t, double omega, double vtw[2*2], double vdSdN[2*2]);
\r
73 int GetCodonFreqs2 (void);
\r
74 int PairwiseCodon(FILE *fout, FILE*fds, FILE*fdn, FILE*dt, double space[]);
\r
75 int PairwiseAA(FILE *fout, FILE *f2AA);
\r
76 int lfunNSsites_rate(FILE* fout, double x[], int np);
\r
77 int lfunNSsites_M2M8(FILE* frst, double x[], int np);
\r
78 int lfunNSsites_AC(FILE* frst, double x[], int np);
\r
79 double GetBranchRate(int igene, int ibrate, double x[], int *ix);
\r
80 int GetPMatBranch(double Pt[], double x[], double t, int inode);
\r
81 int ConditionalPNode(int inode, int igene, double x[]);
\r
82 double CDFdN_dS(double x,double par[]);
\r
83 int DiscreteNSsites(double par[]);
\r
84 char GetAASiteSpecies(int species, int sitepatt);
\r
85 void finishup(void);
\r
86 int mergeSeqs(FILE*fout);
\r
87 void Get4foldSites(void);
\r
88 int AdHocRateSmoothing(FILE*fout, double x[NS*3], double xb[NS*3][2], double space[]);
\r
89 void DatingHeteroData(FILE* fout);
\r
91 int SlidingWindow(FILE*fout, FILE* fpair[], double space[]);
\r
93 void SimulateData2s61(void);
\r
95 void d4dSdN(FILE*fout);
\r
98 double logprior(double t, double w, double par[]);
\r
99 double logistic_transformation(double point, double logmean, double stdlogpar);
\r
100 double loglikelihood(double Ptmatrix[]);
\r
101 int EstVariances(double *var);
\r
102 int BayesPairwise(int is, int js, double x[], double var[], double maxlogl,
\r
103 int npoints, double xb[][2], double space[]);
\r
104 double logP(double x[], int np);
\r
105 double CDFLogis( double x, double m, double s );
\r
106 //end of kostas functions
\r
108 struct common_info {
\r
109 unsigned char *z[NS];
\r
110 char *spname[NS], seqf[512],outf[512],treef[512],daafile[512], cleandata;
\r
111 char oldconP[NNODE]; /* update conP for nodes? to save computation */
\r
112 int seqtype, ns, ls, ngene, posG[NGENE+1], lgene[NGENE], npatt,*pose, readpattern;
\r
113 int runmode,clock, verbose,print, codonf,aaDist,model,NSsites;
\r
114 int nOmega, nbtype, nOmegaType; /* branch partition, AA pair (w) partition */
\r
115 int method, icode, ncode, Mgene, ndata, bootstrap;
\r
116 int fix_rgene,fix_kappa,fix_omega,fix_alpha,fix_rho,nparK,fix_blength,getSE;
\r
117 int np, ntime, nrgene, nkappa, npi, nrate, nalpha, ncatG, hkyREV;
\r
118 size_t sconP, sspace;
\r
119 double *fpatt, *space, kappa,omega,alpha,rho,rgene[NGENE], TipDate, TipDate_TimeUnit;
\r
120 double pi[NCODE], piG[NGENE][64], fb61[64];
\r
121 double f3x4[NGENE][12], *pf3x4, piAA[20];
\r
122 double freqK[NCATG], rK[NCATG], MK[NCATG*NCATG],daa[20*20], *conP, *fhK;
\r
123 double (*plfun)(double x[],int np);
\r
124 double hyperpar[4]; /* kostas, the hyperparameters for the prior distribution of distance & omega */
\r
125 double omega_fix; /* fix the last w in the NSbranchB, NSbranch2 models
\r
126 for lineages. Useful for testing whether w>1 for some lineages. */
\r
127 int conPSiteClass; /* conPSiteClass=0 if (method==0) and =1 if (method==1)?? */
\r
129 char *nodeScale; /* nScale[ns-1] for interior nodes */
\r
130 double *nodeScaleF; /* nScaleF[npatt] for scale factors */
\r
131 /* pomega & pkappa are used to communicate between SetParameters & ConditionalPNode
\r
132 & eigenQcodon. Try to remove them? */
\r
133 double *pomega, pkappa[5], *ppi;
\r
136 int nbranch, nnode, root, branches[NBRANCH][2];
\r
140 int father, nson, sons[MAXNSONS], ibranch, ipop;
\r
141 double branch, age, omega, *conP, label;
\r
142 char *nodeStr, fossil, usefossil;
\r
143 } *nodes, **gnodes, nodes_t[2*NS-1];
\r
146 /* for sptree.nodes[].fossil: lower, upper, bounds, gamma, inverse-gamma */
\r
147 enum {LOWER_F=1, UPPER_F, BOUND_F} FOSSIL_FLAGS;
\r
148 char *fossils[]={" ", "L", "U", "B"};
\r
150 struct SPECIESTREE {
\r
151 int nbranch, nnode, root, nspecies, nfossil;
\r
153 char name[LSPNAME+1], fossil, usefossil; /* fossil: 0, 1, 2, 3 */
\r
154 int father, nson, sons[2];
\r
155 double age, pfossil[7]; /* lower and upper bounds or alpha & beta */
\r
156 double *lnrates; /* log rates for loci */
\r
159 /* all trees are binary & rooted, with ancestors unknown. */
\r
161 struct DATA { /* locus-specific data and tree information */
\r
162 int ns[NGENE], ls[NGENE], npatt[NGENE], ngene, lgene[NGENE];
\r
163 int root[NGENE+1], BlengthMethod, fix_nu, nbrate[NGENE], icode[NGENE];
\r
165 char *z[NGENE][NS], cleandata[NGENE];
\r
166 char idaafile[NGENE], daafile[NGENE][40];
\r
167 double *fpatt[NGENE], lnpT, lnpR, lnpDi[NGENE];
\r
168 double Qfactor[NGENE], pi[NGENE][NCODE];
\r
169 double rgene[NGENE], kappa[NGENE], alpha[NGENE], omega[NGENE];
\r
170 int NnodeScale[NGENE];
\r
171 char *nodeScale[NGENE]; /* nScale[data.ns[locus]-1] for interior nodes */
\r
174 extern double Small_Diff;
\r
175 int Nsensecodon, FROM61[64], FROM64[64], FourFold[4][4];
\r
176 int ChangedInIteration; /* 1: t changed, update P(t); 2: paras changed, update UVRoot */
\r
177 double *PMat, *U, *V, *Root, *_UU[NBTYPE+2], *_VV[NBTYPE+2], *_Root[NBTYPE+2];
\r
178 /* 5 sets for branchsite models (YN2002); 6 sets for clade models */
\r
180 double pcodon0[64],paa0[20], *pcodonClass; /* for aaDist=FIT1 or FIT2 */
\r
182 int BayesEB; /* =1 for site models M2a & M8; =2 for branch-site models A & C */
\r
186 int OmegaAA[190], AA1STEP[190];
\r
187 enum {DNA, AA, CODON, MORPHC} DATATYPE;
\r
189 double _rateSite=1;
\r
190 double Qfactor_NS, Qfactor_NS_branch[NBTYPE];
\r
191 int KGaussLegendreRule=16;
\r
193 double AAchem[][20+1]={ /* last element is the max */
\r
194 {8.1, 10.5, 11.6, 13, 5.5, 10.5, 12.3, 9, 10.4, 5.2,
\r
195 4.9, 11.3, 5.7, 5.2, 8, 9.2, 8.6, 5.4, 6.2, 5.9, 13}, /* p */
\r
196 { 31, 124, 56, 54, 55, 85, 83, 3, 96, 111,
\r
197 111, 119, 105, 132, 32.5, 32, 61, 170, 136, 84, 170}, /* v */
\r
198 {0, 0.65, 1.33, 1.38, 2.75, 0.89, 0.92, 0.74, 0.58,
\r
199 0, 0, 0.33, 0, 0, 0.39, 1.42, 0.71, 0.13, 0.2, 0, -999},/* c */
\r
200 {-0.11, 0.079, -0.136, -0.285, -0.184, -0.067, -0.246, -0.073, 0.32, 0.001,
\r
201 -0.008, 0.049, -0.041, 0.438, -0.016, -0.153, -0.208, 0.493, 0.381, -0.155} /* a */
\r
202 }; /* in the order p, v, c, a */
\r
205 FILE *fout, *frub, *flnf, *frst, *frst1, *frst2=NULL, *finitials;
\r
206 char *ratef="rates";
\r
207 enum {Fequal, F1x4, F3x4, Fcodon, F1x4MG, F3x4MG, FMutSel0, FMutSel} CodonFreqs;
\r
208 char *codonfreqs[]={"Fequal", "F1x4", "F3x4", "Fcodon", "F1x4MG", "F3x4MG", "FMutSel0", "FMutSel"};
\r
209 enum {NSbranchB=1, NSbranch2, NSbranch3} NSBranchModels;
\r
210 char *NSbranchmodels[]={"One dN/dS ratio",
\r
211 "free dN/dS Ratios for branches", "several dN/dS ratios for branches",
\r
213 enum {Poisson, EqualInput, Empirical, Empirical_F,
\r
214 FromCodon=6, REVaa_0=8, REVaa=9} AAModel;
\r
215 char *aamodels[]={"Poisson", "EqualInput", "Empirical", "Empirical_F", "",
\r
216 "", "FromCodon", "", "REVaa_0", "REVaa"};
\r
217 enum {NSnneutral=1, NSpselection, NSdiscrete, NSfreqs, NSgamma, NS2gamma,
\r
218 NSbeta, NSbetaw, NSbetagamma, NSbeta1gamma, NSbeta1normal, NS02normal,
\r
219 NS3normal, NSM2aRel=22, NSTgamma, NSTinvgamma, NSTgamma1, NSTinvgamma1} NSsitesModels;
\r
220 char *NSsitesmodels[]={"one-ratio","NearlyNeutral", "PositiveSelection","discrete","freqs",
\r
221 "gamma","2gamma","beta","beta&w>1","beta&gamma", "beta&gamma+1",
\r
222 "beta&normal>1", "0&2normal>0", "3normal>0", "", "", "", "", "", "", "", "",
\r
223 "M2a_rel", "Tgamma", "Tinvgamma", "Tgamma+1", "Tinvgamma+1"};
\r
224 int maxNSsitesModels=27;
\r
225 enum {FIT1=11, FIT2=12} SiteClassModels;
\r
226 enum {AAClasses=7 } aaDistModels;
\r
227 char *clockstr[]={"", "Global clock", "Local clock", "ClockCombined"};
\r
228 enum {GlobalClock=1, LocalClock, ClockCombined} ClockModels;
\r
231 #include "treesub.c"
\r
232 #include "treespace.c"
\r
235 /* variables for batch run of site models */
\r
236 int ncatG0=10, insmodel=0, nnsmodels=1, nsmodels[15]={0};
\r
237 /* used for sliding windows analysis */
\r
238 int windowsize0=20, offset0=1, npositive=0;
\r
241 int main (int argc, char *argv[])
\r
243 FILE *fseq=NULL, *fpair[6];
\r
244 char pairfs[6][32]={"2NG.dS","2NG.dN","2NG.t", "2ML.dS","2ML.dN","2ML.t"};
\r
245 char ctlf[96]="codeml.ctl", *pmodel, timestr[64];
\r
246 char *seqtypestr[3]={"CODONML", "AAML", "CODON2AAML"};
\r
247 char *Mgenestr[]={"diff. rate", "separate data", "diff. rate & pi",
\r
248 "diff. rate & k&w", "diff. rate & pi & k&w"};
\r
249 int getdistance=1, i, k, s2=0, idata, nc, nUVR, cleandata0;
\r
252 #ifdef NSSITESBandits
\r
258 printf("KGaussLegendreRule? ");
\r
259 scanf("%d", &KGaussLegendreRule);
\r
262 noisy=0; com.runmode=0;
\r
263 com.clock=0; com.fix_rgene=0; /* 0: estimate rate factors for genes */
\r
264 com.cleandata=0; /* 1: delete; 0:use missing data */
\r
266 com.model=Empirical_F;
\r
267 strcpy(com.daafile, "jones.dat");
\r
268 com.icode=0; com.nrate=0;
\r
269 com.fix_kappa=0; com.kappa=1; com.omega=2.1;
\r
270 com.fix_alpha=1; com.alpha=0.; com.ncatG=4; /* alpha=0 := inf */
\r
271 com.fix_rho=1; com.rho=0.;
\r
272 com.getSE=0; com.print=0; com.verbose=1; com.fix_blength=0;
\r
273 com.method=0; com.space=NULL;
\r
275 frub=gfopen("rub","w");
\r
276 frst=gfopen("rst","w");
\r
277 frst1=gfopen("rst1","w");
\r
280 mergeSeqs(frst); exit(0);
\r
285 #if (DSDN_MC || DSDN_MC_SITES)
\r
286 SimulateData2s61();
\r
289 if(argc>1) strncpy(ctlf, argv[1], 95);
\r
292 cleandata0 = com.cleandata;
\r
293 if(com.runmode!=-2 && com.runmode!=-3)
\r
294 finitials=fopen("in.codeml","r");
\r
298 fprintf(frst, "Supplemental results for CODEML (seqf: %s treef: %s)\n",
\r
299 com.seqf, com.treef);
\r
300 if(com.getSE==2) frst2=fopen("rst2","w");
\r
302 printf("%s in %s\n", seqtypestr[com.seqtype-1], pamlVerStr);
\r
304 fout = gfopen(com.outf, "w");
\r
306 if(noisy && com.seqtype==CODONseq)
\r
307 { printcu(F0,NULL,com.icode); puts("Nice code, uuh?"); }
\r
309 /* space for P&U&V&Root */
\r
310 if(com.clock==5 || com.clock==6)
\r
311 DatingHeteroData(fout);
\r
314 if(com.seqtype==CODONseq) {
\r
316 if(com.model>=1) nUVR = NBTYPE+2;
\r
318 else if (com.seqtype==CODONseq || com.model==FromCodon)
\r
321 GetMemPUVR(nc, nUVR);
\r
323 if((fseq=fopen(com.seqf,"r"))==NULL || com.seqf[0]=='\0') {
\r
324 printf ("\n\nSequence file %s not found!\n", com.seqf);
\r
328 /* d4dSdN(fout); */
\r
329 if (com.aaDist==AAClasses) {
\r
331 GetOmegaAA(OmegaAA);
\r
333 else if (com.seqtype==AAseq && com.model==REVaa_0)
\r
336 if(com.seqtype==1) {
\r
337 for(i=0; i<3; i++)
\r
338 fpair[i]=(FILE*)gfopen(pairfs[i],"w");
\r
339 if(com.runmode==-2 || com.runmode==-3)
\r
340 for(; i<6;i++) fpair[i]=(FILE*)gfopen(pairfs[i],"w");
\r
342 else if(com.runmode==-2)
\r
343 fpair[0]=(FILE*)gfopen("2AA.t","w");
\r
345 for (idata=0; idata<com.ndata; idata++) {
\r
347 printf ("\nData set %d ", idata+1);
\r
348 fprintf(fout, "\n\nData set %d\n", idata+1);
\r
349 fprintf(frst,"\t%d",idata+1);
\r
350 fprintf(frst1, "%d", idata+1);
\r
351 fprintf(frub,"\nData set %2d\n",idata+1);
\r
355 GetOptions(ctlf); /* warning: ndata, cleandata etc. are read again. */
\r
357 if(com.seqtype!=1) error2("batch run of site models requires codon seqs.");
\r
358 if(com.fix_omega) error2("fix omega during batch run?");
\r
359 if(com.model) error2("model should be 0 in the batch run?");
\r
360 if(com.runmode) error2("runmode?");
\r
362 /* for allocating memory com.fhK[] */
\r
363 com.NSsites=NSbetaw; com.ncatG=ncatG0+1;
\r
364 for(i=0; i<nnsmodels; i++)
\r
365 if(nsmodels[i]>=NSTgamma || nsmodels[i]<=NSTinvgamma1)
\r
366 com.ncatG = max2(com.ncatG, KGaussLegendreRule+1);
\r
367 printf("NSsites batch run (ncatG as in YNGP2000): ");
\r
368 for(i=0; i<nnsmodels; i++)
\r
369 printf(" %2d", nsmodels[i]);
\r
373 com.cleandata = cleandata0;
\r
375 /* ReadSeq may change seqtype*/
\r
376 ReadSeq((com.verbose?fout:NULL), fseq, com.cleandata, 0);
\r
379 /* AllPatterns(fout); */
\r
381 fprintf(frst1,"\t%d\t%d\t%d", com.ns, com.ls, com.npatt);
\r
386 if(com.seqtype==1 && com.npi)
\r
387 error2("codon models (estFreq) not implemented for ngene > 1");
\r
388 if(com.runmode==-2 && com.Mgene!=1) error2("use Mgene=1 for runmode=-2?");
\r
389 if(com.runmode==-3 && com.Mgene!=1) error2("use Mgene=1 for runmode=-3?");
\r
390 if(com.model) error2("NSbranchsites with ngene.");
\r
391 if(com.NSsites) error2("NSsites with ngene.");
\r
392 if(com.aaDist>=FIT1) /* because of pcodon0[] */
\r
393 { error2("ngene for amino acid fitness models"); }
\r
396 if(com.ndata==1) fclose(fseq);
\r
398 i = (com.ns*2-1)*sizeof(struct TREEN);
\r
399 if((nodes=(struct TREEN*)malloc(i))==NULL)
\r
400 error2("oom nodes");
\r
402 pmodel=(com.seqtype==CODONseq?NSbranchmodels[com.model]:aamodels[com.model]);
\r
403 fprintf(fout,"%s (in %s) %s\n",seqtypestr[com.seqtype-1], pamlVerStr, com.seqf);
\r
404 fprintf(fout,"Model: %s for branches, ", pmodel);
\r
405 if(com.clock) fprintf(fout," %s ",clockstr[com.clock]);
\r
406 if(com.seqtype==CODONseq||com.model==FromCodon) {
\r
407 if(com.fix_kappa) fprintf(fout, " kappa = %.3f fixed\n", com.kappa);
\r
408 if(com.fix_omega) fprintf(fout, " omega = %.3f fixed\n", com.omega);
\r
410 if(com.seqtype==AAseq && (com.model==Empirical||com.model==Empirical_F))
\r
411 fprintf (fout, " (%s) ", com.daafile);
\r
412 if(com.seqtype==AAseq&&com.nrate) fprintf(fout,"(nrate:%d) ", com.nrate);
\r
413 if(com.alpha && com.rho) fprintf (fout, "Auto-");
\r
414 if(com.alpha) fprintf (fout, "dGamma (ncatG=%d) ", com.ncatG);
\r
416 fprintf (fout, " (%d genes: %s) ", com.ngene, Mgenestr[com.Mgene]);
\r
418 if(com.alpha==0) com.nalpha=0;
\r
419 else com.nalpha=(com.nalpha?com.ngene:!com.fix_alpha);
\r
420 if(com.Mgene==1) com.nalpha=!com.fix_alpha;
\r
421 if(com.nalpha>1 && (!com.alpha || com.ngene==1 || com.fix_alpha))
\r
423 if(com.nalpha>1 && com.rho) error2("Malpha or rho");
\r
424 if(com.nalpha>1) fprintf (fout,"(%d gamma)", com.nalpha);
\r
426 if(com.Mgene && com.ngene==1) error2("Mgene for one gene.");
\r
427 if(com.seqtype==CODONseq) {
\r
428 fprintf (fout, "\nCodon frequency model: %s\n", codonfreqs[com.codonf]);
\r
430 fputs("Warning: Gamma model for codons. See documentation.",fout);
\r
432 if((com.seqtype==CODONseq||com.model==FromCodon)
\r
433 && (com.aaDist && com.aaDist<10 && com.aaDist!=AAClasses))
\r
434 fprintf(fout,"%s, %s\n",com.daafile,(com.aaDist>0?"geometric":"linear"));
\r
437 fprintf(fout,"Site-class models: ");
\r
438 if (nnsmodels==1) {
\r
439 fprintf(fout," %s",NSsitesmodels[com.NSsites]);
\r
440 if(com.NSsites>=NSdiscrete)fprintf(fout," (%d categories)",com.ncatG);
\r
442 if(com.nparK) fprintf(fout," & HMM");
\r
445 fprintf(fout,"\nFitness models: aaDist: %d\n",com.aaDist);
\r
447 fprintf(fout,"ns = %3d ls = %3d\n\n", com.ns, com.ls);
\r
449 com.sspace = max2(5000000,3*com.ncode*com.ncode*sizeof(double));
\r
451 if(com.sspace < 2*com.ncode*com.ncode+4*com.npatt*sizeof(double))
\r
452 com.sspace = 2*com.ncode*com.ncode+4*com.npatt*sizeof(double);
\r
454 k = com.ns*(com.ns-1)/2;
\r
456 com.sspace=max2(com.sspace,
\r
457 (int)sizeof(double)*((com.ns*2-2)*(com.ns*2-2+4+k)+k));
\r
459 if((com.space = (double*)realloc(com.space,com.sspace))==NULL) {
\r
460 printf("\nfailed to get %9lu bytes for space", com.sspace);
\r
461 error2("oom space");
\r
464 SeqDistance=(double*)realloc(SeqDistance, k*sizeof(double));
\r
465 ancestor=(int*)realloc(ancestor, k*sizeof(int));
\r
466 if(SeqDistance==NULL||ancestor==NULL) error2("oom distance&ancestor");
\r
467 for(i=0; i<k; i++) SeqDistance[i] = -1;
\r
469 if(com.seqtype==AAseq) {
\r
470 InitializeBaseAA (fout);
\r
471 if (com.model==FromCodon /* ||com.aaDist==AAClasses */)
\r
472 AA2Codonf(com.pi, com.fb61); /* get codon freqs from aa freqs */
\r
474 else { /* codon sequences */
\r
475 if(com.sspace < max2(com.ngene+1,com.ns)*(64+12+4)*sizeof(double)) {
\r
476 com.sspace = max2(com.ngene+1,com.ns)*(64+12+4)*sizeof(double);
\r
477 if((com.space = (double*)realloc(com.space,com.sspace))==NULL)
\r
478 error2("oom space for #c");
\r
480 if (InitializeCodon(fout,com.space))
\r
481 error2("giving up on stop codons");
\r
484 for(i=0; i<com.ngene; i++)
\r
485 xtoy(com.pi,com.piG[i],com.ncode);
\r
489 if(com.seqtype==CODONseq)
\r
490 DistanceMatNG86(fout,fpair[0],fpair[1],fpair[2],0);
\r
492 DistanceMatAA(fout);
\r
496 if(com.seqtype==AAseq && com.model==Poisson && !com.print)
\r
497 PatternWeightJC69like(fout);
\r
498 if(com.alpha || com.NSsites) {
\r
499 s2=com.npatt*com.ncatG*sizeof(double);
\r
500 if((com.fhK=(double*)realloc(com.fhK,s2))==NULL) error2("oom fhK");
\r
506 npositive += SlidingWindow(fout, fpair, com.space);
\r
507 FPN(frst1); fflush(frst1);
\r
511 if((com.runmode==-2 || com.runmode==-3) && com.Mgene!=1) {
\r
512 if(com.seqtype==CODONseq)
\r
513 PairwiseCodon(fout,fpair[3],fpair[4],fpair[5],com.space);
\r
515 PairwiseAA(fout, fpair[0]);
\r
518 com.sconP = 2L *com.ncode*com.npatt*sizeof(double);
\r
519 /* to be increased later in GetInitials() */
\r
520 /* com.sconP = (com.ns-1)*com.ncode*com.npatt*sizeof(double); */
\r
521 com.conP = (double*)realloc(com.conP, com.sconP);
\r
523 printf("\n%9u bytes for distance",com.ns*(com.ns-1)/2*sizeof(double));
\r
524 printf("\n%9u bytes for conP\n", com.sconP);
\r
525 printf ("%9u bytes for fhK\n%9u bytes for space\n", s2, com.sspace);
\r
527 error2("oom conP");
\r
530 for(insmodel=0; insmodel<nnsmodels; insmodel++) {
\r
531 com.NSsites = nsmodels[insmodel];
\r
532 if(com.NSsites<=NSpselection)
\r
533 com.ncatG = com.NSsites+1;
\r
534 else if(com.NSsites==NSM2aRel || com.NSsites==NSdiscrete)
\r
536 else if (com.NSsites==NSfreqs)
\r
538 else if (com.NSsites==NSbetaw||com.NSsites==NS02normal)
\r
539 com.ncatG = ncatG0 + 1;
\r
541 com.ncatG = ncatG0;
\r
542 if(com.NSsites==NSTgamma || com.NSsites==NSTinvgamma)
\r
543 com.ncatG=KGaussLegendreRule;
\r
544 if(com.NSsites==NSTgamma1 || com.NSsites==NSTinvgamma1)
\r
545 com.ncatG=KGaussLegendreRule+1;
\r
547 com.nrate = com.nkappa=(com.hkyREV?5:!com.fix_kappa);
\r
548 if(com.NSsites==0 || com.NSsites==NSbetaw) com.nrate += !com.fix_omega;
\r
549 else if(com.NSsites==NSnneutral) com.nrate ++;
\r
550 else if(com.NSsites==NSpselection || com.NSsites==NSM2aRel)
\r
551 com.nrate += 1+!com.fix_omega;
\r
552 else if(com.NSsites==NSdiscrete)
\r
553 com.nrate += com.ncatG;
\r
555 printf("\n\nModel %d: %s\n",com.NSsites, NSsitesmodels[com.NSsites]);
\r
556 fprintf(fout,"\n\nModel %d: %s",com.NSsites,NSsitesmodels[com.NSsites]);
\r
557 fprintf(frst,"\n\nModel %d: %s",com.NSsites,NSsitesmodels[com.NSsites]);
\r
558 fprintf(frub,"\n\nModel %d: %s",com.NSsites,NSsitesmodels[com.NSsites]);
\r
559 if(com.NSsites) fprintf(fout," (%d categories)",com.ncatG);
\r
562 #ifdef NSSITESBandits
\r
563 com.fix_blength = (com.NSsites>0 ? 2 : 1);
\r
564 if(com.NSsites>0) strcpy(com.treef,"M0tree");
\r
568 printf("\nTime used: %s\n", printtime(timestr));
\r
569 fprintf(fout,"\nTime used: %s\n", printtime(timestr));
\r
573 if (com.Mgene==1) MultipleGenes(fout, fpair, com.space);
\r
574 else if (com.runmode==0) Forestry(fout);
\r
575 else if (com.runmode==3) StepwiseAddition(fout, com.space);
\r
576 else if (com.runmode>=4) Perturbation(fout,(com.runmode==4),com.space);
\r
577 else StarDecomposition(fout, com.space);
\r
578 printf("\nTime used: %s\n", printtime(timestr));
\r
579 fprintf(fout,"\nTime used: %s\n", printtime(timestr));
\r
582 FPN(frst); fflush(frst);
\r
583 FPN(frst1); fflush(frst1);
\r
586 } /* for (idata) */
\r
591 printf("\nfalse positive: %6d\n", npositive);
\r
592 fprintf(frst1, " false positive: %6d\n", npositive);
\r
597 if(com.seqtype==1)
\r
598 k = ((com.runmode==-2 || com.runmode==-3) ? 6 : 3);
\r
599 else if (com.runmode==-2)
\r
601 FOR(i,k) fclose(fpair[i]);
\r
602 if(com.ndata>1 && fseq) fclose(fseq);
\r
603 fclose(fout); fclose(frub);
\r
604 if(finitials) fclose(finitials);
\r
607 for(i=0; i<com.ns; i++) free(com.z[i]);
\r
613 /* x[]: t[ntime]; rgene[ngene-1]; kappa; p[](NSsites); omega[];
\r
614 { alpha(for NSsites) !! alpha, rho || rK[], fK[] || rK[], MK[] }
\r
617 int Forestry (FILE *fout)
\r
619 static int times=0;
\r
620 FILE *ftree, *frate=NULL;
\r
621 int status=0, i,j=0,k, itree, ntree, np, iteration=1;
\r
622 int pauptree=0, haslength;
\r
623 double x[NP],xb[NP][2], xcom[NP-NBRANCH], lnL=0,lnL0=0, e=1e-8, tl=0, nchange=-1;
\r
624 double *g=NULL, *H=NULL;
\r
625 #ifdef NSSITESBandits
\r
629 if ((ftree=fopen(com.treef,"r"))==NULL) {
\r
630 printf("\ntree file %s not found.\n", com.treef);
\r
633 GetTreeFileType(ftree, &ntree, &pauptree, 0);
\r
635 frate=(FILE*)gfopen(ratef,"w");
\r
636 if (ntree>10 && com.npatt>10000 && com.print)
\r
637 puts("\nlnf file may be large");
\r
638 flnf=gfopen("lnf","w+");
\r
639 fprintf(flnf,"%6d %6d %6d\n", ntree, com.ls, com.npatt);
\r
641 if(com.seqtype==1 && com.aaDist>=FIT1) {
\r
642 xtoy(com.pi,pcodon0,64);
\r
644 FOR(i,com.ncode) paa0[GeneticCode[com.icode][FROM61[i]]]+=pcodon0[i];
\r
645 pcodonClass=(double*)malloc(com.ncatG*64*sizeof(double));
\r
646 if(pcodonClass==NULL) error2("oom pcodonClass");
\r
649 for(itree=0; ntree==-1||itree<ntree; itree++,iteration=1) {
\r
650 if(ReadTreeN(ftree,&haslength, &i,0,1))
\r
651 { puts("end of tree file."); break; }
\r
653 printf("\nTREE # %2d\n", itree+1);
\r
654 fprintf(fout,"\n\nTREE # %2d: ", itree+1);
\r
655 fprintf(flnf,"\n\n%2d\n", itree+1);
\r
656 if(com.print) fprintf (frst,"\n\nTREE # %2d\n", itree+1);
\r
657 fprintf(frub,"\n\nTREE #%2d\n", itree+1);
\r
659 if (com.fix_blength==2 && !haslength) error2("no branch lengths in tree");
\r
660 if (com.fix_blength>0 && !haslength) com.fix_blength=0;
\r
661 if (times++==0 && com.fix_blength>0 && haslength) {
\r
662 if(com.clock) puts("\nBranch lengths in tree are ignored");
\r
664 if(com.fix_blength==2)
\r
665 puts("\nBranch lengths in tree are fixed.");
\r
666 else if(com.fix_blength==1)
\r
667 puts("\nBranch lengths in tree used as initials.");
\r
668 if(com.fix_blength==1) {
\r
670 if((x[nodes[i].ibranch]=nodes[i].branch)<0)
\r
671 x[nodes[i].ibranch]=1e-5;
\r
677 nchange = MPScore(com.space);
\r
678 if(com.ns<40) { OutTreeN(F0,0,0); printf(" MP score: %.0f",nchange); }
\r
679 OutTreeN(fout,0,0); fprintf(fout," MP score: %.0f",nchange);
\r
681 if(!com.clock && nodes[tree.root].nson<=2 && com.ns>2) {
\r
682 puts("\nThis is a rooted tree, without clock. Check.");
\r
683 fputs("\nThis is a rooted tree. Please check!",fout);
\r
685 GetInitials(x, &i);
\r
688 if(noisy>=3 && np<100) matout(F0,x,1,np);
\r
689 if(i==-1) iteration = 0;
\r
690 if(np>NP || np-com.ntime>NP-NBRANCH) error2("raise NP");
\r
691 if(com.sspace < spaceming2(np)) {
\r
692 com.sspace = spaceming2(np);
\r
693 printf ("\nspace adjusted to %9u bytes\n",com.sspace);
\r
694 if((com.space=(double*)realloc(com.space,com.sspace))==NULL) {
\r
695 printf("\ntrying to get %d bytes for ming2", com.sspace);
\r
696 error2("oom space");
\r
699 printf("\nntime & nrate & np:%6d%6d%6d\n",com.ntime,com.nrate,com.np);
\r
702 if(itree && !finitials) for(i=0;i<np-com.ntime;i++) x[com.ntime+i] = xcom[i];
\r
704 if(iteration && np) {
\r
706 SetxInitials (np, x, xb); /* start within the feasible region */
\r
710 for(i=0; i<com.npatt; i++)
\r
711 com.fpatt[i] /= (double)com.ls;
\r
713 lnL = com.plfun (x,np);
\r
715 printf("\nnp =%6d", np);
\r
716 printf("\nlnL0 = %12.6f\n",-lnL);
\r
719 if(iteration && np) {
\r
720 if(com.method == 1)
\r
721 j = minB (noisy>2?frub:NULL, &lnL,x,xb, e, com.space);
\r
722 else if (com.method==3)
\r
723 j = minB2(noisy>2?frub:NULL, &lnL,x,xb, e, com.space);
\r
725 j = ming2(noisy>2?frub:NULL,&lnL,com.plfun,NULL,x,xb, com.space,e,np);
\r
727 if (j==-1 || lnL<=0 || lnL>1e7) status=-1;
\r
729 if(status) fprintf(fout,"\ncheck convergence..");
\r
732 printf("Out..\nlnL = %12.6f\n",-lnL);
\r
734 printf("%d lfun, %d eigenQcodon, %d P(t)\n",NFunCall, NEigenQ, NPMatUVRoot);
\r
736 { lnL0=lnL; FOR(i,np-com.ntime) xcom[i]=x[com.ntime+i]; }
\r
738 for (i=0; i<np-com.ntime; i++) xcom[i]=xcom[i]*.2+x[com.ntime+i]*0.8;
\r
740 if(!LASTROUND && (com.NSsites==NSpselection||com.NSsites==NSM2aRel||com.NSsites==NSdiscrete
\r
741 ||com.NSsites==NSfreqs||com.NSsites==NS3normal)) {
\r
742 /* transform back to p0, p1,... */
\r
743 k=com.ntime+com.nrgene+com.nkappa+com.npi;
\r
745 if(com.nparK) { /* HMM model for w */
\r
747 for(i=0; i<com.ncatG; i++,k+=com.ncatG-1)
\r
748 f_and_x(x+k,x+k,com.ncatG,0,0);
\r
751 j = (com.NSsites==NS3normal ? 3 : com.ncatG);
\r
752 if(com.model && com.model<=NSbranch2) j=3;
\r
753 f_and_x(x+k,x+k,j,0,0);
\r
757 if(com.NSsites==NSdiscrete && com.aaDist==0 && com.model==0)
\r
759 if(com.clock) { /* move times into x[] */
\r
760 for(i=0,j=!nodes[tree.root].fossil; i<tree.nnode; i++)
\r
761 if(i!=tree.root && nodes[i].nson && !nodes[i].fossil)
\r
762 x[j++] = nodes[i].age;
\r
765 fprintf (fout,"\nlnL(ntime:%3d np:%3d): %13.6f %+14.6f\n",
\r
766 com.ntime, np, -lnL, -lnL+lnL0);
\r
768 if(com.fix_blength<2) {
\r
769 OutTreeB(fout); FPN(fout);
\r
772 OutTreeB(fout); FPN(fout);
\r
773 if(com.fix_blength==2) {
\r
774 for(i=0; i<tree.nbranch; i++) fprintf(fout, " %8.5f", nodes[tree.branches[i][1]].branch);
\r
778 for(i=0; i<np; i++) fprintf(fout," %8.6f",x[i]);
\r
779 FPN(fout); fflush(fout);
\r
782 puts("Calculating SE's");
\r
783 if(com.sspace < np*(np+1)*sizeof(double)) {
\r
784 com.sspace = np*(np+1)*sizeof(double);
\r
785 if((com.space=(double*)realloc(com.space,com.sspace))==NULL)
\r
786 error2("oom space for SE");
\r
791 HessianSKT2004 (x, lnL, g, H);
\r
792 if(com.getSE>=2 && com.clock==0 && nodes[tree.root].nson==3) { /* g & H */
\r
793 fprintf(frst2,"\n %d\n\n", com.ns);
\r
794 OutTreeN(frst2, 1, 1); fprintf(frst2,"\n\n");
\r
795 for(i=0; i<com.ntime; i++)
\r
796 if(x[i]>0.0004 && fabs(g[i])<0.005) g[i] = 0;
\r
797 for(i=0; i<com.ntime; i++) fprintf(frst2," %9.6f", x[i]); fprintf(frst2, "\n\n");
\r
798 for(i=0; i<com.ntime; i++) fprintf(frst2," %9.6f", g[i]); fprintf(frst2, "\n\n");
\r
799 fprintf(frst2, "\nHessian\n\n");
\r
800 for(i=0; i<com.ntime; i++,FPN(frst2))
\r
801 for(j=0; j<com.ntime; j++)
\r
802 fprintf(frst2," %10.4g", H[i*np+j]);
\r
806 for(i=0; i<np*np; i++) H[i] *= -1;
\r
807 matinv(H, np, np, H+np*np);
\r
808 fprintf(fout,"SEs for parameters:\n");
\r
809 for(i=0; i<np; i++)
\r
810 fprintf(fout," %8.6f", (H[i*np+i]>0. ? sqrt(H[i*np+i]) : -1));
\r
814 if(com.seqtype==1 && com.ntime && com.clock==0)
\r
815 fprintf(fout,"\nNote: Branch length is defined as number of nucleotide substitutions per codon (not per neucleotide site).\n");
\r
817 fprintf(fout,"Note: Branch length is defined for the first gene (site partition).\n");
\r
818 fprintf(fout,"For other genes, look at \"rates for genes\".\n");
\r
821 /* if (com.clock) SetBranch (x); */
\r
822 if(com.clock && com.nbtype>1)
\r
823 fputs("\nWarning: branch rates are not yet applied in tree length and branch lengths",fout);
\r
825 fputs("\nNote: mutation rate is not applied to tree length. Tree has times, for TreeView",fout);
\r
826 for(i=0,tl=0; i<tree.nnode; i++)
\r
827 if(i!=tree.root) tl += nodes[i].branch;
\r
828 fprintf(fout,"\ntree length = %9.5f%s\n",tl,com.ngene>1?" (1st gene)":"");
\r
832 #ifdef NSSITESBandits
\r
833 if(com.NSsites==0) {
\r
834 for(i=com.ntime; i<com.np; i++) fprintf(frst1,"\t%.3f", x[i]);
\r
835 fprintf(frst1,"\t%.2f\t%.3f", tl, -lnL);
\r
837 fM0tree=(FILE*)gfopen("M0tree", (insmodel==0?"w":"a"));
\r
838 fprintf(fM0tree, "%d %d\n", com.ns, 1);
\r
839 OutTreeN(fM0tree,1,1); FPN(fM0tree);
\r
843 for(i=com.ntime; i<com.np; i++) fprintf(frst1,"\t%.3f",x[i]);
\r
844 fprintf(frst1,"\t%.3f",-lnL);
\r
848 for(i=0; i<com.np; i++) fprintf(frst1,"\t%.3f",x[i]);
\r
849 fprintf(frst1,"\t%.3f", -lnL);
\r
852 fprintf(frst1,"\t%.4f", (com.ns==2 ? x[0]*2 : 0));
\r
853 for(i=0; i<com.nkappa; i++) fprintf(frst1,"\t%.3f",x[com.ntime+i]);
\r
854 fprintf(frst1,"\t%.4f", com.omega);
\r
855 fprintf(frst1,"\t%.3f", -lnL);
\r
859 FPN(fout); OutTreeN(fout,0,1); FPN(fout);
\r
860 FPN(fout); OutTreeN(fout,1,1); FPN(fout);
\r
862 FPN(fout); OutTreeN(fout,1,PrNodeNum); FPN(fout);
\r
865 if(com.np-com.ntime || com.clock)
\r
866 DetailOutput(fout,x, H);
\r
868 if (com.seqtype==AAseq && com.model>=REVaa_0)
\r
869 eigenQaa(fout, Root, U, V, x+com.ntime+com.nrgene);
\r
872 lfunNSsites_rate(frst,x,np);
\r
874 if(com.rho==0 && com.nparK==0 && com.clock<=1)
\r
875 AncestralSeqs(frst,x);
\r
876 if(!com.NSsites && com.plfun!=lfun)
\r
877 lfunRates(frate,x,np);
\r
880 lnL = com.plfun(x,np);
\r
883 fflush(fout); fflush(flnf); fflush(frst); fflush(frst1);
\r
887 if(frate) fclose(frate);
\r
888 if (com.aaDist && com.aaDist<10 && com.aaDist!=AAClasses
\r
889 && (com.seqtype==CODONseq||com.model==FromCodon))
\r
890 printf("\n%s, %s.\n", com.daafile, (com.aaDist>0 ? "geometric" : "linear"));
\r
891 if(com.seqtype==1 && com.aaDist>=FIT1) free(pcodonClass);
\r
892 if(ntree==-1) ntree=itree;
\r
896 rell(flnf, fout, ntree);
\r
904 double *PointKappa (double xcom[], int igene)
\r
906 /* This points to the kappa parameters in xcom[], by looking at com.model,
\r
910 int nka=(com.hkyREV?5:1), nw=(com.aaDist==AAClasses?com.nOmegaType:1);
\r
912 if(com.Mgene>1 && com.Mgene>=3)
\r
913 k += igene*(nka + nw);
\r
915 if(com.fix_kappa) return(&com.kappa);
\r
920 double *PointOmega (double xcom[], int igene, int inode, int isiteclass)
\r
922 /* This points to the omega parameters in xcom[], by looking at com.model,
\r
923 com.NSsites and igene. This sometimes points to com.omega or com.rK[].
\r
924 This is called by SetParameters(), DetailOutput(), etc.
\r
926 Difficulties in using this with lfunt() etc.
\r
928 Trying to remove global variables com.pomega and com.pkappa through
\r
929 PointOmega and PointKappa, but was unsuccessful when too many changes were
\r
930 made at the same time. Perhaps look at this later. Note that some
\r
931 variables are passed over the head from lfunt() etc. to eigenQcodon().
\r
933 Ziheng Notes: 8 August 2003.
\r
936 int k = com.nrgene+com.nkappa, backfore;
\r
937 int nka=(com.hkyREV?5:1), nw=(com.aaDist==AAClasses?com.nOmegaType:1);
\r
939 if (com.seqtype!=CODONseq && com.model!=FromCodon)
\r
940 error2("should not be here.");
\r
942 if(com.NSsites==0 && com.model==0) { /* simple case: one ratio */
\r
944 if(com.fix_omega) return (&com.omega_fix); /* fix_omega */
\r
947 else if(com.Mgene>=3)
\r
948 k += igene*(nka + nw) + nka;
\r
950 else if(com.NSsites==0 && com.model) { /* branch model */
\r
951 if (com.aaDist==0) {
\r
952 if(com.fix_omega && nodes[inode].label==com.nbtype-1)
\r
953 return (&com.omega_fix);
\r
954 else k += (int)nodes[inode].label;
\r
956 else if(com.aaDist==AAClasses)
\r
957 k += (int)nodes[inode].label*com.nOmegaType;
\r
959 else if (com.NSsites && com.model==0) { /* site model */
\r
961 k += com.ncatG-1+2*isiteclass;
\r
962 else if(com.aaDist==FIT1)
\r
963 k += com.ncatG-1+4*isiteclass;
\r
964 else if(com.aaDist==FIT2)
\r
965 k += com.ncatG-1+5*isiteclass;
\r
967 return (&com.rK[isiteclass]);
\r
969 else if (com.NSsites && com.model<=NSbranch2) { /* branch&site models A&B */
\r
970 k += 2; /* skip the frequencies. */
\r
971 backfore = (int)nodes[inode].label;
\r
973 return(&com.rK[isiteclass]);
\r
974 else if(isiteclass==2) {
\r
975 if(com.fix_omega && backfore)
\r
976 return(&com.omega_fix);
\r
978 k += 2 + (com.NSsites==NSpselection?0:2) + backfore;
\r
981 else { /* NSbranch3: Clade models C and D */
\r
982 k += com.ncatG-1; /* skip the frequencies. */
\r
983 backfore = (int)nodes[inode].label;
\r
984 if(isiteclass<com.ncatG-1)
\r
985 return(&com.rK[isiteclass]);
\r
986 else if(isiteclass == com.ncatG-1) {
\r
987 if(com.fix_omega && backfore==com.nbtype-1)
\r
988 return(&com.omega_fix);
\r
990 k += 2 + (com.NSsites==NSpselection?0:2) + backfore;
\r
997 int sortwM3(double x[])
\r
999 /* sort the w values for NSsites=NSdiscrete
\r
1000 This assumes that com.freqK[] and com.rK[] have been initialized.
\r
1002 int i, k=com.ntime+com.nrgene+com.nkappa+com.npi, index[NCATG];
\r
1003 double space[NCATG];
\r
1005 if(com.NSsites!=NSdiscrete) error2("sortwM3");
\r
1006 if(fabs(1-sum(com.freqK,com.ncatG))>1e-6) error2("sortwM3: freqK");
\r
1008 if(com.nparK) { puts("\asortwM3 for HMM not implemented yet.."); return(-1); }
\r
1010 indexing(com.rK, com.ncatG, index, 0, (int*)space);
\r
1011 xtoy(com.rK,space,com.ncatG);
\r
1012 FOR(i,com.ncatG) com.rK[i]=space[index[i]];
\r
1013 xtoy(com.freqK,space,com.ncatG);
\r
1014 FOR(i,com.ncatG) com.freqK[i]=space[index[i]];
\r
1015 FOR(i,com.ncatG-1) x[k+i]=com.freqK[i];
\r
1016 FOR(i,com.ncatG) x[k+com.ncatG-1+i]=com.rK[i];
\r
1021 void printParametersNSsites (FILE* fout, double x[])
\r
1023 int i,j, k=com.ntime+com.nrgene+com.nkappa+com.npi;
\r
1024 double w[NBTYPE][3];
\r
1026 if(!com.NSsites) error2("should not be here");
\r
1028 fprintf(fout,"\n\ndN/dS (w) for site classes (K=%d)\n",com.ncatG);
\r
1029 if(com.model==0) {
\r
1030 fputs("\np: ",fout); for(i=0; i<com.ncatG; i++) fprintf(fout," %8.5f", com.freqK[i]);
\r
1031 fputs("\nw: ",fout); for(i=0; i<com.ncatG; i++) fprintf(fout," %8.5f", com.rK[i]);
\r
1033 if(com.freqK[i] < 1e-5 && com.rK[i] > 1)
\r
1034 fprintf(fout,"\n(note that p[%d] is zero)\n", i);
\r
1036 else if(com.model<=NSbranch2) {
\r
1037 fprintf(fout,"\nsite class 0 1 2a 2b");
\r
1038 fprintf(fout,"\nproportion ");
\r
1039 for(i=0; i<com.ncatG; i++) fprintf(fout," %8.5f", com.freqK[i]);
\r
1040 fprintf(fout,"\nbackground w ");
\r
1041 for(i=0; i<com.ncatG; i++) fprintf(fout," %8.5f", com.rK[i%2]);
\r
1042 fprintf(fout,"\nforeground w ");
\r
1043 for(i=0; i<com.ncatG-2; i++) fprintf(fout," %8.5f", com.rK[i%2]);
\r
1044 for(i=0; i<2; i++) fprintf(fout," %8.5f", (com.fix_omega?com.omega_fix:x[com.np-1]));
\r
1046 if(com.freqK[2] < 1e-5 && com.rK[2] > 1)
\r
1047 fprintf(fout, "\n(note that p[2] is zero)\n");
\r
1050 else if (com.model==NSbranch3) {
\r
1051 k += com.ncatG-1 + (com.NSsites==3 && com.ncatG>2) + 1; /* freqs & w0 & w1 */
\r
1052 for(i=0; i<com.nbtype; i++) {
\r
1053 for(j=0; j<com.ncatG-1; j++)
\r
1054 w[i][j] = com.rK[j];
\r
1055 w[i][com.ncatG-1] = (i==com.nbtype-1 && com.fix_omega ? com.omega_fix : x[k++]);
\r
1058 fprintf(fout,"\nsite class ");
\r
1059 for(i=0; i<com.ncatG; i++) fprintf(fout," %9d", i);
\r
1060 fprintf(fout,"\nproportion ");
\r
1061 for(i=0; i<com.ncatG; i++) fprintf(fout, " %9.5f", com.freqK[i]);
\r
1062 for(i=0; i<com.nbtype; i++) {
\r
1063 fprintf(fout,"\nbranch type %d: ", i);
\r
1064 for(j=0; j<com.ncatG; j++) fprintf(fout," %9.5f", w[i][j]);
\r
1067 if(com.freqK[i] < 1e-5) fprintf(fout,"\n(note that p[%d] is zero)\n", i);
\r
1069 fprintf(fout, "\n");
\r
1072 static int ijAAref=19*20+9;
\r
1073 /* reference aa pair: VI (for REVaa, REVaa_0, AAClasses to estimate Grantham)
\r
1074 The rate for this pair is set to 1, and other rates are relative to it.
\r
1076 #define E1N(m,s) (s/sqrt(PI*2)*exp(-square((1-m)/s)/2)+m*(1-CDFNormal((1-m)/s)))
\r
1079 void DetailOutput (FILE *fout, double x[], double var[])
\r
1081 /* var[] is used for codon models if com.getSE=1 to calculate the variances
\r
1084 int i,j,k=com.ntime, np=com.np,npclass, ibtype;
\r
1085 double om=-1,N=-1,S=0,dN=0,dS=0,dSt,dNt, mr=0, vtw[4],vSN[4], omclass[NCATG];
\r
1086 double phi1=0,phi2=0, t, *tdSdNb=NULL, y;
\r
1087 double mu[3]={0,1,2},sig[3]={-1}; /* 3normal: mu0=0 fixed. mu2 estimated */
\r
1090 fprintf(fout,"\nDetailed output identifying parameters\n");
\r
1092 if(com.clock) OutputTimesRates(fout, x, var);
\r
1095 fprintf (fout, "\nrates for %d genes:%6.0f", com.ngene, 1.);
\r
1096 for(i=0; i<com.nrgene; i++)
\r
1097 fprintf (fout, " %8.5f", x[k++]);
\r
1101 if (com.seqtype==CODONseq || com.model==FromCodon) {
\r
1103 fprintf(fout,"a (TC) & b (TA) & c (TG) & d (CA) & e (CG): ");
\r
1104 FOR(i,5) fprintf(fout,"%8.5f ", x[k++]); FPN(fout);
\r
1106 else if (!com.fix_kappa && com.Mgene<=2)
\r
1107 fprintf(fout,"\nkappa (ts/tv) = %8.5f\n", x[k++]);
\r
1110 if (com.codonf==F1x4 || com.codonf==F1x4MG || com.codonf>=FMutSel0) {
\r
1111 for(j=0,fb3x4[3]=1; j<3; j++) fb3x4[j] = x[k+j];
\r
1112 abyx(1/sum(fb3x4,4), fb3x4, 4);
\r
1113 fprintf(fout, "\nFrequency parameters:\n");
\r
1115 fprintf(fout, " %9.5f (%c)", fb3x4[j], BASEs[j]);
\r
1117 if(com.codonf==FMutSel)
\r
1118 for(j=0; j<4; j++)
\r
1119 fprintf(frst1, "\t%.4f", fb3x4[j]);
\r
1121 else if (com.codonf==F3x4 || com.codonf==F3x4MG) {
\r
1122 for(j=0;j<3;j++) {
\r
1123 xtoy(x+k+j*3, fb3x4+j*4, 3);
\r
1125 abyx(1/sum(fb3x4+j*4,4), fb3x4+j*4, 4);
\r
1127 fprintf(fout, "\nCodon frequency model: %s", codonfreqs[com.codonf]);
\r
1128 fprintf(fout, "\nFrequency parameters:\n");
\r
1129 for(i=0; i<3; i++,FPN(fout)) {
\r
1130 fprintf(fout, "Position %d: ", i+1);
\r
1132 fprintf(fout, " %9.5f (%c)", fb3x4[i*4+j], BASEs[j]);
\r
1135 if(com.npi>3 || com.codonf!=FMutSel) {
\r
1136 fprintf(fout, "\nEquilibrium codon frequencies (evolver-style):\n");
\r
1137 for(j=0; j<64; j++) {
\r
1138 fprintf(fout," %11.8f", GeneticCode[com.icode][j]==-1?0:com.pi[FROM64[j]]);
\r
1139 if((j+1)%4==0) FPN(fout);
\r
1142 if(com.npi>3 && com.codonf>=FMutSel0) {
\r
1143 if(com.codonf==FMutSel0) {
\r
1144 fprintf(fout, "\nEquilibrium amino acid frequencies:\n");
\r
1145 for(j=0; j<20; j++) {
\r
1146 fprintf(fout," %11.8f", com.piAA[j]);
\r
1147 if((j+1)%10==0) FPN(fout);
\r
1149 i = GeneticCode[com.icode][FROM61[com.ncode-1]];
\r
1150 y = (i == 19 ? 0 : com.ppi[3+i]);
\r
1151 fprintf(fout, "\nfitness for %d codons (amino acid %c has fitness 0)\n", com.ncode, AAs[i]);
\r
1152 for(j=0; j<com.ncode; j++) {
\r
1153 i = GeneticCode[com.icode][FROM61[j]];
\r
1154 fprintf(fout," %9.6f", (i == 19 ? 0 : com.ppi[3+i])-y);
\r
1158 fprintf(fout, "\nfitness for %d codons (GGG has fitness 0)\n", com.ncode-1);
\r
1159 for(j=0; j<com.ncode-1; j++)
\r
1160 fprintf(fout," %9.6f", com.ppi[3+j]);
\r
1167 if(com.codonf == FMutSel)
\r
1168 SelectionCoefficients(frst, com.pkappa, com.ppi, com.omega);
\r
1171 /* dN/dS by averaging over site classes.
\r
1172 Qfactor_NS was calculated during ML iteration and is used here..
\r
1174 if(com.NSsites && com.model==0) {
\r
1175 for(j=0,dS=dN=0; j<com.ncatG; j++) {
\r
1178 com.pomega = x+k+com.ncatG-1+2*j;
\r
1179 else if(com.aaDist >= FIT1) {
\r
1180 com.pomega = x+k+com.ncatG-1+j*(4+(com.aaDist==FIT2));
\r
1181 xtoy(pcodonClass+j*64, com.pi, com.ncode);
\r
1185 eigenQcodon(2,1,&S,&dSt,&dNt,NULL,NULL,NULL, &mr, com.pkappa,com.rK[j],PMat);
\r
1186 /* t=1 used here, and dS & dN used later for each branch */
\r
1187 dS += com.freqK[j]*dSt;
\r
1188 dN += com.freqK[j]*dNt;
\r
1189 omclass[j] = dNt/dSt;
\r
1198 if(!com.fix_omega && com.NSsites==0 && com.model==0 && com.aaDist!=7 && com.Mgene<=2)
\r
1199 fprintf(fout,"\nomega (dN/dS) = %8.5f\n", x[k++]);
\r
1201 /* dN/dS rate ratios for classes */
\r
1202 if (com.NSsites >= NSgamma) {
\r
1203 fprintf(fout,"\nParameters in M%d (%s):\n ", com.NSsites, NSsitesmodels[com.NSsites]);
\r
1204 if(com.NSsites == NSgamma)
\r
1205 fprintf(fout," a=%9.5f b=%9.5f\n",x[k],x[k+1]);
\r
1206 else if(com.NSsites == NS2gamma)
\r
1207 fprintf(fout," p0 = %9.5f a0 = %9.5f b0 = %9.5f\n(p1 = %9.5f) a1 = %9.5f (b1 = %9.5f)\n",
\r
1208 x[k],x[k+1],x[k+2], 1-x[k], x[k+3],x[k+3]);
\r
1209 else if(com.NSsites == NSbeta)
\r
1210 fprintf(fout,"p = %9.5f q = %9.5f\n",x[k],x[k+1]);
\r
1211 else if(com.NSsites == NSbetaw)
\r
1212 fprintf(fout," p0 = %9.5f p = %9.5f q = %9.5f\n (p1 = %9.5f) w = %9.5f\n",
\r
1213 x[k],x[k+1],x[k+2], 1-x[k], (com.fix_omega?com.omega:x[k+3]));
\r
1214 else if(com.NSsites == NSbetagamma)
\r
1215 fprintf(fout," p0 = %9.5f p = %9.5f q = %9.5f\n(p1 = %9.5f) a = %9.5f b = %9.5f\n",
\r
1216 x[k],x[k+1],x[k+2], 1-x[k], x[k+3],x[k+4]);
\r
1217 else if(com.NSsites == NSbeta1gamma)
\r
1218 fprintf(fout," p0 = %9.5f p = %9.5f q = %9.5f\n(p1 = %9.5f) a = %9.5f b = %9.5f\n",
\r
1219 x[k],x[k+1],x[k+2], 1-x[k], x[k+3],x[k+4]);
\r
1220 else if(com.NSsites == NSbeta1normal)
\r
1221 fprintf(fout," p0 = %9.5f p = %9.5f q = %9.5f\n(p1 = %9.5f) u = %9.5f s = %9.5f\n",
\r
1222 x[k],x[k+1],x[k+2], 1-x[k], x[k+3],x[k+4]);
\r
1223 else if(com.NSsites == NS02normal)
\r
1224 fprintf(fout,"p0 = %9.5f p1 = %9.5f u2 = %9.5f s1 = %9.5f s2 = %9.5f\n",
\r
1225 x[k],x[k+1],x[k+2],x[k+3],x[k+4]);
\r
1226 else if(com.NSsites == NS3normal)
\r
1227 fprintf(fout,"p0 = %9.5f p1 = %9.5f (p2 = %9.5f)\n u2 = %9.5f s0 = %9.5f s1 = %9.5f s2 = %9.5f\n",
\r
1228 x[k],x[k+1], 1-x[k]-x[k+1], x[k+2],x[k+3],x[k+4],x[k+5]);
\r
1229 else if(com.NSsites == NSTgamma)
\r
1230 fprintf(fout,"alpha = %9.5f beta = %9.5f T = %9.5f\n", x[k],x[k+1],(com.fix_omega ? com.omega_fix : x[k+2]));
\r
1231 else if(com.NSsites == NSTinvgamma)
\r
1232 fprintf(fout,"alpha = %9.5f beta = %9.5f T = %9.5f\n", x[k],x[k+1],(com.fix_omega ? com.omega_fix : x[k+2]));
\r
1233 else if(com.NSsites == NSTgamma1)
\r
1234 fprintf(fout,"p0 = %9.5f (p1 = %9.5f) alpha = %9.5f beta = %9.5f T = %9.5f\n", x[k],1-x[k],x[k+1],x[k+2],(com.fix_omega ? com.omega_fix : x[k+3]));
\r
1235 else if(com.NSsites==NSTinvgamma1)
\r
1236 fprintf(fout,"p0 = %9.5f (p1 = %9.5f) alpha = %9.5f beta = %9.5f T = %9.5f\n", x[k],1-x[k],x[k+1],x[k+2],(com.fix_omega ? com.omega_fix : x[k+3]));
\r
1239 if (com.NSsites==NSdiscrete && com.aaDist) { /* structural site classes */
\r
1240 npclass=(com.aaDist<10 ? 2 : (com.aaDist==FIT1?4:5));
\r
1241 fprintf(fout,"\nParameters in each class (%d)",npclass);
\r
1242 fprintf(fout,"%s:\n\n",
\r
1243 (com.aaDist<10 ? "(b, a)" : "(a_p, p*, a_v, v*, b)"));
\r
1244 for(j=0,k+=com.ncatG-1; j<com.ncatG; j++,FPN(fout)) {
\r
1245 fprintf(fout,"%d: f=%8.5f, ",j+1,com.freqK[j]);
\r
1246 FOR(i,npclass) fprintf(fout,"%9.5f",x[k++]);
\r
1247 fprintf(fout," dN/dS = %7.5f", omclass[j]);
\r
1250 else if (com.NSsites && com.aaDist==0) {
\r
1251 printParametersNSsites(fout,x);
\r
1253 fprintf(fout,"\nTransition matrix M in HMM: M_ij=Prob(i->j):\n");
\r
1254 matout(fout, com.MK, com.ncatG, com.ncatG);
\r
1257 else if(com.aaDist && com.aaDist<=6) { /* one class (YNH98, Genetics) */
\r
1258 k = com.ntime+com.nrgene+com.nkappa+com.npi;
\r
1259 fprintf (fout,"\nb = %9.5f", x[k++]);
\r
1260 if (com.seqtype==CODONseq) fprintf (fout,"\na = %9.5f\n", x[k++]);
\r
1262 else if(com.aaDist && com.aaDist>=11) { /* fitness, one class */
\r
1263 fprintf (fout,"\nfitness model (a_p, p*, a_v, v*, (and w0 for FIT2):\n");
\r
1264 k = com.ntime+com.nrgene+com.nkappa+com.npi;
\r
1265 FOR(i,4+(com.aaDist==FIT2)) fprintf(fout," %9.5f",x[k++]); FPN(fout);
\r
1267 else if(com.model==0 && com.NSsites==0 && !com.fix_omega && com.Mgene>2) {
\r
1268 if(!com.fix_kappa && !com.fix_omega) {
\r
1269 for(i=0; i<com.ngene; i++,k+=2)
\r
1270 fprintf(fout,"\ngene #%2d: kappa = %9.5f omega = %9.5f", i+1, x[k], x[k+1]);
\r
1272 else if(com.fix_kappa) {
\r
1273 for(i=0; i<com.ngene; i++,k++)
\r
1274 fprintf(fout,"\ngene #%2d: omega = %9.5f", i+1, x[k]);
\r
1276 else if(com.fix_omega) {
\r
1277 for(i=0; i<com.ngene; i++,k++)
\r
1278 fprintf(fout,"\ngene #%2d: kappa = %9.5f", i+1, x[k]);
\r
1285 for(j=0; j<com.nalpha; j++) {
\r
1286 if (!com.fix_alpha)
\r
1287 fprintf(fout,"\nalpha (gamma, K = %d) = %8.5f", com.ncatG,(com.alpha=x[k++]));
\r
1289 DiscreteGamma(com.freqK,com.rK,com.alpha,com.alpha,com.ncatG,DGammaUseMedian);
\r
1291 fprintf(fout,"\nrate: "); FOR(i,com.ncatG) fprintf(fout," %8.5f",com.rK[i]);
\r
1292 fprintf(fout,"\nfreq: "); FOR(i,com.ncatG) fprintf(fout," %8.5f",com.freqK[i]);
\r
1296 if (!com.fix_rho) fprintf (fout, "rho (correlation) = %8.5f\n", x[k]);
\r
1297 fprintf (fout, "transition probabilities between rate categories:\n");
\r
1298 for(i=0;i<com.ncatG;i++,FPN(fout)) FOR(j,com.ncatG)
\r
1299 fprintf(fout," %8.5f",com.MK[i*com.ncatG+j]);
\r
1302 if (com.aaDist==AAClasses) {
\r
1303 if(com.model==0) {
\r
1304 fprintf (fout, "\nw (dN/dS) classes for amino acid pairs:\n");
\r
1305 for(k=0; k<com.nOmegaType; k++) {
\r
1306 fprintf (fout, " %9.5f: ", x[com.ntime+com.nrgene+com.nkappa+k]);
\r
1307 for(i=0; i<20; i++) for(j=0; j<i; j++)
\r
1308 if (OmegaAA[i*(i-1)/2+j]==k) fprintf(fout," %c%c", AAs[i],AAs[j]);
\r
1309 if (k==0) fprintf(fout, " (background ratio)");
\r
1312 /* output for bubble plot */
\r
1313 if(com.seqtype==1) {
\r
1314 for(i=0; i<20; i++) for(j=0; j<i; j++) {
\r
1316 if(AA1STEP[i*(i-1)/2+j])
\r
1317 y = x[com.ntime+com.nrgene+com.nkappa + OmegaAA[i*(i-1)/2+j]]; /* omega */
\r
1318 fprintf(frst, "%c%c %3d %3d %8.5f\n", AAs[i], AAs[j], i+1, j+1, y);
\r
1323 fprintf (fout, "\nw (dN/dS) for branch-type and amino acid class:\n");
\r
1324 k = com.ntime+com.nrgene+com.nkappa+com.npi;
\r
1325 for(i=0; i<com.nbtype; i++) {
\r
1326 fprintf(fout, "Branch type %d: ", i);
\r
1327 for(j=0; j<com.nOmegaType; j++) {
\r
1328 fprintf (fout, " %9.5f", x[k++]);
\r
1335 /* dN & dS for each branch in the tree */
\r
1336 if(com.seqtype==CODONseq && com.ngene==1 && (com.model==0 || com.NSsites==0)
\r
1337 /*||com.model==FromCodon||com.aaDist==AAClasses */){
\r
1338 tdSdNb = (double*)malloc(tree.nnode*3*sizeof(double));
\r
1339 if(tdSdNb==NULL) error2("oom DetailOutput");
\r
1340 if(com.model && com.aaDist!=AAClasses ) { /* branch models */
\r
1341 fprintf(fout, "\nw (dN/dS) for branches: ");
\r
1342 k = com.ntime+com.nrgene+com.nkappa+com.npi;
\r
1343 for(i=0; i<com.nOmega-1; i++)
\r
1344 fprintf(fout, " %7.5f", x[k+i]);
\r
1345 fprintf(fout, " %7.5f", (com.fix_omega ? com.omega : x[k+i]));
\r
1348 fputs("\ndN & dS for each branch\n\n",fout);
\r
1349 fprintf(fout,"%7s%11s%8s%8s%8s%8s%8s %5s %5s\n\n",
\r
1350 "branch","t","N","S","dN/dS","dN","dS","N*dN","S*dS");
\r
1351 for(i=0,dNt=dSt=0; i<tree.nbranch; i++) {
\r
1352 fprintf(fout,"%4d..%-3d ",tree.branches[i][0]+1,tree.branches[i][1]+1);
\r
1353 k = com.ntime+com.nrgene+com.nkappa+com.npi;
\r
1354 /* if(com.codonf >= FMutSel0)
\r
1355 com.ppi = x+com.ntime+com.nrgene+com.nkappa;
\r
1357 t = nodes[tree.branches[i][1]].branch;
\r
1359 if(com.NSsites==0) {
\r
1360 if (com.aaDist) om=-1; /* not used in eigenQcodon() */
\r
1361 else if (com.model==0 || com.model==FromCodon)
\r
1362 om = (com.fix_omega?com.omega:x[k]);
\r
1363 else if (com.model==NSbranchB) om = x[k+i];
\r
1364 else if (com.model==NSbranch2) om = nodes[tree.branches[i][1]].omega;
\r
1366 if(com.model && com.aaDist)
\r
1367 com.pomega = x + com.ntime + com.nrgene + !com.fix_kappa + com.npi
\r
1368 + (int)nodes[tree.branches[i][1]].label*com.nOmegaType;
\r
1371 eigenQcodon(2,t,&S,&dS,&dN, NULL,NULL,NULL, &mr, com.pkappa,om,PMat); /* PMat destroyed! */
\r
1374 if (com.aaDist) om = dN/dS;
\r
1376 if(dS<.01/com.ls) om = -1;
\r
1377 else if(om==-1) om = dN/dS;
\r
1378 if(com.model==0) om = com.omega;
\r
1383 tdSdNb[tree.nnode+i] = dS;
\r
1384 tdSdNb[tree.nnode*2+i] = dN;
\r
1387 fprintf(fout," %7.3f %7.1f %7.1f %7.4f %7.4f %7.4f %5.1f %5.1f",
\r
1388 t,N,S,om,dN,dS,N*dN,S*dS);
\r
1389 /* fprintf(frst,"%8.1f%8.1f %9.5f%9.4f%9.4f",N,S,om,dN,dS); */
\r
1391 /* om not used in AAClasses model */
\r
1392 if(com.getSE>1&&com.fix_blength<2&&!com.clock&&com.aaDist!=AAClasses){
\r
1393 vtw[0] = var[i*np+i];
\r
1394 vtw[3] = var[k*np+k];
\r
1395 vtw[1] = vtw[2] = var[i*np+k];
\r
1396 VariancedSdN(t, om, vtw, vSN);
\r
1397 fprintf(fout," dN = %7.4f +- %.4f dS = %7.4f +- %.4f",
\r
1398 dN,(vSN[3]>0?sqrt(vSN[3]):-0),dS,(vSN[0]>0?sqrt(vSN[0]):-0));
\r
1399 fprintf(fout," (method 2)");
\r
1403 else if(com.model==0) { /* NSsites & other site-class models */
\r
1404 fprintf(fout,"%9.3f %8.1f %8.1f %8.4f %8.4f %8.4f %6.1f %6.1f\n",
\r
1405 t,N,S,om,dN*t,dS*t, N*dN*t,S*dS*t);
\r
1407 else { /* NSbranchsites models */
\r
1411 if(com.NSsites==0) {
\r
1412 fprintf(fout,"\ntree length for dN: %12.4f\ntree length for dS: %12.4f\n", dNt,dSt);
\r
1414 fprintf(frst1,"\t%.4f\t%.4f", dNt, dSt);
\r
1417 if(com.model && com.NSsites==0) {
\r
1418 fprintf(fout,"\ndS tree:\n");
\r
1419 for(i=0; i<tree.nbranch; i++)
\r
1420 nodes[tree.branches[i][1]].branch = tdSdNb[tree.nnode+i];
\r
1421 OutTreeN(fout,1,1);
\r
1422 fprintf(fout,"\ndN tree:\n");
\r
1423 for(i=0; i<tree.nbranch; i++)
\r
1424 nodes[tree.branches[i][1]].branch = tdSdNb[tree.nnode*2+i];
\r
1425 OutTreeN(fout,1,1); FPN(fout);
\r
1426 /* revert branch lengths to the original values */
\r
1427 for(i=0; i<tree.nbranch; i++)
\r
1428 nodes[tree.branches[i][1]].branch = tdSdNb[i];
\r
1431 /* the first label is the label assigned in the tree file. The second is w ratio */
\r
1432 if(com.aaDist==0) {
\r
1433 fprintf(fout,"\nw ratios as labels for TreeView:\n");
\r
1434 OutTreeN(fout, 1, PrOmega); FPN(fout);
\r
1437 } /* if codonseqs */
\r
1439 FPN(fout); fflush(fout);
\r
1444 void ReadNSsitesModels(char *line)
\r
1446 /* This reads the line NSsites = 0 1 2 3 7 8 in codeml.ctl.
\r
1451 if ((pline=strstr(line, "="))==NULL) error2(".ctl file error NSsites");
\r
1453 for (nnsmodels=0; nnsmodels<maxNSsitesModels; nnsmodels++) {
\r
1454 if(sscanf(pline, "%d", &nsmodels[nnsmodels]) != 1) break;
\r
1455 for(pop_digit=0; ; ) {
\r
1456 if(isdigit(*pline)) { pline++; pop_digit=1; }
\r
1457 else if(isspace(*pline)) {
\r
1459 if(pop_digit) break;
\r
1461 else error2(".ctl file NSsites line strange.");
\r
1463 if(nsmodels[nnsmodels]<0 || nsmodels[nnsmodels]>=maxNSsitesModels)
\r
1464 error2("NSsites model");
\r
1466 com.NSsites=nsmodels[0];
\r
1470 int ReadDaafiles(char *line)
\r
1472 /* This reads the daa files and set up the eigen matrices U,V,Root for combined
\r
1473 clock analyses of multiple protein data sets (clock = 5 or 6).
\r
1475 int i, ng=(com.ndata>1?com.ndata:NGENE), markgenes[NGENE];
\r
1477 splitline(line, markgenes);
\r
1478 for(i=0; i<ng; i++) {
\r
1479 if(!isalnum(line[markgenes[i]])) break;
\r
1480 sscanf(line+markgenes[i], "%s", data.daafile[i]);
\r
1481 printf("protein %2d uses %s\n", i+1, data.daafile[i]);
\r
1487 int GetOptions (char *ctlf)
\r
1489 int iopt, i,j, nopt=37, lline=255;
\r
1490 char line[255], *pline, opt[99], *comment="*#";
\r
1491 char *optstr[] = {"seqfile", "outfile", "treefile", "seqtype", "noisy",
\r
1492 "cleandata", "runmode", "method",
\r
1493 "clock", "TipDate", "getSE", "RateAncestor", "CodonFreq", "estFreq", "verbose",
\r
1494 "model", "hkyREV", "aaDist","aaRatefile",
\r
1495 "NSsites", "NShmm", "icode", "Mgene", "fix_kappa", "kappa",
\r
1496 "fix_omega", "omega", "fix_alpha", "alpha","Malpha", "ncatG",
\r
1497 "fix_rho", "rho", "ndata", "bootstrap", "Small_Diff", "fix_blength"};
\r
1500 int ng=-1, markgenes[NGENE+99];
\r
1501 char *daafiles[]={"", "grantham.dat", "miyata.dat",
\r
1502 "g1974c.dat","g1974p.dat","g1974v.dat","g1974a.dat"};
\r
1504 /* kostas, default prior for t & w */
\r
1505 com.hyperpar[0]=1.1; com.hyperpar[1]=1.1; com.hyperpar[2]=1.1; com.hyperpar[3]=2.2;
\r
1507 fctl=gfopen(ctlf,"r");
\r
1508 if (noisy) printf ("\n\nReading options from %s..\n", ctlf);
\r
1510 if (fgets(line, lline, fctl) == NULL) break;
\r
1511 for (i=0,t=0,pline=line; i<lline&&line[i]; i++)
\r
1512 if (isalnum(line[i])) { t=1; break; }
\r
1513 else if (strchr(comment,line[i])) break;
\r
1514 if (t==0) continue;
\r
1515 sscanf (line, "%s%*s%lf", opt,&t);
\r
1516 if ((pline=strstr(line, "="))==NULL)
\r
1517 error2("err: option file. add space around the equal sign?");
\r
1518 for (iopt=0; iopt<nopt; iopt++) {
\r
1519 if (strncmp(opt, optstr[iopt], 8)==0) {
\r
1521 printf ("\n%3d %15s | %-20s %6.2f", iopt+1,optstr[iopt],opt,t);
\r
1523 case ( 0): sscanf(pline+1, "%s", com.seqf); break;
\r
1524 case ( 1): sscanf(pline+1, "%s", com.outf); break;
\r
1525 case ( 2): sscanf(pline+1, "%s", com.treef); break;
\r
1526 case ( 3): com.seqtype=(int)t; break;
\r
1527 case ( 4): noisy=(int)t; break;
\r
1528 case ( 5): com.cleandata=(char)t; break;
\r
1530 sscanf(pline+1, "%d%lf%lf%lf%lf", &com.runmode, com.hyperpar, com.hyperpar+1, com.hyperpar+2, com.hyperpar+3);
\r
1532 case ( 7): com.method=(int)t; break;
\r
1533 case ( 8): com.clock=(int)t; break;
\r
1535 sscanf(pline+1, "%lf%lf", &com.TipDate, &com.TipDate_TimeUnit);
\r
1537 case (10): com.getSE=(int)t; break;
\r
1538 case (11): com.print=(int)t; break;
\r
1539 case (12): com.codonf=(int)t; break;
\r
1540 case (13): com.npi=(int)t; break;
\r
1541 case (14): com.verbose=(int)t; break;
\r
1542 case (15): com.model=(int)t; break;
\r
1543 case (16): com.hkyREV=(int)t; break;
\r
1544 case (17): com.aaDist=(int)t; break;
\r
1546 sscanf(pline+2,"%s",com.daafile);
\r
1547 if(com.seqtype==2 && com.ndata>1 && (com.clock==5 || com.clock==6)) {
\r
1548 ReadDaafiles(pline+2);
\r
1552 case (19): ReadNSsitesModels(line); break;
\r
1553 case (20): com.nparK=(int)t; break;
\r
1556 if(com.seqtype==1 && (com.clock==5 || com.clock==6)) {
\r
1557 ng = splitline (++pline, markgenes);
\r
1558 for(j=0; j<min2(ng,com.ndata); j++)
\r
1559 if(!sscanf(pline+markgenes[j],"%d",&data.icode[j])) break;
\r
1561 for(j=0; j<min2(ng,com.ndata); j++) printf("%4d", data.icode[j]); FPN(F0);
\r
1565 case (22): com.Mgene=(int)t; break;
\r
1566 case (23): com.fix_kappa=(int)t; break;
\r
1569 if(com.seqtype==1 && com.fix_kappa && (com.clock==5 || com.clock==6)) {
\r
1570 ng = splitline (++pline, markgenes);
\r
1571 for(j=0; j<min2(ng,com.ndata); j++)
\r
1572 if(!sscanf(pline+markgenes[j],"%lf",&data.kappa[j])) break;
\r
1574 matout(F0, data.kappa, 1, min2(ng,com.ndata));
\r
1577 case (25): com.fix_omega=(int)t; break;
\r
1580 if(com.seqtype==1 && com.fix_omega && (com.clock==5 || com.clock==6)) {
\r
1581 ng = splitline (++pline, markgenes);
\r
1582 for(j=0; j<min2(ng,com.ndata); j++)
\r
1583 if(!sscanf(pline+markgenes[j],"%lf",&data.omega[j])) break;
\r
1585 matout(F0, data.omega, 1, min2(ng,com.ndata));
\r
1589 case (27): com.fix_alpha=(int)t; break;
\r
1592 if(com.fix_alpha && t && (com.clock==5 || com.clock==6)) {
\r
1593 ng = splitline (++pline, markgenes);
\r
1594 for(j=0; j<min2(ng,com.ndata); j++)
\r
1595 if(!sscanf(pline+markgenes[j], "%lf", &data.alpha[j])) break;
\r
1597 matout(F0, data.alpha, 1, min2(ng,com.ndata));
\r
1600 case (29): com.nalpha=(int)t; break;
\r
1601 case (30): com.ncatG=(int)t; break;
\r
1602 case (31): com.fix_rho=(int)t; break;
\r
1603 case (32): com.rho=t; break;
\r
1604 case (33): com.ndata=(int)t; break;
\r
1605 case (34): com.bootstrap=(int)t; break;
\r
1606 case (35): Small_Diff=t; break;
\r
1607 case (36): com.fix_blength=(int)t; break;
\r
1613 { printf ("\noption %s in %s not recognised\n", opt,ctlf); exit(-1); }
\r
1617 if((com.fix_kappa || (com.fix_alpha&&com.alpha)) && (com.clock==5 || com.clock==6))
\r
1618 printf("Using parameters from the control file.");
\r
1620 if (noisy) FPN(F0);
\r
1621 if(com.seqtype==1 || com.model==FromCodon)
\r
1623 if (com.seqtype==AAseq || com.seqtype==CODON2AAseq) {
\r
1624 if(com.NSsites) error2("use NSsites=0 for amino acids?");
\r
1625 if(com.hkyREV && com.model!=FromCodon) /* REV & FromCodon not well-tested. */
\r
1626 error2("use hkyREV=0 for amino acids?");
\r
1628 if(com.aaDist==AAClasses)
\r
1629 com.nrate = com.nkappa=(com.hkyREV ? 5 : !com.fix_kappa);
\r
1631 switch (com.model) {
\r
1632 case (Poisson): case (EqualInput): case (Empirical): case (Empirical_F):
\r
1633 com.fix_kappa=1; com.kappa=0; com.nrate=0; break;
\r
1634 case (FromCodon):
\r
1635 com.nrate=com.nkappa = (com.hkyREV ? 5 : !com.fix_kappa);
\r
1636 if(com.aaDist) com.nrate++;
\r
1637 if(com.fix_omega) error2("fix_omega = 1");
\r
1639 com.codonf=0; puts("CodonFreq=0 reset for model=6.");
\r
1642 case (REVaa_0): com.fix_kappa=0; com.kappa=0; break;
\r
1643 case (REVaa): com.fix_kappa=0; com.kappa=0; com.nrate=189; break;
\r
1644 default: error2("model unavailable");
\r
1646 if(com.Mgene>2 || (com.Mgene==2 && (com.model==Fequal||com.model==2)))
\r
1647 error2 ("Mgene && model");
\r
1648 if(com.seqtype==2 && com.model!=FromCodon && com.model!=AAClasses)
\r
1649 { com.fix_kappa=com.fix_omega=1; com.kappa=com.omega=0; }
\r
1651 else if(com.seqtype==CODONseq) {
\r
1653 if (com.model||com.aaDist||com.NSsites!=NSdiscrete||com.alpha||com.rho)
\r
1654 error2("HMM model option");
\r
1655 if(com.Mgene>1 && com.model) error2("Mgene & model?");
\r
1656 if(com.fix_kappa) {
\r
1658 error2("can't fix kappa for the codon model you selected.");
\r
1660 com.pkappa[0] = com.kappa;
\r
1662 if(com.codonf>=FMutSel0 && com.Mgene>=2)
\r
1663 error2("model FMutSel + Mgene not implemented");
\r
1664 if(com.runmode==-2 && com.seqtype==1 && com.npi)
\r
1665 error2("runmode = -2 not implemented for codon models with frequencies");
\r
1666 if(com.runmode==-3 && com.seqtype==1 && com.npi)
\r
1667 error2("runmode = -3 not implemented for codon models with frequencies");
\r
1668 if(com.hkyREV && (com.aaDist || com.Mgene>1))
\r
1669 error2("hkyREV with aaDist or Mgene: check options?\a");
\r
1670 if(com.NSsites<0 || com.NSsites>maxNSsitesModels || (com.NSsites>13 && com.NSsites<22))
\r
1671 error2("option NSsites.");
\r
1672 if(com.aaDist && com.NSsites)
\r
1673 error2("aaDist & NSsites don't work together");
\r
1674 if((com.model && com.aaDist)
\r
1675 && (com.model>NSbranch2 || com.aaDist!=AAClasses))
\r
1676 error2("model & aaDist");
\r
1677 if(com.model==NSbranch3 && com.NSsites!=2 && com.NSsites!=3)
\r
1678 error2("clade model should have model = 3 NSsites = 2 or 3.");
\r
1680 if(com.aaDist && com.fix_omega)
\r
1681 error2("can't fix_omega for aaDist models");
\r
1683 com.nrate = com.nkappa = (com.hkyREV ? 5 : !com.fix_kappa);
\r
1685 /* pi_T, pi_C, pi_A are counted as frequency parameters pi. */
\r
1688 if(com.codonf==FMutSel0) /* FMutSel0: pi_TCA plus 20 AA freqs. */
\r
1689 com.npi = 3 + (com.npi ? 20-1 : 0);
\r
1690 else if(com.codonf==FMutSel) /* FMutSel: pi_TCA plus 60 codon freqs. */
\r
1691 com.npi = 3 + (com.npi ? com.ncode-1 : 0);
\r
1692 else if(com.npi) {
\r
1693 if (com.codonf==F1x4 || com.codonf==F1x4MG) com.npi = 3;
\r
1694 else if (com.codonf==F3x4 || com.codonf==F3x4MG) com.npi = 9;
\r
1695 else if (com.codonf==Fcodon) com.npi = com.ncode-1;
\r
1697 com.nrate += com.npi;
\r
1699 if (com.aaDist!=AAClasses) {
\r
1700 if(com.fix_kappa>1) error2("fix_kappa>1, not tested."); /** ???? */
\r
1701 if (com.model>0 && (com.alpha || !com.fix_alpha))
\r
1702 error2("dN/dS ratios among branches not implemented for gamma");
\r
1703 if (com.model>0 && com.clock)
\r
1704 error2("model and clock don't work together");
\r
1705 if (com.fix_omega) {
\r
1706 com.omega_fix=com.omega;
\r
1707 if((com.model==0 && com.NSsites==NSdiscrete)
\r
1708 || (com.model && com.NSsites && com.NSsites!=NSpselection
\r
1709 &&com.NSsites!=NSdiscrete && com.NSsites!=NSbetaw))
\r
1710 error2("\afix_omega?");
\r
1712 if (com.model>NSbranch3) error2("seqtype or model.");
\r
1714 if (com.model==NSbranch2 && com.clock==2)
\r
1715 error2("NSbranch & local clock.");
\r
1717 if (com.model==NSbranch3 && com.NSsites==NSpselection && com.ncatG!=3)
\r
1718 { com.ncatG=3; puts("ncatG=3 reset."); }
\r
1719 if(com.kappa<0) error2("kappa..");
\r
1720 if (com.runmode) com.fix_blength=0;
\r
1721 if((com.runmode==-2 || com.runmode==-3) && (com.NSsites||com.alpha||com.aaDist))
\r
1722 error2("wrong model for pairwise comparison.\ncheck NSsites, alpha, aaDist, model etc.");
\r
1723 if(com.runmode>0 && com.model==2) error2("tree search & model");
\r
1724 if(com.aaDist && com.NSsites!=0 && com.NSsites!=NSdiscrete)
\r
1725 error2("NSsites && aaDist.");
\r
1727 if((com.NSsites || nnsmodels>1) && (com.alpha || com.fix_alpha==0))
\r
1728 error2("NSsites & Gamma");
\r
1729 if(com.seqtype==1 && (com.alpha || com.fix_alpha==0))
\r
1730 puts("\aGamma codon model: are you sure this is the model you want to use? ");
\r
1732 if(com.aaDist==0) {
\r
1733 if((!com.fix_omega || (com.Mgene && com.Mgene>=3)) && !com.NSsites && !com.model)
\r
1737 if(com.aaDist<=6) com.nrate+=2; /* a & b, PSB2000 */
\r
1738 else if(com.aaDist==FIT1) com.nrate+=4; /* fitness models: */
\r
1739 else if(com.aaDist==FIT2) com.nrate+=5; /* ap, p*, av, v*, b */
\r
1740 if(com.aaDist>=FIT1)
\r
1741 for(i=0; i<2; i++)
\r
1742 for(j=0;j<20;j++) AAchem[i][j] /= AAchem[i][20];
\r
1746 if(com.NSsites==NSfreqs && com.ncatG!=5)
\r
1747 { puts("\nncatG changed to 5."); com.ncatG=5; }
\r
1748 if(com.model && com.NSsites)
\r
1749 if((com.model!=2 && com.model!=3)
\r
1750 || (com.NSsites!=NSpselection && com.NSsites!=NSdiscrete))
\r
1751 error2("only NSsites=2,3 & model=2,3 are compatible.");
\r
1752 switch(com.NSsites) {
\r
1753 case (NSnneutral): com.ncatG=2; break;
\r
1754 case (NSpselection):
\r
1756 com.ncatG=3; break;
\r
1757 case (NSbetaw): com.ncatG++; break;
\r
1758 case (NS02normal): com.ncatG++; break;
\r
1761 if(com.model==2) { /* branchsite models A & B */
\r
1762 if(com.ncatG!=3) puts("\abranch-site model: use ncatG=3 only.");
\r
1764 com.nrate += (com.NSsites==2?2:3);
\r
1766 else if(com.model==3) { /* Clade models C & D */
\r
1767 if(com.NSsites==NSpselection) {
\r
1768 com.ncatG=3; com.nrate+=3;
\r
1770 if(com.NSsites==NSdiscrete) {
\r
1771 if(com.ncatG!=2 && com.ncatG!=3)
\r
1772 error2("use 2 or 3 for ncatG for model=3?");
\r
1773 com.nrate += com.ncatG+1;
\r
1776 else if(com.NSsites==NSnneutral) {
\r
1777 if(!com.fix_omega) com.nrate++;
\r
1778 else { com.nrate++; com.omega_fix=com.omega; }
\r
1780 else if(com.NSsites==NSpselection || com.NSsites==NSM2aRel) {
\r
1781 if(!com.fix_omega) com.nrate+=2;
\r
1782 else { com.nrate++; com.omega_fix=com.omega; }
\r
1784 else if(com.NSsites==NSbetaw)
\r
1785 { if(!com.fix_omega) com.nrate++; else com.omega_fix=com.omega; }
\r
1786 else if(com.NSsites==NSdiscrete && com.aaDist) {
\r
1787 if (com.aaDist<=6) com.nrate+=com.ncatG; /* a&b PSB2000 */
\r
1788 else { /* fitness models */
\r
1789 com.nrate=!com.fix_kappa+4*com.ncatG;
\r
1790 if(com.aaDist==FIT2) com.nrate+=com.ncatG;
\r
1793 else if(com.NSsites==NSdiscrete)
\r
1794 com.nrate+=com.ncatG; /* omega's */
\r
1795 else if(com.NSsites==NSTgamma || com.NSsites==NSTinvgamma) {
\r
1796 com.nrate += 2 + !com.fix_omega; com.ncatG=KGaussLegendreRule;
\r
1798 else if(com.NSsites==NSTgamma1 || com.NSsites==NSTinvgamma1) {
\r
1799 com.nrate += 3+!com.fix_omega; com.ncatG=KGaussLegendreRule+1;
\r
1805 error2 ("seqtype..");
\r
1807 if((com.runmode==-2 || com.runmode==-3) && com.cleandata==0) {
\r
1809 if(noisy) puts("gaps are removed for pairwise comparison.");
\r
1811 if(com.method &&(com.clock||com.rho))
\r
1812 { com.method=0; puts("Iteration method reset: method = 0"); }
\r
1813 if(com.method && com.seqtype==2 && com.model==FromCodon)
\r
1814 { com.method=0; puts("\awork on method = 1 for model = 6"); }
\r
1816 if (com.clock && com.fix_blength==2)
\r
1817 error2("can't fix branch lengths under clock model.");
\r
1818 if (com.runmode==3 && (com.clock)) error2("runmode+clock");
\r
1819 if (com.aaDist<=6 && (com.seqtype==CODONseq || com.model==FromCodon))
\r
1820 strcpy(com.daafile, daafiles[abs(com.aaDist)]);
\r
1822 if (com.fix_alpha && com.alpha==0) {
\r
1823 if (com.rho) puts("rho set to 0."); com.fix_rho=1; com.rho=0;
\r
1826 if(!com.fix_alpha && com.alpha<=0)
\r
1827 error2("initial value alpha <= 0 for fix_alpha = 0");
\r
1828 if(!com.fix_rho && com.rho==0) { com.rho=0.001; puts("init rho reset"); }
\r
1829 if(com.alpha||com.NSsites)
\r
1830 { if(com.ncatG<2 || com.ncatG>NCATG) error2("ncatG"); }
\r
1831 else if (com.ncatG>1) com.ncatG=1;
\r
1833 if(com.ndata<=0) com.ndata=1;
\r
1834 if(com.bootstrap && com.ndata!=1) error2("ndata=1 for bootstrap.");
\r
1840 int testx (double x[], int np)
\r
1842 /* This is used for LS branch length estimation by nls2, called only if(clock==0)
\r
1845 double tb[]={.4e-6, 99};
\r
1847 FOR (i,com.ntime)
\r
1848 if (x[i]<tb[0] || x[i]>tb[1])
\r
1855 int SetxBound (int np, double xb[][2])
\r
1857 int i=-1,j,k, K=com.ncatG;
\r
1858 double tb[]={4e-6,50}, tb0=1e-8, rgeneb[]={0.01,99}, rateb[]={1e-4,999};
\r
1859 double alphab[]={0.02,49}, betab[]={0.005,99}, omegab[]={0.0001,999};
\r
1860 double rhob[]={0.01,0.99}, pb[]={.00001,.99999};
\r
1862 SetxBoundTimes (xb);
\r
1863 for(i=com.ntime;i<np;i++) FOR (j,2) xb[i][j]=rateb[j];
\r
1865 for(i=com.ntime;i<np;i++) { xb[i][0]=rateb[0]; xb[i][1]=rateb[1]; }
\r
1866 for(i=0; i<com.nrgene; i++) for(j=0;j<2;j++) xb[com.ntime+i][j]=rgeneb[j];
\r
1867 for(i=0; i<com.nrate; i++) for(j=0;j<2;j++) xb[com.ntime+com.nrgene+i][j]=rateb[j];
\r
1868 k = com.ntime+com.nrgene+com.nkappa;
\r
1870 /* codon frequency parameters */
\r
1872 k += j = (com.seqtype==CODONseq && com.codonf>=FMutSel0 ? 3 : 0);
\r
1873 if(com.seqtype==CODONseq && com.npi>3
\r
1874 && (com.codonf==Fcodon || com.codonf==FMutSel0 ||com.codonf==FMutSel)) {
\r
1875 for( ; j<com.npi; j++) {
\r
1876 xb[k][0] = -29; xb[k++][1] = 29;
\r
1880 /* omega parameters or those in the w distribution */
\r
1881 if (com.NSsites) { /* p's before w's in xb[] */
\r
1882 omegab[0] *= 0.01;
\r
1884 switch(com.NSsites) {
\r
1885 case(NSnneutral):
\r
1886 xb[k][0]=pb[0]; xb[k++][1]=pb[1]; /* p0 */
\r
1887 xb[k][0]=omegab[0]; xb[k++][1]=1; /* w0 < 1 */
\r
1889 case(NSpselection): /* for p0, p1, w2 */
\r
1890 case(NSM2aRel): /* for p0, p1, w2 */
\r
1891 FOR(j,2) { xb[k][0]=-99; xb[k++][1]=99; } /* transformed p */
\r
1892 xb[k][0]=omegab[0]; xb[k++][1]=1; /* w0 < 1 */
\r
1893 if(!com.fix_omega && (com.model==0 || com.model==2)) { /* w2 > 1 */
\r
1894 xb[k][0] = (com.NSsites==NSpselection ? 1 : omegab[0]);
\r
1895 xb[k++][1] = omegab[1];
\r
1897 else if (com.model==3)
\r
1898 for(j=0; j<1+!com.fix_omega; j++) {
\r
1899 xb[k][0]=omegab[0]; xb[k++][1]=omegab[1];
\r
1902 case(NSdiscrete): /* pK[] & rK[] */
\r
1903 if(com.model==3) { /* Clade model D */
\r
1904 if(com.nparK) error2("model & NSsites & nparK");
\r
1905 FOR(j,K-1) { xb[k][0]=-99; xb[k++][1]=99; }
\r
1906 FOR(j,K+1) { xb[k][0]=omegab[0]; xb[k++][1]=omegab[1]; }
\r
1908 else if(com.model==2) { /* branch-site model B */
\r
1911 FOR(j,K-1) { xb[k][0]=-99; xb[k++][1]=99; }
\r
1912 FOR(j,K) { xb[k][0]=omegab[0]; xb[k++][1]=omegab[1]; }
\r
1914 FOR(j,K*(K-1)) { xb[k][0]=-99; xb[k++][1]=99; }
\r
1916 else { /* NSsites models M3 */
\r
1917 FOR(j,K-1) { xb[k][0]=-99; xb[k++][1]=99; }
\r
1918 FOR(j,K) { xb[k][0]=omegab[0]; xb[k++][1]=omegab[1]; }
\r
1921 if(com.seqtype==CODONseq && com.aaDist)
\r
1922 FOR(j,K) { xb[k][0]=omegab[0]; xb[k++][1]=omegab[1]; }
\r
1924 case(NSfreqs): /* p0...pK */
\r
1925 FOR(j,K-1) { xb[k][0]=-99; xb[k++][1]=99; }
\r
1928 FOR(j,2) { xb[k][0]=alphab[0]; xb[k++][1]=alphab[1]; } break;
\r
1929 case(NS2gamma): /* p0, alpha1,beta1,alpha2=beta2 */
\r
1930 xb[k][0]=pb[0]; xb[k++][1]=pb[1];
\r
1931 FOR(j,3) { xb[k][0]=alphab[0]; xb[k++][1]=alphab[1]; }
\r
1933 case(NSbeta): /* p_beta,q_beta */
\r
1934 FOR(j,2) { xb[k][0]=betab[0]; xb[k++][1]=betab[1]; }
\r
1937 /* p0, p_beta, q_beta, w */
\r
1938 xb[k][0]=pb[0]; xb[k++][1]=pb[1]; /* p0 */
\r
1939 FOR(j,2) { xb[k][0]=betab[0]; xb[k++][1]=betab[1]; } /* p & q */
\r
1940 if(!com.fix_omega) { xb[k][0]=1; xb[k++][1]=omegab[1]; }
\r
1942 case(NSbetagamma): /* p0, p_beta, q_beta, alpha, beta */
\r
1943 xb[k][0]=pb[0]; xb[k++][1]=pb[1]; /* p0 */
\r
1944 FOR(j,4) { xb[k][0]=betab[0]; xb[k++][1]=betab[1]; } /* p&q, a&b */
\r
1946 case(NSbeta1gamma): /* p0, p_beta, q_beta, alpha, beta */
\r
1947 xb[k][0]=pb[0]; xb[k++][1]=pb[1]; /* p0 */
\r
1948 FOR(j,4) { xb[k][0]=betab[0]; xb[k++][1]=betab[1]; } /* p&q, a&b */
\r
1950 case(NSbeta1normal): /* p0, p_beta, q_beta, mu, s */
\r
1951 xb[k][0]=pb[0]; xb[k++][1]=pb[1]; /* p0 */
\r
1952 FOR(j,4) { xb[k][0]=betab[0]; xb[k++][1]=betab[1]; } /* p&q, mu&s */
\r
1953 xb[k-2][0]=1; xb[k-2][1]=9; /* mu */
\r
1955 case(NS02normal): /* p0, p1, mu2, s1, s2 */
\r
1956 FOR(j,2) { xb[k][0]=pb[0]; xb[k++][1]=pb[1]; } /* p0 & p1, */
\r
1957 FOR(j,3) { xb[k][0]=.0001; xb[k++][1]=29; } /* mu2,s1,s2 */
\r
1959 case(NS3normal): /* p0, p1, mu2, s0, s1, s2 */
\r
1960 FOR(j,2) { xb[k][0]=-49; xb[k++][1]=49; } /* p0 & p1, tranformed */
\r
1961 FOR(j,4) { xb[k][0]=.0001; xb[k++][1]=29; } /* mu2,s0,s1,s2 */
\r
1965 case(NSTinvgamma):
\r
1967 case(NSTinvgamma1):
\r
1968 if(com.NSsites==NSTgamma1 || com.NSsites==NSTinvgamma1) /* p0 for G(a,b,T) */
\r
1969 { xb[k][0]=0.001; xb[k++][1]=0.9999; }
\r
1972 if(com.NSsites==NSTinvgamma || com.NSsites==NSTinvgamma1)
\r
1974 xb[k++][1]=alphab[1]; /* alpha */
\r
1975 xb[k][0]=0.05; xb[k++][1]=betab[1]; /* beta */
\r
1976 if(!com.fix_omega)
\r
1977 { xb[k][0]=1; xb[k++][1]=29; } /* T */
\r
1981 else if((com.seqtype==CODONseq||com.model==FromCodon) && com.aaDist!=AAClasses)
\r
1982 { if(!com.fix_omega) { xb[k][0]=omegab[0]; xb[k][1]=omegab[1]; } }
\r
1984 if(com.seqtype==CODONseq && com.model)
\r
1985 for(j=0; j<com.nOmega-com.fix_omega; j++)
\r
1986 { xb[k+j][0]=omegab[0]; xb[k+j][1]=omegab[1]; }
\r
1988 if (com.aaDist<0 && (com.seqtype==1||com.model==FromCodon)) {
\r
1989 /* linear relationship between d_ij and w_ij */
\r
1990 if(com.nrate != !com.fix_kappa+1+(com.seqtype==1)) error2("in Setxbound");
\r
1991 xb[com.ntime+com.nrgene+!com.fix_kappa][1]=1; /* 0<b<1 */
\r
1994 k=com.ntime+com.nrgene+com.nrate;
\r
1995 for (i=0;i<com.nalpha;i++,k++) FOR (j,2) xb[k][j]=alphab[j];
\r
1996 if (!com.fix_rho) FOR (j,2) xb[np-1][j]=rhob[j];
\r
1998 if(noisy>=3 && np<100) {
\r
1999 printf("\nBounds (np=%d):\n",np);
\r
2000 for(i=0;i<np;i++) printf(" %10.6f", xb[i][0]); FPN(F0);
\r
2001 for(i=0;i<np;i++) printf(" %10.6f", xb[i][1]); FPN(F0);
\r
2007 void getpcodonClass(double x[], double pcodonClass[])
\r
2009 /* This uses pcodon0[], paa0[], and x[] to calculate pcodonclass[] and
\r
2010 com.pi[] for the fitness models.
\r
2011 pcodon0[] has the codon frequencies observed (codonFreq=3) or expected
\r
2012 (codonFreq=2 or 1 or 0) rootally. Under the fitness models, the expected
\r
2013 codon frequencies pcodonClass[] differs among site classes and from the
\r
2014 rootal pi[] (pcodon0[]).
\r
2015 This is called by SetParameters().
\r
2017 int i,iclass,iaa, k, nclass=(com.NSsites==0?1:com.ncatG);
\r
2018 double paaClass[20], *w,fit;
\r
2020 if(com.seqtype!=1 || com.aaDist<FIT1) error2("getpcodonClass");
\r
2021 k=com.ntime+com.nrgene+!com.fix_kappa+nclass-1;
\r
2022 FOR(iclass, nclass) {
\r
2023 w=x+k+iclass*(4+(com.aaDist==FIT2));
\r
2025 fit = -w[0]*square(AAchem[0][iaa]-w[1])
\r
2026 -w[2]*square(AAchem[1][iaa]-w[3]);
\r
2027 paaClass[iaa]=exp(2*fit);
\r
2029 abyx(1/sum(paaClass,20), paaClass, 20);
\r
2030 FOR(i,com.ncode) {
\r
2031 iaa=GeneticCode[com.icode][FROM61[i]];
\r
2032 pcodonClass[iclass*64+i]=pcodon0[i]/paa0[iaa]*paaClass[iaa];
\r
2035 if(fabs(1-sum(pcodonClass+iclass*64,com.ncode))>1e-5) error2("pcodon!=1");
\r
2037 fprintf(frst,"\nSite class %d: ",iclass+1);
\r
2038 matout (frst,paaClass,2, 10);
\r
2039 matout (frst,pcodonClass+iclass*64,16,4);
\r
2042 if(nclass==1) FOR(i,com.ncode) com.pi[i]=pcodonClass[i];
\r
2047 int GetInitialsCodon (double x[])
\r
2049 /* This sets the initials and count com.np for codon models.
\r
2051 int k=com.ntime+com.nrgene, i,j, K=com.ncatG, nsyncodon[20];
\r
2054 if(com.nrate) { /* either kappa, omega, or both for each gene */
\r
2055 if(com.Mgene<=2) {
\r
2057 x[k++]=.5+rndu();
\r
2058 for(i=0; i<4; i++) x[k++]=.1+rndu();
\r
2060 else if (!com.fix_kappa)
\r
2061 x[k++] = com.kappa;
\r
2062 if(com.codonf==FMutSel0 || com.codonf==FMutSel) {
\r
2063 for(i=0; i<3; i++) /* pi_TCA */
\r
2064 x[k++] = com.pf3x4[i]/(com.pf3x4[3]+.02*rndu());
\r
2066 if(com.npi>3 && com.codonf==FMutSel0) {
\r
2067 for(i=0; i<20; i++) nsyncodon[i]=0;
\r
2068 for(i=0; i<com.ncode; i++)
\r
2069 nsyncodon[GeneticCode[com.icode][FROM61[i]]] ++;
\r
2070 for(i=0; i<20-1; i++) /* amino acid fitness, ignoring nsyncodon */
\r
2071 x[k++] = log((com.piAA[i]/nsyncodon[i]+.001)/(com.piAA[19]/nsyncodon[19]+.002*rndu()));
\r
2073 else if(com.npi>3 && com.codonf==FMutSel) {
\r
2074 for(i=0;i<com.ncode-1;i++) /* codon fitness */
\r
2075 x[k++] = log((com.pi[i]+.001)/(com.pi[com.ncode-1]+.002*rndu()));
\r
2078 else if(com.npi) {
\r
2079 if(com.codonf==Fcodon)
\r
2080 for(i=0;i<com.ncode-1;i++) /* codon fitness */
\r
2081 x[k++] = log((com.pi[i]+.001)/(com.pi[com.ncode-1]+.002*rndu()));
\r
2082 else if(com.codonf==F1x4 || com.codonf==F1x4MG)
\r
2083 for(i=0;i<3;i++) /* pi_TCA */
\r
2084 x[k++] = com.pf3x4[i]/(com.pf3x4[3]+.02*rndu());
\r
2085 else if(com.codonf==F3x4 || com.codonf==F3x4MG)
\r
2086 for(j=0; j<3; j++)
\r
2087 for(i=0;i<3;i++) /* pi_TCA */
\r
2088 x[k++] = com.pf3x4[j*4+i]/(com.pf3x4[j*4+3]+.02*rndu());
\r
2090 if (com.NSsites==0 && com.model==0) {
\r
2092 { if(!com.fix_omega) x[k++]=com.omega; }
\r
2093 else if (com.aaDist==AAClasses)
\r
2094 for(i=0; i<com.nOmegaType; i++)
\r
2095 x[k++]=0.11+0.1*rndu();
\r
2097 { x[k++]=0.11+0.1*rndu(); x[k++]=0.22+0.1*rndu(); }
\r
2100 else { /* com.Mgene==3,4 */
\r
2101 if(com.Mgene>=3) {
\r
2102 com.nrate *= com.ngene;
\r
2103 if(com.fix_omega) com.nrate--;
\r
2105 for(i=0; i<com.ngene; i++) {
\r
2107 error2("hkyREV for ngene>1. Fix me.");
\r
2108 if(!com.fix_kappa && !com.fix_omega)
\r
2109 { x[k++] = com.kappa; x[k++] = com.omega; }
\r
2110 else if (com.fix_kappa)
\r
2111 x[k++] = com.omega;
\r
2112 else if (com.fix_omega) {
\r
2113 x[k++] = com.kappa;
\r
2114 if(i!=com.ngene-1) x[k++] = com.omega;
\r
2120 if(com.model && com.model<=NSbranch3) { /* branch models */
\r
2121 if (com.model==NSbranchB) {
\r
2122 com.nbtype = tree.nbranch;
\r
2123 for(i=0; i<tree.nbranch; i++)
\r
2124 nodes[(int)tree.branches[i][1]].label = i;
\r
2126 if(com.NSsites==0) {
\r
2127 com.nOmega = com.nbtype;
\r
2128 if (com.aaDist==AAClasses)
\r
2129 com.nrate += com.nOmegaType*com.nbtype;
\r
2130 else if (com.model==NSbranchB || com.model==NSbranch2)
\r
2131 com.nrate += (com.model==NSbranchB ? tree.nbranch : com.nOmega-1+!com.fix_omega);
\r
2132 else if(com.aaDist==0)
\r
2133 com.nrate += !com.fix_omega + com.nbtype - 1;
\r
2136 com.nrate = com.nkappa+!com.fix_omega+com.nbtype-1;
\r
2137 else if (com.aaDist==AAClasses)
\r
2138 com.nrate = com.nkappa + com.nOmegaType*com.nbtype;
\r
2139 else if (com.model==NSbranchB || com.model==NSbranch2)
\r
2140 com.nrate += (com.model==NSbranchB ? tree.nbranch : com.nOmega-1+!com.fix_omega);
\r
2142 k = com.ntime+com.nrgene;
\r
2143 for(i=0; i<com.nrate; i++)
\r
2144 x[k++] = com.omega * (0.8+0.4*rndu());
\r
2149 if (com.NSsites==0 && com.nrate==0)
\r
2150 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, (com.nkappa>1?x+com.ntime+com.nrgene:&com.kappa), com.omega,PMat);
\r
2153 /* branch-site and clade models
\r
2154 com.nOmega=2 different w's at a site (three w's in the model: w0,w1,w2) */
\r
2155 if(com.model && com.NSsites) {
\r
2156 if(com.model==NSbranch2) { /* branch-site models A & B */
\r
2158 if(com.NSsites==NSdiscrete)
\r
2159 com.nrate = com.nkappa +com.npi + 2 +!com.fix_omega+com.nbtype-1-1; /* add w0 and w1 */
\r
2161 com.nrate = com.nkappa +com.npi +1+!com.fix_omega+com.nbtype-1-1;
\r
2164 /* add p0 and p1. check that this works for NSbranch2 */
\r
2165 k = com.ntime+com.nrgene+com.nkappa+com.npi;
\r
2167 if(com.model<=NSbranch2) { /* branch-site models A & B */
\r
2168 /* p0 and p1: x[0,1]=1,0, for p[]=0.6 0.2 */
\r
2169 x[k++] = 1+0.5*rndu();
\r
2170 if(K==3) x[k++] = 0.2*rndu();
\r
2171 if(com.NSsites == 2) /* w0<1, w1=1 (if present) */
\r
2172 x[k++] = 0.2+0.1*rndu();
\r
2173 else if(com.NSsites == NSdiscrete) { /* w0 and w1 for model B */
\r
2174 x[k++] = 0.2*rndu();
\r
2175 if(K==3) x[k++] = 0.4+.8*rndu();
\r
2177 if(!com.fix_omega)
\r
2178 x[k++] = com.omega + 1 + rndu(); /* w2 */
\r
2180 else { /* NSbranch3: clade models C and D */
\r
2181 x[k++] = 1 + rndu();
\r
2182 if(com.ncatG == 3) x[k++] = .5+rndu(); /* p0 and p1 */
\r
2183 if(com.NSsites == NSpselection) /* w0<1, w1=1 (if present) */
\r
2184 x[k++] = 0.2+0.2*rndu();
\r
2185 else if(com.NSsites == NSdiscrete) { /* w0 and w1 */
\r
2186 x[k++] = 0.2+0.2*rndu();
\r
2187 if(com.ncatG==3) x[k++] = 0.5+.5*rndu();
\r
2189 for(i=0; i<com.nbtype-1; i++) /* additional w's */
\r
2190 x[k++] = com.omega*(1+0.5*rndu());
\r
2191 if(!com.fix_omega)
\r
2192 x[k++] = com.omega*(1+0.5*rndu());
\r
2195 else if (com.NSsites) { /* w's are counted in com.nrate */
\r
2196 switch(com.NSsites) {
\r
2198 x[k++]=0.5+0.4*rndu(); /* p0 for w0<1 */
\r
2199 x[k++]=0.1+0.5*rndu(); /* w0<1 */
\r
2201 case(NSpselection): /* for p0, p1. w is counted in nrate. */
\r
2203 x[k++] = 0.8+rndu(); x[k++]=.1+.5*rndu(); /* p0, p1 */
\r
2204 x[k++] = 0.1+0.4*rndu(); /* w0<1 */
\r
2205 if(!com.fix_omega) {
\r
2206 x[k++] = com.omega*(1+0.2*rndu()); /* w2 */
\r
2207 if(com.omega<1 && com.NSsites==NSpselection) {
\r
2208 puts("\ninitial w for M2:NSpselection reset.");
\r
2209 x[k-1] = 2+rndu();
\r
2215 for(i=0; i<com.ncatG-1; i++) x[k++]=0.;
\r
2216 if(com.aaDist<=6)
\r
2217 for(i=0;i<com.ncatG;i++) { x[k++]=1.1; x[k++]=1.2; }
\r
2218 for(i=0;i<com.ncatG;i++) /* ap,p*,av,v*, and b for each site class */
\r
2219 FOR(j,4+(com.aaDist==FIT2)) x[k++]=rndu();
\r
2221 else if(com.nparK) { /* K*(K-1) paras in HMM of dN/dS over sites */
\r
2222 zero(x+k,com.ncatG*(com.ncatG-1));
\r
2223 k += com.ncatG*(com.ncatG-1);
\r
2225 else { /* p0...pK. Note that w's are counted in nrate */
\r
2226 for(i=0;i<com.ncatG-1;i++) x[k++]=rndu();
\r
2227 for(i=0;i<com.ncatG;i++)
\r
2228 x[k++]=com.omega * (.5+i*2./com.ncatG*(0.8+0.4*rndu()));
\r
2231 case(NSfreqs): /* p0...pK. w's are fixed */
\r
2232 for(i=0;i<com.ncatG-1;i++) x[k++]=(com.ncatG-j)/2.;
\r
2234 case(NSgamma): x[k++]=1.1; x[k++]=1.1; break;
\r
2235 case(NS2gamma): /* p0, alpha1,beta1,alpha2=beta2 */
\r
2236 x[k++]=0.5; FOR(j,3) x[k++]=2*rndu()+j*0.1; break;
\r
2237 case(NSbeta): /* p_beta,q_beta */
\r
2238 x[k++]=.2+rndu(); x[k++]=1+rndu(); break;
\r
2240 /* p0, p_beta, q_beta. w is counted in nrate. */
\r
2241 x[k++]=.9; x[k++]=.2+rndu(); x[k++]=1+rndu();
\r
2242 if(!com.fix_omega) {
\r
2245 puts("\ninitial w for M8:NSbetaw>1 reset.");
\r
2250 case(NSbetagamma): /* p0, p_beta, q_beta, alpha, beta */
\r
2251 x[k++]=.9; x[k++]=.4; x[k++]=1.2; x[k++]=1.1; x[k++]=1.1;
\r
2253 case(NSbeta1gamma): /* p0, p_beta, q_beta, alpha, beta */
\r
2254 x[k++]=.9; x[k++]=.4; x[k++]=1.2; x[k++]=.1; x[k++]=1.1;
\r
2256 case(NSbeta1normal): /* p0, p_beta, q_beta, alpha, beta */
\r
2257 x[k++]=.95; x[k++]=.4; x[k++]=1.2; x[k++]=1.1; x[k++]=1.1;
\r
2259 case(NS02normal): /* p0, p1, mu2, s1, s2 */
\r
2260 x[k++]=.8; x[k++]=0.3; /* p0 & p1, not transformed */
\r
2261 x[k++]=.2; /* mu2 */
\r
2262 x[k++]=5; x[k++]=1.1; /* s1,s2 */
\r
2264 case(NS3normal): /* p0, p1, mu2, s0, s1, s2 */
\r
2265 x[k++]=.77; x[k++]=0.22; /* p0 & p1, transformed */
\r
2266 x[k++]=.2; /* mu2 */
\r
2267 x[k++]=0.5; x[k++]=5; x[k++]=1.1; /* s0,s1,s2 */
\r
2270 case(NSTgamma): /* alpha, beta, T */
\r
2271 case(NSTgamma1): /* p0, alpha, beta, T */
\r
2272 if(com.NSsites==NSTgamma1)
\r
2273 x[k++]=0.8+0.2*rndu(); /* p0, not transformed */
\r
2274 x[k++]=2+rndu(); x[k++]=3+rndu();
\r
2275 if(!com.fix_omega) x[k++]=1.+rndu();
\r
2277 case(NSTinvgamma): /* alpha, beta, T */
\r
2278 case(NSTinvgamma1): /* p0, alpha, beta, T */
\r
2279 if(com.NSsites==NSTinvgamma1)
\r
2280 x[k++]=0.8+0.2*rndu(); /* p0, not transformed */
\r
2281 x[k++]=3+rndu(); x[k++]=0.8+0.2*rndu(); /* mean = b/(a-1) */
\r
2282 if(!com.fix_omega) x[k++]=1.+rndu();
\r
2285 } /* if(com.NSsites) */
\r
2292 int GetInitials (double x[], int* fromfile)
\r
2294 /* This caculates the number of parameters (com.np) and get initial values.
\r
2295 This routine is too messy. Perhaps try to restruct the code and make
\r
2296 two sections for amino acids and codons?
\r
2297 com.nrate is initialised in getoptions().
\r
2299 static int times=0;
\r
2300 int i, j,k=0, naa=20;
\r
2301 int K=(com.model==2&&com.NSsites?com.ncatG-1:com.ncatG);
\r
2302 size_t sconP_new = (size_t)(tree.nnode-com.ns)*com.ncode*com.npatt*sizeof(double);
\r
2305 NFunCall = NPMatUVRoot = NEigenQ = 0;
\r
2306 if(com.clock==ClockCombined && com.ngene<=1)
\r
2307 error2("Combined clock model requires mutliple genes.");
\r
2308 GetInitialsTimes(x);
\r
2310 com.plfun = (com.alpha==0 ? lfun : (com.rho==0?lfundG:lfunAdG));
\r
2311 if(com.NSsites) com.plfun=lfundG;
\r
2312 if(com.nparK) com.plfun=lfunAdG;
\r
2314 if(com.plfun==lfun) com.conPSiteClass=0;
\r
2315 if(com.method && com.fix_blength!=2 && com.plfun==lfundG) {
\r
2316 com.conPSiteClass=1;
\r
2317 sconP_new *= com.ncatG;
\r
2319 if(com.sconP<0 || sconP_new<0) error2("data set too large.");
\r
2320 if(com.sconP<sconP_new) {
\r
2321 com.sconP = sconP_new;
\r
2322 printf("\n%9lu bytes for conP, adjusted\n", com.sconP);
\r
2323 if((com.conP=(double*)realloc(com.conP, com.sconP))==NULL)
\r
2324 error2("oom conP");
\r
2327 InitializeNodeScale();
\r
2330 if((com.aaDist && com.aaDist<10 && com.aaDist!=AAClasses &&
\r
2331 (com.seqtype==CODONseq||com.model==FromCodon)) ||
\r
2332 (com.seqtype==AAseq &&
\r
2333 (com.model==Empirical||com.model==Empirical_F||com.model>=REVaa_0))){
\r
2334 GetDaa(NULL,com.daa);
\r
2337 com.nrgene = (!com.fix_rgene)*(com.ngene-1);
\r
2338 for(j=0; j<com.nrgene; j++) x[com.ntime+j] = 1;
\r
2341 if(com.seqtype==CODONseq)
\r
2342 GetInitialsCodon(x);
\r
2344 com.np = com.ntime+com.nrgene+com.nrate;
\r
2346 k=com.ntime+com.nrgene;
\r
2347 if (com.aaDist==AAClasses) {
\r
2348 if (!com.fix_kappa) x[k++]=com.kappa;
\r
2349 for(i=0; i<com.nrate-!com.fix_kappa; i++)
\r
2350 x[k++] = com.omega;
\r
2351 if (com.nOmegaType>65)
\r
2352 puts("\a\nget better initial values for AAclasses?");
\r
2355 if (com.seqtype==AAseq) { /* AAseq */
\r
2357 eigenQaa(NULL, Root, U, V, &t); /* once for all */
\r
2358 if (com.model==REVaa_0) {
\r
2359 for(i=0;i<naa;i++) for(j=0;j<i;j++)
\r
2360 if (AA1STEP[i*(i-1)/2+j] && i*naa+j!=ijAAref)
\r
2361 x[k++] = com.daa[i*naa+j];
\r
2363 else if (com.model==REVaa) {
\r
2364 for (i=1; i<naa; i++) for(j=0; j<i; j++)
\r
2365 if(i*naa+j != ijAAref) x[k++] = com.daa[i*naa+j];
\r
2367 else if (com.model==FromCodon) {
\r
2368 for(j=0; j<com.nkappa; j++) x[k++] = com.kappa;
\r
2369 for(j=0; j<com.nrate-com.nkappa; j++) x[k++] = com.omega;
\r
2375 for (i=0; i<com.nalpha; i++) x[com.np++] = com.alpha;
\r
2377 if (!com.fix_rho) x[com.np++] = com.rho;
\r
2379 AutodGamma (com.MK, com.freqK, com.rK, &t, com.alpha, com.rho,com.ncatG);
\r
2380 else if (com.alpha && com.fix_alpha && !com.NSsites)
\r
2381 DiscreteGamma(com.freqK,com.rK,com.alpha,com.alpha,com.ncatG,DGammaUseMedian);
\r
2383 if(com.fix_blength==-1)
\r
2384 for(i=0; i<com.np; i++) x[i] = (i<com.ntime ? .1+0.5*rndu() : 0.5+rndu());
\r
2388 readx(x,fromfile);
\r
2389 if(com.runmode>0 && fromfile && com.NSsites) LASTROUND=1;
\r
2397 int SetPGene (int igene, int _pi, int _UVRoot, int _alpha, double x[])
\r
2399 /* xcom[] does not contain time parameters
\r
2400 Note that com.piG[][] have been homogeneized if (com.Mgene==3)
\r
2401 Note calculation of nr1 for (com.Mgene>=3 && com.fix_omega), as only the
\r
2402 w for the last partition is fixed.
\r
2404 int nr1=(com.nrate+1)/com.ngene, k=com.nrgene+(com.Mgene>=3)*igene*nr1;
\r
2405 double *xcom=x+com.ntime, mr=0;
\r
2408 xtoy (com.piG[igene],com.pi,com.ncode);
\r
2409 #if(defined(CODEML))
\r
2410 if(com.codonf==F1x4MG || com.codonf==F3x4MG)
\r
2411 com.pf3x4 = com.f3x4[igene];
\r
2415 if (com.seqtype==CODONseq) {
\r
2416 if(!com.fix_kappa) com.kappa=xcom[k++];
\r
2417 if(!com.fix_omega) com.omega=xcom[k++];
\r
2419 com.omega = (com.Mgene>2&&igene<com.ngene-1?xcom[k++]:com.omega_fix);
\r
2421 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr,
\r
2422 (com.hkyREV||com.codonf==FMutSel?&xcom[com.nrgene]:&com.kappa),com.omega,PMat);
\r
2425 eigenQaa(NULL, Root, U, V, xcom+k);
\r
2428 com.alpha=xcom[com.nrgene+com.nrate+igene];
\r
2429 DiscreteGamma (com.freqK, com.rK, com.alpha, com.alpha, com.ncatG, DGammaUseMedian);
\r
2435 int SetParametersNSsites (double x[])
\r
2437 /* for NSsites and NSbranchsite models including HMM, NSclade models
\r
2438 p's are before w's in x[].
\r
2439 w[2][3] holds omegas; w[i][j] for fore (i=0) or back (i=1) branches
\r
2442 A & B: branch-site models: (model=2, NSsites=2 or 3)
\r
2449 C & D: clade-site models: (model=3, NSsites=2 or 3)
\r
2459 int k0=com.ntime+com.nrgene+com.nkappa+com.npi, k=k0;
\r
2460 int K=com.ncatG, i,j, off;
\r
2461 double w[NBTYPE][3], t, S,dS,dN, spaceP2PI[NCATG*(NCATG+1)], small=1e-4;
\r
2463 double p0=1, c,e,eT, dwy, y,z, a, b, T, C, lnGa, ww, sign, *xI=NULL, *wI=NULL; /* truncated NSsites models */
\r
2465 if(com.NSsites==0) error2("SetParametersNSsites : strange.");
\r
2467 switch(com.NSsites) {
\r
2469 com.freqK[0] = x[k++];
\r
2470 com.freqK[1] = 1-com.freqK[0];
\r
2471 com.rK[0] = x[k++];
\r
2474 case(NSpselection):
\r
2477 if(com.model == NSbranch2) /* branch-site A&B (Y&N2002) */
\r
2479 if(com.nparK) { /* HMM models, setting up p[] & w[] */
\r
2480 for(j=0; j<K; j++) /* w's for site classes */
\r
2481 com.rK[j] = x[k++];
\r
2482 for (i=0; i<K; i++, k+=K-1) {
\r
2483 if (!LASTROUND) f_and_x(x+k,com.MK+i*K,K,0,0); /* x->f */
\r
2484 else xtoy (x+k,com.MK+i*K,K-1);
\r
2485 com.MK[i*K+K-1] = 1-sum(com.MK+i*K,K-1);
\r
2487 PtoPi(com.MK, com.freqK, K, spaceP2PI);
\r
2491 /* *** Note: Falling through.
\r
2492 This sets up p[] for NSpselection, NSdiscrete, NSfreqs
\r
2496 f_and_x(x+k,com.freqK,K,0,1); /* x->f */
\r
2500 for(j=0,com.freqK[K-1]=1; j<K-1; j++)
\r
2501 com.freqK[K-1] -= (com.freqK[j] = x[k++]);
\r
2502 if(com.freqK[K-1]<-small || com.freqK[K-1]>1+small) {
\r
2503 matout(F0, com.freqK, 1, K);
\r
2504 error2("freqK[]");
\r
2508 /* setting up w[] */
\r
2509 if(com.NSsites == NSfreqs) {
\r
2510 if(com.ncatG!=5) error2("NSfreqs, ncatG?");
\r
2512 com.rK[1] = 1./3;
\r
2513 com.rK[2] = 2./3;
\r
2517 else if(com.NSsites == NSpselection || com.NSsites == NSM2aRel) {
\r
2518 com.rK[0] = x[k++];
\r
2520 com.rK[2] = (com.fix_omega && com.model<=2 ? com.omega_fix : x[k++]);
\r
2522 else if(com.NSsites == NSdiscrete && com.aaDist == 0) {
\r
2523 for(j=0; j<K; j++)
\r
2524 com.rK[j] = x[k++];
\r
2526 if(com.model) { /* branch-site and clade models */
\r
2527 if(com.model == NSbranch2) { /* branch-site models */
\r
2528 w[0][0] = w[1][0] = com.rK[0]; /* site class 0 */
\r
2529 w[0][1] = w[1][1] = com.rK[1]; /* site class 1 */
\r
2531 w[1][2] = com.rK[2];
\r
2533 else { /* clade models */
\r
2535 for(i=0; i<com.nbtype; i++) {
\r
2536 for(j=0; j<K-1; j++)
\r
2537 w[i][j] = com.rK[j];
\r
2538 w[i][K-1] = (i==com.nbtype-1 && com.fix_omega ? com.omega_fix : x[k++]);
\r
2548 case(NSbetagamma):
\r
2549 case(NSbeta1gamma):
\r
2550 case(NSbeta1normal):
\r
2553 DiscreteNSsites(x+k);
\r
2557 /* rK[] & freqK[] for truncated nssites models. */
\r
2558 if(com.NSsites>=NSTgamma && com.NSsites<=NSTinvgamma1) {
\r
2559 off = (com.NSsites==NSTgamma1||com.NSsites==NSTinvgamma1);
\r
2564 com.freqK[K] = 1 - p0;
\r
2568 T = (com.fix_omega ? com.omega_fix : x[k+off+2]);
\r
2569 K = com.ncatG-off;
\r
2570 lnGa = LnGamma(a);
\r
2571 if(com.NSsites==NSTgamma || com.NSsites==NSTgamma1) {
\r
2572 C = CDFGamma(T, a, b);
\r
2573 mr = a/(C*b)*CDFGamma(T, a+1, b);
\r
2576 C = 1 - CDFGamma(1/T, a, b);
\r
2577 mr = b/(C*(a-1))*( 1 - CDFGamma(1/T, a-1, b) );
\r
2580 GaussLegendreRule(&xI, &wI, K);
\r
2581 /* w changes monotonically from 0 to T. */
\r
2582 for(j=0; j<K; j++) {
\r
2583 if(j<K/2) { i = K/2-1-j; sign=-1; }
\r
2584 else { i = j-K/2; sign=1; }
\r
2585 #if(0) /* linear transform */
\r
2587 com.rK[j] = ww = (1+y)*T/2;
\r
2589 #else /* exponential transform */
\r
2593 z = 1 + eT + y - y*eT;
\r
2594 com.rK[j] = ww = -1/c*log(z/2);
\r
2595 dwy = (1 - eT)/(c*z);
\r
2597 if(com.NSsites==NSTgamma || com.NSsites==NSTgamma1)
\r
2598 com.freqK[j] = exp( a*log(b*ww)-lnGa-b*ww )/(ww*C) * p0*wI[i]*dwy;
\r
2600 com.freqK[j] = exp( a*log(b/ww)-lnGa-b/ww ) /(ww*C) * p0*wI[i]*dwy;
\r
2603 printf("\na b T lnGa=%9.5f%9.5f%9.5f %9.5f\nf & w:\n", a,b,T, lnGa);
\r
2604 FOR(j,com.ncatG) printf("%13.5f", com.freqK[j]); FPN(F0);
\r
2605 FOR(j,com.ncatG) printf("%13.5f", com.rK[j]); FPN(F0);
\r
2609 /* For NSsites models, calculates Qfactor_NS, to be used in eigenQcodon().
\r
2610 For branch-site and clade models, calculate Qfactor_NS[] and also
\r
2611 UVRoot for different omega's.
\r
2614 if(com.model == 0) { /* NSsites models */
\r
2615 if(com.aaDist==0) {
\r
2616 if(com.NSsites<NSTgamma || com.NSsites>NSTinvgamma1) /* mr already calculated for truncated models */
\r
2617 for(j=0,mr=0; j<com.ncatG; j++)
\r
2618 mr += com.freqK[j]*com.rK[j];
\r
2620 eigenQcodon(0,-1,&S,&dS,&dN,NULL,NULL,NULL, &Qfactor_NS, com.pkappa, mr, PMat);
\r
2623 for(j=0,Qfactor_NS=0; j<com.ncatG; j++) {
\r
2625 com.pomega = x+k+com.ncatG-1+2*j;
\r
2626 else if(com.aaDist >= FIT1) {
\r
2627 com.pomega = x+k+com.ncatG-1+j*(4+(com.aaDist==FIT2));
\r
2628 xtoy(pcodonClass+j*64, com.pi, com.ncode);
\r
2631 eigenQcodon(0,-1,&S,&dS,&dN,NULL,NULL,NULL, &mr, com.pkappa, com.rK[j], PMat);
\r
2632 Qfactor_NS += com.freqK[j]*mr;
\r
2635 Qfactor_NS = 1/Qfactor_NS;
\r
2636 if(NFunCall==1) printf("Qfactor_NS = %.6f\n", Qfactor_NS);
\r
2638 else if (com.model == NSbranch2) { /* branch&site models */
\r
2639 t = com.freqK[0] + com.freqK[1];
\r
2641 error2("p0 + p1 too small for branch&site model?");
\r
2642 com.freqK[2] = (1-t)*com.freqK[0]/t;
\r
2643 com.freqK[3] = (1-t)*com.freqK[1]/t;
\r
2644 /* calculates scale factors: background branches has two site classes
\r
2645 while foreground branches has 3 site classes */
\r
2647 for(i=0; i<2; i++) { /* i=0 back (2 site classes); i=1 fore (3 classes) */
\r
2648 for(j=0,mr=0; j<(i==0?2:3); j++) {
\r
2649 com.omega = w[i][j];
\r
2651 if(i==0) f = com.freqK[j]/t;
\r
2652 else if(j==2) f = 1-t;
\r
2653 if(NFunCall==1) printf("branch=%d freq=%.6f w%d = %.6f\n", i,f,j,com.omega);
\r
2654 mr += f*com.omega;
\r
2656 Qfactor_NS_branch[i] = -1;
\r
2657 eigenQcodon(0,-1,&S,&dS,&dN,NULL,NULL,NULL, &Qfactor_NS_branch[i], com.pkappa, mr, PMat);
\r
2658 Qfactor_NS_branch[i] = 1/Qfactor_NS_branch[i];
\r
2659 if(NFunCall==1) printf("\t\t\tQfactor for branch %d = %.6f\n", i,Qfactor_NS_branch[i]);
\r
2661 /* calculates 3 sets of U&V&Root vectors (w0,w1,w2), for GetPMatBranch().
\r
2662 No eigenQcodon() calls are needed in ConditionalPNode() or minbranches().
\r
2664 for(i=0; i<3; i++) { /* (w0,w1,w2) */
\r
2665 if(NFunCall==1) printf("w[%d] = %.6f\n", i, w[1][i]);
\r
2667 eigenQcodon(1,-1,NULL,NULL,NULL,_Root[i],_UU[i],_VV[i], &mr, com.pkappa,w[1][i],PMat);
\r
2670 else { /* NSbranch3: Clade models C and D */
\r
2671 /* calculates Qfactor_NS_branch[nbtype]: each branch has K=com.ncatG site classes */
\r
2672 for(i=0; i<com.nbtype; i++) {
\r
2673 for(j=0,mr=0; j<K; j++)
\r
2674 mr += com.freqK[j] * w[i][j];
\r
2675 Qfactor_NS_branch[i] = -1;
\r
2676 eigenQcodon(0,-1,NULL,NULL,NULL,NULL,NULL,NULL, &Qfactor_NS_branch[i], com.pkappa,mr,PMat);
\r
2677 Qfactor_NS_branch[i] = 1/Qfactor_NS_branch[i];
\r
2679 if(NFunCall==1) printf("\t\t\tQfactor for branch=%d = %.6f\n", i,Qfactor_NS_branch[i]);
\r
2681 /* calculates K-1+nbtype sets of U&V&Root vectors (w0,w1,w2, w3,...), for GetPMatBranch().
\r
2683 for(i=0; i<K-1+com.nbtype; i++) {
\r
2685 com.omega = (i < K-1 ? w[0][i] : w[i-K+1][K-1]);
\r
2686 eigenQcodon(1,-1,NULL,NULL,NULL,_Root[i],_UU[i],_VV[i], &mr, com.pkappa,com.omega,PMat);
\r
2693 int Set_UVR_BranchSite (int iclass, int branchlabel)
\r
2695 /* There are 3 different w's in the branch-site models A & B, and nbtype+2
\r
2696 different w's in the clade models C & B, so there are the same number of
\r
2697 sets of U&V&Root. This routine points out the right set.
\r
2701 if(com.model==0 || com.NSsites==0) error2("should not be here.");
\r
2703 if(com.model<=NSbranch2) { /* branch-site models A & B */
\r
2704 if(branchlabel==0) iUVR = iclass%2; /* back, w0 w1 */
\r
2705 else iUVR = (iclass<=1 ? iclass : 2); /* fore, w0 w1 w2 */
\r
2707 else { /* clade models C & D */
\r
2708 if(iclass<com.ncatG-1) iUVR = iclass;
\r
2709 else iUVR = com.ncatG-1 + branchlabel;
\r
2713 Root = _Root[iUVR];
\r
2719 int GetCodonFreqs (void)
\r
2721 /* This is called by SetParameters() and calculates the expected base or codon frequencies
\r
2722 (com.pf3x4[] & com.pi[]) using the parameters under the model com.codonf.
\r
2723 This is used for models in which codon frequency parameters are estimated from
\r
2724 the data by ML. Modified from GetCodonFreqs2().
\r
2725 com.pi[] is modified.
\r
2726 The routine does not work if com.ngene>1.
\r
2728 int n=com.ncode, i,j,k, ic,iaa,b[3];
\r
2729 double *ppi=com.ppi, mutbias[20], y;
\r
2731 if (com.codonf==Fcodon) {
\r
2732 for(i=0; i<n; i++)
\r
2733 com.pi[i] = (i==n-1 ? 1 : exp(com.ppi[i]));
\r
2734 abyx (1./sum(com.pi,n), com.pi, n);
\r
2738 for(j=0;j<3;j++) {
\r
2739 xtoy(ppi, com.pf3x4+j*4, 3);
\r
2740 com.pf3x4[j*4+3] = 1;
\r
2741 abyx (1./sum(com.pf3x4+j*4,4), com.pf3x4+j*4, 4);
\r
2742 if(com.codonf==F3x4 || com.codonf==F3x4MG)
\r
2745 if(com.codonf==FMutSel && com.npi==3) return(0);
\r
2747 if ((com.codonf>=F1x4 && com.codonf<=F3x4MG) || com.npi>3) {
\r
2748 for (i=0; i<n; i++) {
\r
2749 ic=FROM61[i]; b[0]=ic/16; b[1]=(ic/4)%4; b[2]=ic%4;
\r
2750 com.pi[i] = com.pf3x4[b[0]]*com.pf3x4[4+b[1]]*com.pf3x4[8+b[2]];
\r
2754 if (com.codonf==FMutSel && com.npi>3) {
\r
2755 for(i=0; i<n-1; i++) /* last codon has fitness 0 */
\r
2756 com.pi[i] *= exp(com.ppi[3+i]);
\r
2758 else if (com.codonf==FMutSel0 && com.npi>3) {
\r
2759 for(i=0; i<n; i++) { /* last amino acid has fitness 0 */
\r
2760 iaa = GeneticCode[com.icode][FROM61[i]];
\r
2761 if(iaa<19) com.pi[i] *= exp(com.ppi[3+iaa]);
\r
2763 for(i=0,zero(com.piAA,20); i<n; i++)
\r
2764 com.piAA[GeneticCode[com.icode][FROM61[i]]] += com.pi[i];
\r
2765 abyx(1./sum(com.piAA,20), com.piAA, 20);
\r
2767 else if (com.codonf==FMutSel0 && com.npi==3) {
\r
2768 for (i=0,zero(mutbias,20); i<n; i++) {
\r
2770 iaa = GeneticCode[com.icode][ic];
\r
2771 b[0]=ic/16; b[1]=(ic/4)%4; b[2]=ic%4;
\r
2772 mutbias[iaa] += com.pf3x4[b[0]]*com.pf3x4[b[1]]*com.pf3x4[b[2]];
\r
2774 for(i=0; i<n; i++) {
\r
2775 ic=FROM61[i]; iaa = GeneticCode[com.icode][ic];
\r
2776 b[0]=ic/16; b[1]=(ic/4)%4; b[2]=ic%4;
\r
2777 y = com.pf3x4[b[0]]*com.pf3x4[b[1]]*com.pf3x4[b[2]];
\r
2778 com.pi[i] = y/mutbias[iaa] * com.piAA[iaa];
\r
2780 y = sum(com.pi, n);
\r
2782 abyx (1./sum(com.pi,n), com.pi, n);
\r
2787 int SetParameters (double x[])
\r
2789 /* Set com. variables and initialize U, V, Root etc. before each calculation
\r
2790 of the likelihood function.
\r
2791 Is it a good idea to restruct this and/or Getinitials into two parts,
\r
2792 one for aa's and another for codons?
\r
2793 When (com.NSsites==NS02normal || NS3normal), p's are before w's in x[];
\r
2796 int i,j,k, ik=0, nUVR=NBTYPE+2;
\r
2797 double t,w0=-1, mr=0;
\r
2799 if(com.clock>=5) return(0);
\r
2800 if(com.fix_blength<2) SetBranch(x);
\r
2801 if(com.np<=com.ntime) return(0);
\r
2803 if(com.seqtype==1 || com.model==FromCodon || com.aaDist==AAClasses) {
\r
2804 k = com.ntime+com.nrgene;
\r
2805 if(com.hkyREV==0) {
\r
2806 if(com.fix_kappa==1) { com.pkappa[0]=com.kappa; ik=1; }
\r
2807 else com.kappa=x[k];
\r
2809 for(i=0; i<com.nkappa; i++)
\r
2810 com.pkappa[ik++] = x[k++];
\r
2813 com.ppi = x+com.ntime+com.nrgene+com.nkappa;
\r
2816 com.pomega = x+com.ntime+com.nrgene+com.nkappa+com.npi;
\r
2818 for(j=0;j<com.nrgene;j++)
\r
2819 com.rgene[j+1] = x[com.ntime+j];
\r
2820 if(com.clock && AbsoluteRate) com.rgene[0] = x[0]; /* so that rgene are abs rates */
\r
2822 if(com.seqtype==1 && com.aaDist>=FIT1)
\r
2823 getpcodonClass(x, pcodonClass);
\r
2825 k=com.ntime+com.nrgene+com.nkappa+com.npi;
\r
2828 if(!com.model && !com.aaDist && !com.fix_omega && !com.NSsites)
\r
2830 if(com.seqtype==AAseq)
\r
2831 eigenQaa(NULL, Root, U, V, x+com.ntime+com.nrgene);
\r
2832 else if(com.model==0 && com.NSsites==0 && com.Mgene<=1)
\r
2833 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, com.pkappa, com.omega,PMat);
\r
2834 else if((com.model==NSbranchB || com.model==NSbranch2)
\r
2835 && com.NSsites==0 && com.nbtype<=nUVR) {
\r
2836 for(i=0; i<com.nbtype; i++) {
\r
2837 if(com.aaDist == AAClasses)
\r
2838 com.pomega = x+com.ntime+com.nrgene+com.nkappa+com.npi+i*com.nOmegaType;
\r
2840 w0 = (i==com.nOmega-1&&com.fix_omega?com.omega_fix:com.pomega[i]);
\r
2841 eigenQcodon(1,-1,NULL,NULL,NULL,_Root[i],_UU[i],_VV[i], &mr, com.pkappa,w0,PMat);
\r
2844 k = com.ntime+com.nrgene+com.nrate;
\r
2846 if (com.seqtype==CODONseq && com.NSsites)
\r
2847 SetParametersNSsites(x);
\r
2849 /* to force crash in case or error
\r
2850 if(com.model) com.omega=-1;
\r
2853 /* branch models */
\r
2854 if(com.seqtype==CODONseq && com.model && com.NSsites==0 && com.aaDist==0) {
\r
2855 FOR(j,tree.nnode) {
\r
2856 if (j==tree.root) continue;
\r
2857 if (com.fix_omega && (int)nodes[j].label==com.nOmega-1)
\r
2858 nodes[j].omega = com.omega_fix;
\r
2860 nodes[j].omega = com.pomega[(int)nodes[j].label];
\r
2863 if (!com.fix_alpha && com.NSsites==0) {
\r
2864 com.alpha = x[k++];
\r
2866 DiscreteGamma(com.freqK,com.rK,com.alpha,com.alpha,com.ncatG,DGammaUseMedian);
\r
2868 if (!com.fix_rho) {
\r
2870 AutodGamma(com.MK, com.freqK, com.rK, &t, com.alpha, com.rho, com.ncatG);
\r
2876 int DiscreteNSsites(double par[])
\r
2878 /* This discretizes the continuous distribution for dN/dS ratios among sites
\r
2879 and calculates freqK[] and rK[], using the median method.
\r
2880 par[] contains all paras in the w distribution. par[0] is the
\r
2881 proportion of beta if (com.NSsites==betaw), or the proportion of w=0 if
\r
2882 (com.NSsites=NS02normal).
\r
2883 This routine uses com.NSsites, com.ncatG, com.freqK, com.rK.
\r
2884 betaw has com.ncatG-1 site classes in the beta distribution, and 02normal
\r
2885 has com.ncatG-1 site classes in the mixed normal distribution.
\r
2886 See the function CDFdN_dS() for definitions of parameters.
\r
2888 int status=0, i,j,off, K=com.ncatG-(com.NSsites==NSbetaw || com.NSsites==NS02normal);
\r
2889 double xb[2]={1e-7,99}; /* bounds for omega. */
\r
2890 double p01=0, p, w0, lnbeta;
\r
2892 if(com.NSsites==NSbeta || com.NSsites==NSbetaw) xb[1]=1;
\r
2894 if(com.NSsites==NSbeta || com.NSsites==NSbetaw) {
\r
2895 off = (com.NSsites==NSbetaw); /* par[0] is proportion for beta for M8 */
\r
2896 lnbeta = LnGamma(par[off])+LnGamma(par[off+1])-LnGamma(par[off]+par[off+1]);
\r
2897 for(j=0; j<K; j++) {
\r
2898 p = (j*2.+1)/(2.*K);
\r
2899 com.rK[j] = QuantileBeta(p, par[off], par[off+1], lnbeta);
\r
2903 for(j=0; j<K; j++) {
\r
2904 p = (j*2. + 1)/(2.*K);
\r
2906 if(com.rK[j]) w0 = (w0 + com.rK[j])/2;
\r
2907 com.rK[j] = Quantile(CDFdN_dS, p, w0, par, xb); /* median */
\r
2910 for(j=0; j<K; j++) com.freqK[j] = 1.0/K;
\r
2912 if(com.NSsites==NSbetaw) {
\r
2913 if(!com.fix_omega) com.rK[com.ncatG-1] = par[3];
\r
2914 else com.rK[com.ncatG-1] = com.omega_fix;
\r
2915 com.freqK[K] = 1-par[0];
\r
2916 for(j=0; j<K; j++) com.freqK[j] *= par[0];
\r
2918 if(com.NSsites==NS02normal) {
\r
2919 for(j=K-1;j>=0;j--) /* shift to right by 1 to make room for spike at 0*/
\r
2920 { com.rK[j+1]=com.rK[j]; com.freqK[j+1]=com.freqK[j]; }
\r
2921 com.rK[0]=0; com.freqK[0]=par[0];
\r
2922 for(j=1;j<K+1;j++) com.freqK[j]*=(1-par[0]);
\r
2925 if(com.NSsites>=NSgamma){
\r
2926 if(!status && com.NSsites==NSbeta)
\r
2927 for(j=1;j<com.ncatG;j++) if(com.rK[j]+1e-7<com.rK[j-1]) status=1;
\r
2930 printf("\nwarning: DiscreteNSsites\nparameters: ");
\r
2931 FOR(j,(com.NSsites==7?2:4)) printf(" %12.6f", par[j]); FPN(F0);
\r
2932 FOR(j,com.ncatG) printf("%13.5f", com.freqK[j]); FPN(F0);
\r
2933 FOR(j,com.ncatG) printf("%13.5e", com.rK[j]); FPN(F0);
\r
2940 double CDFdN_dS(double x,double p[])
\r
2942 /* This calculates the CDF of the continuous dN/dS distribution over sites,
\r
2943 to be used as argument to the routine Quantile(). When the distribution
\r
2944 has spikes, the spikes are ignored in this routine, and the scaling
\r
2945 is done outside this routine, for example, in DiscreteNSsites().
\r
2946 All parameters (par) for the w distribution are passed to this routine,
\r
2947 although some (p0 for the spike at 0) are not used in this routine.
\r
2948 Parameters are arranged in the following order:
\r
2950 NSgamma (2): alpha, beta
\r
2951 NS2gamma (4): p0, alpha1, beta1, alpha2 (=beta2)
\r
2952 NSbeta (2): p_beta, q_beta
\r
2953 NSbetaw (4): p0, p_beta, q_beta, w (if !com.fix_omega, not used here)
\r
2954 NSbetagamma (5): p0, p_beta, q_beta, alpha, beta
\r
2955 NSbeta1gamma (5): p0, p_beta, q_beta, alpha, beta (1+gamma)
\r
2956 NSbeta1normal (5): p0, p_beta, q_beta, mu, s (normal>1)
\r
2957 NS02normal (5): p0, p1, mu2, s1, s2 (s are sigma's)
\r
2958 NS3normal (6): p0, p1, mu2, s0, s1, s2 (s are sigma's)
\r
2960 Parameters p0 & p1 are transformed if (!LASTROUND)
\r
2964 double z, f[3],mu[3]={0,1,2},sig[3]; /* 3normal: mu0=0 fixed. mu2 estimated */
\r
2966 switch(com.NSsites) {
\r
2967 case(NSgamma): cdf=CDFGamma(x,p[0],p[1]); break;
\r
2969 cdf=p[0] *CDFGamma(x,p[1],p[2])+(1-p[0])*CDFGamma(x,p[3],p[3]); break;
\r
2970 case(NSbeta): cdf=CDFBeta(x,p[0],p[1],0); break;
\r
2971 case(NSbetaw): cdf=CDFBeta(x,p[1],p[2],0); break;
\r
2972 case(NSbetagamma):
\r
2973 cdf=p[0]*CDFBeta(x,p[1],p[2],0)+(1-p[0])*CDFGamma(x,p[3],p[4]); break;
\r
2975 case(NSbeta1gamma):
\r
2976 if(x<=1) cdf=p[0]*CDFBeta(x,p[1],p[2],0);
\r
2977 else cdf=p[0]+(1-p[0])*CDFGamma(x-1,p[3],p[4]);
\r
2979 case(NSbeta1normal):
\r
2980 if(x<=1) cdf=p[0]*CDFBeta(x,p[1],p[2],0);
\r
2982 cdf=CDFNormal((p[3]-1)/p[4]);
\r
2984 matout(F0,p,1,5);;
\r
2985 printf("PHI(%.6f)=%.6f\n",(p[3]-1)/p[4],cdf); getchar();
\r
2987 cdf=p[0]+(1-p[0])*(1- CDFNormal((p[3]-x)/p[4])/cdf);
\r
2991 mu[2]=p[2]; sig[1]=p[3]; sig[2]=p[4];
\r
2992 f[1]=p[1]; f[2]=1-f[1];
\r
2993 cdf = 1 - f[1]* CDFNormal(-(x-mu[1])/sig[1])/CDFNormal(mu[1]/sig[1])
\r
2994 - f[2]* CDFNormal(-(x-mu[2])/sig[2])/CDFNormal(mu[2]/sig[2]);
\r
2997 mu[2]=p[2]; sig[0]=p[3]; sig[1]=p[4]; sig[2]=p[5];
\r
2999 if(LASTROUND) { f[0]=p[0]; f[1]=p[1]; }
\r
3000 else { z=(f[0]=exp(p[0]))+(f[1]=exp(p[1]))+1; f[0]/=z; f[1]/=z;}
\r
3002 cdf = 1 - f[0]* 2*CDFNormal(-x/sig[0])
\r
3003 - f[1]* CDFNormal(-(x-mu[1])/sig[1])/CDFNormal(mu[1]/sig[1])
\r
3004 - f[2]* CDFNormal(-(x-mu[2])/sig[2])/CDFNormal(mu[2]/sig[2]);
\r
3011 void GetSNphysical(double pi[], double *Sphysical, double *Nphysical, double *S4)
\r
3013 /* this calculates the synonymous and nonsynonymous sites according to the
\r
3014 physical-site definition (Yang 2006 Computational Molecular Evolution, Section 2.5.4).
\r
3015 S and N are sites per codon.
\r
3016 It is not clear how to deal with stop codons.
\r
3018 int i,j,k, ic,b[3], aa0,aa1, *code=GeneticCode[com.icode];
\r
3019 int by[3]={16,4,1}, nstop,s,n;
\r
3022 for(i=0,*Sphysical=*Nphysical=*S4=0; i<com.ncode; i++) {
\r
3023 ic=FROM61[i]; b[0]=ic/16; b[1]=(ic/4)%4; b[2]=ic%4;
\r
3024 /* no need to check the first and second positions here */
\r
3025 if(FourFold[b[0]][b[1]]) *S4 += pi[i];
\r
3028 for(j=0,s=n=nstop=0; j<3; j++) FOR(k,3) {
\r
3029 aa1 = code[ic + ((b[j]+k+1)%4 - b[j])*by[j]];
\r
3030 if(aa1==-1) nstop++;
\r
3031 else if(aa0==aa1) s++;
\r
3035 *Sphysical += pi[i]*s/9.*3.;
\r
3036 *Nphysical += pi[i]*n/9.*3.;
\r
3038 y = (*Sphysical + *Nphysical)/3;
\r
3039 *Sphysical /= y; *Nphysical /= y;
\r
3043 double GetOmega (int aa1, int aa2, double omega, double pomega[])
\r
3045 /* this gets the omega (w) value under different models for eigenQcodon().
\r
3047 double w=1, fit1,fit2;
\r
3050 if (com.aaDist==AAClasses) {
\r
3051 if (aa1<aa2) { k=aa2; aa2=aa1; aa1=k; }
\r
3052 k=aa1*(aa1-1)/2+aa2;
\r
3053 if (pomega[OmegaAA[k]]<0) {
\r
3054 if (noisy) printf("aa1 & aa2 & iw & w: %d %d %d %.5f\n",
\r
3055 aa1,aa2,OmegaAA[k],pomega[OmegaAA[k]]);
\r
3056 pomega[OmegaAA[k]]=0;
\r
3058 if (com.seqtype==AAseq && com.nrate>65 && aa1*20+aa2==ijAAref)
\r
3059 ; /* if estimating grantham's matrix with aa sequences */
\r
3060 else w = pomega[OmegaAA[k]];
\r
3062 else if (com.aaDist==0) w = omega; /* NSsites==0 or >0 */
\r
3063 else if (com.aaDist<=6) { /* chemical properties: a & b */
\r
3064 w = pomega[0]*com.daa[aa1*20+aa2];
\r
3065 if(com.aaDist>0) w = exp(-w); /* geometric */
\r
3066 else w = 1-w; /* linear */
\r
3067 if (com.seqtype==CODONseq) w *= pomega[1];
\r
3069 else if (com.aaDist>=FIT1) { /* ap,p*,av,v* (and w0 for FIT2) */
\r
3070 fit1 = -pomega[0]*square(AAchem[0][aa1]-pomega[1])
\r
3071 -pomega[2]*square(AAchem[1][aa1]-pomega[3]);
\r
3072 fit2 = -pomega[0]*square(AAchem[0][aa2]-pomega[1])
\r
3073 -pomega[2]*square(AAchem[1][aa2]-pomega[3]);
\r
3075 w = exp(-fit1-fit2);
\r
3076 if(com.aaDist==FIT2) w *= pomega[4];
\r
3083 double GetMutationMultiplier (int i, int j, int pos, int from[3], int to[3])
\r
3085 /* This sets the mutation-bias multipliers for F1x4MG, F3x4MG, FMutSel0, FMutSel.
\r
3086 com.pi[], com.pf3x4[], and com.piAA[] are set correctly before this routine is called.
\r
3088 int n=com.ncode, b1,b2;
\r
3089 double q, eFit1, eFit2, small=min2(1e-6, 1./com.ls);
\r
3091 /* b1 and b2 are the 2 unchanged positions */
\r
3092 if (pos==0) { b1=1; b2=2; }
\r
3093 else if(pos==1) { b1=2; b2=0; }
\r
3094 else { b1=0; b2=1; }
\r
3095 q = 1 / (com.pf3x4[b1*4+to[b1]] * com.pf3x4[b2*4+to[b2]]);
\r
3097 if(com.npi && (com.codonf==FMutSel || com.codonf==FMutSel0)) {
\r
3098 eFit1 = max2(com.pi[i], small);
\r
3099 eFit2 = max2(com.pi[j], small);
\r
3100 eFit1 /= com.pf3x4[from[0]] * com.pf3x4[from[1]] * com.pf3x4[from[2]];
\r
3101 eFit2 /= com.pf3x4[ to[0]] * com.pf3x4[ to[1]] * com.pf3x4[to[2]];
\r
3103 if(fabs(eFit2-eFit1)>1e-10)
\r
3104 q *= (log(eFit2)-log(eFit1))/(eFit2-eFit1);
\r
3112 int SelectionCoefficients (FILE* fout, double kappa[], double ppi[], double omega)
\r
3114 /* This calculates the distribution of S or 2Ns under the FMutSel or FMutSel0 models.
\r
3115 Qsubw[] is not correct if (com.NSsites) and the results are not printed.
\r
3117 int n=Nsensecodon, i,j,k, ic1,ic2,b1,b2;
\r
3118 int ndiff,pos=0,from[3],to[3];
\r
3119 double q, summut=0, summutp=0, sumsub=0, sumsubw=0, eF1,eF2, fb[4];
\r
3120 double bigS=2, sumbadmut=0,sumgoodmut=0;
\r
3121 double Qmut[NCODE*NCODE], Qsub[NCODE*NCODE], Qsubw[NCODE*NCODE], Ns[NCODE*NCODE], mNs=0,mNsp=0,mNsn=0;
\r
3122 double maxNs=0, fNsMut[50]={0}, fNsSub[50]={0}, fNsSubw[50]={0}, small=min2(1e-6, 1./com.ls);
\r
3125 if(com.codonf<FMutSel0)
\r
3126 error2("codonf incorrect");
\r
3128 fprintf(fout, "\nI\tJ\tij\t2Ns_IJ\tpMut_IJ\tpSub_IJ\t2Ns_JI\tpMut_JI\tpSub_JI\n\n");
\r
3129 fb[0]=ppi[0]; fb[1]=ppi[1]; fb[2]=ppi[2]; fb[3]=1;
\r
3130 for (i=0;i<n*n;i++) Qmut[i]=Qsub[i]=Qsubw[i]=0;
\r
3131 for (i=1; i<n; i++) {
\r
3132 ic1=FROM61[i]; from[0]=ic1/16; from[1]=(ic1/4)%4; from[2]=ic1%4;
\r
3133 for(j=0; j<i; j++) {
\r
3134 ic2=FROM61[j]; to[0]=ic2/16; to[1]=(ic2/4)%4; to[2]=ic2%4;
\r
3135 for(k=0,ndiff=0; k<3; k++)
\r
3136 if(from[k]!=to[k]) { ndiff++; pos=k; }
\r
3137 if(ndiff!=1) continue;
\r
3139 if(com.hkyREV) { /* REV-GTR model */
\r
3140 b1 = min2(from[pos],to[pos]); /* b1 and b2 are changed nucleotides */
\r
3141 b2 = max2(from[pos],to[pos]);
\r
3142 if (b1==0 && b2==1) q = kappa[0]; /* TC or CT, relative to AG */
\r
3143 else if (b1==0 && b2==2) q = kappa[1]; /* TA or AT */
\r
3144 else if (b1==0 && b2==3) q = kappa[2]; /* TG or GT */
\r
3145 else if (b1==1 && b2==2) q = kappa[3]; /* CA or AC */
\r
3146 else if (b1==1 && b2==3) q = kappa[4]; /* CG or GC */
\r
3148 else { /* HKY model */
\r
3149 if(from[pos]+to[pos]==1 || from[pos]+to[pos]==5)
\r
3153 eF1 = max2(com.pi[i], small) / (fb[from[0]] * fb[from[1]] * fb[from[2]]);
\r
3154 eF2 = max2(com.pi[j], small) / (fb[ to[0]] * fb[ to[1]] * fb[to[2]]);
\r
3156 Ns[i*n+j] = log(eF2/eF1);
\r
3157 Ns[j*n+i] = -Ns[i*n+j];
\r
3159 if(maxNs < fabs(Ns[i*n+j])) maxNs = fabs(Ns[i*n+j]);
\r
3161 Qmut[i*n+j] = Qsub[i*n+j] = com.pi[i] * q * fb[ to[pos]];
\r
3162 Qmut[j*n+i] = Qsub[j*n+i] = com.pi[j] * q * fb[from[pos]];
\r
3164 if(fabs(Ns[i*n+j]) > 1e-20) { /* non-neutral mutations */
\r
3165 Qsub[i*n+j] *= Ns[i*n+j]/(1 - exp(-Ns[i*n+j]));
\r
3166 Qsub[j*n+i] *= Ns[j*n+i]/(1 - exp(-Ns[j*n+i]));
\r
3168 Qsubw[i*n+j] = Qsub[i*n+j];
\r
3169 Qsubw[j*n+i] = Qsub[j*n+i];
\r
3170 if(!com.NSsites && GeneticCode[com.icode][ic1] != GeneticCode[com.icode][ic2]) {
\r
3171 Qsubw[i*n+j] *= com.omega;
\r
3172 Qsubw[j*n+i] *= com.omega;
\r
3175 summut += Qmut[i*n+j] + Qmut[j*n+i];
\r
3176 sumsub += Qsub[i*n+j] + Qsub[j*n+i];
\r
3177 sumsubw += Qsubw[i*n+j] + Qsubw[j*n+i];
\r
3179 if(fabs(Ns[i*n+j]) > 1e-20) { /* non-neutral mutations */
\r
3180 summutp += (Ns[i*n+j]>0 ? Qmut[i*n+j] : Qmut[j*n+i]);
\r
3181 mNsp += (Ns[i*n+j]>0 ? Qmut[i*n+j]*Ns[i*n+j] : Qmut[j*n+i]*Ns[j*n+i]);
\r
3182 mNsn += (Ns[i*n+j]<0 ? Qmut[i*n+j]*Ns[i*n+j] : Qmut[j*n+i]*Ns[j*n+i]);
\r
3184 else { /* neutral mutation. Ns = 0 makes no contribution to mNsp & mNsn */
\r
3185 summutp += (Qmut[i*n+j] + Qmut[j*n+i])/2;
\r
3187 mNs += (Qmut[i*n+j]+Qmut[j*n+i])*fabs(Ns[i*n+j]);
\r
3189 if (fabs(Ns[i*n+j])>bigS) {
\r
3190 if (Ns[i*n+j]>0) {
\r
3191 sumgoodmut += Qmut[i*n+j];
\r
3192 sumbadmut += Qmut[j*n+i];
\r
3195 sumgoodmut += Qmut[j*n+i];
\r
3196 sumbadmut += Qmut[i*n+j];
\r
3200 fprintf(fout, "%c%c%c\t", BASEs[from[0]],BASEs[from[1]],BASEs[from[2]]);
\r
3201 fprintf(fout, "%c%c%c\t", BASEs[ to[0]],BASEs[ to[1]],BASEs[ to[2]]);
\r
3202 fprintf(fout, "%c%c", BASEs[from[pos]],BASEs[to[pos]]);
\r
3203 fprintf(fout, "\t%.5f\t%.5f\t%.5f", Ns[i*n+j], Qmut[i*n+j], Qsub[i*n+j]);
\r
3204 fprintf(fout, "\t%.5f\t%.5f\t%.5f", Ns[j*n+i], Qmut[j*n+i], Qsub[j*n+i]);
\r
3207 fprintf(fout, "\t%.5f\t%.5f", Qsubw[i*n+j], Qsubw[j*n+i]);
\r
3214 sumgoodmut /= summut;
\r
3215 sumbadmut /= summut;
\r
3218 mNsn /= summut-summutp;
\r
3220 fprintf(fout, "\n\nHistograms\n2Ns\tFMut\tFSub(CodonUsage)\tFSubw(after w)\n\n");
\r
3222 for(i=0; i<n; i++) {
\r
3223 for(j=0; j<n; j++) {
\r
3224 if(Qmut[i*n+j] == 0) continue;
\r
3225 for(k=0; k<ncat-1; k++) {
\r
3226 if(Ns[i*n+j] < (-1 + (k+1.)*2/ncat)*maxNs) break;
\r
3228 fNsMut[k] += Qmut[i*n+j]/summut;
\r
3229 fNsSub[k] += Qsub[i*n+j]/sumsub;
\r
3230 fNsSubw[k] += Qsubw[i*n+j]/sumsubw;
\r
3233 for(k=0; k<ncat; k++) {
\r
3234 fprintf(fout, "%.5f\t%.5f\t%.5f", (-1 + (k+0.5)*2/ncat)*maxNs, fNsMut[k], fNsSub[k]);
\r
3236 fprintf(fout, "\t%.5f", fNsSubw[k]);
\r
3240 fprintf(fout, "\nProportion of advantageous (S > 0) mutations:\n %.5f\n", summutp/summut);
\r
3241 fprintf(fout, "\nProportions of good & bad mutations (|S| > %.4f) among mutations:\n%.5f %.5f\n",
\r
3242 bigS, sumgoodmut, sumbadmut);
\r
3243 fprintf(fout, "\nmean |Ns| = %.5f\tmean Ns+ = %.5f\tmean Ns- = %.5f\n", mNs,mNsp,mNsn);
\r
3245 fprintf(frst1, "\t%.4f\t%.4f\t%.4f", mNs, mNsp, mNsn);
\r
3251 int eigenQcodon (int mode, double blength, double *S, double *dS, double *dN,
\r
3252 double Root[], double U[], double V[], double *meanrate, double kappa[], double omega, double Q[])
\r
3254 /* This contructs the rate matrix Q for codon substitution and gets the eigen
\r
3255 values and vectors if getstats==0, or get statistics (dS & dN etc.) if
\r
3257 The routine is also called by Qcodon2aa for mechanistic amino acid
\r
3258 substitution models.
\r
3259 Input parameters are kappa, omega and com.pi (or com.fb61).
\r
3261 Statistics calculated include S, dS & dN.
\r
3262 c0[0,1,2] and c[0,1,2] are rates for the 3 codon positions before and after
\r
3263 selection. c4 is for 4-fold rates. ts[3] and tv[3] are transition/
\r
3264 transversion rates for the three codon positions, not calculated.
\r
3266 mode=0: construct Q; 1: calculate UVRoot; 2:calculate statistics
\r
3268 *Qfactor or *meanrate:
\r
3269 =0 means that Q is scaled as usual;
\r
3270 <0 means that the scale factor will be calculated and returned
\r
3271 >0 the given scale factor is applied (1 means no scaling).
\r
3273 Note that under NSsites or branch&site models, scaling is done for all Q
\r
3274 matrices for the whole branch.
\r
3276 aaDist=FIT1 & FIT2: ap,p*,av,v*, (and w0 for FIT2)
\r
3277 The argument omega is used only if the model assumes one omega. For
\r
3278 AAClasses, com.pomega is used instead.
\r
3280 int n=Nsensecodon, i,j,k, ic1,ic2,aa1,aa2, b1,b2;
\r
3281 int ndiff,pos=0,from[3],to[3];
\r
3282 double q, mr, rs0,ra0,rs,ra, y;
\r
3283 double Sphysical, Nphysical, S4, dSnew, dNnew;
\r
3284 double d4=0, d0[3], d[3], ts[3], tv[3]; /* rates at positions and 4-fold sites */
\r
3285 double *pi=(com.seqtype==AAseq?com.fb61:com.pi), w=-1, piQij;
\r
3286 double space[NCODE*(NCODE+1)];
\r
3288 /* Delete this after the MutSel project. */
\r
3289 static int times=0;
\r
3290 if(mode==1) times=0;
\r
3294 if(blength>=0 && (S==NULL||dS==NULL||dN==NULL)) error2("eigenQcodon");
\r
3295 for (i=0;i<n*n;i++) Q[i]=0;
\r
3296 for (i=1; i<n; i++) {
\r
3297 ic1=FROM61[i]; from[0]=ic1/16; from[1]=(ic1/4)%4; from[2]=ic1%4;
\r
3298 for(j=0; j<i; j++) {
\r
3300 ic2=FROM61[j]; to[0]=ic2/16; to[1]=(ic2/4)%4; to[2]=ic2%4;
\r
3301 for(k=0,ndiff=0; k<3; k++)
\r
3302 if(from[k]!=to[k]) { ndiff++; pos=k; }
\r
3303 if(ndiff!=1) continue;
\r
3305 if(com.hkyREV) { /* REV-GTR model */
\r
3306 b1 = min2(from[pos],to[pos]); /* b1 and b2 are changed nucleotides */
\r
3307 b2 = max2(from[pos],to[pos]);
\r
3308 if (b1==0 && b2==1) q = kappa[0]; /* TC or CT, relative to AG */
\r
3309 else if (b1==0 && b2==2) q = kappa[1]; /* TA or AT */
\r
3310 else if (b1==0 && b2==3) q = kappa[2]; /* TG or GT */
\r
3311 else if (b1==1 && b2==2) q = kappa[3]; /* CA or AC */
\r
3312 else if (b1==1 && b2==3) q = kappa[4]; /* CG or GC */
\r
3314 else { /* HKY model */
\r
3315 if(from[pos]+to[pos]==1 || from[pos]+to[pos]==5)
\r
3318 if (com.codonf>=F1x4MG && com.codonf<=FMutSel && com.codonf!=Fcodon)
\r
3319 q *= GetMutationMultiplier (i, j, pos, from, to);
\r
3321 aa1 = GeneticCode[com.icode][ic1];
\r
3322 aa2 = GeneticCode[com.icode][ic2];
\r
3324 q *= GetOmega(aa1, aa2, omega, com.pomega);
\r
3325 Q[i*n+j] = q*pi[j];
\r
3326 Q[j*n+i] = q*pi[i];
\r
3331 for (i=0; i<n; i++)
\r
3332 Q[i*n+i] = -sum(Q+i*n,n);
\r
3333 for (i=0,mr=0; i<n; i++)
\r
3334 mr -= pi[i]*Q[i*n+i];
\r
3336 if(mode==1) { /* get Root, U, & V */
\r
3337 if (com.seqtype==AAseq) return (0);
\r
3338 eigenQREV(Q, pi, n, Root, U, V, space);
\r
3339 if(*meanrate>= 0) { /* apply scaling if meanrate>0 */
\r
3342 for (i=0; i<n; i++)
\r
3346 else if(mode==2) { /* get statistics */
\r
3347 for(i=0;i<3;i++) d[i] = d0[i] = ts[i] = tv[i]=0;
\r
3348 rs0 = ra0 = rs = ra = 0;
\r
3349 for (i=0; i<n; i++) {
\r
3350 ic1=FROM61[i]; from[0]=ic1/16; from[1]=(ic1/4)%4; from[2]=ic1%4;
\r
3351 for(j=0; j<n; j++) {
\r
3352 if(i==j || Q[i*n+j]==0) continue;
\r
3353 ic2=FROM61[j]; to[0]=ic2/16; to[1]=(ic2/4)%4; to[2]=ic2%4;
\r
3354 aa1 = GeneticCode[com.icode][ic1];
\r
3355 aa2 = GeneticCode[com.icode][ic2];
\r
3356 for(k=0,ndiff=0; k<3; k++)
\r
3357 if(from[k] != to[k]) { ndiff++; pos=k; }
\r
3358 if(ndiff!=1) error2("jgl");
\r
3360 piQij = pi[i]*Q[i*n+j];
\r
3361 if(pos==2 && FourFold[to[0]][to[1]])
\r
3370 w = GetOmega(aa1, aa2, omega, com.pomega);
\r
3372 d0[pos] += piQij/w;
\r
3378 if(fabs(mr-(rs+ra)) > 1e-6)
\r
3379 error2("mr should be = rs+ra");
\r
3382 w = (rs0+ra0); rs0 /= w; ra0 /= w; *S = rs0*3*com.ls;
\r
3383 if(com.NSsites==0 && blength>=0) { /* calculates dS & dN */
\r
3384 if(blength==0) *dS = *dN = 0;
\r
3387 *dS = blength*rs/(3*rs0);
\r
3388 *dN = blength*ra/(3*ra0);
\r
3389 w = (*dS>0 ? *dN/ *dS : -1);
\r
3390 GetSNphysical(com.pi, &Sphysical, &Nphysical, &S4);
\r
3391 for(i=0;i<3;i++) {
\r
3392 d[i] *= blength/mr;
\r
3393 d0[i] *= blength/mr;
\r
3395 d4 *= blength/mr/S4;
\r
3396 dNnew = blength*ra/Nphysical;
\r
3397 dSnew = blength*rs/Sphysical;
\r
3400 printf("\nd123[*] =%9.5f%9.5f%9.5f average%9.5f\n", d[0],d[1],d[2], (d[0]+d[1]+d[2])/3);
\r
3401 printf( " [B] =%9.5f%9.5f%9.5f average%9.5f\n", d0[0],d0[1],d0[2], (d0[0]+d0[1]+d0[2])/3);
\r
3402 printf("accept =%9.5f%9.5f%9.5f\n\n", d[0]/d0[0],d[1]/d0[1],d[2]/d0[2]);
\r
3403 printf("w =%9.5f dN =%9.5f dS =%9.5f d4 =%9.5f (%.1f four-fold sites)\n", w, *dN,*dS, d4, S4*com.ls);
\r
3404 printf("%12s dN*=%9.5f dS*=%9.5f S* =%7.2f N* =%7.2f\n", "", dNnew, dSnew, Sphysical*com.ls, Nphysical*com.ls);
\r
3407 /* print out dN* dS* d4 d3B */
\r
3408 if(com.verbose && times==1 && com.ns==2)
\r
3409 fprintf(frst1, "\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f",
\r
3410 *dN*2, *dS*2, dNnew*2, dSnew*2, d0[2]*2, d4*2);
\r
3412 else if (com.NSsites) {
\r
3418 if(*meanrate<0) *meanrate = mr;
\r
3423 int eigenQaa (FILE *fout, double Root[], double U[], double V[], double rate[])
\r
3425 /* Codon-based AA model must use FromCodon, even if com.aaDist==AAClasses.
\r
3427 int naa=20, i,j,k;
\r
3428 double Q[20*20], mr=0, t=0;
\r
3429 double space[NCODE*NCODE*2+NCODE],*Qc=space+NCODE*NCODE, *space_pisqrt=Qc+NCODE*NCODE;
\r
3430 char aa3[4]="", AAratefile[96]="AAratefile.dat";
\r
3433 for(i=0; i<naa*naa; i++) Q[i]=0;
\r
3434 switch (com.model) {
\r
3435 case (Poisson) : case (EqualInput) :
\r
3436 fillxc (Q, 1., naa*naa); break;
\r
3437 case (Empirical) : case (Empirical_F):
\r
3438 for(i=0; i<naa; i++) for(j=0; j<i; j++)
\r
3439 Q[i*naa+j]=Q[j*naa+i]=com.daa[i*naa+j];
\r
3441 case (FromCodon): /* eigenQcodon check mode value */
\r
3442 eigenQcodon(0,-1,NULL,NULL,NULL,Root,U,V, &mr,
\r
3443 (com.hkyREV||com.codonf==FMutSel?rate:&com.kappa),com.omega,Qc);
\r
3444 Qcodon2aa(Qc, com.fb61, Q, space);
\r
3447 for (i=1,k=0; i<naa; i++)
\r
3448 for (j=0; j<i; j++)
\r
3449 if (AA1STEP[i*(i-1)/2+j] && i*naa+j!=ijAAref)
\r
3450 Q[i*naa+j] = Q[j*naa+i] = rate[k++];
\r
3452 Q[(k/naa)*naa+k%naa] = Q[(k%naa)*naa+k/naa] = 1;
\r
3455 for (i=0,k=0; i<naa; i++)
\r
3456 for (j=0; j<i; j++)
\r
3457 if (i*naa+j != ijAAref) Q[i*naa+j] = Q[j*naa+i] = rate[k++];
\r
3458 Q[ijAAref] = Q[(ijAAref%naa)*naa+(ijAAref/naa)] = 1;
\r
3461 for(i=0; i<naa; i++) for(j=0; j<naa; j++)
\r
3462 Q[i*naa+j] *= com.pi[j];
\r
3463 for (i=0,mr=0; i<naa; i++) {
\r
3465 Q[i*naa+i] = -sum(Q+i*naa,naa);
\r
3466 mr -= com.pi[i]*Q[i*naa+i];
\r
3469 if (fout && com.model>=REVaa_0) {
\r
3470 printf("\nAA substitution rate matrix printed into %s\n", AAratefile);
\r
3471 fAArate=(FILE*)gfopen(AAratefile,"w");
\r
3472 fprintf (fout, "\n\nRate matrix (symmetrical part, Sij)\n");
\r
3473 for(i=0,t=0; i<naa; i++) {
\r
3474 if(com.pi[i]==0) error2("eigenQaa: do this now");
\r
3475 for(j=0; j<i; j++)
\r
3476 t += Q[i*naa+j]/com.pi[j]/(naa*(naa-1)/2.);
\r
3478 for(i=0; i<naa; i++) {
\r
3479 fprintf (fout, "\n%-5s", getAAstr(aa3,i));
\r
3480 for(j=0; j<i; j++) fprintf(fout, " %8.2f", Q[i*naa+j]/t/com.pi[j]*100);
\r
3481 for(j=0; j<i; j++) fprintf(fAArate, " %8.2f", Q[i*naa+j]/t/com.pi[j]*100);
\r
3484 fputs("\n ",fout);
\r
3485 for(i=0; i<naa; i++)
\r
3486 fprintf(fout,"%5s", getAAstr(aa3,i));
\r
3489 matout(fAArate, com.pi, 1, naa);
\r
3490 for(i=0; i<naa; i++)
\r
3491 fprintf(fAArate,"%12s", getAAstr(aa3,i));
\r
3493 fprintf(fAArate,"\n\nNote: Amino acid rate matrix estimated from %s\n", com.seqf);
\r
3497 if (fout && frst1 && com.model>=REVaa_0) {
\r
3498 fprintf(frst1, "\nRate matrix (symmetrical part, Sij) for bubble plot\n");
\r
3499 for(i=0; i<naa; i++) for(j=0; j<i; j++)
\r
3500 fprintf(frst1, "\t%d\t%d\t%.2f\n", i+1,j+1,Q[i*naa+j]/t/com.pi[j]*100);
\r
3503 eigenQREV(Q, com.pi, naa, Root, U, V, space_pisqrt);
\r
3504 for(i=0; i<naa; i++)
\r
3505 Root[i] = Root[i]/mr;
\r
3510 int Qcodon2aa (double Qc[], double pic[], double Qaa[], double piaa[])
\r
3514 This routine constructs the rate matrix for amino acid replacement from
\r
3515 the rate matrix for codon substitution, by congregating states in the
\r
3516 Markov chain. Both processes are time reversible, and only the
\r
3517 symmetrical part of the rate matrix are constructed. Codon frequencies
\r
3518 pic[] are used. They are constructed by assigning equal frequencies for
\r
3519 synonymous codons in the routine AA2Codonf().
\r
3520 Qaa(aai,aaj) = SUMi SUMj (piC[i]*piC[j]]*Qc[i][j]) / (piAA[i]*piAA[j])
\r
3522 int i, j, aai, aaj, nc=Nsensecodon, naa=20;
\r
3526 zero(Qaa,naa*naa);
\r
3527 for(i=0; i<nc; i++)
\r
3528 piaa[GeneticCode[com.icode][FROM61[i]]] += pic[i];
\r
3529 for(i=0; i<nc; i++) {
\r
3530 aai = GeneticCode[com.icode][FROM61[i]];
\r
3531 if(piaa[aai]==0) ti = 0;
\r
3532 else ti = pic[i]/piaa[aai];
\r
3533 for(j=0; j<i; j++) {
\r
3534 aaj = GeneticCode[com.icode][FROM61[j]];
\r
3535 if (Qc[i*nc+j]==0 || aai==aaj) continue;
\r
3539 tij = ti*pic[j]*Qc[i*nc+j]/piaa[aaj];
\r
3540 Qaa[aai*naa+aaj] += tij;
\r
3541 Qaa[aaj*naa+aai] += tij;
\r
3550 int ConditionalPNode (int inode, int igene, double x[])
\r
3552 int n=com.ncode, i,j,k,h, ison, pos0=com.posG[igene], pos1=com.posG[igene+1];
\r
3555 for(i=0; i<nodes[inode].nson; i++)
\r
3556 if(nodes[nodes[inode].sons[i]].nson>0 && !com.oldconP[nodes[inode].sons[i]])
\r
3557 ConditionalPNode(nodes[inode].sons[i], igene, x);
\r
3560 for(h=pos0*n; h<pos1*n; h++)
\r
3561 nodes[inode].conP[h] = 0; /* young ancestor */
\r
3563 for(h=pos0*n; h<pos1*n; h++)
\r
3564 nodes[inode].conP[h] = 1;
\r
3565 if (com.cleandata && inode<com.ns)
\r
3566 for(h=pos0; h<pos1; h++)
\r
3567 nodes[inode].conP[h*n+com.z[inode][h]] = 1;
\r
3569 for (i=0; i<nodes[inode].nson; i++) {
\r
3570 ison = nodes[inode].sons[i];
\r
3571 t = nodes[ison].branch * _rateSite;
\r
3573 if(com.clock) t *= GetBranchRate(igene,(int)nodes[ison].label,x,NULL);
\r
3574 else t *= com.rgene[igene];
\r
3577 GetPMatBranch(PMat, x, t, ison);
\r
3579 if (nodes[ison].nson<1 && com.cleandata) { /* tip && clean */
\r
3580 for(h=pos0; h<pos1; h++)
\r
3581 for(j=0; j<n; j++)
\r
3582 nodes[inode].conP[h*n+j] *= PMat[j*n+com.z[ison][h]];
\r
3584 else if (nodes[ison].nson<1 && !com.cleandata) { /* tip & unclean */
\r
3585 for(h=pos0; h<pos1; h++)
\r
3586 for(j=0; j<n; j++) {
\r
3587 for(k=0,t=0; k<nChara[com.z[ison][h]]; k++)
\r
3588 t += PMat[j*n+CharaMap[com.z[ison][h]][k]];
\r
3589 nodes[inode].conP[h*n+j] *= t;
\r
3592 else { /* internal node */
\r
3593 for(h=pos0; h<pos1; h++)
\r
3594 for(j=0; j<n; j++) {
\r
3595 for(k=0,t=0; k<n; k++)
\r
3596 t += PMat[j*n+k]*nodes[ison].conP[h*n+k];
\r
3597 nodes[inode].conP[h*n+j] *= t;
\r
3601 } /* for (ison) */
\r
3602 if(com.NnodeScale && com.nodeScale[inode])
\r
3603 NodeScale(inode, pos0, pos1);
\r
3609 int PMatJC69like (double P[], double t, int n)
\r
3612 double pii=1./n+(1.-1./n)*exp(-n/(n-1.)*t), pij=(1.-pii)/(n-1.);
\r
3613 for(i=0; i<n*n; i++) P[i] = pij;
\r
3614 for(i=0; i<n; i++) P[i*n+i] = pii;
\r
3619 int Fcodon_3x4 (double fcodon[], double fb3x4[]);
\r
3620 void OutFb3x4(FILE*fout, double fb3x4[]);
\r
3621 void CountCodons (FILE *fout,double fcodonsg[],double fb3x4sg[],double fb4g[]);
\r
3623 int Fcodon_3x4(double fcodon[], double fb3x4[])
\r
3625 /* this converts the codon frequencies into a fb3x4 table. fcodon has 64 codons.
\r
3627 int b[3], k,j, nc=64, status=0;
\r
3631 for(k=0; k<nc; k++) {
\r
3632 b[0]=k/16; b[1]=(k%16)/4; b[2]=k%4;
\r
3633 for(j=0; j<3; j++) {
\r
3634 fb3x4[j*4+b[j]] += fcodon[k];
\r
3637 for(j=0; j<3; j++) {
\r
3638 t = sum(fb3x4+j*4, 4);
\r
3639 if(t<1e-20) status=-1;
\r
3640 abyx(1/t, fb3x4+j*4, 4);
\r
3645 void OutFb3x4 (FILE*fout, double fb3x4[])
\r
3648 for(j=0; j<3; j++) {
\r
3649 fprintf(fout, "\nposition %2d:", j+1);
\r
3651 fprintf(fout,"%5c:%7.5f", BASEs[k],fb3x4[j*4+k]);
\r
3653 fprintf(fout,"\nAverage ");
\r
3654 for(k=0; k<4; k++)
\r
3655 fprintf(fout,"%5c:%7.5f", BASEs[k],(fb3x4[0*4+k]+fb3x4[1*4+k]+fb3x4[2*4+k])/3);
\r
3659 void CountCodons (FILE *fout,double fcodonsg[],double fb3x4sg[],double fb4g[])
\r
3661 /* Outputs codon counts and f3x4 tables, called from InitializeCodon(), where
\r
3662 more notes are found.
\r
3664 int h, j,k, nc=NCODE, ig, wname=15, nb[3], ib[3][4], ic, nempty, status=0;
\r
3666 /* counts codons for output, species first, genes next */
\r
3667 fputs("Codon usage in sequences\n",fout);
\r
3668 zero(fcodonsg, com.ns*nc);
\r
3669 for(j=0; j<com.ns; j++) {
\r
3670 for(h=0; h<com.npatt; h++) {
\r
3671 for(k=0; k<3; k++)
\r
3672 NucListall(CODONs[com.z[j][h]][k], &nb[k], ib[k]);
\r
3673 k = nb[0]*nb[1]*nb[2];
\r
3675 ic = ib[0][0]*16+ib[1][0]*4+ib[2][0];
\r
3676 fcodonsg[j*nc+ic] += com.fpatt[h];
\r
3678 status += Fcodon_3x4(fcodonsg+j*nc, fb3x4sg+j*12);
\r
3680 if(-status/(double)com.ns > 0.9) {
\r
3681 printf("\n%d out of %d sequences do not have any resolved nucleotides. Giving up.\n",-status,com.ns);
\r
3684 printcums(fout, com.ns, fcodonsg, com.icode);
\r
3685 fputs("Codon position x base (3x4) table for each sequence.",fout);
\r
3686 for(j=0; j<com.ns; j++) {
\r
3687 fprintf (fout,"\n\n#%d: %-*s", j+1,wname,com.spname[j]);
\r
3688 OutFb3x4(fout, fb3x4sg+j*12);
\r
3691 zero(fcodonsg, (com.ngene+1)*nc);
\r
3692 zero(fb4g, (com.ngene+1)*4);
\r
3693 for(ig=0; ig<com.ngene; ig++) {
\r
3694 for(j=0; j<com.ns; j++) {
\r
3695 for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
3696 for(k=0; k<3; k++)
\r
3697 NucListall(CODONs[com.z[j][h]][k], &nb[k], ib[k]);
\r
3699 k = nb[0]*nb[1]*nb[2];
\r
3701 ic = ib[0][0]*16+ib[1][0]*4+ib[2][0];
\r
3702 fcodonsg[ig*nc+ic] += com.fpatt[h];
\r
3705 if(Fcodon_3x4(fcodonsg+ig*nc, fb3x4sg+ig*12)) {
\r
3706 printf("All sequences are empty? ");
\r
3707 if(com.ngene>1) printf(" in Gene %d\n", ig+1);
\r
3712 fputs("\n\nCodon usage in genes\n",fout);
\r
3713 printcums(fout, com.ngene, fcodonsg, com.icode);
\r
3714 fputs("Codon position x base (3x4) table for each gene.\n",fout);
\r
3715 for(ig=0; ig<com.ngene; ig++) {
\r
3716 fprintf (fout,"\n\nGene #%d", ig+1);
\r
3717 OutFb3x4(fout, fb3x4sg+ig*12);
\r
3721 for(ig=0; ig<com.ngene; ig++)
\r
3722 for(k=0;k<nc;k++) fcodonsg[com.ngene*nc+k]+=fcodonsg[ig*nc+k];
\r
3723 Fcodon_3x4(fcodonsg+com.ngene*nc, fb3x4sg+com.ngene*12);
\r
3724 for(ig=0; ig<com.ngene+1; ig++)
\r
3725 for(j=0;j<3;j++) for(k=0;k<4;k++) fb4g[ig*4+k]+=fb3x4sg[ig*12+j*4+k]/3;
\r
3727 fputs("\n\nSums of codon usage counts",fout);
\r
3728 printcu(fout, fcodonsg+com.ngene*nc, com.icode);
\r
3729 if(!com.cleandata) fputs("\n(Ambiguity data are not used in the counts.)\n",fout);
\r
3730 fputs("\n\nCodon position x base (3x4) table, overall\n",fout);
\r
3731 OutFb3x4(fout, fb3x4sg+com.ngene*12);
\r
3735 double *fb3x4 = fb3x4sg+com.ngene*12, GC3;
\r
3736 GC3 = (fb3x4[0*4+1] + fb3x4[1*4+1] + fb3x4[2*4+1])/3
\r
3737 + (fb3x4[0*4+3] + fb3x4[1*4+3] + fb3x4[2*4+3])/3;
\r
3738 fprintf(frst1, "\t%.4f", GC3);
\r
3744 void AddCodonFreqSeqGene (int js, int ig, double fcodon0[], double fcodon[],
\r
3745 double fb3x40[], double fb3x4[],
\r
3746 double fb40[], double fb4[]);
\r
3748 void AddCodonFreqSeqGene (int js, int ig, double fcodon0[], double fcodon[],
\r
3749 double fb3x40[], double fb3x4[],
\r
3750 double fb40[], double fb4[])
\r
3752 /* This adds codon and nucleotide counts in sequence js in gene ig to fcodon,
\r
3753 fb3x4, and fb4, using fcodon0, fb3x40, and fb40 to resolve ambiguities
\r
3754 Similar to AddFreqSeqGene().
\r
3756 int h, k, i0,i1,i2, nc=NCODE;
\r
3757 int nb[3],ib[3][4],ic=-1;
\r
3759 char str[4]=" ", codon[4]=" ", ft[64];
\r
3761 for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
3762 for(k=0; k<3; k++)
\r
3763 NucListall(CODONs[com.z[js][h]][k], &nb[k], ib[k]);
\r
3764 k = nb[0]*nb[1]*nb[2];
\r
3765 for(k=0; k<3; k++) { /* f3x4 & f1x4, no regard for stop codons */
\r
3766 for(i0=0,t=t1=0; i0<nb[k]; i0++) {
\r
3767 t += fb3x40[k*4+ib[k][i0]];
\r
3768 t1 += fb40[ib[k][i0]];
\r
3770 for(i0=0; i0<nb[k]; i0++) {
\r
3771 fb3x4[k*4+ib[k][i0]] += com.fpatt[h] * fb3x40[k*4+ib[k][i0]]/t;
\r
3772 fb4[ib[k][i0]] += com.fpatt[h]* fb40[ib[k][i0]]/t1;
\r
3775 for(i0=0; i0<64; i0++) ft[i0]=0;
\r
3776 for(i0=k=0,t=0; i0<nb[0]; i0++) FOR(i1,nb[1]) FOR(i2,nb[2]) {
\r
3777 ic = ib[0][i0]*16+ib[1][i1]*4+ib[2][i2];
\r
3778 if(FROM64[ic]==-1) continue;
\r
3782 if(k==0) printf("%s in seq. %d is stop (icode=%d)\n",
\r
3783 getcodon(str,ic),js+1,com.icode);
\r
3785 printf("difficulty in resolving codon %s.\n", codon);
\r
3786 for(ic=0; ic<nc; ic++) if(ft[ic])
\r
3787 fcodon[ic] += (t>0 ? com.fpatt[h]*fcodon0[ic]/t : com.fpatt[h]/k);
\r
3792 int InitializeCodon (FILE *fout, double space[])
\r
3794 /* Count codons for genes, calculate site patterns and fpatt.
\r
3795 Sequences com.z[] are not coded and may contain ambiguity characters
\r
3796 Space requirement for fcodonsg & fb3x4sg: max(ngene+1,ns)*(64+12+4).
\r
3797 First we count codons for output, with ambiguity characters ignored.
\r
3798 Then we recount to resolve ambiguity characters, to be used for ML
\r
3799 calculation later on.
\r
3800 set up com.pi[NCODE], com.piG[NGENE][64], according to com.codonf
\r
3801 com.pi[] has freqs for all codon sites in the seqs if ngene>1.
\r
3802 Space use is not economical as com.piG and fcodonsg are separate and
\r
3805 int j,k, nc=NCODE, ig, ic[3], wrongorder[4]={2,1,3,0};
\r
3807 double *fcodonsg=space, *fb3x4sg=space+max2((com.ngene+1),com.ns)*nc;
\r
3808 double *fb4g=space+(com.ngene+1)*(64+12);
\r
3809 double *ppi, fcodon0[64],fb3x40[12],fb40[4], d1,d2,d3;
\r
3811 /* counts codons for output, species first, genes next */
\r
3812 if(noisy) puts("Counting codons..");
\r
3813 CountCodons(fout, fcodonsg, fb3x4sg, fb4g);
\r
3815 /* Now to count fcodonsg, fb3x4sg, fb4g, to set up pi's for ML calculation.
\r
3816 Three iterations are going on at the same time.
\r
3818 if (com.codonf!=Fequal && !com.cleandata) { /* iteration to resolve ambiguities */
\r
3819 for(ig=0; ig<com.ngene; ig++) { /* calculate com.piG[] */
\r
3820 axtoy(1/sum(fcodonsg+ig*nc,nc), fcodonsg+ig*nc, fcodon0, nc);
\r
3821 xtoy(fb3x4sg+ig*12, fb3x40, 12);
\r
3822 xtoy(fb4g+ig*4, fb40, 4);
\r
3824 for(irf=0; irf<nrf; irf++) {
\r
3825 zero(fcodonsg + ig*nc, nc);
\r
3826 zero(fb3x4sg + ig*12, 12);
\r
3827 zero(fb4g+ig*4, 4);
\r
3828 for(j=0; j<com.ns; j++) {
\r
3829 AddCodonFreqSeqGene (j, ig, fcodon0, fcodonsg+ig*nc,
\r
3830 fb3x40, fb3x4sg+ig*12, fb40, fb4g+ig*4);
\r
3832 abyx(1/sum(fcodonsg+ig*nc,nc), fcodonsg + ig*nc, nc);
\r
3833 for(k=0; k<3; k++)
\r
3834 abyx(1/sum(fb3x4sg+ig*12+k*4,4), fb3x4sg+ig*12+k*4, 4);
\r
3835 abyx(1/sum(fb4g+ig*4,4), fb4g+ig*4, 4);
\r
3836 d1 = distance(fcodonsg+ig*nc, fcodon0, nc);
\r
3837 d2 = distance(fb3x4sg+ig*12, fb3x40, 12);
\r
3838 d3 = distance(fb4g+ig*4, fb40, 4);
\r
3839 if(d1<1e-8 && d2<1e-8 && d3<1e-8)
\r
3841 xtoy(fcodonsg+ig*nc, fcodon0, nc);
\r
3842 xtoy(fb3x4sg+ig*12, fb3x40, 12);
\r
3843 xtoy(fb4g+ig*4, fb40, 4);
\r
3847 axtoy(1/sum(fcodonsg+com.ngene*nc,nc), fcodonsg+com.ngene*nc, fcodon0, nc);
\r
3848 xtoy(fb3x4sg+com.ngene*12, fb3x40, 12);
\r
3849 xtoy(fb4g+com.ngene*4, fb40, 4);
\r
3850 for(irf=0; irf<nrf; irf++) { /* calculate com.pi[] */
\r
3851 zero(fcodonsg + com.ngene*nc, nc);
\r
3852 zero(fb3x4sg + com.ngene*12, 12);
\r
3853 zero(fb4g + com.ngene*4, 4);
\r
3854 for(ig=0; ig<com.ngene; ig++)
\r
3855 for(j=0; j<com.ns; j++) {
\r
3856 AddCodonFreqSeqGene(j, ig, fcodon0, fcodonsg+com.ngene*nc,
\r
3857 fb3x40, fb3x4sg+com.ngene*12, fb40, fb4g+com.ngene*4);
\r
3859 abyx(1/sum(fcodonsg+com.ngene*nc,nc), fcodonsg+com.ngene*nc, nc);
\r
3861 abyx(1/sum(fb3x4sg+com.ngene*12+k*4,4), fb3x4sg+com.ngene*12+k*4, 4);
\r
3862 abyx(1/sum(fb4g+com.ngene*4,4), fb4g+com.ngene*4, 4);
\r
3863 d1 = distance(fcodonsg+com.ngene*nc, fcodon0, nc);
\r
3864 d2 = distance(fb3x4sg+com.ngene*12, fb3x40, 12);
\r
3865 d3 = distance(fb4g+com.ngene*4, fb40, 4);
\r
3866 if(d1<1e-8 && d2<1e-8 && d3<1e-8) break;
\r
3867 xtoy(fcodonsg+com.ngene*nc, fcodon0, nc);
\r
3868 xtoy(fb3x4sg+com.ngene*12, fb3x40, 12);
\r
3869 xtoy(fb4g+com.ngene*4, fb40, 4);
\r
3873 /* edit com.pi & com.piG according to com.codonf */
\r
3874 for(ig=0; ig<com.ngene+1; ig++) {
\r
3875 ppi = (ig<com.ngene?com.piG[ig]:com.pi);
\r
3877 if (com.codonf==Fequal)
\r
3878 fillxc(ppi,1,com.ncode);
\r
3879 else if (com.codonf==Fcodon || com.codonf==FMutSel0 || com.codonf==FMutSel) {
\r
3880 for(k=0; k<nc; k++)
\r
3881 if(FROM64[k]>-1) ppi[FROM64[k]] = fcodonsg[ig*nc+k];
\r
3883 else if (com.codonf==F3x4 || com.codonf==F3x4MG) {
\r
3884 for(k=0; k<nc; k++)
\r
3886 ppi[FROM64[k]] = fb3x4sg[ig*12+k/16]*fb3x4sg[ig*12+4+(k/4)%4]*fb3x4sg[ig*12+8+k%4];
\r
3888 else if (com.codonf==F1x4 || com.codonf==F1x4MG) {
\r
3889 for(k=0; k<nc; k++)
\r
3891 ppi[FROM64[k]] = fb4g[ig*4+k/16]*fb4g[ig*4+(k/4)%4]*fb4g[ig*4+k%4];
\r
3893 abyx(1/sum(ppi,com.ncode), ppi, com.ncode); /* ncode != nc */
\r
3895 if(ig<com.ngene) {
\r
3896 if (com.codonf>=F1x4 && com.codonf<=FMutSel)
\r
3897 xtoy(fb3x4sg+ig*12, com.f3x4[ig], 12);
\r
3898 /* write 1x4 tables into 3x4 tables */
\r
3899 if (com.codonf==FMutSel0 || com.codonf==FMutSel || com.codonf==F1x4 || com.codonf==F1x4MG) {
\r
3900 for(k=0; k<4; k++) {
\r
3901 d1 = com.f3x4[ig][0*4+k] + com.f3x4[ig][1*4+k] + com.f3x4[ig][2*4+k];
\r
3902 for(j=0; j<3; j++)
\r
3903 com.f3x4[ig][j*4+k] = d1/3;
\r
3909 if(com.codonf==FMutSel0) {
\r
3910 for(j=0,zero(com.piAA,20); j<com.ncode; j++)
\r
3911 com.piAA[GeneticCode[com.icode][FROM61[j]]] += com.pi[j];
\r
3912 matout(F0, com.piAA, 1, 20);
\r
3915 if(com.codonf>=F1x4 && com.codonf<=FMutSel)
\r
3916 com.pf3x4 = com.f3x4[0];
\r
3918 if(com.verbose && com.ngene==1) {
\r
3919 fprintf(fout,"\n\nCodon frequencies under model, for use in evolver (TTT TTC TTA TTG ... GGG):\n");
\r
3920 for(k=0; k<64; k++) {
\r
3921 fprintf(fout,"%12.8f",GeneticCode[com.icode][k]==-1?0:com.pi[FROM64[k]]);
\r
3922 if((k+1)%4==0) FPN(fout);
\r
3925 fprintf(fout, "\nWrong order: AAA AAC AAG AAT ... TTT\n");
\r
3926 for(k=0; k<64; k++) {
\r
3927 ic[0] = wrongorder[k/16];
\r
3928 ic[1] = wrongorder[(k/4)%4];
\r
3929 ic[2] = wrongorder[k%4];
\r
3930 j = ic[0]*16+ic[1]*4+ic[2];
\r
3931 if(GeneticCode[com.icode][j]!=-1)
\r
3932 fprintf(fout,"%.8f, ", com.pi[FROM64[j]]);
\r
3942 int AA2Codonf(double faa[20], double fcodon[])
\r
3944 /* get codon freqs from amino acid freqs, assuming equal freq. for each syn
\r
3945 codon. Used in codon-based amino acid substitution models.
\r
3947 int ic, iaa, i, NCsyn[20];
\r
3949 FOR(i,20) NCsyn[i]=0;
\r
3950 FOR(ic,64) if((iaa=GeneticCode[com.icode][ic])!=-1) NCsyn[iaa]++;
\r
3952 for(ic=0; ic<Nsensecodon; ic++) {
\r
3953 iaa=GeneticCode[com.icode][FROM61[ic]];
\r
3954 fcodon[ic]+=faa[iaa]/NCsyn[iaa];
\r
3956 if(fabs(1-sum(fcodon,64))>1e-6) printf("\n1 == %12.7f\n", sum(fcodon,64));
\r
3961 int DistanceMatAA (FILE *fout)
\r
3966 if(fout) fprintf(fout,"\nAA distances (raw proportions of different sites)\n");
\r
3967 for(h=0,lst=0; h<com.npatt; h++) lst+=com.fpatt[h];
\r
3969 if(fout) fprintf(fout, "\n%-15s", com.spname[i]);
\r
3971 for(h=0,p=0; h<com.npatt; h++)
\r
3972 if (com.z[i][h] != com.z[j][h]) p += com.fpatt[h];
\r
3974 SeqDistance[i*(i-1)/2+j]=p;
\r
3975 if(fout) fprintf(fout, " %7.4f", p);
\r
3978 if(fout) FPN(fout);
\r
3983 int GetDaa (FILE* fout, double daa[])
\r
3985 /* Get the amino acid distance (or substitution rate) matrix
\r
3986 (grantham, dayhoff, jones, etc).
\r
3991 double dmax=0, dmin=1e40;
\r
3993 if(noisy>3) printf("\n\nReading matrix from %s", com.daafile);
\r
3994 if (com.model==REVaa_0||com.model==REVaa) puts(", to get initial values.");
\r
3995 fdaa = gfopen(com.daafile, "r");
\r
3997 for (i=0; i<naa; i++)
\r
3998 for (j=0,daa[i*naa+i]=0; j<i; j++) {
\r
3999 fscanf(fdaa, "%lf", &daa[i*naa+j]);
\r
4000 daa[j*naa+i] = daa[i*naa+j];
\r
4001 if (dmax<daa[i*naa+j]) dmax = daa[i*naa+j];
\r
4002 if (dmin>daa[i*naa+j]) dmin = daa[i*naa+j];
\r
4004 if(com.aaDist && (com.seqtype==1||com.model==FromCodon)) { /* codon model */
\r
4005 if(noisy) printf("\ndistance: %.2f --- %.2f\n", dmin, dmax);
\r
4006 for(i=0; i<naa; i++)
\r
4007 for(j=0; j<naa; j++)
\r
4008 com.daa[i*naa+j] /= dmax;
\r
4010 else if (com.seqtype==AAseq) {
\r
4011 for(i=0; i<naa; i++)
\r
4012 for(j=0; j<i; j++)
\r
4013 if(i*naa+j!=ijAAref)
\r
4014 daa[j*naa+i] = daa[i*naa+j] /= com.daa[ijAAref];
\r
4015 daa[ijAAref] = daa[(ijAAref%naa)*naa+(ijAAref/naa)] = 1;
\r
4017 if(com.model==Empirical) {
\r
4018 for(i=0; i<naa; i++)
\r
4019 if(fscanf(fdaa,"%lf",&com.pi[i])!=1)
\r
4020 error2("aaRatefile");
\r
4021 if (fabs(1-sum(com.pi,20))>1e-5) {
\r
4022 printf("\nSum of freq. = %.6f != 1 in aaRateFile\n", sum(com.pi,naa));
\r
4030 fprintf (fout, "\n%s\n", com.daafile);
\r
4031 for(i=0; i<naa; i++) {
\r
4032 fprintf (fout, "\n%4s", getAAstr(aa3,i));
\r
4033 for(j=0; j<i; j++)
\r
4034 fprintf (fout, "%5.0f", daa[i*naa+j]);
\r
4041 for(i=0,FPN(frst);i<naa;i++,FPN(frst))
\r
4042 FOR(j,i) fprintf(frst,"%3d",AA1STEP[i*(i-1)/2+j]);
\r
4044 for(i=0,k=0;i<naa;i++)
\r
4045 FOR(j,i) if(AA1STEP[i*(i-1)/2+j]) {
\r
4046 fprintf(frst,"%c%c\t%.2f\n",AAs[i],AAs[j],com.daa[i*naa+j]);
\r
4049 fprintf(frst,"\n%d one-step amino acid pairs\n", k);
\r
4057 int SetAA1STEP (void)
\r
4059 /* Sets the global variable AA1STEP[19*20/2].
\r
4060 Sets com.nrate for models like AAClasses and REVaa_0.
\r
4061 AA1STEP[k] marks the k_th pair of amino acids that differ at one position,
\r
4062 Q[i*naa+j] is the k_th nonzero element if AA1STEP[k]=i*naa+j;
\r
4063 Lower diagonal of Q is visited, with i>j.
\r
4065 int ncode0=com.ncode, nc, naa=20, i,j,k, ic1,ic2, ndiff, from[3],to[3];
\r
4066 int *Q=(int*)PMat;
\r
4069 nc=Nsensecodon; com.ncode=ncode0;
\r
4070 for(i=0; i<naa*naa; i++) Q[i]=0;
\r
4071 for (i=0; i<nc; i++)
\r
4072 for(j=0; j<i; j++) {
\r
4073 ic1=FROM61[i]; from[0]=ic1/16; from[1]=(ic1/4)%4; from[2]=ic1%4;
\r
4074 ic2=FROM61[j]; to[0]=ic2/16; to[1]=(ic2/4)%4; to[2]=ic2%4;
\r
4075 for (k=0,ndiff=0; k<3; k++) if (from[k]!=to[k]) ndiff++;
\r
4076 if (ndiff!=1) continue;
\r
4077 ic1 = GeneticCode[com.icode][ic1];
\r
4078 ic2 = GeneticCode[com.icode][ic2];
\r
4079 Q[ic1*naa+ic2]++;
\r
4084 for (i=0,FPN(F0); i<naa; i++,FPN(F0)) FOR(j,i)printf("%3d",Q[i*naa+j]);
\r
4087 for (i=0,k=0; i<naa; i++)
\r
4088 for(j=0; j<i; j++) {
\r
4089 if (Q[i*naa+j]>0) { AA1STEP[i*(i-1)/2+j] = 1; k++; }
\r
4090 else AA1STEP[i*(i-1)/2+j] = 0;
\r
4093 for(i=0,FPN(F0);i<naa;i++,FPN(F0)) FOR(j,i)printf("%3d",AA1STEP[i*(i-1)/2+j]);
\r
4095 if(com.seqtype==2) com.nrate = k-1; /* one element (ijAAref) is fixed */
\r
4100 int GetOmegaAA (int OmegaAA[])
\r
4102 /* This routine reads the file OmegaAA.dat to initialize the
\r
4103 lower diagonal matrix OmegaAA, which specifies the aa substituion
\r
4104 rate classes. To be used with the codon substitution model
\r
4105 AAClasses, which specifies several classes of the dN/dS ratio.
\r
4107 OmegaAA[iaa*(iaa-1)/2+jaa]= -1 if no one-step change is possible;
\r
4108 = 0 for the first, background, class
\r
4109 = i (1,..,nclass) if iaa and jaa are in class i
\r
4111 char *OmegaAAf="OmegaAA.dat", line[1024];
\r
4113 int iomega, n1step=0, i,j,k, iaa,jaa, npair, naa=20, nline=1024;
\r
4115 for(i=0,n1step=0; i<naa; i++) for(j=0; j<i; j++)
\r
4116 if (AA1STEP[i*(i-1)/2+j]) { OmegaAA[i*(i-1)/2+j] = 0; n1step++; }
\r
4117 else OmegaAA[i*(i-1)/2+j] = -1;
\r
4119 printf("\n\n%d one-step aa pairs.\n", n1step);
\r
4120 printf("Reading omega class from %s.\n", OmegaAAf);
\r
4122 com.nOmegaType = -1;
\r
4123 fin=fopen(OmegaAAf,"r");
\r
4124 if(fin) fscanf(fin, "%d", &com.nOmegaType);
\r
4125 if (com.nOmegaType<1 || com.nOmegaType>65-1) {
\r
4126 if (com.seqtype!=CODONseq) puts("\nTo be tested.\a");
\r
4128 if (com.seqtype==AAseq) {
\r
4129 for(i=0; i<naa; i++) for(j=0; j<i; j++) if(i*naa+j != ijAAref && AA1STEP[i*(i-1)/2+j])
\r
4130 OmegaAA[i*(i-1)/2+j] = com.nOmegaType++;
\r
4133 for(i=0; i<naa; i++) for(j=0; j<i; j++)
\r
4134 if(AA1STEP[i*(i-1)/2+j]) OmegaAA[i*(i-1)/2+j] = com.nOmegaType++;
\r
4135 printf("%d dN/dS ratios estimated from data.\n",com.nOmegaType);
\r
4138 printf("%d dN/dS ratios estimated from data.\n",com.nOmegaType);
\r
4139 for(iomega=0; iomega<com.nOmegaType-1; iomega++) {
\r
4140 fscanf(fin, "%d", &j);
\r
4141 if (j!=iomega+1) { printf("err data file %s.", OmegaAAf); exit(-1); }
\r
4142 printf ("\nClass #%d: ", j);
\r
4143 j = fgetc (fin); if (j!=':') error2("err expecting :");
\r
4144 fgets (line, nline, fin);
\r
4146 printf ("%s\n", line);
\r
4147 for (j=0,npair=0; j<nline-1 && line[j] && line[j]!='\n'; j++) {
\r
4149 if (!isalpha(iaa)) continue;
\r
4150 jaa = line[++j]; if(!isalpha(jaa)) error2("err jaa");
\r
4153 printf ("\npair %2d: |%c%c| ", npair, iaa,jaa);
\r
4154 iaa=CodeChara((char)iaa,AAseq); jaa=CodeChara((char)jaa,AAseq);
\r
4155 if(iaa<0||iaa>19||jaa<0||jaa>19) error2("aa not found");
\r
4156 if (iaa<jaa) { k=jaa, jaa=iaa; iaa=k; }
\r
4158 printf ("|%c%c (%2d,%2d)| ", AAs[iaa], AAs[jaa],iaa,jaa);
\r
4159 if (iaa==jaa) puts("This pair has no effect.");
\r
4160 if (OmegaAA[iaa*(iaa-1)/2+jaa]==-1) {
\r
4161 puts("\nThis pair cannot change in one step and is ignored!");
\r
4164 else if (OmegaAA[iaa*(iaa-1)/2+jaa])
\r
4165 error2("This pair has already been specified?");
\r
4166 OmegaAA[iaa*(iaa-1)/2+jaa]=iomega+1;
\r
4167 printf (" in class %d ",iomega+1);
\r
4171 if(fin) fclose(fin);
\r
4172 com.nrate = com.nkappa = (com.hkyREV ? 5 : !com.fix_kappa);
\r
4173 com.nrate += (com.nOmega = com.nOmegaType);
\r
4175 for (i=0; i<naa; i++,FPN(F0))
\r
4176 for(j=0; j<i; j++) printf ("%3d", OmegaAA[i*(i-1)/2+j]);
\r
4183 int GetCodonFreqs2 (void)
\r
4185 /* Recalcualte the expected codon frequencies (com.pi[]) using the control
\r
4186 variable com.codonf, and the observed codon frequencies in com.pi[].
\r
4187 com.pi[] is both input (observed codon frequencies) and output (expected
\r
4188 frequencies under the model codonf).
\r
4189 This is used by PairwiseCodon().
\r
4191 int n=com.ncode, i,j, ic,b[3];
\r
4192 double *pi=com.pi, fb3x4[12], fb4[4], GC[3]={0};
\r
4194 if (com.codonf==Fequal)
\r
4195 { fillxc(pi,1./n,n); return 0; }
\r
4196 if (com.codonf!=Fcodon && com.codonf!=FMutSel) {
\r
4197 for (i=0,zero(fb3x4,12),zero(fb4,4); i<n; i++) {
\r
4198 ic=FROM61[i]; b[0]=ic/16; b[1]=(ic/4)%4; b[2]=ic%4;
\r
4200 { fb3x4[j*4+b[j]] += pi[i]; fb4[b[j]] += pi[i]/3.; }
\r
4202 for (i=0; i<n; i++) {
\r
4203 ic=FROM61[i]; b[0]=ic/16; b[1]=(ic/4)%4; b[2]=ic%4;
\r
4204 if (com.codonf==F3x4 || com.codonf==F3x4MG)
\r
4205 pi[i] = fb3x4[b[0]]*fb3x4[4+b[1]]*fb3x4[8+b[2]];
\r
4207 pi[i] = fb4[b[0]]*fb4[b[1]]*fb4[b[2]];
\r
4210 if(com.codonf==F1x4MG)
\r
4212 xtoy(fb4, com.pf3x4+j*4, 4);
\r
4213 else if(com.codonf==F3x4MG)
\r
4214 xtoy(fb3x4, com.pf3x4, 12);
\r
4216 abyx (1./sum(pi,n), pi, n);
\r
4218 GC[0] = (fb3x4[0+1]+fb3x4[0+3])*100;
\r
4219 GC[1] = (fb3x4[4+1]+fb3x4[4+3])*100;
\r
4220 GC[2] = (fb3x4[8+1]+fb3x4[8+3])*100;
\r
4221 /* fprintf(frst1, "\tGC123\t%.1f\t%.1f\t%.1f", GC[0],GC[1],GC[2]); */
\r
4227 double lfun2dSdN (double x[], int np)
\r
4229 /* likelihood function for calculating dS and dN between 2 sequences,
\r
4230 com.z[0] & com.z[1:
\r
4231 f(i,j) = \pi_i * p_{ij}(t)
\r
4233 Data are clean and coded.
\r
4234 Transition probability pijt is calculated for observed patterns only.
\r
4236 int n=com.ncode, h,i,k, ik, z0,z1;
\r
4237 double lnL=0, fh,expt[NCODE], mr=0;
\r
4241 if(com.hkyREV==0) {
\r
4242 if(com.fix_kappa==1) { com.pkappa[0] = com.kappa; ik = 1; }
\r
4243 else com.kappa = x[k]; /* Is this necessary? */
\r
4245 for(i=0; i<(com.hkyREV ? 5 : !com.fix_kappa); i++)
\r
4246 com.pkappa[ik++] = x[k++];
\r
4247 if(com.codonf==FMutSel)
\r
4248 for(i=0; i<3; i++)
\r
4249 com.pkappa[ik++] = x[k++];
\r
4251 if(!com.fix_omega) com.omega = x[1+com.nkappa];
\r
4252 if(!com.fix_kappa || !com.fix_omega)
\r
4253 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, com.pkappa, com.omega,PMat);
\r
4255 for(k=0; k<n; k++)
\r
4256 expt[k] = exp(x[0]*Root[k]);
\r
4257 for (h=0; h<com.npatt; h++) {
\r
4258 if(com.fpatt[h]<1e-20) continue;
\r
4261 for(k=0,fh=0;k<n;k++)
\r
4262 fh += U[z0*n+k]*expt[k]*V[k*n+z1];
\r
4265 matout(F0,x,1,np);
\r
4266 printf("lfun2dSdN: fh = %.9f\n",fh);
\r
4269 lnL -= log(fh) * com.fpatt[h];
\r
4275 int VariancedSdN (double t, double omega, double vtw[2*2], double vdSdN[2*2])
\r
4277 /* This calculates the covariance matrix of dS & dN, using the
\r
4278 difference approximation, from the covariance matrix of t and
\r
4279 omega (vtw). com.kappa and com.pi are used. Sampling errors
\r
4280 in parameters other than t and omega, such as kappa and pi[],
\r
4282 JacobiSN = {{dS/dt, dS/dw}, {dN/dt,dN/dw}}
\r
4285 double JacobiSN[2*2],T1[2*3],T2[2*3], S,dS,dN, dS1,dN1,dS2,dN2, eh, mr=0;
\r
4287 if(vtw[0]<=0 || vtw[3]<=0) {
\r
4288 puts("var(dS,dN) not calculable.");
\r
4293 /* printf("\nt & w: %.5f %.5f\n", t, omega);
\r
4294 matout(F0,vtw, 2,2); */
\r
4295 eigenQcodon(2,t,&S,&dS,&dN,NULL,NULL,NULL, &mr, com.pkappa,omega,PMat);
\r
4297 eh = (t+1)*Small_Diff;
\r
4298 eigenQcodon(2,t+eh,&S,&dS1,&dN1,NULL,NULL,NULL, &mr, com.pkappa,omega,PMat);
\r
4299 eigenQcodon(2,t-eh,&S,&dS2,&dN2,NULL,NULL,NULL, &mr, com.pkappa,omega,PMat);
\r
4300 JacobiSN[0*np+0] = (dS1 - dS2)/(2*eh);
\r
4301 JacobiSN[1*np+0] = (dN1 - dN2)/(2*eh);
\r
4303 eh = (omega+1)*Small_Diff;
\r
4304 eigenQcodon(2,t,&S,&dS1,&dN1,NULL,NULL,NULL, &mr, com.pkappa,omega+eh,PMat);
\r
4305 eigenQcodon(2,t,&S,&dS2,&dN2,NULL,NULL,NULL, &mr, com.pkappa,omega-eh,PMat);
\r
4306 JacobiSN[0*np+1] = (dS1 - dS2)/(2*eh);
\r
4307 JacobiSN[1*np+1] = (dN1 - dN2)/(2*eh);
\r
4309 matby(JacobiSN,vtw,T1,2,2,2);
\r
4310 mattransp2 (JacobiSN, T2, 2, 2);
\r
4311 matby(T1,T2,vdSdN,2,2,2);
\r
4313 /* matout(F0,vdSdN, 2,2); */
\r
4318 double distanceHKY85 (double x[], double *kappa, double alpha);
\r
4319 int distance3pos(double dHKY[], double kHKY[], int *sites4, char *z1, char *z2);
\r
4321 int distance3pos(double dHKY[], double kHKY[], int *sites4, char *z1, char *z2)
\r
4323 /* This calculates nucleotide-based distances between two protein-coding
\r
4324 DNA sequences z1 and z2, both of which are coded. com.cleandata = 1 is
\r
4327 int i,j, h, k, ic1, ic2, from[3], to[3];
\r
4328 double fij[4][16]={{0}}, pi4[4]={0};
\r
4329 /* [0,1,2] are for 3 positions, [3] is for 4-fold */
\r
4331 for (h=0; h<com.npatt; h++) {
\r
4332 ic1=FROM61[(int)z1[h]]; from[0]=ic1/16; from[1]=(ic1/4)%4; from[2]=ic1%4;
\r
4333 ic2=FROM61[(int)z2[h]]; to[0]=ic2/16; to[1]=(ic2/4)%4; to[2]=ic2%4;
\r
4334 for(k=0; k<3; k++)
\r
4335 fij[k][from[k]*4+to[k]] += com.fpatt[h]/com.ls;
\r
4336 if(from[0]==to[0] && from[1]==to[1] && FourFold[to[0]][to[1]])
\r
4337 fij[3][from[2]*4+to[2]] += com.fpatt[h];
\r
4339 *sites4 = (int) sum(fij[3], 16);
\r
4342 FOR(k,16) fij[3][k] /= *sites4;
\r
4343 FOR(i,4) FOR(j,4) pi4[i] += fij[3][i*4+j]/2;
\r
4344 FOR(i,4) FOR(j,4) pi4[j] += fij[3][i*4+j]/2;
\r
4346 for(k=0; k<4; k++)
\r
4347 dHKY[k] = distanceHKY85(fij[k], &kHKY[k], 0);
\r
4352 int PairwiseCodon (FILE *fout, FILE*fds, FILE*fdn, FILE*ft, double space[])
\r
4354 /* Calculates ds & dn for all pairwise codon sequence comparisons.
\r
4355 It uses different npatt for different pairs.
\r
4356 The data com.z[] should be encoded clean data, with ambiguity characters
\r
4357 removed. Think of what to do with raw unclean data.
\r
4358 JacobiSN has two columns, the 1st are deratives of dS (dS/dt, dS/dk, dS/dw)
\r
4359 and the second of dN.
\r
4361 char *pz0[NS],codon[2][3]; /* pz0, npatt0, & fpatt0 hold the old information */
\r
4362 int npatt0=com.npatt;
\r
4363 double *fpatt0, ls0=com.ls;
\r
4364 float fp[NCODE*NCODE];
\r
4365 int n=com.ncode, is,js,j,k,h, i0,np, wname=15;
\r
4366 int nb[3],ib[3][4],ic[2], missing=0, sites4;
\r
4367 double x[10]={.9,1,.5,.5,.5,.5,.3}, xb[10][2]={{1e-5,50}}, large=50;
\r
4368 double kappab[2]={.01,999}, omegab[2]={.001,99};
\r
4369 double lnL, e=1e-7, *var=space+NP, S,dS,dN, mr=0;
\r
4370 double JacobiSN[2*3],T1[2*3],T2[2*3],vSN[2*2], dS1,dN1,dS2,dN2,y[3],eh;
\r
4371 /* for calculating SEs of dS & dN */
\r
4372 double dHKY[4], kHKY[4];
\r
4374 fpatt0=(double*)malloc(npatt0*3*sizeof(double));
\r
4375 FOR(k,com.ns) pz0[k]=com.z[k];
\r
4376 com.z[0] = (char*)(fpatt0+npatt0);
\r
4377 com.z[1] = com.z[0]+npatt0;
\r
4378 FOR (k,npatt0) fpatt0[k] = (float)com.fpatt[k];
\r
4380 if(!com.cleandata) puts("\nPairwiseCodon: pairwise deletion.");
\r
4381 if (com.ngene>1 && com.Mgene==1) puts("ngene>1 to be tested.");
\r
4382 if (noisy>1) printf("\npairwise comparison (Goldman & Yang 1994).\n");
\r
4383 fprintf(fout,"\npairwise comparison, codon frequencies: %s.\n",
\r
4384 codonfreqs[com.codonf]);
\r
4386 FOR(j,com.nkappa) { xb[1+j][0]=kappab[0]; xb[1+j][1]=kappab[1]; }
\r
4387 if(!com.fix_omega) { k=1+com.nkappa; xb[k][0]=omegab[0]; xb[k][1]=omegab[1]; }
\r
4389 fprintf(fds,"%6d\n", com.ns); fprintf(fdn,"%6d\n", com.ns);
\r
4390 fprintf(ft,"%6d\n", com.ns);
\r
4391 fprintf(frst, "\n\npairwise comparison (Goldman & Yang 1994)");
\r
4393 "\nseq seq N S dN dS dN/dS Paras.\n");
\r
4395 for(is=0;is<com.ns;is++) {
\r
4396 fprintf(fds,"%-*s ", wname,com.spname[is]);
\r
4397 fprintf(fdn,"%-*s ", wname,com.spname[is]);
\r
4398 fprintf(ft,"%-*s ", wname,com.spname[is]);
\r
4399 for(js=0; js<is; js++) {
\r
4401 puts("\nInput the pair i & j (i>j) for dN-dS calculation? ");
\r
4402 scanf("%d%d",&is,&js);
\r
4404 if(is>com.ns || js<0 || is<js) error2("invalid pair");
\r
4406 if(noisy>1) printf ("\n%4d vs. %3d", is+1, js+1);
\r
4407 fprintf(fout,"\n\n%d (%s) ... %d (%s)",
\r
4408 is+1,com.spname[is], js+1,com.spname[js]);
\r
4409 fprintf (frst, "%3d %3d ", is+1, js+1);
\r
4410 if(noisy>2) fprintf(frub, "\n\n%d (%s) ... %d (%s)",
\r
4411 is+1,com.spname[is], js+1,com.spname[js]);
\r
4412 for(k=0; k<n*n; k++) fp[k]=0;
\r
4413 if(com.cleandata) {
\r
4414 for(h=0; h<npatt0; h++) {
\r
4415 j = max2(pz0[is][h],pz0[js][h]);
\r
4416 k = min2(pz0[is][h],pz0[js][h]);
\r
4417 fp[j*n+k] += (float)fpatt0[h];
\r
4421 for(h=0,com.ls=0; h<npatt0; h++) {
\r
4422 FOR(i0,2) FOR(k,3) codon[i0][k] = pz0[i0==0 ? is : js][h*3+k];
\r
4423 for(i0=0,missing=0; i0<2; i0++) {
\r
4424 for(k=0; k<3; k++)
\r
4425 NucListall(codon[i0][k], &nb[k], ib[k]);
\r
4426 if(nb[0]*nb[1]*nb[2]!=1)
\r
4427 { missing=1; break; }
\r
4429 ic[i0] = FROM64[ ib[0][0]*16+ib[1][0]*4+ib[2][0] ];
\r
4431 if(missing) continue;
\r
4432 com.ls += (int)fpatt0[h];
\r
4434 j = max2(ic[0],ic[1]);
\r
4435 k = min2(ic[0],ic[1]);
\r
4436 fp[j*n+k] += (float)fpatt0[h];
\r
4440 for(j=0,com.npatt=0;j<n;j++) {
\r
4441 for(k=0; k<j+1; k++)
\r
4443 com.z[0][com.npatt] = (char)j;
\r
4444 com.z[1][com.npatt] = (char)k;
\r
4445 com.fpatt[com.npatt++] = fp[j*n+k];
\r
4448 if(noisy>2) printf("\n npatt=%d ",com.npatt);
\r
4449 for(j=0,zero(com.pi,n); j<com.npatt; j++) {
\r
4450 com.pi[(int)com.z[0][j]] += com.fpatt[j]/(2.*com.ls);
\r
4451 com.pi[(int)com.z[1][j]] += com.fpatt[j]/(2.*com.ls);
\r
4454 GetCodonFreqs2 ();
\r
4456 distance3pos(dHKY, kHKY, &sites4, com.z[0], com.z[1]);
\r
4458 np = com.np = (com.ntime=1) + com.nkappa + !com.fix_omega;
\r
4461 /* initial values and bounds */
\r
4462 x[0] = SeqDistance[is*(is-1)/2+js]*(0.8+0.3*rndu());
\r
4463 if(x[0]>3) x[0]=1.5+rndu();
\r
4464 if(x[0]<1e-6) x[0]=.5*rndu();
\r
4465 if(com.nkappa==1) { /* HKY type model */
\r
4466 if(is==0 && js==1) x[1] = (com.icode==1?4:1.5)+rndu();
\r
4467 else x[1] = (x[1]*2+2+rndu())/3;
\r
4468 if(x[1]>10) x[1] = 5;
\r
4471 else /* REV or FMutSel models, do something later */
\r
4472 for(j=1,x[1]=.8+.4*rndu(); j<com.nkappa; j++)
\r
4473 x[1+j] = .2+.4*rndu();
\r
4475 if(!com.fix_omega) {
\r
4477 if(is==0 && js==0) x[k] = 0.2+0.2*rndu();
\r
4478 else x[k] = (3*x[k]+0.6*rndu())/4;
\r
4479 x[k] = max2(x[k],0.01);
\r
4480 x[k] = min2(x[k],2);
\r
4484 FPN(F0); FOR(k,np) printf(" %12.6f",x[k]); FPN(F0);
\r
4485 FOR(k,np) printf(" %12.6f",xb[k][0]); FPN(F0);
\r
4486 FOR(k,np) printf(" %12.6f",xb[k][1]); FPN(F0);
\r
4489 if(com.fix_kappa && com.fix_omega)
\r
4490 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, com.pkappa,com.omega,PMat);
\r
4493 if( com.runmode == -3 ){ //kostas, save values
\r
4500 ming2(noisy>3?frub:NULL, &lnL, lfun2dSdN, NULL, x, xb, space, e, np);
\r
4501 else { x[1]=x[2]=com.kappa=com.omega=0; lnL=0; }
\r
4504 fprintf(fout,"\nlnL =%12.6f\n",-lnL);
\r
4505 FOR(k,np) fprintf(fout," %8.5f",x[k]); FPN(fout);
\r
4508 printf("\n\nt_NG = %.5f\tMLEs: ", SeqDistance[is*(is-1)/2+js]);
\r
4509 for(k=0;k<np;k++) printf(" %.5f", x[k]);
\r
4512 if (np && com.getSE) {
\r
4513 Hessian(np, x, lnL, space, var, lfun2dSdN, var+np*np);
\r
4514 matinv(var, np, np, var+np*np);
\r
4515 fprintf(fout,"SEs for parameters:\n");
\r
4516 FOR(k,np) fprintf(fout," %8.5f",(var[k*np+k]>0.?sqrt(var[k*np+k]):-0));
\r
4520 eigenQcodon(2,x[0],&S,&dS,&dN, NULL,NULL,NULL, &mr, com.pkappa,com.omega,PMat);
\r
4523 puts("\nNucleotide-based analysis (approximate MLEs; use baseml to get proper MLEs):");
\r
4524 printf("\ndHKY (123-4):"); FOR (k,4) printf(" %8.5f", dHKY[k]);
\r
4525 printf("\nkHKY (123-4):"); FOR (k,4) printf(" %8.5f", kHKY[k]);
\r
4526 printf(" (%d four-fold sites)\n", sites4);
\r
4529 fprintf(fds," %7.4f",dS); fprintf(fdn," %7.4f",dN);
\r
4530 fprintf(ft," %7.4f",x[0]);
\r
4533 "t= %6.4f S= %7.1f N= %7.1f dN/dS= %7.4f dN =%7.4f dS =%7.4f\n",
\r
4534 x[0], S, com.ls*3-S, com.omega, dN, dS);
\r
4536 fprintf(frst,"%8.1f %8.1f %8.4f %8.4f %8.4f", com.ls*3-S, S, dN, dS, com.omega);
\r
4537 for(k=0; k<np; k++) fprintf(frst," %8.4f",x[k]);
\r
4539 for(k=0; k<np; k++) fprintf(frst1,"\t%.4f",x[k]);
\r
4540 fprintf(frst1,"\t%.3f", -lnL);
\r
4541 fprintf(frst1,"\t%.4f\t%.4f", dN, dS);
\r
4545 fprintf(frst," +-%6.4f",(var[k*np+k]>0.?sqrt(var[k*np+k]):-1));
\r
4546 fprintf(frst," %9.3f\n",-lnL);
\r
4547 if(com.getSE && !com.fix_omega) {
\r
4549 FOR(j,np) y[j] = x[j];
\r
4550 y[k] += (eh=(x[k]+1)*Small_Diff);
\r
4551 if(!com.fix_kappa) com.kappa = y[1];
\r
4552 com.omega = y[1+!com.fix_kappa];
\r
4553 eigenQcodon(2,y[0],&S,&dS1,&dN1,NULL,NULL,NULL, &mr, com.pkappa,com.omega,PMat);
\r
4555 if(!com.fix_kappa) com.kappa = y[1];
\r
4556 com.omega = y[1+!com.fix_kappa];
\r
4557 eigenQcodon(2,y[0],&S,&dS2,&dN2,NULL,NULL,NULL, &mr, com.pkappa,com.omega,PMat);
\r
4559 JacobiSN[0*np+k] = (dS1-dS2)/(2*eh);
\r
4560 JacobiSN[1*np+k] = (dN1-dN2)/(2*eh);
\r
4563 matby(JacobiSN, var, T1, 2, np, np);
\r
4564 mattransp2(JacobiSN, T2, 2, np);
\r
4565 matby(T1,T2,vSN,2,np,2);
\r
4567 fputs("\nvar(dS,dN):\n", fout);
\r
4568 matout(fout,vSN,2,2);
\r
4570 fprintf(fout,"dN = %7.5f +- %.5f dS = %7.5f +- %.5f",
\r
4571 dN,(vSN[3]>0?sqrt(vSN[3]):-0),dS,(vSN[0]>0?sqrt(vSN[0]):-0));
\r
4572 fprintf(fout," (by method 1)\n");
\r
4575 T1[1] = T1[2] = var[0*np+np-1];
\r
4576 T1[3] = var[(np-1)*np+(np-1)];
\r
4577 if(com.getSE && !com.fix_omega)
\r
4578 VariancedSdN(x[0], x[np-1], T1, vSN);
\r
4580 fprintf(fout,"dN = %7.5f +- %.5f dS = %7.5f +- %.5f",
\r
4581 dN,(vSN[3]>0?sqrt(vSN[3]):-0),dS,(vSN[0]>0?sqrt(vSN[0]):-0));
\r
4582 fprintf(fout," (by method 2)\n");
\r
4587 if(com.runmode == -3) BayesPairwise( is, js, x, var, lnL, 32, xb, space ); //kostas
\r
4589 fflush(frst); fflush(fout);
\r
4591 FPN(fds); FPN(fdn); FPN(ft);
\r
4592 fflush(fds); fflush(fdn); fflush(ft);
\r
4595 com.ls = (int)ls0; FOR(k,com.ns) com.z[k] = pz0[k];
\r
4596 com.npatt = npatt0; FOR(h,npatt0) com.fpatt[h] = fpatt0[h]; free(fpatt0);
\r
4606 int BayesPairwise(int is, int js, double x[], double var[], double maxlogl,
\r
4607 int npoints, double xb[][2], double space[])
\r
4609 /*This function returns estimates of E[ t | x ], E[ w | x ], Var[ t | x ], Var[ w | x ],
\r
4610 Cov[ w,t | x ], Corr[ w,t | x], P[ w>1 | x ]
\r
4612 double *nodes = NULL, *weights = NULL; //contain the points and weights
\r
4613 double interm_results[7] = {0,0,0,0,0,0,0}; //contain the normalizing_constant, E[w|x], E[t|x], E[w^2|x], E[t^2|x], E[w*t|x], P[w>1|x]
\r
4614 register int i = 0, j = 0;
\r
4615 int w_index = 0, t_index = 0, way = 0, setp = 0;
\r
4616 double w_value, t_value, w_weight, t_weight, sign, scalefactor = 0, Qmatrix[64*64],
\r
4617 z1, z2, logl, logposterior, hvalue, m1, m2, s1, s2, rvalue,
\r
4618 bayes_est[7], maxlogP, e = 1e-7, xp[2], kappa[1], pS, pN, sdiff[3] = {1e-7, 1e-8, 1e-9};
\r
4619 double FL, alpha, u, w_p, Rp[64*64], Up[64*64], Vp[64*64], PMatp[64*64], logl_p, logposterior_p,
\r
4620 hvalue_p, qvalue; //For calculation of P(w>1|x)
\r
4621 char ch1[] = "E[t]", ch2[] = "E[w]", ch3[] = "SE[t]", ch4[] = "SE[w]",
\r
4622 ch5[] = "Cov[t,w]", ch6[] = "Corr[t,w]", ch7[] = "P[w > 1]";
\r
4625 if( com.fix_kappa == 1 && com.fix_omega == 0 ){
\r
4627 x[1] = com.kappa;
\r
4629 if( com.fix_omega == 1 )
\r
4632 pS = Sd/SS; pN = Nd/NN;
\r
4633 if( pS == 0 && pN == 0 ){ x[1] = 2; } //fix value of k at 2 when sequences are identical
\r
4635 if( com.codonf == 3 ){
\r
4636 for( i = 0; i < 61; i++ ){
\r
4637 if( com.pi[i] < 1e-15 ) com.pi[i] = 1e-15;
\r
4641 if( ( pS>0 && pS<0.74 ) && ( pN>0 && pN<0.74 ) && x[0] > 0.001 && x[0] < 10 && x[2] > 0.005 && x[2] < 5 ){
\r
4642 // Choice of 0.001, 0.005, 5 and 10 is arbitrary. We just need to exclude extreme
\r
4643 // values because extreme values are shrinked and if we use the MLES to define
\r
4644 // the location parameters of logistic distributions the transformation won't be proper
\r
4646 EstVariances( var );
\r
4651 if( x[0] < 10 ) xp[0] = x[0]; else xp[0] = x[4]; //initial t value for ming2. MAP is usually far from an extreme ML value for a prior mean ~1.
\r
4652 if( x[2] < 10 ) xp[1] = x[2]; else xp[1] = x[5]; //initial w value for ming2. MAP is usually far from an extreme ML value for a prior mean ~0.5.
\r
4654 if( pS == 0 && pN == 0 ) xp[1] = com.hyperpar[2] / com.hyperpar[3]; //in case of identical sequences posterior mean would be close to the prior mean
\r
4655 //Set the boundaries for minimization
\r
4656 xb[0][0] = 1e-5; xb[0][1] = 100;
\r
4657 xb[1][0] = 1e-5; xb[1][1] = 200;
\r
4659 if( com.fix_kappa == 0 ){
\r
4660 com.fix_kappa = 1; com.kappa = x[1]; com.nkappa = 0, com.np = 2;
\r
4663 x[5] = Small_Diff; //Store the value of Small_Diff. I return the initial value back below.
\r
4664 Small_Diff = 1e-9;
\r
4666 ming2( NULL, &maxlogP, logP, NULL, xp, xb, space, e, 2 );
\r
4667 printf( "\nMAPs: %.5f %.5f\n", xp[0], xp[1] );
\r
4669 Small_Diff = 1e-6;
\r
4671 Hessian( 2, xp, maxlogP, space, var, logP, var+2*2 );
\r
4672 //Explores whether different values 1e-9 <= Small_Diff <= 1e-7 give appropriate Hessian matrix (No negative or zero variance after Hessian inversion)
\r
4673 if( ( var[0] * var[3] - var[1] * var[2] ) < 0 || var[0] < 0 ){ //check condition for local minimum
\r
4674 for( i=0; i<3; i++){
\r
4675 Small_Diff = sdiff[i];
\r
4676 Hessian( 2, xp, maxlogP, space, var, logP, var+2*2 );
\r
4677 if( ( var[0] * var[3] - var[1] * var[2] ) > 0 && var[0] > 0 ) break;
\r
4679 fprintf( fout, "MAPs:\n %.5f %.5f\n", xp[0], xp[1] );
\r
4680 printf( "Can't estimate Hessian" );
\r
4686 Small_Diff = x[5]; //Restore the initial value of Small_Diff
\r
4687 matinv( var, 2, 2, var+2*2 );
\r
4688 x[0] = xp[0]; x[2] = xp[1]; var[1] = var[3];
\r
4691 m1 = log( x[0] ); //location parameter of logistic distribution for transformation of t
\r
4692 m2 = log( x[2] ); //location parameter of logistic distribution for transformation of w
\r
4693 s1 = ( 1 / x[0] ) * sqrt( var[0] ); //scale parameter of logistic distribution for transformation of t
\r
4694 s2 = ( 1 / x[2] ) * sqrt( var[1] ); //scale parameter of logistic distribution for transformation of w
\r
4696 GaussLegendreRule( &nodes, &weights, npoints );
\r
4698 //For P(w>1|x) calculation
\r
4699 FL = CDFLogis( 0, m2, s2 );
\r
4700 if( FL > 1 - 1e-5 ){ bayes_est[6] = 0, setp = 1; } //If all the mass of posterior density is below 1, set P(w>1|x)=0
\r
4701 if( FL < 1e-5 ){ bayes_est[6] = 1, setp = 1; } //If all the mass of posterior density is above 1, set P(w>1|x)=1
\r
4702 if( setp == 0 ) alpha = 2 * FL - 1;
\r
4704 for(i = 0; i < npoints; i++) { //count for omega
\r
4705 if( i < npoints / 2 ) { w_index = npoints/2 - 1 - i; sign = -1; }
\r
4706 else { w_index = i - npoints/2; sign = 1; }
\r
4707 z2 = sign*nodes[ w_index ];
\r
4708 w_weight = weights[ w_index ]; //weight for omega value
\r
4709 w_value = logistic_transformation( z2, m2, s2 ); //value of omega after logistic transformation
\r
4711 eigenQcodon( 1, -1, NULL, NULL, NULL, Root, U, V, &scalefactor, kappa, w_value, Qmatrix ); //estimate Root,U,V for the scaled Qmatrix
\r
4713 //For P(w>1|x) calculation
\r
4715 u = ( (1 - alpha) * z2 + 1 + alpha ) / 2;
\r
4716 w_p = logistic_transformation( u, m2, s2 );
\r
4717 eigenQcodon(1, -1, NULL, NULL, NULL, Rp, Up, Vp, &scalefactor, kappa, w_p, Qmatrix ); //estimate Rp,Up,Vp for the scaled Qmatrix for the calculation of P(w>1|x)
\r
4720 for( j=0; j<npoints; j++ ){ //count for distance
\r
4721 if( j < npoints/2 ) { t_index = npoints/2 - 1 - j; sign = -1; }
\r
4722 else { t_index = j - npoints/2; sign = 1; }
\r
4723 z1 = sign * nodes[ t_index ];
\r
4724 t_weight = weights[ t_index ];
\r
4725 t_value = logistic_transformation( z1, m1, s1 ); //value of distance after logistic transformation
\r
4727 PMatUVRoot( PMat, t_value, com.ncode, U, V, Root ); //calculate Ptmatrix
\r
4729 logl = loglikelihood( PMat );
\r
4730 logposterior = logl + logprior( t_value, w_value, com.hyperpar );
\r
4732 if( way == 0 ) hvalue = exp( logposterior + maxlogl ); //way is 0 or 1
\r
4733 else hvalue = exp( logposterior + maxlogP );
\r
4735 rvalue = ( hvalue * 2 * t_value * s1 * 2 * w_value * s2 ) / ( (1 - square(z1)) * (1 - square(z2) ) );
\r
4738 // For P(w>1|x) calculation
\r
4740 PMatUVRoot( PMatp, t_value, com.ncode, Up, Vp, Rp );
\r
4741 logl_p = loglikelihood( PMatp );
\r
4742 logposterior_p = logl_p + logprior( t_value, w_p, com.hyperpar );
\r
4744 if( way == 0) hvalue_p = exp( logposterior_p + maxlogl ); //way is 0 or 1
\r
4745 else hvalue_p = exp( logposterior_p + maxlogP );
\r
4747 qvalue = ( hvalue_p * 2 * t_value * s1 * 2 * w_p * s2 * ( 1 - alpha ) ) / ( (1 - square(z1)) * (1 - square(u) ) * 2 );
\r
4750 interm_results[0] += w_weight * t_weight * rvalue; //normalizing constant
\r
4751 interm_results[1] += w_weight * t_weight * w_value * rvalue; //unscaled post. mean omega
\r
4752 interm_results[2] += w_weight * t_weight * t_value * rvalue; //unscaled post. mean time
\r
4753 interm_results[3] += w_weight * t_weight * w_value * w_value * rvalue; //unscaled post. mean omega^2
\r
4754 interm_results[4] += w_weight * t_weight * t_value * t_value * rvalue; //unscaled post. mean time^2
\r
4755 interm_results[5] += w_weight * t_weight * w_value * t_value * rvalue; //unscaled post. mean omega*time
\r
4756 if( setp == 0 ) interm_results[6] += w_weight * t_weight * qvalue; //for P[w>1|x] calculation
\r
4758 } //2nd for closes
\r
4759 } //1st for closes
\r
4761 for( i=1; i<=6; i++ )
\r
4762 interm_results[i] /= interm_results[0]; //divide everything by the normalizing constant
\r
4764 bayes_est[0] = interm_results[2]; //posterior mean distance
\r
4765 bayes_est[1] = interm_results[1]; //posterior mean omega
\r
4766 bayes_est[2] = interm_results[4] - square( interm_results[2] ); //posterior Var distance
\r
4767 if( bayes_est[2] < 0 ) bayes_est[2] = 0;
\r
4769 bayes_est[3] = interm_results[3] - square( interm_results[1] ); //posterior Var omega
\r
4770 if( bayes_est[3] < 0 ) bayes_est[3] = 0;
\r
4772 bayes_est[4] = interm_results[5] - interm_results[1] * interm_results[2]; //posterior Cov[t,w]
\r
4774 if( bayes_est[2] != 0 && bayes_est[3] != 0 ){
\r
4775 bayes_est[5] = bayes_est[4] / sqrt( bayes_est[2] * bayes_est[3] ); //posterior Corr[t,w]
\r
4777 if( setp == 0 ) bayes_est[6] = interm_results[6]; // P[w>1|x]
\r
4780 fprintf( fout, "\n\nBayesian estimates of distance and omega\n" );
\r
4781 fprintf( fout, "%14s%14s%14s%14s%15s", ch1, ch2, ch3, ch4, ch5 );
\r
4783 if( bayes_est[2] != 0 && bayes_est[3] != 0){
\r
4784 fprintf( fout, "%15s%15s", ch6, ch7 );
\r
4787 fprintf( fout, "%15s", ch7 );
\r
4790 fprintf( fout, "\n%14.6g%14.6g%14.6g%14.6g%15.6g", bayes_est[0],
\r
4791 bayes_est[1], sqrt( bayes_est[2] ), sqrt( bayes_est[3] ), bayes_est[4] );
\r
4793 if( bayes_est[2] != 0 && bayes_est[3] != 0){
\r
4794 fprintf( fout, "%15.6g%15.3g", bayes_est[5], bayes_est[6] );
\r
4797 fprintf( fout, "%15.3g", bayes_est[6] );
\r
4801 } //end of function
\r
4805 double loglikelihood( double Ptmatrix[] )
\r
4806 { /*This function returns the logl for 2 sequences given the number of different site patterns (com.npatt),
\r
4807 the number of possible different codons (com.ncode), the positions of codons of the site patterns
\r
4808 into the genetic code (*com.z[]), the frequences of site patterns (com.fpatt[]), the probabilities
\r
4809 of codons in equilibrium (com.pi[]) and the Pt matrix(Ptmatrix[])*/
\r
4812 for( i=0; i<com.npatt; i++ )
\r
4813 logl += com.fpatt[i] * log( com.pi[ (int)com.z[0][i] ] * Ptmatrix[ com.ncode * (int)com.z[0][i] + (int)com.z[1][i] ] );
\r
4818 double logistic_transformation( double point, double logmean, double stdlogpar )
\r
4821 result = exp( logmean + stdlogpar * log( ( 1 + point ) / ( 1 - point ) ) );
\r
4825 double logprior( double t, double w, double par[] )
\r
4826 { /*This function returns the log prior function of time and omega(dN/dS)
\r
4827 par vector contains the shape & scale parameters for time for omega.*/
\r
4828 double logtprior = - par[1] * t + ( par[0] - 1 ) * log(t) + par[0] * log( par[1] ) - LnGamma( par[0] );
\r
4829 double logwprior = - par[3] * w + ( par[2] - 1 ) * log(w) + par[2] * log( par[3] ) - LnGamma( par[2] );
\r
4830 return logtprior + logwprior;
\r
4833 double logP( double x[], int np )
\r
4834 { /*This function returns the log-posterior. It is used to find the
\r
4835 mode of the log-posterior. x[0] is for distance and x[1] for w.*/
\r
4836 return lfun2dSdN( x, np ) - logprior( x[0], x[1], com.hyperpar );
\r
4839 int EstVariances( double *var ){
\r
4840 /*This function estimates Var(t) & Var(w) using the
\r
4841 Nei-Gojobori counting method and stores them in the 1st
\r
4842 and 2nd place of *var vector respectively*/
\r
4843 double pS, pN, VarpS, VarpN, hN, hS, VardS, VardN, dS, dN;
\r
4848 hN = 1 - ( (double)4 / 3 ) * pN;
\r
4849 hS = 1 - ( (double)4 / 3 ) * pS;
\r
4851 dS = - 0.75 * log( hS );
\r
4852 dN = - 0.75 * log( hN );
\r
4854 VarpS = ( pS * (1-pS) ) / SS;
\r
4855 VarpN = ( pN * (1-pN) ) / NN;
\r
4857 VardS = square( 1 / hS ) * VarpS;
\r
4858 VardN = square( 1 / hN ) * VarpN;
\r
4860 var[0] = square( ( 3 * SS ) / ( SS + NN ) ) * VardS + square( ( 3 * NN ) / ( SS + NN) ) * VardN;
\r
4861 var[1] = VardN / square( dS ) + ( square( dN ) * VardS ) / square( square( dS ) );
\r
4866 double CDFLogis( double x, double m, double s ){
\r
4867 return 1 / ( 1 + exp( -( x - m ) / s ) );
\r
4869 //End of kostas code
\r
4875 double lfun2AA (double t)
\r
4877 /* likelihood function for two amino acid sequences
\r
4878 prob(i,j) = PI_i * p(i,j,t)
\r
4880 The data are clean & coded (com.z[0] & com.z[1]).
\r
4881 Transition probability pijt is calculated for observed patterns only.
\r
4883 int n=20, h,k, aa0,aa1;
\r
4884 double lnL=0, pijt,expt[20],al=com.alpha;
\r
4886 if(al==0) FOR(k,n) expt[k] = exp(t*Root[k]);
\r
4887 else FOR(k,n) expt[k] = pow(al/(al-t*Root[k]),al);
\r
4888 for(h=0; h<com.npatt; h++) {
\r
4889 aa0=com.z[0][h]; aa1=com.z[1][h];
\r
4890 for(k=0,pijt=0; k<n; k++)
\r
4891 pijt += U[aa0*n+k] * expt[k]*V[k*n+aa1];
\r
4892 lnL -= log(com.pi[aa0]*pijt)*com.fpatt[h];
\r
4898 int _nestS=0; /* 189= estimate the S elements, 0= use those from com.daa[] */
\r
4899 static double *_Fij;
\r
4901 double lfun2AArev (double x[], int np)
\r
4903 /* np = _nestS + 19*3 + (1 or 2);
\r
4904 x[]: Q matrix, 3*pi, 1 or 2 blength
\r
4905 pi[0] is for the root, pi[1] & pi[2] are for Q[1] & Q[2] for the 2 branches.
\r
4906 See notes in PairwiseAArev().
\r
4909 double pi[3][20], *p, Q[3][400], *T=Q[0], *Fe=Q[0], t,t1,t2, m1,m2, lnL=0;
\r
4910 double space[20*20*2+20*2];
\r
4913 for(k=0; k<3; k++) {
\r
4914 for(i=0,p=pi[k],p[19]=1; i<n-1; i++) p[i] = x[_nestS+k*19+i];
\r
4915 for(i=0,t=0; i<n; i++) t+=p[i];
\r
4916 for(i=0; i<n; i++) p[i]/=t;
\r
4920 for(i=0,k=0; i<n; i++) {
\r
4921 for(j=0,Q[1][i*n+i]=0; j<i; j++)
\r
4922 if(i*n+j != ijAAref)
\r
4923 Q[1][i*n+j]=Q[1][j*n+i] = x[k++];
\r
4925 Q[1][ijAAref] = Q[1][(ijAAref%n)*n+(ijAAref/n)] = 1;
\r
4928 for(i=0; i<n; i++)
\r
4929 for(j=0,Q[1][i*n+i]=0; j<=i; j++)
\r
4930 Q[1][i*n+j] = Q[1][j*n+i] = com.daa[i*n+j];
\r
4933 for(i=0,m1=m2=0; i<n; i++) {
\r
4934 for(j=0,t1=t2=0;j<n;j++) {
\r
4935 Q[2][i*n+j] = Q[1][i*n+j]*pi[2][j];
\r
4936 Q[1][i*n+j] *= pi[1][j];
\r
4937 t1 += Q[1][i*n+j];
\r
4938 t2 += Q[2][i*n+j];
\r
4940 Q[1][i*n+i] = -t1;
\r
4941 Q[2][i*n+i] = -t2;
\r
4942 m1 += pi[1][i]*t1;
\r
4943 m2 += pi[2][i]*t2;
\r
4946 if(com.ntime==1) { t1 = x[np-1]/2/m1; t2 = x[np-1]/2/m2; }
\r
4947 else { t1 = x[np-2]/m1; t2 = x[np-1]/m2; }
\r
4948 PMatQRev(Q[1], pi[1], t1, n, space);
\r
4949 PMatQRev(Q[2], pi[2], t2, n, space);
\r
4951 for(i=0; i<n*n; i++) Fe[i]=0;
\r
4954 for(j=0,t=pi[0][k]*Q[1][k*n+i]; j<n; j++)
\r
4955 Fe[i*n+j] += t*Q[2][k*n+j];
\r
4957 /* if(fabs((t=sum(Fe,n*n))-1)>1e-6) printf("Fe = %.9f != 1\n", t); */
\r
4959 for(i=0; i<n*n; i++) {
\r
4960 if(_Fij[i]<=1e-15) continue;
\r
4962 lnL -= _Fij[i]*log(Fe[i]);
\r
4964 printf("Fij_exp = %.10f < 0\n", Fe[i]);
\r
4969 double PairwiseAArev (int is, int js)
\r
4971 /* This calculates pairwise distance under a nonstationary model. It assumes
\r
4972 three sets of amino acid frequencies: pi[0] for the root, pi[1] and pi[2]
\r
4973 are for the Q matrices for branches 1 and 2.
\r
4974 It estimate the symmetrical part of the rate matrix if _nestS==189.
\r
4975 If _nestS==0, it uses the symmetrical part read from the com.daa file.
\r
4976 It can estimate 1 or 2 distances depending on com.ntime=1 or 2.
\r
4978 np = 189 + 19*3 + (1 or 2);
\r
4979 x[]: Q matrix, 3*pi, 1 or 2 blength
\r
4981 int n=com.ncode, h,i,j,k, np=_nestS+ 19*3 + 1;
\r
4982 double Fij[400], x[248], xb[248][2], lnL, e=1e-9, t=0, p[20];
\r
4984 com.ntime=1; /* 1: t1=t2; 2: t1 and t2 */
\r
4985 if(com.ntime==2) np++;
\r
4987 if(com.cleandata!=1) error2("cleandata");
\r
4989 if(com.sspace < spaceming2(np)) {
\r
4990 com.sspace = spaceming2(np);
\r
4991 printf ("\nspace adjusted to %9lu bytes\n",com.sspace);
\r
4992 if((com.space=(double*)realloc(com.space,com.sspace))==NULL)
\r
4993 error2("oom space");
\r
4996 for(h=0,zero(Fij,n*n); h<com.npatt; h++) {
\r
4997 Fij[com.z[is][h]*n+com.z[js][h]] += com.fpatt[h]/com.ls;
\r
5001 for (i=1,k=0; i<n; i++) FOR(j,i)
\r
5002 if(i*n+j!=ijAAref) x[k++] = (com.daa[i*n+j]+.001)*(0.8+0.4*rndu());
\r
5004 for(i=0;i<np;i++) {
\r
5005 x[i]=rndu(); xb[i][0]=1e-5; xb[i][1]=100;
\r
5007 lnL = lfun2AArev(x,np);
\r
5008 printf("\nlnL0 = %12.6f\n",-lnL);
\r
5010 ming2(noisy>2?frub:NULL,&lnL,lfun2AArev,NULL,x,xb, com.space, e, np);
\r
5014 for(k=0; k<3; k++) {
\r
5015 for(i=0,p[19]=1; i<n-1; i++) p[i]=x[_nestS+k*19+i];
\r
5016 for(i=0,t=0; i<n; i++) t+=p[i];
\r
5017 for(i=0; i<n; i++) p[i]/=t;
\r
5018 matout2(F0, p, 1, n, 7, 4);
\r
5021 return (x[_nestS + 19*3]);
\r
5025 int PairwiseAA (FILE *fout, FILE*f2AA)
\r
5027 /* Calculates pairwise distances using amino acid seqs.
\r
5028 Data (com.z[]) are clean and coded.
\r
5029 com.npatt for the whole data set is used which may be greater than
\r
5030 the number of patterns for each pair.
\r
5031 SE is not calculated.
\r
5034 int n=com.ncode, j, is,js;
\r
5035 double x, xb[2]={0,19}, lnL, step;
\r
5037 if (com.ngene>1 && com.Mgene==1) error2("ngene>1 to be tested.");
\r
5038 if (noisy) printf("\npairwise ML distances of AA seqs.\n\n");
\r
5040 if(com.model>Empirical_F) error2("PairwiseAA: model wrong");
\r
5042 if(com.model==0) fillxc(com.pi,1./n, n);
\r
5043 if(com.model>=Empirical) GetDaa(NULL, com.daa);
\r
5044 if(com.model==0 || com.model==Empirical)
\r
5045 eigenQaa(NULL, Root, U, V, NULL);
\r
5047 FOR(j,com.ns) pz0[j]=com.z[j];
\r
5048 fprintf(fout,"\nML distances of aa seqs.\n");
\r
5050 fprintf(fout,"\nContinuous gamma with alpha = %.3f is used (ncatG is ignored).\n\n",com.alpha);
\r
5052 fprintf(f2AA,"%6d\n", com.ns);
\r
5053 for(is=0; is<com.ns; is++,FPN(F0),FPN(fout),FPN(f2AA)) {
\r
5054 printf ("%4d vs", is+1);
\r
5055 fprintf(f2AA,"%-14s ", com.spname[is]);
\r
5056 fprintf(fout,"%-14s ", com.spname[is]);
\r
5057 for(js=0; js<is; js++) {
\r
5059 if(com.model==REVaa) {
\r
5060 x = PairwiseAArev(is, js);
\r
5061 fprintf(f2AA," %7.4f",x); fprintf(fout," %7.4f",x);
\r
5065 com.z[0]=pz0[is]; com.z[1]=pz0[js];
\r
5066 printf (" %2d", js+1);
\r
5067 if(com.model==1||com.model==Empirical_F) {
\r
5068 for (j=0,zero(com.pi,n); j<com.npatt; j++) {
\r
5069 com.pi[(int)com.z[0][j]]+=com.fpatt[j];
\r
5070 com.pi[(int)com.z[1][j]]+=com.fpatt[j];
\r
5072 abyx(1./sum(com.pi,n), com.pi, n);
\r
5073 eigenQaa(NULL,Root,U,V,NULL);
\r
5075 /* com.posG[1]=com.npatt; */
\r
5077 xb[0]=SeqDistance[is*(is-1)/2+js]; x=xb[0]*1.5; step=xb[0];
\r
5078 LineSearch(lfun2AA, &lnL, &x, xb, step, 1e-7);
\r
5079 fprintf(f2AA," %7.4f",x); fprintf(fout," %7.4f",x);
\r
5084 FOR(j,com.ns) com.z[j]=pz0[j];
\r
5089 char GetAASiteSpecies(int species, int sitepatt)
\r
5091 /* this returns the amino acid encoded by the codon at sitepatt in species.
\r
5092 Returns '*' if more than two amino acids or '-' if codon is --- or ***.
\r
5094 int n=com.ncode, c, naa, k;
\r
5097 if(com.seqtype!=1)
\r
5098 error2("GetAASiteSpecies() right now works for codon seqs only. Check.");
\r
5099 c = com.z[species][sitepatt];
\r
5101 aa = AAs[ GeneticCode[com.icode][FROM61[c]] ];
\r
5103 else { /* naa is = 1 or >1, precise value being incorrect. */
\r
5104 for(k=0,aa=-1; k<nChara[c]; k++) {
\r
5105 newaa = GeneticCode[com.icode][FROM61[ CharaMap[c][k] ]];
\r
5106 if(newaa==-1) continue;
\r
5107 newaa = AAs[newaa];
\r
5113 if(newaa != aa) naa++;
\r
5115 if(nChara[c]==n) aa = '-';
\r
5116 else if(naa>1) aa = '*';
\r
5123 int PrintProbNSsites (FILE* frst, double prob[], double meanw[], double varw[], int ncat, int refsp)
\r
5125 /* This prints out posterior probabilities that each site is from a site class
\r
5126 under the NSsites mdoels (model=0).
\r
5127 This is called by both the old empirical Bayes routine (NEB) and also the new
\r
5128 Bayes empirical Bayes (BEB) routine.
\r
5130 int h, hp, it, ir, lst=(com.readpattern?com.npatt:com.ls);
\r
5131 double psel=0, wpos=1, cutoff=0.5;
\r
5132 double mpostp[NCATG];
\r
5135 char codons[2][4];
\r
5136 double St, Nt, ns, na, ndiff;
\r
5138 if(com.model==0) {
\r
5139 fprintf(frst," & postmean_w");
\r
5140 if(!BayesEB && com.rK[ncat-1]>1) fprintf(frst," & P(w>1)");
\r
5142 fprintf(frst,"\n(amino acids refer to 1st sequence: %s)\n\n", com.spname[refsp]);
\r
5143 zero(mpostp, com.ncatG);
\r
5144 for(h=0; h<lst; h++,FPN(frst)) {
\r
5145 hp = (!com.readpattern ? com.pose[h] : h);
\r
5146 aa = GetAASiteSpecies(refsp, hp);
\r
5147 fprintf(frst,"%4d %c ", h+1, aa);
\r
5148 for (ir=0,it=0,psel=0; ir<ncat; ir++) {
\r
5149 fprintf(frst," %5.5f", prob[ir*com.npatt+hp]);
\r
5150 if(prob[ir*com.npatt+hp] > prob[it*com.npatt+hp])
\r
5152 if(!BayesEB && com.model==0)
\r
5153 if(com.rK[ir] > 1) psel += prob[ir*com.npatt+hp];
\r
5154 mpostp[ir] += prob[ir*com.npatt+hp]/com.ls;
\r
5157 fprintf(frst, " (%2d)", it+1);
\r
5158 if(com.model==0) {
\r
5159 fprintf(frst, " %6.3f", meanw[hp]);
\r
5160 if(!BayesEB && psel) fprintf(frst, " %6.3f", psel);
\r
5161 if(BayesEB==1 && com.model==0)
\r
5162 fprintf(frst, " +- %6.3f", varw[hp]);
\r
5168 printf("\nmean posterior probabilities for site classes");
\r
5169 matout(F0, mpostp, 1, com.ncatG);
\r
5170 matout(F0, com.freqK, 1, com.ncatG);
\r
5174 /* list of positively selected sites */
\r
5175 if(com.model==0) { /* NSsites models */
\r
5176 if(com.NSsites!=1 && com.NSsites!=7)
\r
5177 fprintf(frst,"\nPositively selected sites\n\n\tProb(w>1) mean w\n\n");
\r
5179 for(ir=0,it=0; ir<ncat; ir++)
\r
5180 if(BayesEB==1 || (com.freqK[ir]>.1/com.ls && com.rK[ir]>wpos)) it=1;
\r
5181 if(!com.aaDist && it) {
\r
5182 fprintf(fout,"\nPositively selected sites (*: P>95%%; **: P>99%%)\n");
\r
5183 fprintf(fout,"(amino acids refer to 1st sequence: %s)\n\n", com.spname[refsp]);
\r
5184 fprintf(fout," Pr(w>1) %25s\n\n", "post mean +- SE for w");
\r
5185 for(h=0; h<lst; h++) {
\r
5186 hp=(!com.readpattern ? com.pose[h] : h);
\r
5188 psel = prob[(ncat-1)*com.npatt+hp];
\r
5190 for (ir=0,psel=0; ir<ncat; ir++)
\r
5191 if(com.rK[ir]>wpos) psel+=prob[ir*com.npatt+hp];
\r
5195 if(psel>.95) sig = "* ";
\r
5196 if(psel>.99) sig = "**";
\r
5197 aa = GetAASiteSpecies(refsp, hp);
\r
5198 fprintf(fout,"%6d %c %10.3f%-8s %.3f", h+1, aa, psel, sig, meanw[hp]);
\r
5199 fprintf(frst,"%6d %c %10.3f%-8s %.3f", h+1, aa, psel, sig, meanw[hp]);
\r
5201 if(BayesEB==1 && com.model==0) {
\r
5202 fprintf(fout, " +- %5.3f", varw[hp]);
\r
5203 fprintf(frst, " +- %5.3f", varw[hp]);
\r
5206 /*********** print out both codons if 2 sequences ******/
\r
5209 codons[0] = CODONs[com.z[0][hp]]);
\r
5210 codons[1] = CODONs[com.z[1][hp]]);
\r
5211 ndiff=difcodonNG(codons[0], codons[1], &St,&Nt,&ns,&na,0,com.icode);
\r
5212 fprintf(fout,"\t%3s %3s %2.0f diff (ps pn: %5.3f %5.3f)", codons[0], codons[1], ndiff, ns/St, na/Nt);
\r
5216 FPN(fout); FPN(frst);
\r
5220 if(!BayesEB==1 && com.rK[ncat-2]>wpos)
\r
5221 fputs("\nNote: more than one w>1. Check rst for details\n",fout);
\r
5227 int lfunNSsites_rate (FILE* frst, double x[], int np)
\r
5229 /* This calculates the dN/dS rates for sites under models with variabel dN/dS
\r
5230 ratios among sites (Nielsen and Yang 1998). Modified from lfundG()
\r
5231 com.fhK[] holds the posterior probabilities.
\r
5233 int h,hp, ir, it=0, refsp=0, k=com.ntime+com.nrgene+com.nkappa;
\r
5235 double w2=x[com.np-1],psel=0, *meanw, maxmw, minmw, wpos=1.1, cutoff=0.5;
\r
5238 FILE *fsites, *fras;
\r
5239 int continuous=0, R,G,B;
\r
5240 int lst=(com.readpattern?com.npatt:com.ls);
\r
5241 int ncolors=5; /* continuous = 0 uses the specified colors */
\r
5242 char sitelabel[96], *colors[5]={"darkblue", "lightblue", "purple", "pinkred", "red"};
\r
5243 char *colorvalues[5]={"[2,2,120]", "[133,57,240]", "[186,60,200]", "[200,60,160]", "[250,5,5]"};
\r
5246 if(com.nparK) error2("lfunNSsites_rate to be done for HMM.");
\r
5248 if((meanw=(double*)malloc(com.npatt*sizeof(double)))==NULL)
\r
5249 error2("oom lfunNSsites_rate"); /* meanw useful for NSsites only */
\r
5251 printParametersNSsites(frst,x);
\r
5253 fputs("\nCheck main result file for parameter estimates\n", frst);
\r
5256 if(com.NnodeScale)
\r
5257 FOR(h,com.npatt) {
\r
5258 for(ir=1,fh=com.fhK[h]; ir<com.ncatG; ir++)
\r
5259 if(com.fhK[ir*com.npatt+h]>fh) fh=com.fhK[ir*com.npatt+h];
\r
5260 for(ir=0; ir<com.ncatG; ir++)
\r
5261 com.fhK[ir*com.npatt+h]=exp(com.fhK[ir*com.npatt+h]-fh);
\r
5262 lnL-=fh*com.fpatt[h];
\r
5265 for(h=0; h<com.npatt; h++) {
\r
5266 for (ir=0,fh=meanw[h]=0; ir<com.ncatG; ir++) {
\r
5267 fh += (com.fhK[ir*com.npatt+h]*=com.freqK[ir]); /* prior=>posterior */
\r
5268 meanw[h] += com.fhK[ir*com.npatt+h]*com.rK[ir];
\r
5270 for (ir=0,meanw[h]/=fh; ir<com.ncatG; ir++) com.fhK[ir*com.npatt+h]/=fh;
\r
5271 lnL -= com.fpatt[h]*log(fh);
\r
5274 fprintf(frst,"\nNaive Empirical Bayes (NEB) probabilities for %d classes",com.ncatG);
\r
5275 if(com.model==0 && com.NSsites && com.NSsites!=1 && com.NSsites!=7)
\r
5276 fprintf(fout,"\nNaive Empirical Bayes (NEB) analysis");
\r
5277 PrintProbNSsites(frst, com.fhK, meanw, NULL, com.ncatG, refsp);
\r
5279 if(com.model && com.model<=NSbranch2) { /* branch&site models */
\r
5280 if(com.rK[0]>wpos || com.rK[1]>wpos) { /* positive sites for all lineages */
\r
5281 fputs("\n\nPositive sites for all lineages Prob(w>1):\n",fout);
\r
5282 for(h=0; h<lst; h++) {
\r
5283 hp=(!com.readpattern ? com.pose[h] : h);
\r
5284 aa = GetAASiteSpecies(refsp, hp);
\r
5286 if(com.rK[0]>wpos) psel = com.fhK[0*com.npatt+hp];
\r
5287 if(com.rK[1]>wpos) psel += com.fhK[1*com.npatt+hp];
\r
5290 if(psel>.95) sig = "*";
\r
5291 if(psel>.99) sig = "**";
\r
5292 fprintf(fout, "%6d %c %.3f%s\n", h+1, aa, psel, sig);
\r
5296 if(w2>wpos && (com.freqK[com.ncatG-1]>1e-6)) { /* for foreground branches */
\r
5297 fprintf(fout,"\nNaive Empirical Bayes (NEB) analysis (please use the BEB results.)");
\r
5298 fprintf(fout,"\nPositive sites for foreground lineages Prob(w>1):\n\n");
\r
5299 for(h=0; h<lst; h++) {
\r
5300 hp=(!com.readpattern ? com.pose[h] : h);
\r
5301 aa = GetAASiteSpecies(refsp, hp);
\r
5302 psel = com.fhK[2*com.npatt+hp]+com.fhK[3*com.npatt+hp];
\r
5305 if(psel>.95) sig = "*";
\r
5306 if(psel>.99) sig = "**";
\r
5307 fprintf(fout, "%6d %c %.3f%s\n", h+1, aa, psel, sig);
\r
5312 fprintf (frst,"\n\nlnL = %12.6f\n", -lnL);
\r
5314 /* RasMol script for coloring structure */
\r
5315 if(com.verbose && com.model==0) {
\r
5316 fsites=(FILE*)fopen("SiteNumbering.txt", "r");
\r
5318 puts("\nCollecting RasMol commands for coloring structure into RasMol.txt");
\r
5320 printf("Choose color scheme (0: %d colors, 1: white->red, 2: rainbow) ",ncolors);
\r
5321 scanf("%d", &continuous);
\r
5323 fras = (FILE*)gfopen("RasMol.txt", "w");
\r
5324 for(h=0,maxmw=0,minmw=99; h<com.npatt; h++) {
\r
5325 if(maxmw < meanw[h]) maxmw = meanw[h];
\r
5326 if(minmw > meanw[h]) minmw = meanw[h];
\r
5328 if(continuous == 0)
\r
5329 for (it=0; it<ncolors; it++)
\r
5330 printf("\t%-10s %-20s mean_w < %7.5f\n",
\r
5331 colors[it], colorvalues[it], (it+1)*(maxmw-minmw)/ncolors);
\r
5332 fprintf(fras, "cartoon\nset background white\n");
\r
5333 for(h=0; h<lst; h++) {
\r
5334 fscanf(fsites, "%d%s", &it, sitelabel);
\r
5335 if(it-1!=h) { puts("Site number wrong. Giving up."); break; }
\r
5336 if(strchr(sitelabel, '?')) continue;
\r
5337 hp = (!com.readpattern ? com.pose[h] : h);
\r
5339 if(continuous==0) {
\r
5340 for (it=0; it<ncolors; it++)
\r
5341 if(meanw[hp]<minmw+(it+1.)*(maxmw-minmw)/ncolors+1e-9) break;
\r
5342 fprintf(fras,"select %s\n\t\tcolor %s\n", sitelabel, colorvalues[it]);
\r
5344 else if (continuous==1) {
\r
5345 it = 5+(int)(245*(meanw[hp]-minmw)/(maxmw-minmw+1e-9));
\r
5346 fprintf(fras,"select %s\n\t\tcolor [250, %d, %d]\n", sitelabel, 255-it,255-it);
\r
5349 rainbowRGB((meanw[hp]-minmw)/(maxmw-minmw+1e-9), &R, &G, &B);
\r
5350 fprintf(fras, "select %s\n\t\tcolor [%d, %d, %d]\n", sitelabel, R,G,B);
\r
5353 fclose(fsites); fclose(fras);
\r
5358 if(com.model==0 && (com.NSsites==NSpselection || com.NSsites==NSbetaw)
\r
5359 && (com.fix_omega!=1 || com.omega!=1)) /* BEB for M2 & M8 */
\r
5360 lfunNSsites_M2M8(frst, x, com.np);
\r
5361 if(!com.fix_omega && (com.model==2 || com.model==3) && com.NSsites==2) /* BEB for branchsite A & clade C */
\r
5362 lfunNSsites_AC(frst, x, com.np);
\r
5368 #ifdef NSSITESBandits
\r
5369 void finishup(void)
\r
5372 fend=(FILE*)gfopen("finished","w");
\r
5380 (*) Codon models for variable dN/dS ratios among sites
\r
5381 (com.nrate includes kappa & omega) (see also CDFdN_dS)
\r
5385 0 one-ratio 0: one ratio for all sites
\r
5386 1 neutral 1: p0 (w0=0, w1=1)
\r
5387 2 selection 3: p0, p1, w2 (w0=0, w1=1)
\r
5388 3 discrete 2K-1: p0,p1,..., and w0,w1,...
\r
5389 4 freqs K: p's (w's are fixed)
\r
5390 5 gamma 2: alpha, beta
\r
5391 6 2gamma 4: p0, alpha1,beta1, alpha2=beta2
\r
5392 7 beta 2: p_beta, q_beta
\r
5393 8 beta&w 4: p0, p_beta, q_beta, w estimated
\r
5394 9 beta&gamma 5: p0, p_beta, q_beta, alpha, beta
\r
5395 10 beta&1+gamma 5: p0, p_beta, q_beta, alpha, beta (1+gamma used)
\r
5396 11 beta&1>normal 5: p0, p_beta, q_beta, mu, s (normal truncated w>1)
\r
5397 12 0&2normal 5: p0, p1, mu2, s1, s2
\r
5398 13 3normal 6: p0, p1, mu2, s0, s1, s2
\r
5399 14 M8a:beta&w=1 3: p0, p_beta, q_beta, w=1 fixed
\r
5400 15 M8a:beta&w>=1 4: p0, p_beta, q_beta, w>=1 estimated
\r
5402 NSsites = 14 forces change to fix_omega so we can't have 2 models in one run.
\r
5403 NSsites = 15 would not set omegab[0] correctly for the next tree.
\r
5406 (*) Codon models for variable dN/dS ratios among both branches and sites
\r
5407 (model=2, NSsites=3 or 2)
\r
5408 (com.nrate includes kappa & omega)
\r
5409 Parameters include branchlens, kappa, p0, p1, w0, w1, w2
\r
5411 method = 0: SetPSiteClass copies w's to nodes[].omega and PMat is calculated
\r
5412 in ConditionalPNode().
\r
5413 method = 1: PMat for branch of interest is calulated in lfuntdd_SiteClass().
\r
5414 The two types of branches have different Qfactor_NS: Qfactor_NS_branch[2].
\r
5418 (*) Codon (perhaps aa as well) site-class models
\r
5420 NSsites=3, ncatG=3 or 2 etc
\r
5423 1-6 for G1974,Miyata,c,p,v,a
\r
5424 FIT1 & FIT2 (11, 12): fitness model F_j = a_p*(p-p*)^2+a_v*(v-v*)^2
\r
5425 FIT1: w_ij = exp(F_j - F_i)
\r
5426 FIT2: w_ij = b*exp(F_j - F_i)
\r
5428 FIT1 & FIT2 are also implemented for NSsites=0
\r
5431 (*) Amino acid models
\r
5433 REVaa: The symmetrical part (S) of the rate matrix Q=S*PI are estimated,
\r
5434 making up 19*20/2-1=189 rate parameters for the matrix. The aa
\r
5435 frequencies are estimated using the observed ones. The Sij for
\r
5436 ijAAref=19*naa+9 (I-V) is set to one and others are relative rates;
\r
5437 REVaa_0: AA1STEP[i*(i+1)+j] marks the aa pair i & j that are
\r
5438 interchangeable. Sij for ijAAref=19*naa+9 (I-V) is set to one
\r
5439 and others are relative rates;
\r
5443 Codon & amino acid models
\r
5445 AAClasses: OmegaAA[i*(i-1)/2+j] marks the dN/dS ratio class for the pair
\r
5446 i & j. Note kappa is before omega's in x[].
\r
5447 OmegaAA[i*(i-1)/2+j]=-1, if AAs i & j are not interchangeable
\r
5448 =0, for the background ratio
\r
5449 =1,...,nclass for AAs i & j specified in OmegaAA.dat.
\r
5450 The total number of classes (com.nOmega) is one plus the number
\r
5451 specified in the file OmegaAAf.
\r
5453 com.nOmega is the number of different dN/dS ratios in the NSbranchB, NSbranch2 models
\r
5455 nodes[].label marks the dN/dS ratio for the node in the NSbranchB NSbranch2 models
\r
5456 AA1STEP[i*(i-1)/2+j] =1 if AAs i & j differ at one codon position;
\r
5459 (*) Codon and amino acid models
\r
5461 aaDist = -5,-4,-3,-2,-1,1,2,3,4,5:
\r
5462 Geometric and linear relationships between amino acid distance and
\r
5463 substitution rate:
\r
5464 wij = a*(1-b*dij/dmax)
\r
5465 wij = a*exp(-b*dij/dmax)
\r
5466 aaDist = 0:equal, +:geometric; -:linear, {1-5:Grantham,Miyata,c,p,v}
\r
5468 aaDist = 11, 12: fitness models, see above.
\r
5475 #if 0 /* routines for testing codon-models */
\r
5477 int GetCategoryQc (char z[NS])
\r
5479 /* the category ID for a codon site with z[NS], transformed
\r
5480 classified into 19 categories
\r
5482 int i,j, icat, ncat=19, it, b[NS][3], nbase[3], markb[4];
\r
5484 puts("\nDo not work with missing data, GetCategoryQc.");
\r
5485 for (j=0; j<com.ns; j++) {
\r
5486 it=FROM61[(int)z[j]]; b[j][0]=it/16; b[j][1]=(it/4)%4; b[j][2]=it%4;
\r
5489 FOR (j,4) markb[j]=0;
\r
5490 FOR (j,com.ns) markb[b[j][i]]=1;
\r
5491 nbase[i]=markb[0]+markb[1]+markb[2]+markb[3]-1;
\r
5493 if(nbase[1]>=2) icat=ncat-1;
\r
5495 if(nbase[0]>2) nbase[0]=2; if(nbase[2]>2) nbase[2]=2;
\r
5496 icat = nbase[1]*9+nbase[0]*3+nbase[2];
\r
5501 int TestModelQc (FILE * fout, double x[])
\r
5503 /* Test the Qc model, slower than simulations
\r
5506 int h, npatt, it, icat, j, nodeb[NS], imposs;
\r
5507 int n=Nsensecodon, isum, nsum, ncat=19;
\r
5508 double fh, y, nobs[19], pexp[19], Pt[8][NCODE*NCODE];
\r
5510 puts("\nDo not work with missing data, GetCategoryQc.");
\r
5511 puts("\ntest Qc..\n");
\r
5512 for (h=0,zero(nobs,ncat); h<com.npatt; h++) {
\r
5513 for (j=0; j<com.ns; j++) z[j]=com.z[j][h]-1;
\r
5514 icat = GetCategoryQc(z);
\r
5515 nobs[icat]+=com.fpatt[h];
\r
5518 printf("cat #%4d: %4d%4d%4d%6.0f\n", j+1,j/9+1,(j/3)%3+1,j%3+1,nobs[j]);
\r
5520 if (com.ns>5 || com.alpha || com.ngene>1)
\r
5521 error2 ("TestModelQc: ns>5 || alpha>0.");
\r
5522 if (SetParameters (x)) puts ("\npar err..");
\r
5523 for (j=0,npatt=1; j<com.ns; j++) npatt*=n;
\r
5524 for (isum=0,nsum=1; isum<tree.nnode-com.ns; nsum*=n,isum++) ;
\r
5525 printf("\nTest Qc: npatt = %d\n", npatt);
\r
5526 FOR (j, tree.nbranch)
\r
5527 PMatUVRoot (Pt[j], nodes[tree.branches[j][1]].branch,n,U,V,Root);
\r
5529 for (h=0,zero(pexp,ncat); h<npatt; h++) {
\r
5530 for (j=0,it=h; j<com.ns; nodeb[com.ns-1-j]=it%n,it/=n,j++) ;
\r
5531 for (j=0,imposs=0; j<com.ns; j++)
\r
5532 { z[j]=nodeb[j]; if (com.pi[(int)z[j]]==0) imposs=1; }
\r
5533 if (imposs) continue;
\r
5535 if ((icat=GetCategoryQc(z)) == ncat-1) continue;
\r
5536 if ((h+1)%100==0)
\r
5537 printf("\rTest Qc:%9d%4d%9.2f%%", h+1, icat, 100.*(h+1.)/npatt);
\r
5539 for (isum=0,fh=0; isum<nsum; isum++) {
\r
5540 for (j=0,it=isum; j<tree.nbranch-com.ns+1; j++)
\r
5541 { nodeb[com.ns+j]=it%n; it/=n; }
\r
5542 for (j=0,y=com.pi[nodeb[tree.root]]; j<tree.nbranch; j++)
\r
5543 y*=Pt[j][nodeb[tree.branches[j][0]]*n+nodeb[tree.branches[j][1]]];
\r
5547 matout (F0, x, 1, com.np);
\r
5548 printf ("\a\ntest Qc: h=%4d fh=%9.4f \n", h, fh);
\r
5552 pexp[ncat-1]=1-sum(pexp,ncat-1);
\r
5555 fprintf(fout, "\ncat # %4d%4d%4d%4d%6.0f%10.5f%10.2f",
\r
5556 j+1, j/9+1, (j/3)%3+1, j%3+1, nobs[j], pexp[j], com.ls*pexp[j]);
\r
5562 #if (DSDN_MC || DSDN_MC_SITES)
\r
5564 void SimulateData2s61(void)
\r
5566 /* This simulates two codon sequences and analyze using ML (GY94).
\r
5567 It generates site pattern freqs and then samples from them
\r
5568 to generate the seq data. Codons are coded as 0,1,...,60. There
\r
5569 is another routine of a similar name in the file dsdn.c where the
\r
5570 codons are coded as 0,1,...,63. The two routines should not be
\r
5572 Note that com.pi[] is changed in the analysis but not reused to
\r
5574 Ifdef (DSDN_MC_SITES), the data will be simulated with the NSsites models
\r
5575 but analysed assuming one omega for all sites, so the model is wrong.
\r
5577 char infile[32]="in.codon2s", seqfile[32]="codonseq.tmp",str[4]="";
\r
5579 int ir,nr=100, ip, i,j,k,h, n=Nsensecodon;
\r
5580 int npatt0=n*(n+1)/2, nobs[NCODE*NCODE];
\r
5581 int il,nil, ls[50]={0,200,500};
\r
5582 double y, x[6]={1,1,1},xb[6][2], S,dN,dS,dNt,dSt,om,lnL, mr=0;
\r
5583 double t0,kappa0,omega0=.5,pi0[NCODE], mx[6],vx[6],mse[6]; /* t,k,w,dS,dN */
\r
5584 double Efij[NCODE*(NCODE+1)/2], space[50000];
\r
5586 com.icode=0; com.seqtype=1; com.ns=2;
\r
5587 com.ncode=n; com.cleandata=1; setmark_61_64 ();
\r
5588 for(j=0; j<com.ns; j++)
\r
5589 com.z[j] = (char*) malloc(npatt0*sizeof(char));
\r
5590 if(com.z[com.ns-1]==NULL) error2("oom z");
\r
5591 if((com.fpatt=(double*)malloc(npatt0*sizeof(double)))==NULL)
\r
5592 error2("oom fpatt");
\r
5593 for(j=0; j<3; j++) { xb[j][0]=.0001; xb[j][1]=99; }
\r
5595 #if (DSDN_MC_SITES)
\r
5596 strcpy(infile,"in.codon2sSites");
\r
5598 printf("\nTwo codon seq. simulation for ML (GY94), input from %s\n",infile);
\r
5599 fin=gfopen(infile,"r");
\r
5601 fscanf (fin,"%d%d%d%d", &k,&nr, &com.codonf, &nil);
\r
5602 printf("\n%d replicates, %s model for analysis\nLc:",
\r
5603 nr, codonfreqs[com.codonf]);
\r
5604 for(il=0; il<nil; il++)
\r
5605 fscanf(fin, "%d", &ls[il+1]);
\r
5606 matIout(F0, ls+1, 1, nil);
\r
5607 for(i=0,k=0; i<NCODE; i++) {
\r
5608 fscanf(fin,"%lf",&y);
\r
5609 if(GeneticCode[com.icode][i]>-1) pi0[k++]=y;
\r
5611 error2("stop codon freq !=0");
\r
5613 printf("sum pi = 1 = %.6f\n", sum(pi0,n));
\r
5615 for(ip=0; ip<99; ip++) {
\r
5616 fscanf(fin, "%lf%lf", &t0,&kappa0);
\r
5618 printf("\n\nParameter set %d\nt0 =%.2f kappa0 =%.2f\n",ip+1,t0,kappa0);
\r
5619 fprintf(frst,"\n\nParameter set %d\nt0 =%.2f kappa0 =%.2f\n",ip+1,t0,kappa0);
\r
5621 for(j=0; j<n; j++) com.pi[j] = pi0[j];
\r
5623 #if (DSDN_MC_SITES)
\r
5625 fscanf(fin,"%d", &com.ncatG);
\r
5626 for(i=0; i<com.ncatG; i++) fscanf(fin,"%lf", &com.freqK[i]);
\r
5627 for(i=0; i<com.ncatG; i++) fscanf(fin,"%lf", &com.rK[i]);
\r
5629 printf("\nSite classe model (K=%d)\np: ",com.ncatG);
\r
5630 for(i=0; i<com.ncatG; i++)
\r
5631 printf("%7.4f",com.freqK[i]);
\r
5632 printf("\nw: "); FOR(i,com.ncatG) printf("%7.4f",com.rK[i]); FPN(F0);
\r
5633 fprintf(frst,"\nSite classe model (K=%d)\np: ",com.ncatG);
\r
5634 for(i=0; i<com.ncatG; i++)
\r
5635 fprintf(frst,"%7.4f",com.freqK[i]);
\r
5636 fputs("\nw: ",frst); FOR(i,com.ncatG) fprintf(frst,"%7.4f",com.rK[i]); FPN(frst);
\r
5638 if(1-sum(com.freqK,com.ncatG))
\r
5639 error2("freqs do not sum to 1");
\r
5640 for(j=0,Qfactor_NS=0,dS=dN=0; j<com.ncatG; j++) {
\r
5641 freqK_NS = com.freqK[j];
\r
5642 eigenQcodon(2,1,&S,&dSt,&dNt,NULL,NULL,NULL, &mr, &kappa0,com.rK[j],PMat);
\r
5643 dS += freqK_NS*dSt;
\r
5644 dN += freqK_NS*dNt;
\r
5646 Qfactor_NS = 1/Qfactor_NS;
\r
5647 om = (dS>0?dN/dS:-1);
\r
5648 dS *= t0*Qfactor_NS;
\r
5649 dN *= t0*Qfactor_NS;
\r
5652 fscanf(fin,"%lf", &omega0);
\r
5653 eigenQcodon(2,t0,&S,&dS,&dN, NULL,NULL,NULL, &mr, &kappa0,omega0,space);
\r
5656 printf("\nCorrect values");
\r
5657 printf("\nS%%=%7.4f dS=%7.4f dN=%7.4f w=%7.4f\n",S/3,dS,dN,om);
\r
5658 fprintf(frst,"\nCorrect values");
\r
5659 fprintf(frst,"\nS%%=%7.4f dS=%7.4f dN=%7.4f w=%7.4f\n",S/3,dS,dN,om);
\r
5661 /* calculate Efij[], the site pattern probabilities */
\r
5662 FOR(j,n) com.pi[j]=pi0[j];
\r
5663 #if (DSDN_MC_SITES)
\r
5665 for(k=0,zero(Efij,npatt0); k<com.ncatG; k++) {
\r
5666 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, &kappa0,com.rK[k],PMat);
\r
5667 PMatUVRoot(PMat, t0, n, U, V, Root);
\r
5668 for(i=0,h=0;i<n;i++) for(j=0;j<=i;j++) {
\r
5669 y=com.pi[i]*PMat[i*n+j];
\r
5670 Efij[h++] += (i==j?y:2*y) * com.freqK[k];
\r
5675 eigenQcodon(1,-1,NULL,NULL,NULL,Root, U, V, &mr, &kappa0, omega0, PMat);
\r
5676 PMatUVRoot (PMat, t0, n, U, V, Root);
\r
5677 for(i=0,h=0;i<n;i++) for(j=0;j<=i;j++) { /* why for each il? */
\r
5678 y=com.pi[i]*PMat[i*n+j];
\r
5679 Efij[h++]=(i==j?y:2*y);
\r
5682 for(i=h=0,com.ls=1,com.npatt=npatt0;i<n;i++) for(j=0;j<=i;j++) {
\r
5683 com.z[0][h]=(char)i; com.z[1][h]=(char)j;
\r
5684 com.fpatt[h]=Efij[h]; h++;
\r
5686 if(fabs(1-sum(Efij,npatt0))>1e-6) error2("sum Efij != 1");
\r
5688 for(il=0; il<nil+1; il++) {
\r
5691 puts("\nML estimates from infinite data");
\r
5693 x[0]=t0*rndu(); x[1]=kappa0; x[2]=omega0*rndu();
\r
5694 GetCodonFreqs2 ();
\r
5695 ming2(NULL,&lnL,lfun2dSdN,NULL,x,xb, space,1e-10,3);
\r
5696 printf("lnL = %.6f\n",-lnL);
\r
5697 eigenQcodon(2,x[0],&S,&dS,&dN, NULL,NULL,NULL, &mr, &x[1],x[2],space);
\r
5698 printf("S%%=%7.4f dS=%7.4f dN=%7.4f w=%7.4f\n",S/3,dS,dN,x[2]);
\r
5699 fprintf(frst,"ML estimates from infinite data\nt=%7.4f k=%7.4f",x[0],x[1]);
\r
5700 fprintf(frst," S%%=%7.4f dS=%7.4f dN=%7.4f w=%7.4f\n",S/3,dS,dN,x[2]);
\r
5702 for(h=1;h<npatt0; h++) Efij[h]+=Efij[h-1];
\r
5703 puts("\nt & k & w & dS & dN");
\r
5704 fputs("\nLc & t & k & w & dS & dN\n",frst); fflush(frst);
\r
5708 printf("\nls = %d\n", com.ls);
\r
5709 for(ir=0,zero(mx,6),zero(vx,6),zero(mse,6); ir<nr; ir++) {
\r
5710 MultiNomial(com.ls, npatt0, Efij, nobs, NULL);
\r
5711 for(i=0,com.npatt=0,zero(com.pi,n);i<n;i++) for(j=0;j<=i;j++)
\r
5712 if(nobs[k=i*(i+1)/2+j]) {
\r
5713 com.z[0][com.npatt]=i; com.z[1][com.npatt]=j;
\r
5714 com.fpatt[com.npatt++]=nobs[k];
\r
5716 for(i=0,zero(com.pi,n); i<com.npatt; i++) {
\r
5717 com.pi[com.z[0][i]]+=com.fpatt[i]/(2.*com.ls);
\r
5718 com.pi[com.z[1][i]]+=com.fpatt[i]/(2.*com.ls);
\r
5720 GetCodonFreqs2 ();
\r
5722 x[0]=t0; x[1]=kappa0; x[2]=omega0;
\r
5723 /* printf("\nlnL=%9.6f\n",-lfun2dSdN(x,3)); */
\r
5724 ming2((noisy?F0:NULL),&lnL,lfun2dSdN,NULL,x,xb, space,1e-7,3);
\r
5725 eigenQcodon(2,x[0],&S,&x[3],&x[4], NULL,NULL,NULL, &mr, &x[1],x[2],space);
\r
5727 vx[j] += (x[j]-mx[j])*(x[j]-mx[j]);
\r
5728 mx[j] = (mx[j]*ir+x[j])/(ir+1.);
\r
5730 mse[0]+=square(x[2]-omega0);
\r
5731 printf("\r%4d%8.4f%8.4f%8.4f %8.4f%8.4f%8.4f%8.4f%8.4f",
\r
5732 ir+1,x[0],x[1],x[2],mx[0],mx[1],mx[2],mx[3],mx[4]);
\r
5735 fseq=gfopen(seqfile,"w");
\r
5736 fprintf(fseq,"%6d %6d\n", com.ns,com.ls*3);
\r
5737 for(i=0;i<2;i++,FPN(fseq),fflush(fseq)) {
\r
5738 fprintf(fseq,"seq.%-5d ", i+1);
\r
5739 FOR(h,com.npatt) FOR(k,(int)com.fpatt[h])
\r
5740 fprintf(fseq,"%s", getcodon(str,FROM61[com.z[i][h]]));
\r
5742 fclose(fseq); exit(0);
\r
5746 if(nr>1) { FOR(j,5) vx[j]=sqrt(vx[j]/(nr-1.)/nr); mse[0]=sqrt(mse[0]/nr); }
\r
5747 fprintf(frst,"%4d ", com.ls);
\r
5748 FOR(i,5) fprintf(frst,"%7.4f +%7.4f", mx[i],vx[i]); FPN(frst);
\r
5757 /* This simulates two codon sequences and analyze them using Ina's method.
\r
5758 Ina's program is modified to output result in Ina1.tmp. Consistency
\r
5759 analysis is done by generating long sequences.
\r
5760 Note that com.pi[] is not changed in the analysis, which is done outside
\r
5763 char seqfile[32]="codonseq.tmp",tmpfile[32]="Ina1.tmp",str[4]="";
\r
5764 FILE *fseq, *ftmp;
\r
5765 int ip,ir,nr=500, i,j,k,h, n=Nsensecodon;
\r
5766 int npatt0=n*(n+1)/2, nobs[NCODE*NCODE];
\r
5767 int il,nil=1, ls[]={500,100,200,300,400,500,600,800,1000}, fcodon=1;
\r
5768 double y, t=.5,f3x4[12], x[3]={1,1,1}, S,dS,dN, mr=0;
\r
5769 double t0=1,kappa0=1,omega0=1, mx[6],vx[6],mse[6]; /* t,k,w,dS,dN */
\r
5770 double Efij[NCODE*NCODE], space[50000];
\r
5771 double f3x4_data[][3*4]={
\r
5772 {0.25, 0.25, 0.25, 0.25,
\r
5773 0.25, 0.25, 0.25, 0.25,
\r
5774 0.25, 0.25, 0.25, 0.25},
\r
5776 {0.20517, 0.28293, 0.30784, 0.20406, /* mt pri */
\r
5777 0.40979, 0.27911, 0.18995, 0.12116,
\r
5778 0.15105, 0.43290, 0.37123, 0.04482},
\r
5780 {0.19020, 0.16201, 0.36655, 0.28124, /* hiv */
\r
5781 0.28889, 0.18805, 0.30179, 0.22127,
\r
5782 0.24875, 0.16894, 0.36822, 0.21410},
\r
5784 {0.14568, 0.24519, 0.33827, 0.27086,
\r
5785 0.35556, 0.18765, 0.24049, 0.21630,
\r
5786 0.26444, 0.25728, 0.21012, 0.26815} /* lysinNew*/
\r
5789 puts("\nThis simulates data and analyses them using Ina95.");
\r
5791 printf ("fcodon? ");
\r
5792 scanf ("%d", &fcodon);
\r
5794 FOR(j,12) f3x4[j]=f3x4_data[fcodon][j];
\r
5795 for(j=0,h=0,y=0; j<64; j++) {
\r
5796 if (GeneticCode[com.icode][j]==-1) continue;
\r
5797 com.pi[h]=f3x4[j/16]*f3x4[4+(j%16)/4]*f3x4[8+j%4];
\r
5800 FOR(j,n) com.pi[j]/=y;
\r
5801 printf("fcodon: %d\n",fcodon);
\r
5802 matout(frst,f3x4,3,4);
\r
5803 com.icode=0; com.seqtype=1; com.ns=2; com.ls=1; npatt0=n*(n+1)/2;
\r
5804 com.ncode=n; setmark_61_64 ();
\r
5805 FOR(j,com.ns) com.z[j]=(char*) malloc(npatt0*sizeof(char));
\r
5806 if(com.z[com.ns-1]==NULL) error2 ("oom z");
\r
5807 if((com.fpatt=(double*)malloc(npatt0*sizeof(double)))==NULL)
\r
5808 error2("oom fpatt");
\r
5810 printf("\nInfinite sequences.\nsum pi=1=%.6f\n",sum(com.pi,NCODE));
\r
5811 noisy=0; FOR(i,6) x[i]=0;
\r
5814 printf("\nt0 & kappa0 & omega0? ");
\r
5815 scanf("%lf%lf%lf", &t0,&kappa0,&omega0);
\r
5817 printf("t0 =%.2f & kappa0 =%.2f & omega0 =%.2f\n",t0,kappa0,omega0);
\r
5818 fprintf(frst, "\nt & k & w: %8.2f%8.2f%8.2f\n\n", t0,kappa0,omega0);
\r
5819 eigenQcodon(2,t0,&S,&dS,&dN, NULL,NULL,NULL, &mr, &kappa0,omega0,space);
\r
5820 fprintf(frst,"\nS/(S+N)=%7.4f dS=%7.4f dN=%7.4f\n",S/3,dS,dN);
\r
5821 fputs("Lc & t & k & w & dS & dN\n",frst);
\r
5823 eigenQcodon(1,-1,NULL,NULL,NULL,Root, U, V, &mr, &kappa0, omega0, PMat);
\r
5824 PMatUVRoot (PMat, t0, n, U, V, Root);
\r
5825 for(i=0,h=0;i<n;i++) for(j=0;j<=i;j++) {
\r
5826 y=com.pi[i]*PMat[i*n+j];
\r
5827 Efij[h++]=(i==j?y:2*y);
\r
5829 for(i=h=0,com.ls=1,com.npatt=npatt0;i<n;i++) for(j=0;j<=i;j++) {
\r
5830 com.z[0][h]=(char)i; com.z[1][h]=(char)j;
\r
5831 com.fpatt[h]=Efij[h]; h++;
\r
5833 for(h=1;h<npatt0; h++) Efij[h]+=Efij[h-1];
\r
5834 if(fabs(1-Efij[npatt0-1])>1e-6) puts("Sum p_ij != 1.");
\r
5835 for(il=0; il<nil; il++) {
\r
5838 printf("\nls = %d\n", com.ls);
\r
5839 for(ir=0,zero(mx,6),zero(vx,6),zero(mse,6); ir<nr; ir++) {
\r
5840 printf("\r%4d", ir+1);
\r
5841 MultiNomial (com.ls, npatt0, Efij, nobs, NULL);
\r
5842 for(i=0,com.npatt=0;i<n;i++) for(j=0;j<=i;j++)
\r
5843 if(nobs[k=i*(i+1)/2+j]) {
\r
5844 com.z[0][com.npatt]=i; com.z[1][com.npatt]=j;
\r
5845 com.fpatt[com.npatt++]=nobs[k];
\r
5847 fseq=gfopen(seqfile,"w");
\r
5848 fprintf(fseq,"> %6d %6d\n", com.ns,com.ls*3);
\r
5849 for(i=0;i<2;i++,FPN(fseq),fflush(fseq)) {
\r
5850 fprintf(fseq,"seq.%-5d ", i+1);
\r
5851 FOR(h,com.npatt) FOR(k,(int)com.fpatt[h])
\r
5852 fprintf(fseq,"%s", getcodon(str,FROM61[com.z[i][h]]));
\r
5855 if(com.ls>2000) system("Ina1Large codonseq.tmp >t");
\r
5856 else system("Ina1 codonseq.tmp >t");
\r
5857 ftmp=gfopen(tmpfile,"r");
\r
5858 if(fscanf(ftmp,"%lf%lf%lf",&x[0],&x[1],&x[2]) !=3)
\r
5859 error2("reading tmpf");
\r
5862 vx[j] += (x[j]-mx[j])*(x[j]-mx[j]);
\r
5863 mx[j] = (mx[j]*ir+x[j])/(ir+1.);
\r
5865 mse[0]+=square(x[0]-omega0);
\r
5867 printf("%7.4f%7.4f%7.4f %7.4f%7.4f%7.4f%7.4f%7.4f",
\r
5868 x[0],x[1],x[2],mx[0],mx[1],mx[2],mx[3],mx[4]);
\r
5870 /* fprintf(frst1,"%7.4f%7.4f%7.4f %7.4f%7.4f%7.4f%7.4f%7.4f\n",
\r
5871 x[0],x[1],x[2],mx[0],mx[1],mx[2],mx[3],mx[4]);
\r
5874 if(nr>1) { FOR(j,5) vx[j]=sqrt(vx[j]/(nr-1.)/nr); mse[0]=sqrt(mse[0]/nr); }
\r
5875 fprintf(frst,"%4d ", com.ls);
\r
5876 FOR(i,5) fprintf(frst,"%7.3f +%7.4f", mx[i],vx[i]);
\r
5877 FPN(frst); fflush(frst);
\r
5879 fprintf(frst1,"%6d%6d %7.2f%7.2f%7.2f: %8.3f +%7.3f\n",
\r
5880 com.ls,nr, t0,kappa0,omega0, mx[0],mse[0]);
\r
5891 int mergeSeqs(FILE*fout)
\r
5893 /* This concatenate multiple genes (data sets) for the same set of species
\r
5894 into one file of a long gene. Used to process Anne Yoders' alignment.
\r
5897 char *filenames[12]={"NADH1.fin","NADH2.fin","COI.fin","COII.fin","ATPase8.fin",
\r
5898 "ATPase6.fin","COIII.fin","NADH3.fin","NADH4L.fin","NADH4.fin",
\r
5899 "NADH5.fin", "Cytb.fin"};
\r
5901 int ns0=32, nfile=12, ifile, ls0, lswhole=20000, i,h, lgene0[32];
\r
5902 char *z0[32], *spname0[32]={"Artibeus", "B.musculus", "B.physalus", "Bos",
\r
5903 "Canis", "Cavia", "Ceratother", "Dasypus", "Didelphis", "E.asinus",
\r
5904 "E.caballus","Erinaceus", "Felis", "Glis", "Gorilla", "Halichoeru", "Homo",
\r
5905 "Hylobates", "Macropus", "Mus", "Ornithorhy", "Oryctolagu", "Ovis",
\r
5906 "P.paniscus", "P.troglody", "Papio", "Phoca", "P.abelii",
\r
5907 "P.pygmaeus", "Rattus", "Rhinoceros", "Sus"};
\r
5911 FOR(i,ns0) if((z0[i]=(char*)malloc(lswhole*sizeof(char)))==NULL)
\r
5913 for(ifile=0,ls0=0; ifile<nfile; ifile++) {
\r
5914 printf("Reading data set %2d/%2d (%s)", ifile+1,nfile,filenames[ifile]);
\r
5915 fseq=gfopen (filenames[ifile],"r");
\r
5916 ReadSeq(NULL,fseq,1);
\r
5917 lgene0[ifile]=com.ls; com.ls*=3;
\r
5918 FOR(i,ns0) if(strcmp(spname0[i],com.spname[i])) error2("spname different");
\r
5919 FOR(i,ns0) FOR(h,com.ls) z0[i][ls0+h]=com.z[i][h];
\r
5921 printf(" + %5d = %5d\n", com.ls, ls0);
\r
5923 fprintf(fout,"%6d %6d G\nG %4d ", ns0,ls0,nfile);
\r
5924 FOR(ifile,nfile) fprintf(fout, " %4d", lgene0[ifile]); FPN(fout);
\r
5926 for(i=0;i<ns0;i++,FPN(fout)) {
\r
5927 fprintf(fout,"%-12s ", spname0[i]);
\r
5929 fprintf(fout,"%c", z0[i][h]);
\r
5930 if((h+1)%3==0) fprintf(fout," ");
\r
5939 int SlidingWindow(FILE*fout, FILE* fpair[], double space[])
\r
5941 /* sliding window analysis, clean data, 2 sequences only */
\r
5942 int wlen=windowsize0, offset=offset0, wstart, n=com.ncode, j, h, positive=0;
\r
5943 int ls0=com.ls, npatt0=com.npatt;
\r
5945 double *fpatt0, pi0[NCODE], lnL0=0, lnL1=0;
\r
5947 if(com.seqtype!=1) error2("implemented for codon sequences only.");
\r
5948 if(com.runmode!=-2) error2("this version of sliding windows requires runmode=-2");
\r
5949 if(!com.cleandata || com.ngene>1)
\r
5950 error2("clean data & one gene only for sliding window analysis");
\r
5952 error2("Choose RateAncestor=0 for sliding window analysis");
\r
5953 for(j=0; j<com.ns; j++)
\r
5955 for(j=0; j<com.ns; j++)
\r
5956 if((com.z[j]=malloc(npatt0*sizeof(char)))==NULL) error2("oom z");
\r
5957 if((fpatt0=(double*)malloc(npatt0*sizeof(double)))==NULL) error2("oom fp");
\r
5958 for(h=0; h<com.npatt; h++)
\r
5959 fpatt0[h] = com.fpatt[h];
\r
5960 for(j=0; j<n; j++)
\r
5961 pi0[j] = com.pi[j];
\r
5963 for (wstart=0; wstart+wlen<=ls0; wstart+=offset) {
\r
5964 for(h=0; h<npatt0; h++)
\r
5966 for(h=wstart; h<wstart+wlen; h++)
\r
5967 com.fpatt[com.pose[h]]++;
\r
5969 for(h=0,com.npatt=0,zero(com.pi,n); h<npatt0;h++) if(com.fpatt[h]>0) {
\r
5970 for(j=0; j<com.ns; j++)
\r
5971 com.z[j][com.npatt] = z0[j][h];
\r
5972 com.fpatt[com.npatt] = com.fpatt[h];
\r
5976 com.posG[0] = 0; com.posG[1] = com.npatt;
\r
5978 com.fix_omega = 1; com.omega = 1;
\r
5979 PairwiseCodon(fout,fpair[3],fpair[4],fpair[5],com.space);
\r
5980 lnL0 = -lnLmodel; /* lnLmodel passed overhead from PairwiseCodon() */
\r
5982 com.fix_omega = 0; com.omega = 0.5;
\r
5983 PairwiseCodon(fout,fpair[3],fpair[4],fpair[5],com.space);
\r
5986 if(com.omega>1 && (lnL1-lnL0)>2.71/2) {
\r
5992 printf("sites %3d -- %3d (%d) npatt:%4d w=%.4f\n",wstart+1,wstart+wlen,ls0,com.npatt, com.omega);
\r
5993 fprintf(fout,"\nsites %3d -- %3d %4d",wstart+1,wstart+wlen,com.npatt);
\r
5995 /* Forestry(fout); */
\r
5997 fprintf(frst1, " %2d", positive);
\r
5998 printf(" %2d", positive);
\r
6000 com.ls = ls0; com.posG[1] = com.npatt = npatt0;
\r
6001 for(h=0; h<com.npatt; h++)
\r
6002 com.fpatt[h] = fpatt0[h];
\r
6003 xtoy(pi0, com.pi, n);
\r
6005 for(j=0; j<com.ns; j++) {
\r
6007 com.z[j] = z0[j];
\r
6014 void Get4foldSites(void)
\r
6016 /* This collects the four-fold degerate sites into a file named 4fold.nuc.
\r
6017 The data are not coded yet, and the routine is called from ReadSeq().
\r
6019 int ls4, j,k,h, ib[3][4], nb[3];
\r
6020 char file4[12]="4fold.nuc", *mark4;
\r
6023 f4=gfopen(file4,"w");
\r
6024 if ((mark4=(char*)malloc(com.ls*sizeof(char)))==NULL) error2("oom mark");
\r
6025 FOR(h,com.ls) mark4[h]=0;
\r
6027 for (h=0,ls4=0; h<com.ls; h++) {
\r
6028 for(k=0; k<3; k++)
\r
6029 NucListall(com.z[0][h*3+k], &nb[k], ib[k]);
\r
6030 if(nb[0]==1 && nb[2]==1 && FourFold[ib[0][0]][ib[1][0]]) {
\r
6031 for(j=1; j<com.ns; j++)
\r
6032 for(k=0; k<2; k++) if(com.z[j][h*3+k]!=com.z[0][h*3+k]) goto nextsite;
\r
6033 mark4[h]=1; ls4++;
\r
6038 fprintf (f4, "%6d %6d\n", com.ns, ls4);
\r
6039 for (j=0; j<com.ns; j++) {
\r
6040 fprintf (f4, "\n%s\n", com.spname[j]);
\r
6041 for (h=0; h<com.ls; h++)
\r
6042 if(mark4[h]) fprintf (f4, "%c", com.z[j][h*3+2]);
\r
6045 fprintf(f4, "\n\ncodons included\n");
\r
6046 for(h=0; h<com.ls; h++)
\r
6047 if(mark4[h]) fprintf(f4, " %2d", h+1);
\r
6050 fclose(f4); free(mark4);
\r
6054 double distanceHKY85 (double x[], double *kappa, double alpha);
\r
6056 void d4dSdN(FILE* fout)
\r
6058 /* This looks at the 4-fold degerenate sites.
\r
6060 char str1[4]=" ", str2[4]=" ";
\r
6061 int i,j,k, n=com.ncode, b[2][3], ic1,ic2,iaa;
\r
6062 double pS4,d4,kappa4fold;
\r
6063 double fij, fij4f[4*4], pi4f[4], pstop,t, S,dS,dN,dN_dS, mr=0;
\r
6064 double fb3x4[12]={.25, .25, .25, .25,
\r
6065 .25, .25, .25, .25,
\r
6066 .25, .25, .25, .25};
\r
6069 double t0[]={0.001, 0.01,0.05, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.2, 1.5, 2,2.5,3};
\r
6072 com.ls=1; com.kappa=3; com.omega=1;
\r
6074 fb3x4[0*4+0]=0.35;
\r
6075 fb3x4[0*4+1]=0.15;
\r
6076 fb3x4[0*4+2]=0.35;
\r
6077 fb3x4[0*4+3]=0.15;
\r
6079 fb3x4[1*4+0]=0.35;
\r
6080 fb3x4[1*4+1]=0.15;
\r
6081 fb3x4[1*4+2]=0.35;
\r
6082 fb3x4[1*4+3]=0.15;
\r
6084 fb3x4[2*4+0]=0.35;
\r
6085 fb3x4[2*4+1]=0.15;
\r
6086 fb3x4[2*4+2]=0.35;
\r
6087 fb3x4[2*4+3]=0.15;
\r
6090 printf("\tt\tS\tdS\tdN\tdN/dS\tS4\td4\tk_4f\tpT_4f\n");
\r
6093 FOR(k,64) if(FROM64[k]>-1)
\r
6094 com.pi[FROM64[k]]=fb3x4[k/16]*fb3x4[4+(k/4)%4]*fb3x4[8+k%4];
\r
6095 pstop=1-sum(com.pi,n);
\r
6096 abyx(1/(1-pstop),com.pi,n);
\r
6098 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, &com.kappa,com.omega,PMat);
\r
6100 matout(frst,com.pi,16,4);
\r
6104 eigenQcodon(2,t,&S,&dS,&dN,NULL,NULL,NULL, &mr, &com.kappa,com.omega,PMat);
\r
6105 PMatUVRoot (PMat, t, n, U, V, Root);
\r
6106 if(testTransP(PMat,n)) error2("testP");
\r
6108 matout(frst,PMat,n,n);
\r
6110 for(i=0,zero(fij4f,16);i<n;i++) {
\r
6111 ic1=FROM61[i]; b[0][0]=ic1/16; b[0][1]=(ic1/4)%4; b[0][2]=ic1%4;
\r
6112 iaa=GeneticCode[com.icode][ic1];
\r
6114 FOR(k,4) if(GeneticCode[com.icode][ic1+k]!=iaa) break;
\r
6117 fij=com.pi[i]*PMat[i*n+j];
\r
6118 ic2=FROM61[j]; b[1][0]=ic2/16; b[1][1]=(ic2/4)%4; b[1][2]=ic2%4;
\r
6120 if(b[0][0]!=b[1][0] || b[0][1]!=b[1][1]) continue;
\r
6121 fij4f[b[0][2]*4+b[1][2]] += fij;
\r
6123 /* printf("%c %s %s %.8f\n",AAs[iaa],getcodon(str1,ic1+b[0][2]),getcodon(str2,ic2),fij);
\r
6129 pS4=sum(fij4f,16)/3;
\r
6130 abyx(1/sum(fij4f,16),fij4f,16);
\r
6131 FOR(k,4) pi4f[k]=sum(fij4f+k*4,4);
\r
6133 /* matout(F0,fij4f,4,4); */
\r
6135 d4 = distanceHKY85 (fij4f, &kappa4fold, 0);
\r
6136 dN_dS = (dS>0 ? dN/dS : -1);
\r
6137 printf("\t%.4f\t%.5f\t%.5f\t%.5f\t%.5f\t%.3f\t%.5f\t%.3f\t%.4f\n",
\r
6138 t,S/3,dS,dN,dN_dS, pS4,d4,kappa4fold,pi4f[0]);
\r
6142 printf("\nproportion of stop codons: %.4f\n", pstop);
\r
6148 double distanceHKY85 (double x[], double *kappa, double alpha)
\r
6150 /* This is from SeqDivergence(), copied here to avoid linking to SeqDivergence.
\r
6153 double p[4], Y,R, a1,a2,b, P1,P2,Q,tc,ag;
\r
6154 double largek=999, larged=9;
\r
6156 if (testXMat(x) && noisy) {
\r
6158 puts("X err.. Perhaps no sites to compare?");
\r
6161 for (i=0,zero(p,4); i<4; i++) {
\r
6162 FOR (j,4) { p[i]+=x[i*4+j]/2; p[j]+=x[i*4+j]/2; }
\r
6164 P1=x[0*4+1]+x[1*4+0];
\r
6165 P2=x[2*4+3]+x[3*4+2];
\r
6166 Q = x[0*4+2]+x[0*4+3]+x[1*4+2]+x[1*4+3]+ x[2*4+0]+x[2*4+1]+x[3*4+0]+x[3*4+1];
\r
6170 if(P1+P2+Q<1e-100) {
\r
6171 *kappa=-1; return(0);
\r
6177 a1=1-Y*P1/(2*tc)-Q/(2*Y);
\r
6178 a2=1-R*P2/(2*ag)-Q/(2*R);
\r
6180 if (a1<=0 || a2<=0 || b<=0) return (larged);
\r
6181 if (alpha<=0) { a1=-log(a1); a2=-log(a2); b=-log(b); }
\r
6182 else { a1=-gammap(a1,alpha); a2=-gammap(a2,alpha); b=-gammap(b,alpha);}
\r
6183 a1 = -R/Y*b + a1/Y;
\r
6184 a2 = -Y/R*b + a2/R;
\r
6185 if (b>0) *kappa = min2((a1+a2)/(2*b), largek);
\r
6186 return 2*(p[0]*p[1] + p[2]*p[3])*(a1+a2)/2 + 2*Y*R*b;
\r
6192 void get_pclassM_iw_M2M8(int *iw, double *pclassM,
\r
6193 int iclassM, int ip[], double para[4][100], int n1d, int M2a, int ternary);
\r
6194 void get_grid_para_like_M2M8(double para[4][100], int n1d, int dim, int M2a, int ternary,
\r
6195 double p0b[], double p1b[], double w0b[], double wsb[],
\r
6196 double p_beta_b[], double q_beta_b[], double x[], double *S);
\r
6197 void GetIndexTernary(int *ix, int *iy, double *x, double *y, int itriangle, int K);
\r
6201 void get_grid_para_like_M2M8 (double para[4][100], int n1d, int dim, int M2a, int ternary,
\r
6202 double p0b[], double p1b[], double w0b[], double wsb[],
\r
6203 double p_beta_b[], double q_beta_b[], double x[], double *S)
\r
6205 /* This sets up the grid (para[][]) according to the priors. It also copies all
\r
6206 possible w values into com.rK[].
\r
6207 The bounds on parameters are used to set up the uniform priors for parameters.
\r
6209 int i,k,h, site=10;
\r
6212 if(com.NSsites==NSbetaw) /* can't control the range of w from the beta */
\r
6213 { w0b[0]=0; w0b[1]=1; }
\r
6215 for(i=0; i<n1d; i++) {
\r
6216 para[0][i] = p0b[0]+(i+0.5)*(p0b[1]-p0b[0])/n1d; /* p0 */
\r
6217 if(com.NSsites==2) { /* M2 & M2a */
\r
6218 para[1][i] = p1b[0]+(i+0.5)*(p1b[1]-p1b[0])/n1d; /* p1 */
\r
6219 if(ternary) para[0][i] = para[1][i] = -1;
\r
6221 para[2][i] = w0b[0]+(i+0.5)*(w0b[1]-w0b[0])/n1d; /* w0 */
\r
6222 para[2+M2a][i] = wsb[0]+(i+0.5)*(wsb[1]-wsb[0])/n1d; /* w2 */
\r
6225 para[1][i] = p_beta_b[0]+(i+0.5)*(p_beta_b[1]-p_beta_b[0])/n1d; /* p */
\r
6226 para[2][i] = q_beta_b[0]+(i+0.5)*(q_beta_b[1]-q_beta_b[0])/n1d; /* q */
\r
6227 para[3][i] = wsb[0]+(i+0.5)*(wsb[1]-wsb[0])/n1d; /* ws */
\r
6232 if(com.NSsites==2 && M2a==0)
\r
6234 else /* w0 in M2a or w0 from beta in M8 */
\r
6235 for(i=0; i<n1d; i++)
\r
6236 com.rK[k++] = w0b[0]+(i+0.5)*(w0b[1]-w0b[0])/n1d;
\r
6237 if(com.NSsites==2)
\r
6238 com.rK[k++]=1; /* w1 for M2 & M2a */
\r
6239 for(i=0; i<n1d; i++)
\r
6240 com.rK[k++] = wsb[0]+(i+0.5)*(wsb[1]-wsb[0])/n1d; /* w2 in M2 or ws */
\r
6242 /* calculates the likelihood com.fhK[] */
\r
6243 printf("\nCalculating f(x_h|w): %d categories %d w sets.\n", n1d, com.ncatG);
\r
6244 com.conPSiteClass=0; *S=0;
\r
6248 for(k=0; k<com.ncatG; k++)
\r
6249 printf("S%d w log{f(x|w)}: %9.4f %12.6f\n",
\r
6250 site,com.rK[k], (com.NnodeScale?com.fhK[k*com.npatt+site]:log(com.fhK[k*com.npatt+site])));
\r
6252 if(com.NnodeScale)
\r
6253 for(h=0; h<com.npatt; h++) {
\r
6254 for(k=1,fh=com.fhK[h]; k<com.ncatG; k++)
\r
6255 fh = max2(fh,com.fhK[k*com.npatt+h]);
\r
6256 for(k=0; k<com.ncatG; k++)
\r
6257 com.fhK[k*com.npatt+h] = exp(com.fhK[k*com.npatt+h]-fh);
\r
6258 *S += fh*com.fpatt[h];
\r
6261 for(h=0; h<com.npatt; h++) {
\r
6262 for(k=1,fh=com.fhK[h]; k<com.ncatG; k++)
\r
6263 fh = max2(fh,com.fhK[k*com.npatt+h]);
\r
6264 for(k=0; k<com.ncatG; k++)
\r
6265 com.fhK[k*com.npatt+h] /= fh;
\r
6266 *S += log(fh)*com.fpatt[h];
\r
6272 void get_pclassM_iw_M2M8(int *iw, double *pclassM,
\r
6273 int iclassM, int ip[], double para[][100], int n1d, int M2a, int ternary)
\r
6275 /* Given the point on the grid (ip[]), this returns iw and pclassM, where iw
\r
6276 locates the w ratio in com.rK[] and f(x_h|w) stored in com.fhK[],
\r
6277 and pclassM is the proportion of the site class under the model.
\r
6278 Look at get_grid_para_like() for more info about the setup of com.rK[], which
\r
6279 accounts for the setup of iw here in this function.
\r
6281 M8 used to use 10 categories to approximate the beta, each of probability
\r
6282 10%. Here we use n1d categories, equally spaced, and the
\r
6283 probabilities for categories are calculated using CDFBeta.
\r
6285 Parameters for grid integration:
\r
6287 Parameters Parameter dependence
\r
6288 Model 0 1 2 3 iw pclassM
\r
6289 -------------------------------------------------------------------
\r
6290 M2 p0 p1 w2 iclassM w0 w2 iclassM p0 p1
\r
6291 M2a p0 p1 w0 w2 iclassM w2 iclassM p0 p1
\r
6292 M8 p0 p q ws iclassM p q ws iclassM p0 p q
\r
6293 -------------------------------------------------------------------
\r
6295 If M2 or M2a and ternary, the n1d*n1d grid for p0-p1 is mapped onto the
\r
6296 triangle specified by p0-p1-p2. First the index i and j are retrieved
\r
6297 from the label for the point (ip[0]*n1d+ip[1]). Then the coordinates
\r
6298 p0 and p1 at the point is worked out. With this scheme, p0 and p1 each
\r
6299 takes on 2*n1d-1 possible values.
\r
6302 double p0,p1, p,q, cdf0=0,cdf1=1;
\r
6304 if(com.NSsites==NSpselection) { /* M2 & M2a */
\r
6306 GetIndexTernary(&i, &j, &p0, &p1, ip[0]*n1d+ip[1], n1d);
\r
6307 *pclassM = (iclassM==0 ? p0 : (iclassM==1 ? p1 : 1-p0-p1));
\r
6310 if(iclassM<2) *pclassM = para[iclassM][ip[iclassM]]; /* p0 or p1 */
\r
6311 else *pclassM = 1-para[0][ip[0]]-para[1][ip[1]]; /* p2 */
\r
6312 *pclassM = max2(*pclassM,0);
\r
6315 if(M2a==0) { /*M2 */
\r
6316 if(iclassM<2) *iw = iclassM; /* w0 or w1 */
\r
6317 else *iw = 2+ip[2]; /* w2 */
\r
6320 if(iclassM==0) *iw = ip[2]; /* w0 */
\r
6321 else if(iclassM==1) *iw = n1d; /* w1 */
\r
6322 else *iw = n1d+1+ip[3]; /* w2 */
\r
6326 p0 = para[0][ip[0]];
\r
6327 if(iclassM<n1d) { /* w from beta */
\r
6328 p = para[1][ip[1]];
\r
6329 q = para[2][ip[2]];
\r
6330 if(iclassM>0) cdf0 = CDFBeta(iclassM/(double)n1d, p, q, 0);
\r
6331 if(iclassM<n1d-1) cdf1 = CDFBeta((iclassM+1.0)/n1d, p, q, 0);
\r
6332 *pclassM = p0*(cdf1-cdf0);
\r
6344 int lfunNSsites_M2M8 (FILE* frst, double x[], int np)
\r
6346 /* Bayes empirical Bayes (BEB) correction for the posterior of w for each site
\r
6347 under M2 or M8. The integral is 3-d for M2, and 4-d for M2a or M8,
\r
6348 approximated using n1d=10 categories in each dimension. The ngrid=n1d^dim
\r
6349 points make up the grid.
\r
6351 com.ncatG is the number of all possible w's ever needed. They are copied
\r
6352 into com.rK[], to be used to calculate f(x_h|w), stored in com.fhK[], before
\r
6353 entering the grid of 4-d integration. iw[ngrid*nclassM] identifies the
\r
6354 position of w in com.rK[], and pclassM[ngrid*nclassM] is the proportion
\r
6355 of sites under the model. Those are set up in get_pclassM_iw().
\r
6357 The priors are set up in get_grid_para_like(). See notes there.
\r
6358 Some control variables:
\r
6359 M2a=1 for M2a=0 for M2.
\r
6360 ternary=1: use ternary triangles to specify prior for p0-p1 under M2 or M2a
\r
6361 =0: break p0 and p1 into 10 bins and skip the unfeasible points
\r
6363 Parameters and their priors are as follows:
\r
6364 M2 (p0 p1 w2) : p0,p1~U(0,1), w2~U(1,11)
\r
6365 M2a(p0 p1 w0 w2): p0,p1~U(0,1), w0~U(0,1), w2~U(1,11)
\r
6366 M8 (p0 p q ws): p0~U(0,1), p,q~U(0,2), ws~U(1,11)
\r
6368 Ziheng, Copenhagen, 17 May 2004.
\r
6370 int n1d=10, M2a=1, ternary=1, trianglePriorM8=0;
\r
6371 double p0b[]={0,1}, p1b[]={0,1}, w0b[]={0,1}; /* w0b for M2a only. */
\r
6372 double wsb[]={1,11}; /* for w2 in M2 & M2a, or for ws in M8 */
\r
6373 double p_beta_b[]={0,2}, q_beta_b[]={0,2};
\r
6375 int dim=(com.NSsites==8||M2a?4:3), ngrid,igrid, ip[4]={0}, j,k,h, it;
\r
6376 int refsp=0, ncatG0=com.ncatG;
\r
6377 /* # of site classes under model and index for site class */
\r
6378 int nclassM = (com.NSsites==NSpselection?3:n1d+1), iclassM, *iw;
\r
6379 double para[4][100]={{0}}, postpara[4][100]; /* paras on grid for 4-d integral: n1d<=100! */
\r
6380 /* lnfXs is log of term in equation 5 in YWN05, which sums over those terms. */
\r
6381 double fh, fX, *lnfXs,S1,S2, *lnprior, *pclassM, *meanw, *varw, *postSite, *postp0p1=NULL;
\r
6382 double fh1site, t,v;
\r
6383 char timestr[32], *paras[4];
\r
6385 printf("\nBEBing (dim = %d). This may take several minutes.", dim);
\r
6387 if(com.NSsites==8) { paras[0]="p0"; paras[1]="p"; paras[2]="q"; paras[3]="ws"; }
\r
6388 else if(!M2a) { paras[0]="p0"; paras[1]="p1"; paras[2]="w2"; }
\r
6389 else { paras[0]="p0"; paras[1]="p1"; paras[2]="w0"; paras[3]="w2"; }
\r
6391 ngrid=n1d*n1d*n1d*(dim==4?n1d:1);
\r
6392 if(com.NSsites==8) com.ncatG = n1d+n1d; /* w from beta & ws */
\r
6393 else com.ncatG = (M2a ? n1d+1+n1d : 2+n1d); /* w0, w1=1, w2 */
\r
6394 if((meanw=(double*)malloc(com.npatt*(2+nclassM)*sizeof(double)))==NULL)
\r
6395 error2("oom meanw");
\r
6396 varw=meanw+com.npatt; postSite=varw+com.npatt;
\r
6397 ternary=(com.NSsites==2 && ternary);
\r
6399 if(ternary && (postp0p1=(double*)malloc(n1d*n1d*sizeof(double)))==NULL)
\r
6400 error2("oom postp0p1");
\r
6401 if((lnfXs=(double*)malloc(ngrid*sizeof(double)))==NULL)
\r
6402 error2("oom lnfXs");
\r
6403 if((pclassM=(double*)malloc(ngrid*nclassM*(sizeof(double)+sizeof(int))))==NULL)
\r
6404 error2("oom pclassM"); /* this wastes space */
\r
6405 iw = (int*)(pclassM+ngrid*nclassM);
\r
6406 if((lnprior=(double*)malloc(n1d*n1d*sizeof(double)))==NULL)
\r
6407 error2("oom lnprior"); /* this wastes space */
\r
6409 k=com.npatt*com.ncatG*sizeof(double);
\r
6410 if((com.fhK=(double*)realloc(com.fhK,k))==NULL) error2("oom fhK");
\r
6412 for(j=0; j<n1d*n1d; j++) lnprior[j]=0;
\r
6413 if(com.NSsites==8 && trianglePriorM8) {
\r
6414 /* for(j=0; j<n1d; j++) lnprior[j]=(2-1./n1d-j*2./n1d)/n1d; */
\r
6415 for(j=0; j<n1d; j++) lnprior[j]=(2*j+1.)/(n1d*n1d);
\r
6416 printf("triangular prior for p0 under M8\n");
\r
6417 for(j=0; j<n1d; j++) printf("%9.4f", (2*j+1.)/(2*n1d)); FPN(F0);
\r
6418 for(j=0; j<n1d; j++) printf("%9.4f", lnprior[j]); FPN(F0);
\r
6422 get_grid_para_like_M2M8(para, n1d, dim, M2a, ternary, p0b, p1b, w0b, wsb, p_beta_b, q_beta_b, x, &S1);
\r
6424 /* Set up im and pclassM, for each igrid and iclassM. */
\r
6425 for(igrid=0; igrid<ngrid; igrid++) {
\r
6426 for(j=dim-1,it=igrid; j>=0; j--) { ip[j]=it%n1d; it/=n1d; }
\r
6427 if(com.NSsites==2 && !ternary && para[0][ip[0]]+para[1][ip[1]]>1) continue;
\r
6428 for(k=0; k<nclassM; k++) {
\r
6429 get_pclassM_iw_M2M8(&iw[igrid*nclassM+k], &pclassM[igrid*nclassM+k],k,ip,para,n1d,M2a,ternary);
\r
6433 /* calculate log{fX}, where fX is the marginal probability of data,
\r
6434 and posterior of parameters postpara[]. S2 is the scale factor. */
\r
6435 printf("Calculating f(X), the marginal probability of data.\n");
\r
6437 FOR(j,dim) FOR(k,n1d) postpara[j][k]=1;
\r
6438 if(ternary) FOR(k,n1d*n1d) postp0p1[k]=1;
\r
6439 for(igrid=0; igrid<ngrid; igrid++) {
\r
6440 for(j=dim-1,it=igrid; j>=0; j--) { ip[j]=it%n1d; it/=n1d; }
\r
6441 if(com.NSsites==2 && !ternary && para[0][ip[0]]+para[1][ip[1]]>1)
\r
6443 for(h=0,lnfXs[igrid]=0; h<com.npatt; h++) {
\r
6444 for(k=0,fh=0; k<nclassM; k++)
\r
6445 fh += pclassM[igrid*nclassM+k]*com.fhK[iw[igrid*nclassM+k]*com.npatt+h];
\r
6448 printf("strange: f[%3d] = %12.6g very small.\n",h,fh);
\r
6452 lnfXs[igrid] += log(fh)*com.fpatt[h];
\r
6454 lnfXs[igrid] += (com.NSsites==8 ? lnprior[ip[0]] : lnprior[ip[0]*n1d+ip[1]]);
\r
6455 t=lnfXs[igrid]-S2;
\r
6456 if(t>0) { /* change scale factor S2 */
\r
6457 t = (t<200 ? exp(-t) : 0);
\r
6459 FOR(j,dim) FOR(k,n1d)
\r
6460 postpara[j][k] *= t;
\r
6462 postpara[j][ip[j]] ++;
\r
6465 FOR(k,n1d*n1d) postp0p1[k] *= t;
\r
6466 postp0p1[ip[0]*n1d+ip[1]] ++;
\r
6469 S2 = lnfXs[igrid];
\r
6474 for(j=0; j<dim; j++)
\r
6475 postpara[j][ip[j]] += t;
\r
6476 if(ternary) postp0p1[ip[0]*n1d+ip[1]] += t;
\r
6479 for(j=0; j<dim; j++)
\r
6480 for(k=0; k<n1d; k++)
\r
6481 postpara[j][k]/=fX;
\r
6483 for(k=0; k<n1d*n1d; k++)
\r
6487 printf("\tlog(fX) = %12.6f S = %12.6f %12.6f\n", fX+S1-dim*log(n1d*1.),S1,S2);
\r
6489 /* calculate posterior probabilities and mean w for each site pattern.
\r
6490 S1 and S2 are scale factors for probabilities and for w. */
\r
6491 printf("Calculating f(w|X), posterior probabilities of site classes.\n");
\r
6492 for(h=0; h<com.npatt; h++) {
\r
6493 S1=-1e300; FOR(j,nclassM) postSite[j*com.npatt+h]=1;
\r
6494 S2=-1e300; meanw[h]=varw[h]=1;
\r
6495 for(iclassM=0; iclassM<nclassM; iclassM++) {
\r
6496 for(igrid=0; igrid<ngrid; igrid++) {
\r
6497 for(j=dim-1,it=igrid; j>=0; j--) { ip[j]=it%n1d; it/=n1d; }
\r
6498 if(com.NSsites==2 && !ternary && para[0][ip[0]]+para[1][ip[1]]>1)
\r
6501 for(k=0,fh=0; k<nclassM; k++) /* duplicated calculation */
\r
6502 fh += pclassM[igrid*nclassM+k]*com.fhK[iw[igrid*nclassM+k]*com.npatt+h];
\r
6504 it = igrid*nclassM+iclassM;
\r
6505 fh1site = pclassM[it]*com.fhK[iw[it]*com.npatt+h];
\r
6507 if(fh1site<1e-300) continue;
\r
6510 t = log(fh1site)+lnfXs[igrid]; /* t is log of term on grid */
\r
6511 if(t>S1) { /* change scale factor S1 */
\r
6512 for(j=0; j<nclassM; j++)
\r
6513 postSite[j*com.npatt+h] = postSite[j*com.npatt+h]*exp(S1-t);
\r
6516 postSite[iclassM*com.npatt+h] += exp(t-S1);
\r
6518 t = fh1site*com.rK[iw[it]];
\r
6519 v = fh1site*square(com.rK[iw[it]]);
\r
6520 if(t<1e-300) continue;
\r
6521 t = log(t)+lnfXs[igrid]; /* t is log of mean */
\r
6522 v = log(v)+lnfXs[igrid];
\r
6523 if(t>S2) { /* change scale factor S2 */
\r
6524 meanw[h] = meanw[h]*exp(S2-t);
\r
6525 varw[h] = varw[h]*exp(S2-t);
\r
6528 meanw[h] += exp(t-S2);
\r
6529 varw[h] += exp(v-S2);
\r
6533 for(j=0; j<nclassM; j++)
\r
6534 postSite[j*com.npatt+h] *= exp(S1-fX);
\r
6535 meanw[h] *= exp(S2-fX);
\r
6536 varw[h] *= exp(S2-fX);
\r
6537 varw[h] -= meanw[h]*meanw[h];
\r
6538 varw[h] = (varw[h]>0?sqrt(varw[h]):0);
\r
6540 if((h+1)%10==0 || h==com.npatt-1)
\r
6541 printf("\r\tdid %3d / %3d patterns %s", h+1,com.npatt,printtime(timestr));
\r
6544 /* print out posterior probabilities */
\r
6545 fprintf(frst,"\nBayes Empirical Bayes (BEB) probabilities for %d classes (class)", nclassM);
\r
6546 fprintf(fout,"\nBayes Empirical Bayes (BEB) analysis");
\r
6547 fprintf(fout," (Yang, Wong & Nielsen 2005. Mol. Biol. Evol. 22:1107-1118)");
\r
6549 com.ncatG = ncatG0;
\r
6550 PrintProbNSsites(frst, postSite, meanw, varw, nclassM, refsp);
\r
6552 fprintf(fout, "\n\nThe grid %s\n\n", (ternary?"(see ternary graph for p0-p1)":""));
\r
6553 for(j=(ternary?2:0); j<dim; j++,FPN(fout)) {
\r
6554 fprintf(fout, "%-2s: ", paras[j]);
\r
6555 for(k=0; k<n1d; k++)
\r
6556 fprintf(fout, " %6.3f", para[j][k]);
\r
6558 if(ternary) for(k=0; k<n1d; k++) postpara[0][k]=postpara[1][k]=-1;
\r
6559 fprintf(fout, "\n\nPosterior on the grid\n\n");
\r
6560 for(j=(ternary?2:0); j<dim; j++,FPN(fout)) {
\r
6561 fprintf(fout, "%-2s: ", paras[j]);
\r
6562 for(k=0;k<n1d;k++)
\r
6563 fprintf(fout, " %6.3f", postpara[j][k]);
\r
6566 fprintf(fout,"\nPosterior for p0-p1 (see the ternary graph)\n\n");
\r
6567 for(k=0;k<n1d*n1d;k++) {
\r
6568 fprintf(fout," %5.3f", postp0p1[k]);
\r
6569 if(fabs(square((int)sqrt(k+1.))-(k+1))<1e-5) FPN(fout);
\r
6571 fprintf(fout,"\nsum of density on p0-p1 = %10.6f\n", sum(postp0p1,n1d*n1d));
\r
6575 free(meanw); free(lnfXs); free(pclassM); free(lnprior);
\r
6576 if(ternary) free(postp0p1);
\r
6582 /********************************************************************/
\r
6583 void get_grid_para_like_AC(double para[][100], int n1d, int dim,
\r
6584 double w0b[], double w2b[], double x[], double *S);
\r
6586 void get_pclassM_iw_AC(int *iw, double *pclassM, int iclassM, int ip[], double para[][100], int n1d);
\r
6589 void get_grid_para_like_AC(double para[][100], int n1d, int dim,
\r
6590 double w0b[], double w2b[], double x[], double *S)
\r
6592 /* This sets up the grid (para[][]) according to the priors.
\r
6593 It calculates the probability of data at each site given w: f(f_h|w).
\r
6594 This is calculated using the branch model (NSsites = 0 model = 2), with
\r
6595 BayesEB=2 used to force the use of the correct scale factors in GetPMatBranch().
\r
6597 Order of site classes for iw or f(x_h|w):
\r
6599 Branchsite A (121 sets)
\r
6600 site class 0: w0 w0 10
\r
6601 site class 1: w1=1 w1=1 1
\r
6602 site class 2a: w0 w2 100
\r
6603 site class 2b: w1=1 w2 10
\r
6605 Clade C (111 sets)
\r
6606 site class 0: w0 w0 10
\r
6607 site class 1: w1=1 w1=1 1
\r
6608 site class 2: w2 w3 10*10*10...
\r
6610 int modelA=(com.model==2), i,k,h, iw, site=10;
\r
6611 double fh, wbranches[NBTYPE]; /* w for back and fore branches */
\r
6612 int NSsites0=com.NSsites, model0=com.model;
\r
6614 for(i=0; i<n1d; i++) {
\r
6615 para[0][i] = para[1][i] = -1; /* p0 & p1 */
\r
6616 para[2][i] = w0b[0] + (i+0.5)*(w0b[1]-w0b[0])/n1d; /* w0 */
\r
6617 para[3][i] = w2b[0] + (i+0.5)*(w2b[1]-w2b[0])/n1d; /* w2 */
\r
6618 if(com.model==3) /* w3 w4 ... in model C */
\r
6619 for(k=1; k<com.nbtype; k++)
\r
6620 para[3+k][i] = para[3][i];
\r
6623 /* calculates the likelihood com.fhK[] */
\r
6624 printf("\nCalculating f(x_h|w) for %d w's\n", com.ncatG);
\r
6625 com.conPSiteClass = 0;
\r
6629 com.pomega = wbranches;
\r
6630 for(iw=0; iw<com.ncatG; iw++) {
\r
6631 if(modelA) { /* model A: 10 + 1 + 100 + 10 */
\r
6632 if(iw<n1d) wbranches[0] = wbranches[1] = para[2][iw]; /* class 0: w0 */
\r
6633 else if(iw==n1d) wbranches[0] = wbranches[1] = 1; /* class 1: w1 */
\r
6634 else if(iw<n1d+1+n1d*n1d) { /* class 2a: w0 w2 */
\r
6635 wbranches[0] = para[2][(iw-n1d-1)/n1d];
\r
6636 wbranches[1] = para[3][(iw-n1d-1)%n1d];
\r
6638 else { /* class 2b: w1 w2 */
\r
6640 wbranches[1] = para[3][iw-n1d-1-n1d*n1d];
\r
6643 else { /* model C: 10 + 1 + 10*10*... */
\r
6644 if(iw<n1d) /* class 0: w0 */
\r
6645 for(i=0; i<com.nbtype; i++) wbranches[i] = para[2][iw];
\r
6646 else if(iw==n1d) /* class 1: w1 */
\r
6647 for(i=0; i<com.nbtype; i++) wbranches[i] = 1;
\r
6648 else { /* class 2: w2 w3 */
\r
6649 for(i=com.nbtype-1,k=iw-n1d-1; i>=0; i--) {
\r
6650 wbranches[i] = para[3+i][k%n1d];
\r
6655 printf("\nw%-2d: ", iw+1);
\r
6656 for(i=0; i<com.nbtype; i++) printf(" %10.6f", wbranches[i]);
\r
6659 ConditionalPNode(tree.root, 0, x);
\r
6660 for(h=0; h<com.npatt; h++) {
\r
6661 for(i=0,fh=0; i<com.ncode; i++)
\r
6662 fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];
\r
6664 if(fh<-1e-5) printf("\nfh = %.6f negative\n",fh);
\r
6668 for(k=0; k<com.NnodeScale; k++)
\r
6669 fh += com.nodeScaleF[k*com.npatt+h];
\r
6670 com.fhK[iw*com.npatt+h] = fh;
\r
6672 if((iw+1)%10==0 || iw==com.ncatG-1)
\r
6673 printf("\r\t %4d / %d sets.", iw+1, com.ncatG);
\r
6677 for(h=0,*S=0; h<com.npatt; h++) {
\r
6678 for(k=1,fh=com.fhK[h]; k<com.ncatG; k++)
\r
6679 fh = max2(fh,com.fhK[k*com.npatt+h]);
\r
6680 for(k=0; k<com.ncatG; k++)
\r
6681 com.fhK[k*com.npatt+h] = exp(com.fhK[k*com.npatt+h]-fh);
\r
6682 *S += fh*com.fpatt[h];
\r
6684 com.NSsites=NSsites0; com.model=model0;
\r
6687 void get_pclassM_iw_AC(int *iw, double *pclassM, int iclassM, int ip[], double para[][100], int n1d)
\r
6689 /* Given the point on the grid ip[] and iclassM, this returns iw and pclassM,
\r
6690 where iw locates the correct f(x_h|w) stored in com.fhK[], and pclassM is
\r
6691 the proportion of the site class under the model.
\r
6692 The n1d*n1d grid for p0-p1 is mapped onto the ternary graph for p0-p1-p2.
\r
6694 See get_grid_para_like_AC() for order of iw or site classes.
\r
6696 Parameters are model A: (p0 p1 w0 w2)
\r
6697 model C: (p0 p1 w0 w2 w3 ...)
\r
6699 int modelA=(com.model==2), i,j;
\r
6702 GetIndexTernary(&i, &j, &p[0], &p[1], ip[0]*n1d+ip[1], n1d);
\r
6703 p[2] = 1-p[0]-p[1];
\r
6704 *pclassM = p[iclassM<=2 ? iclassM : 2];
\r
6705 if(modelA && iclassM>=2) *pclassM = p[2]*p[iclassM-2]/(1-p[2]);
\r
6707 if(iclassM==0) *iw = ip[2]; /* class 0: w0 */
\r
6708 else if(iclassM==1) *iw = n1d; /* class 1: w1 */
\r
6709 else if(modelA==0) { /* clade model C site class 2: w2 w3 w4 ... */
\r
6710 for(i=0,*iw=0; i<com.nbtype; i++)
\r
6711 *iw = *iw*n1d + ip[3+i];
\r
6714 else if(iclassM==2) *iw = n1d+1+ip[2]*n1d+ip[3]; /* class 2a model A: w0 & w2 */
\r
6715 else *iw = n1d+1+n1d*n1d+ip[3]; /* class 2b model A: w1 & w2 */
\r
6719 int lfunNSsites_AC (FILE* frst, double x[], int np)
\r
6721 /* Bayes empirical Bayes (BEB) calculation of posterior probabilities for site
\r
6722 classes under the branch-site model A (Yang & Nielsen 2002) and clade model C
\r
6723 (Bielawski & Yang 2004). The dimension of integral is 4 for A and (3+nbtype)
\r
6724 for C. Each dimension is approximated using n1d=10 categories, and the grid
\r
6725 is made up of ngrid=n1d^dim points.
\r
6727 For branch-site model A, the probability of data at a site f(x_h|w) needs to
\r
6728 be calculated for 121=(d+1+d*d+d) sets of w's. For model C, it needs to be
\r
6729 calculated for 111 (d+1+d^nbtype) sets.
\r
6730 Those are calculated and stored in com.fhK[], before entering the grid.
\r
6731 iw[ngrid*nclassM] identifies the right f(x_h|w), and pclassM[ngrid*nclassM]
\r
6732 is the proportion of sites under the model, f(w|ita). Those are set up in
\r
6733 get_pclassM_iw_AC().
\r
6735 The priors are set up in get_grid_para_like_AC(). See notes there.
\r
6737 Parameters and priors are as follows:
\r
6738 model A (p0 p1 w0 w2): p0,p1~U(0,1), w0~U(0,1), w2~U(1,11)
\r
6739 model C (p0 p1 w0 w2 w3): p0,p1~U(0,1), w0~U(0,1), w2,w3~U(0,5)
\r
6741 Ziheng, UCL, 22 September 2004, modified Nov 2008 to use more than 2 branch types
\r
6742 under clade model C.
\r
6744 int n1d=10, debug=0, site=10;
\r
6745 double w0b[]={0,1}; /* w0b for w0. */
\r
6746 double wsb[]={1,11}; /* for w2 in model A */
\r
6747 double w2b[]={0,3}; /* for w2-w3-w4 in model C */
\r
6749 int modelA=(com.model==2), dim=(modelA?4:3+com.nbtype), ngrid,igrid, ip[3+NBTYPE], j,k,h,hp,it;
\r
6750 int refsp=0, ncatG0=com.ncatG, lst=(com.readpattern?com.npatt:com.ls);
\r
6751 /* # of site classes under model and index for site class */
\r
6752 int nclassM = (modelA?4:3), iclassM, *iw, i;
\r
6753 double para[3+NBTYPE][100]={{0}}, postpara[3+NBTYPE][100]; /* paras on grid : n1d<=100! */
\r
6754 double fh, fX, *lnfXs,S1,S2, *pclassM, *postSite, *postp0p1;
\r
6755 double fhk[4], t, cutoff=0.5;
\r
6756 char timestr[32], paras[3+NBTYPE][5]={"p0","p1","w0","w2","w3"}, *sig, aa;
\r
6758 printf("\nBEBing (dim = %d). This may take many minutes.", dim);
\r
6761 for(i=2; i<com.nbtype; i++) sprintf(paras[3+i], "w%d", i+2);
\r
6763 for(i=0,ngrid=1; i<dim; i++) ngrid *= n1d;
\r
6765 com.ncatG = n1d + 1 + n1d*n1d + n1d; /* branch-site model A: table 1 YWN05 */
\r
6766 else { /* clade model C: table 2 YWN05 */
\r
6767 for(i=0,com.ncatG=1; i<com.nbtype; i++) com.ncatG *= n1d; /* w2 w3 w4 ... */
\r
6768 com.ncatG += n1d + 1; /* w0 & w1=1 */
\r
6771 k = (n1d*n1d + com.npatt*nclassM + ngrid + ngrid*nclassM)*sizeof(double)
\r
6772 + ngrid*nclassM*sizeof(int);
\r
6773 if(noisy) printf("\nTrying to get %dM memory in lfunNSsites_A\n", k);
\r
6774 if((postp0p1=(double*)malloc(k)) == NULL)
\r
6775 error2("oom in lfunNSsites_AC");
\r
6776 postSite = postp0p1 + n1d*n1d;
\r
6777 lnfXs = postSite + com.npatt*nclassM;
\r
6778 pclassM = lnfXs + ngrid;
\r
6779 iw = (int*)(pclassM + ngrid*nclassM);
\r
6781 k = com.npatt*com.ncatG*sizeof(double);
\r
6782 if((com.fhK=(double*)realloc(com.fhK,k)) == NULL) error2("oom fhK");
\r
6785 get_grid_para_like_AC(para, n1d, dim, w0b, (modelA?wsb:w2b), x, &S1);
\r
6787 /* Set up im and pclassM, for each igrid and iclassM. */
\r
6788 for(igrid=0; igrid<ngrid; igrid++) {
\r
6789 for(j=dim-1,it=igrid; j>=0; j--) { ip[j]=it%n1d; it/=n1d; }
\r
6790 for(k=0; k<nclassM; k++) {
\r
6791 get_pclassM_iw_AC(&iw[igrid*nclassM+k], &pclassM[igrid*nclassM+k],k,ip,para,n1d);
\r
6795 /* calculate marginal prob of data, fX, and postpara[]. S2 is scale. */
\r
6796 printf("Calculating f(X), the marginal probability of data.\n");
\r
6798 for(j=0; j<dim; j++) /* postpara[0-1] for p0p1 ignored */
\r
6799 for(k=0; k<n1d; k++)
\r
6800 postpara[j][k] = 1;
\r
6801 for(k=0; k<n1d*n1d; k++)
\r
6803 for(igrid=0; igrid<ngrid; igrid++) {
\r
6804 for(j=dim-1,it=igrid; j>=0; j--) {
\r
6808 for(h=0,lnfXs[igrid]=0; h<com.npatt; h++) {
\r
6809 for(k=0,fh=0; k<nclassM; k++)
\r
6810 fh += pclassM[igrid*nclassM+k]*com.fhK[iw[igrid*nclassM+k]*com.npatt+h];
\r
6812 printf("strange: f[%3d] = %12.6g very small.\n",h,fh);
\r
6815 lnfXs[igrid] += log(fh)*com.fpatt[h];
\r
6817 t = lnfXs[igrid]-S2;
\r
6818 if(t>0) { /* change scale factor S2 */
\r
6819 t = (t<200 ? exp(-t) : 0);
\r
6821 for(j=0; j<dim; j++) for(k=0; k<n1d; k++)
\r
6822 postpara[j][k] *= t;
\r
6823 for(k=0; k<n1d*n1d; k++)
\r
6826 for(j=0; j<dim; j++)
\r
6827 postpara[j][ip[j]] ++;
\r
6828 postp0p1[ip[0]*n1d+ip[1]] ++;
\r
6829 S2 = lnfXs[igrid];
\r
6834 for(j=0; j<dim; j++)
\r
6835 postpara[j][ip[j]] += t;
\r
6836 postp0p1[ip[0]*n1d+ip[1]] += t;
\r
6838 if((igrid+1)%500==0 || igrid==ngrid-1)
\r
6839 printf("\t%3d / %3d grid points\r", igrid+1,ngrid);
\r
6842 for(j=0; j<dim; j++) for(k=0; k<n1d; k++)
\r
6843 postpara[j][k] /= fX;
\r
6844 for(k=0; k<n1d*n1d; k++)
\r
6848 printf("\tlog(fX) = %12.6f S = %12.6f %12.6f\n", fX+S1-dim*log(n1d*1.),S1,S2);
\r
6850 /* calculate posterior probabilities for sites. S1 is scale factor */
\r
6851 printf("Calculating f(w|X), posterior probs of site classes.\n");
\r
6852 for(h=0; h<com.npatt; h++) {
\r
6854 for(j=0; j<nclassM; j++)
\r
6855 postSite[j*com.npatt+h] = 1;
\r
6856 for(igrid=0; igrid<ngrid; igrid++) {
\r
6857 for(j=dim-1,it=igrid; j>=0; j--) { ip[j]=it%n1d; it/=n1d; }
\r
6858 for(k=0,fh=0; k<nclassM; k++) /* duplicated calculation */
\r
6859 fh += fhk[k] = pclassM[igrid*nclassM+k]*com.fhK[iw[igrid*nclassM+k]*com.npatt+h];
\r
6861 for(iclassM=0; iclassM<nclassM; iclassM++) {
\r
6862 fhk[iclassM] /= fh;
\r
6863 t = log(fhk[iclassM]) + lnfXs[igrid]; /* t is log of term on grid */
\r
6864 if(t>S1 + 50) { /* change scale factor S1 */
\r
6865 for(j=0; j<nclassM; j++)
\r
6866 postSite[j*com.npatt+h] *= exp(S1-t);
\r
6869 postSite[iclassM*com.npatt+h] += exp(t-S1);
\r
6872 for(j=0; j<nclassM; j++)
\r
6873 postSite[j*com.npatt+h] *= exp(S1-fX);
\r
6875 if((h+1)%10==0 || h==com.npatt-1)
\r
6876 printf("\r\tdid %3d / %3d site patterns %s", h+1,com.npatt,printtime(timestr));
\r
6880 for(k=0,printf("\nS%d: ",site); k<nclassM; k++) printf("%7.4f",postSite[k*com.npatt+site]);
\r
6882 /* print out posterior probabilities */
\r
6883 fprintf(frst,"\nBayes Empirical Bayes (BEB) probabilities for %d classes (class)", nclassM);
\r
6884 fprintf(fout,"\nBayes Empirical Bayes (BEB) analysis");
\r
6885 fprintf(fout," (Yang, Wong & Nielsen 2005. Mol. Biol. Evol. 22:1107-1118)");
\r
6887 com.ncatG = ncatG0;
\r
6889 PrintProbNSsites(frst, postSite, NULL, NULL, nclassM, refsp);
\r
6890 if(com.model==2) { /* branch&site model A */
\r
6891 fprintf(fout,"\nPositive sites for foreground lineages Prob(w>1):\n");
\r
6892 for(h=0; h<lst; h++) {
\r
6893 hp = (!com.readpattern ? com.pose[h] : h);
\r
6894 aa = GetAASiteSpecies(refsp, hp);
\r
6895 t = postSite[2*com.npatt+hp] + postSite[3*com.npatt+hp];
\r
6897 sig=""; if(t>.95) sig="*"; if(t>.99) sig="**";
\r
6898 fprintf(fout,"%6d %c %.3f%s\n",h+1, aa, t, sig);
\r
6903 fprintf(fout, "\n\nThe grid (see ternary graph for p0-p1)\n\n");
\r
6904 for(j=2; j<dim; j++,FPN(fout)) {
\r
6905 fprintf(fout, "%-2s: ", paras[j]);
\r
6906 for(k=0; k<n1d; k++)
\r
6907 fprintf(fout, " %6.3f", para[j][k]);
\r
6909 for(k=0; k<n1d; k++)
\r
6910 postpara[0][k] = postpara[1][k]=-1;
\r
6911 fprintf(fout, "\n\nPosterior on the grid\n\n");
\r
6912 for(j=2; j<dim; j++,FPN(fout)) {
\r
6913 fprintf(fout, "%-2s: ", paras[j]);
\r
6914 for(k=0; k<n1d; k++)
\r
6915 fprintf(fout, " %6.3f", postpara[j][k]);
\r
6917 fprintf(fout,"\nPosterior for p0-p1 (see the ternary graph)\n\n");
\r
6918 for(k=0; k<n1d*n1d; k++) {
\r
6919 fprintf(fout," %5.3f", postp0p1[k]);
\r
6920 if(fabs(square((int)sqrt(k+1.))-(k+1))<1e-5) FPN(fout);
\r
6922 fprintf(fout,"\nsum of density on p0-p1 = %10.6f\n", sum(postp0p1,n1d*n1d));
\r