2 Copyright, Ziheng Yang, April 1995.
\r
4 cl -Ot -O2 evolver.c tools.c
\r
5 cl -Ot -O2 -DCodonNSbranches -FeevolverNSbranches.exe evolver.c tools.c
\r
6 cl -Ot -O2 -DCodonNSsites -FeevolverNSsites.exe evolver.c tools.c
\r
7 cl -Ot -O2 -DCodonNSbranchsites -FeevolverNSbranchsites.exe evolver.c tools.c
\r
9 cc -fast -o evolver evolver.c tools.c -lm
\r
10 cc -O4 -DCodonNSbranches -o evolverNSbranches evolver.c tools.c -lm
\r
11 cc -O4 -DCodonNSsites -o evolverNSsites evolver.c tools.c -lm
\r
12 cc -O4 -DCodonNSbranchsites -o evolverNSbranchsites evolver.c tools.c -lm
\r
15 evolver 5 MCbase.dat
\r
16 evolver 6 MCcodon.dat
\r
18 evolver 9 <TreesFile> <MasterTreeFile>
\r
22 #define CodonNSbranches
\r
23 #define CodonNSsites
\r
24 #define CodonNSbranchsites
\r
30 #define NBRANCH (NS*2-2)
\r
37 unsigned char *z[2*NS-1];
\r
38 char spname[NS][LSPNAME+1], daafile[512], cleandata, readpattern;
\r
39 int ns, ls, npatt, np, ntime, ncode, clock, rooted, model, icode;
\r
40 int seqtype, *pose, ncatG, NSsites;
\r
41 double *fpatt, kappa, omega, alpha, pi[64], *conP, daa[20*20];
\r
42 double freqK[NCATG], rK[NCATG];
\r
43 char *siteID; /* used if ncatG>1 */
\r
44 double *siterates; /* rates for gamma or omega for site or branch-site models */
\r
45 double *omegaBS, *QfactorBS; /* omega IDs for branch-site models */
\r
48 int nbranch, nnode, root, branches[NBRANCH][2];
\r
51 int father, nson, sons[MAXNSONS], ibranch;
\r
52 double branch, age, omega, label, *conP;
\r
53 char *nodeStr, fossil;
\r
56 extern char BASEs[];
\r
57 extern int GeneticCode[][64], noisy;
\r
58 int LASTROUND=0; /* not used */
\r
61 #define NODESTRUCTURE
\r
63 #include "treesub.c"
\r
64 #include "treespace.c"
\r
66 void TreeDistances(FILE* fout);
\r
67 void Simulate(char*ctlf);
\r
68 void MakeSeq(char*z, int ls);
\r
69 int EigenQbase(double rates[], double pi[],
\r
70 double Root[],double U[],double V[],double Q[]);
\r
71 int EigenQcodon (int getstats, double kappa,double omega,double pi[],
\r
72 double Root[], double U[], double V[], double Q[]);
\r
73 int EigenQaa(double pi[],double Root[], double U[], double V[],double Q[]);
\r
74 void CladeMrBayesProbabilities (char treefile[]);
\r
75 int between_f_and_x(void);
\r
76 void LabelClades(FILE *fout);
\r
78 char *MCctlf0[]={"MCbase.dat","MCcodon.dat","MCaa.dat"};
\r
79 char *seqf[3]={"mc.paml", "mc.paml", "mc.nex"};
\r
81 enum {JC69, K80, F81, F84, HKY85, T92, TN93, REV} BaseModels;
\r
82 char *basemodels[]={"JC69","K80","F81","F84","HKY85","T92","TN93","REV"};
\r
83 enum {Poisson, EqualInput, Empirical, Empirical_F} AAModels;
\r
84 char *aamodels[]={"Poisson", "EqualInput", "Empirical", "Empirical_F"};
\r
87 double PMat[NCODE*NCODE], U[NCODE*NCODE], V[NCODE*NCODE], Root[NCODE];
\r
88 static double Qfactor=-1, Qrates[5]; /* Qrates[] hold kappa's for nucleotides */
\r
91 int main (int argc, char*argv[])
\r
93 char *MCctlf=NULL, outf[512]="evolver.out", treefile[512]="mcmc.txt", mastertreefile[512]="\0";
\r
94 int i, option=-1, ntree=1,rooted, BD=0, gotoption=0, pick1tree=-1;
\r
95 double bfactor=1, birth=-1,death=-1,sample=-1,mut=-1, *space;
\r
96 FILE *fout=gfopen(outf,"w");
\r
98 printf("EVOLVER in %s\n", pamlVerStr);
\r
99 com.alpha=0; com.cleandata=1; com.model=0; com.NSsites=0;
\r
102 gotoption=1; sscanf(argv[1], "%d", &option);
\r
105 printf("Results for options 1-4 & 8 go into %s\n",outf);
\r
106 else if(option!=5 && option!=6 && option!=7 && option!=9) {
\r
107 puts("Usage: \n\tevolver \n\tevolver option# MyDataFile"); exit(-1);
\r
109 if(option>=4 && option<=6)
\r
111 else if(option==9) {
\r
112 strcpy(treefile, argv[2]);
\r
113 if(argc>3) strcpy(mastertreefile, argv[3]);
\r
114 if(argc>4) sscanf(argv[4], "%d", &pick1tree);
\r
117 #if defined (CodonNSbranches)
\r
118 option=6; com.model=1;
\r
119 MCctlf = (argc==3 ? argv[2] : "MCcodonNSbranches.dat");
\r
121 #elif defined (CodonNSsites)
\r
122 option=6; com.NSsites=3;
\r
123 MCctlf = (argc==3 ? argv[2] : "MCcodonNSsites.dat");
\r
125 #elif defined (CodonNSbranchsites)
\r
126 option=6; com.model=1; com.NSsites=3;
\r
127 MCctlf = (argc==3 ? argv[2] : "MCcodonNSbranchsites.dat");
\r
134 printf("\n\t(1) Get random UNROOTED trees?\n");
\r
135 printf("\t(2) Get random ROOTED trees?\n");
\r
136 printf("\t(3) List all UNROOTED trees?\n");
\r
137 printf("\t(4) List all ROOTED trees?\n");
\r
138 printf("\t(5) Simulate nucleotide data sets (use %s)?\n",MCctlf0[0]);
\r
139 printf("\t(6) Simulate codon data sets (use %s)?\n",MCctlf0[1]);
\r
140 printf("\t(7) Simulate amino acid data sets (use %s)?\n",MCctlf0[2]);
\r
141 printf("\t(8) Calculate identical bi-partitions between trees?\n");
\r
142 printf("\t(9) Calculate clade support values (evolver 9 treefile mastertreefile <pick1tree>)?\n");
\r
143 printf("\t(11) Label clades?\n");
\r
144 printf("\t(0) Quit?\n");
\r
147 scanf("%d", &option);
\r
149 if(option==0) exit(0);
\r
150 if(option>=5 && option<=7) break;
\r
152 printf ("No. of species: ");
\r
153 scanf ("%d", &com.ns);
\r
155 if(com.ns>NS) error2 ("Too many species. Raise NS.");
\r
156 if((space=(double*)malloc(10000*sizeof(double)))==NULL) error2("oom");
\r
157 rooted = !(option%2);
\r
159 printf("\nnumber of trees & random number seed? ");
\r
160 scanf("%d%d", &ntree, &i);
\r
162 printf ("Want branch lengths from the birth-death process (0/1)? ");
\r
166 if(com.ns<3) error2("no need to do this?");
\r
167 i = (com.ns*2-1)*sizeof(struct TREEN);
\r
168 if((nodes=(struct TREEN*)malloc(i)) == NULL)
\r
172 case(1): /* random UNROOTED trees */
\r
173 case(2): /* random ROOTED trees */
\r
174 /* default names */
\r
176 for(i=0; i<com.ns; i++) sprintf(com.spname[i], "%c", (i<26 ? 'A'+i : 'a'+i-26));
\r
178 for(i=0; i<com.ns; i++) sprintf(com.spname[i], "S%d", i+1);
\r
181 printf ("\nbirth rate, death rate, sampling fraction, and ");
\r
182 printf ("mutation rate (tree height)?\n");
\r
183 scanf ("%lf%lf%lf%lf", &birth, &death, &sample, &mut);
\r
185 for(i=0;i<ntree;i++) {
\r
186 RandomLHistory (rooted, space);
\r
188 BranchLengthBD (1, birth, death, sample, mut);
\r
189 if(com.ns<20&&ntree<10) { OutTreeN(F0, 0, BD); puts("\n"); }
\r
190 OutTreeN(fout, 1, BD); FPN(fout);
\r
193 for (i=0; i<com.ns-2-!rooted; i++)
\r
194 Ib[i] = (int)((3.+i)*rndu());
\r
195 MakeTreeIb (com.ns, Ib, rooted);
\r
200 ListTrees(fout, com.ns, rooted);
\r
202 case(8): TreeDistances(fout); break;
\r
204 printf("tree file names? ");
\r
205 scanf("%s%s", treefile, mastertreefile);
\r
207 case(10): between_f_and_x(); break;
\r
208 case(11): LabelClades(fout); break;
\r
214 if(option>=5 && option<=7) {
\r
215 com.seqtype = option-5; /* 0, 1, 2 for bases, codons, & amino acids */
\r
216 Simulate(MCctlf ? MCctlf : MCctlf0[option-5]);
\r
218 else if(option==9) {
\r
219 CladeSupport(fout, treefile, 1, mastertreefile, pick1tree);
\r
220 /* CladeMrBayesProbabilities("/papers/BPPJC3sB/Karol.trees"); */
\r
226 int between_f_and_x (void)
\r
228 /* this helps with the exponential transform for frequency parameters */
\r
233 printf("\ndirection (0:x=>f; 1:f=>x; -1:end) & #classes? ");
\r
234 scanf("%d",&fromf);
\r
235 if(fromf==-1) return(0);
\r
236 scanf("%d", &n); if(n>100) error2("too many classes");
\r
237 printf("input the first %d values for %s? ",n-1,(fromf?"f":"x"));
\r
238 FOR(i,n-1) scanf("%lf",&x[i]);
\r
239 x[n-1]=(fromf?1-sum(x,n-1):0);
\r
240 f_and_x(x, x, n, fromf, 1);
\r
246 void LabelClades(FILE *fout)
\r
248 /* This reads in a tree and scan species names to check whether they form a
\r
249 paraphyletic group and then label the clade.
\r
250 It assumes that the tree is unrooted, and so goes through two rounds to check
\r
251 whether the remaining seqs form a monophyletic clade.
\r
254 int unrooted=1,iclade, sizeclade, mrca, paraphyl, is, imrca, i,j,k, lasts, haslength;
\r
255 char key[96]="A", treef[64]="/A/F/flu/HA.all.prankcodon.tre", *p,chosen[NS], *endstr="end";
\r
256 int *anc[NS-1], loc, bitmask, SI=sizeof(int)*8;
\r
259 printf("Tree file name? ");
\r
260 scanf ("%s", treef);
\r
261 printf("Treat tree as unrooted (0 no, 1 yes)? ");
\r
262 scanf ("%d", &unrooted);
\r
264 ftree = gfopen (treef,"r");
\r
265 fscanf (ftree, "%d%d", &com.ns, &j);
\r
266 if(com.ns<=0) error2("need ns in tree file");
\r
267 debug = (com.ns<20);
\r
269 i = (com.ns*2-1)*sizeof(struct TREEN);
\r
270 if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
\r
271 for(i=0; i<com.ns*2-1; i++) nodes[i].nodeStr = NULL;
\r
272 for(i=0; i<com.ns-1; i++) {
\r
273 anc[i] = (int*)malloc((com.ns/SI+1)*sizeof(int));
\r
274 if(anc[i]==NULL) error2("oom");
\r
276 ReadTreeN(ftree, &haslength, &j, 1, 0);
\r
278 if(debug) { OutTreeN(F0, 1, PrNodeNum); FPN(F0); }
\r
280 for(iclade=0; iclade<com.ns-1; iclade++) {
\r
281 printf("\nString for selecting sequences (followed by non-digit) (end to end)? ");
\r
283 if(strcmp(endstr, key) == 0)
\r
285 for(i=0; i<com.ns; i++)
\r
290 for(i=0; i<com.ns; i++) {
\r
291 if( (p=strstr(com.spname[i], key))
\r
292 && !isdigit(p[k]) )
\r
297 for(i=0; i<com.ns; i++)
\r
298 if(strstr(com.spname[i], key)) chosen[i] = 1;
\r
301 /* look for MRCA, going through two rounds, assuming unrooted tree */
\r
302 for(imrca=0; imrca<1+unrooted; imrca++) {
\r
304 for(i=0; i<com.ns; i++) chosen[i] = 1 - chosen[i];
\r
306 for(i=0,sizeclade=0; i<com.ns; i++)
\r
312 if(sizeclade <= 1 || sizeclade >= com.ns-1) {
\r
313 puts("unable to form a clade. <2 seqs.");
\r
316 for(i=0; i<com.ns-1; i++) for(j=0; j<com.ns/SI+1; j++)
\r
318 for(is=0; is<com.ns; is++) {
\r
319 if(chosen[is]==0) continue;
\r
320 loc = is/SI; bitmask = 1 << (is%SI);
\r
321 for(j=nodes[is].father; j!=-1; j=nodes[j].father) {
\r
322 anc[j-com.ns][loc] |= bitmask;
\r
324 for(i=0,k=0; i<com.ns; i++)
\r
325 if(anc[j-com.ns][i/SI] & (1<<(i%SI)))
\r
333 if(imrca==0 && mrca!=tree.root) /* 1st round is enough */
\r
337 if(sizeclade <= 1 || sizeclade >= com.ns-1 || mrca==tree.root) {
\r
338 printf("Unable to label. Ignored.");
\r
343 for(is=0; is<com.ns-1; is++) {
\r
344 printf("\nnode %4d: ", is+com.ns);
\r
345 for(j=0; j<com.ns; j++) {
\r
346 loc = j/SI; bitmask = 1 << (j%SI);
\r
347 printf(" %d", (anc[is][loc] & bitmask) != 0);
\r
351 printf("\nClade #%d (%s): %d seqs selected, MRCA is %d\n", iclade+1, key, sizeclade, mrca+1);
\r
352 for(is=0,paraphyl=0; is<com.ns; is++) {
\r
353 if(chosen[is] == 0)
\r
354 for(j=nodes[is].father; j!=-1; j=nodes[j].father)
\r
355 if(j==mrca) { paraphyl++; break; }
\r
358 printf("\nThis clade is paraphyletic, & includes %d other sequences\n", paraphyl);
\r
360 nodes[mrca].label = iclade+1;
\r
361 if(debug) OutTreeN(F0, 1, haslength|PrLabel);
\r
364 for(i=0; i<com.ns-1; i++) free(anc[i]);
\r
365 OutTreeN(fout, 1, haslength|PrLabel); FPN(fout);
\r
366 printf("Printed final tree with labels in evolver.out\n");
\r
370 void TreeDistanceDistribution (FILE* fout)
\r
372 /* This calculates figure 3.7 of Yang (2006).
\r
373 This reads the file of all trees (such as 7s.all.trees), and calculates the
\r
374 distribution of partition distance in all pairwise comparisons.
\r
376 int i,j,ntree, k,*nib, nsame, IBsame[NS], lpart=0;
\r
377 char treef[64]="5s.all.trees", *partition;
\r
379 double mPD[NS], PD1[NS]; /* distribution of partition distances */
\r
381 puts("Tree file name?");
\r
382 scanf ("%s", treef);
\r
384 ftree=gfopen (treef,"r");
\r
385 fscanf (ftree, "%d%d", &com.ns, &ntree);
\r
386 printf("%2d sequences %2d trees.\n", com.ns, ntree);
\r
387 i=(com.ns*2-1)*sizeof(struct TREEN);
\r
388 if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
\r
390 lpart = (com.ns-1)*com.ns*sizeof(char);
\r
392 printf("\n%d bytes of space requested.\n", i);
\r
393 partition = (char*)malloc(i);
\r
394 nib = (int*)malloc(ntree*sizeof(int));
\r
395 if (partition==NULL || nib==NULL) error2("out of memory");
\r
397 puts("\ntree #: mean prop of tree pairs with 0 1 2 ... shared bipartitions\n");
\r
398 fputs("\ntree #: prop of tree pairs with 0 1 2 ... shared bipartitions\n",fout);
\r
399 for (i=0; i<ntree; i++) {
\r
400 ReadTreeN (ftree, &j, &k, 0, 1);
\r
401 nib[i]=tree.nbranch-com.ns;
\r
402 Tree2Partition(partition+i*lpart);
\r
404 for(k=0; k<com.ns-3; k++) mPD[k]=0;
\r
405 for (i=0; i<ntree; i++,FPN(fout)) {
\r
406 for(k=0; k<com.ns-3; k++) PD1[k]=0;
\r
407 for (j=0; j<ntree; j++) {
\r
409 nsame=NSameBranch(partition+i*lpart,partition+j*lpart, nib[i],nib[j],IBsame);
\r
412 for(k=0; k<com.ns-3; k++) PD1[k] /= (ntree-1.);
\r
413 for(k=0; k<com.ns-3; k++) mPD[k] = (mPD[k]*i+PD1[k])/(i+1.);
\r
414 printf("%8d (%5.1f%%):", i+1,(i+1.)/ntree*100);
\r
415 for(k=0; k<com.ns-3; k++) printf(" %7.4f", mPD[k]);
\r
416 fprintf(fout, "%8d:", i+1); for(k=0; k<com.ns-3; k++) fprintf(fout, " %7.4f", PD1[k]);
\r
417 printf("%s", (com.ns<8||(i+1)%100==0 ? "\n" : "\r"));
\r
419 free(partition); free(nodes); free(nib); fclose(ftree);
\r
424 void TreeDistances (FILE* fout)
\r
426 /* I think this is broken after i changed the routine Tree2Partition().
\r
428 int i,j,ntree, k,*nib, parti2B[NS], nsame, IBsame[NS],nIBsame[NS], lpart=0;
\r
429 char treef[64]="5s.all.trees", *partition;
\r
431 double psame, mp, vp;
\r
434 TreeDistanceDistribution(fout);
\r
437 puts("\nNumber of identical bi-partitions between trees.\nTree file name?");
\r
438 scanf ("%s", treef);
\r
440 ftree=gfopen (treef,"r");
\r
441 fscanf (ftree, "%d%d", &com.ns, &ntree);
\r
442 printf("%2d sequences %2d trees.\n", com.ns, ntree);
\r
443 i=(com.ns*2-1)*sizeof(struct TREEN);
\r
444 if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
\r
446 if(ntree<2) error2("ntree");
\r
447 printf ("\n%d species, %d trees\n", com.ns, ntree);
\r
448 puts("\n\t1: first vs. rest?\n\t2: all pairwise comparisons?\n");
\r
452 lpart=(com.ns-1)*com.ns*sizeof(char);
\r
453 i=(k==1?2:ntree)*lpart;
\r
454 printf("\n%d bytes of space requested.\n", i);
\r
455 partition=(char*)malloc(i);
\r
456 nib=(int*)malloc(ntree*sizeof(int));
\r
457 if (partition==NULL || nib==NULL) error2("out of memory");
\r
459 if(k==2) { /* pairwise comparisons */
\r
460 fputs("Number of identical bi-partitions in pairwise comparisons\n",fout);
\r
461 for (i=0; i<ntree; i++) {
\r
462 ReadTreeN (ftree, &j, &k, 0, 1);
\r
463 nib[i]=tree.nbranch-com.ns;
\r
464 Tree2Partition(partition+i*lpart);
\r
466 for (i=0; i<ntree; i++,FPN(F0),FPN(fout)) {
\r
467 printf("%2d (%2d):", i+1,nib[i]);
\r
468 fprintf(fout,"%2d (%2d):", i+1,nib[i]);
\r
469 for (j=0; j<i; j++) {
\r
470 nsame=NSameBranch(partition+i*lpart,partition+j*lpart, nib[i],nib[j],IBsame);
\r
471 printf(" %2d", nsame);
\r
472 fprintf(fout," %2d", nsame);
\r
476 else { /* first vs. others */
\r
477 ReadTreeN (ftree, &j, &k, 0, 1);
\r
478 nib[0]=tree.nbranch-com.ns;
\r
479 if (nib[0]==0) error2("1st tree is a star tree..");
\r
480 Tree2Partition (partition);
\r
481 fputs ("Comparing the first tree with the others\nFirst tree:\n",fout);
\r
482 OutTreeN(fout,0,0); FPN(fout); OutTreeB(fout); FPN(fout);
\r
483 fputs ("\nInternal branches in the first tree:\n",fout);
\r
486 fprintf(fout,"%3d (%2d..%-2d): ( ",
\r
487 i+1,tree.branches[k][0]+1,tree.branches[k][1]+1);
\r
488 FOR(j,com.ns) if(partition[i*com.ns+j]) fprintf(fout,"%d ",j+1);
\r
491 if(nodes[tree.root].nson<=2)
\r
492 fputs("\nRooted tree, results may not be correct.\n",fout);
\r
493 fputs("\nCorrect internal branches compared with the 1st tree:\n",fout);
\r
494 FOR(k,nib[0]) nIBsame[k]=0;
\r
495 for (i=1,mp=vp=0; i<ntree; i++,FPN(fout)) {
\r
496 ReadTreeN (ftree, &j, &k, 0, 1);
\r
497 nib[1]=tree.nbranch-com.ns;
\r
498 Tree2Partition(partition+lpart);
\r
499 nsame=NSameBranch (partition,partition+lpart, nib[0],nib[1],IBsame);
\r
501 psame=nsame/(double)nib[0];
\r
502 FOR(k,nib[0]) nIBsame[k]+=IBsame[k];
\r
503 fprintf(fout,"1 vs. %3d: %4d: ", i+1,nsame);
\r
504 FOR(k,nib[0]) if(IBsame[k]) fprintf(fout," %2d", k+1);
\r
505 printf("1 vs. %5d: %6d/%d %10.4f\n", i+1,nsame,nib[0],psame);
\r
506 vp += square(psame - mp)*(i-1.)/i;
\r
507 mp=(mp*(i-1.) + psame)/i;
\r
509 vp=(ntree<=2 ? 0 : sqrt(vp/((ntree-1-1)*(ntree-1.))));
\r
510 fprintf(fout,"\nmean and S.E. of proportion of identical partitions\n");
\r
511 fprintf(fout,"between the 1st and all the other %d trees ", ntree-1);
\r
512 fprintf(fout,"(ignore these if not revelant):\n %.4f +- %.4f\n", mp, vp);
\r
513 fprintf(fout,"\nNumbers of times, out of %d, ", ntree-1);
\r
514 fprintf(fout,"interior branches of tree 1 are present");
\r
515 fputs("\n(This may be bootstrap support for nodes in tree 1)\n",fout);
\r
517 i=tree.branches[parti2B[k]][0]+1; j=tree.branches[parti2B[k]][1]+1;
\r
518 fprintf(fout,"%3d (%2d..%-2d): %6d (%5.1f%%)\n",
\r
519 k+1,i,j,nIBsame[k],nIBsame[k]*100./(ntree-1.));
\r
522 free(partition); free(nodes); free(nib); fclose(ftree);
\r
527 int EigenQbase(double rates[], double pi[],
\r
528 double Root[],double U[],double V[],double Q[])
\r
530 /* Construct the rate matrix Q[] for nucleotide model REV.
\r
533 double mr, space[4];
\r
536 for (i=0,k=0; i<3; i++) for (j=i+1; j<4; j++)
\r
537 if (i*4+j!=11) Q[i*4+j]=Q[j*4+i]=rates[k++];
\r
538 for (i=0,Q[3*4+2]=Q[2*4+3]=1; i<4; i++) FOR (j,4) Q[i*4+j] *= pi[j];
\r
539 for (i=0,mr=0; i<4; i++)
\r
540 { Q[i*4+i]=0; Q[i*4+i]=-sum(Q+i*4, 4); mr-=pi[i]*Q[i*4+i]; }
\r
541 abyx (1/mr, Q, 16);
\r
543 eigenQREV(Q, com.pi, 4, Root, U, V, space);
\r
548 static double freqK_NS=-1;
\r
550 int EigenQcodon (int getstats, double kappa, double omega, double pi[],
\r
551 double Root[], double U[], double V[], double Q[])
\r
553 /* Construct the rate matrix Q[].
\r
554 64 codons are used, and stop codons have 0 freqs.
\r
556 int n=com.ncode, i,j,k, c[2],ndiff,pos=0,from[3],to[3];
\r
557 double mr, space[64];
\r
559 for(i=0; i<n*n; i++) Q[i] = 0;
\r
560 for (i=0; i<n; i++) FOR (j,i) {
\r
561 from[0]=i/16; from[1]=(i/4)%4; from[2]=i%4;
\r
562 to[0]=j/16; to[1]=(j/4)%4; to[2]=j%4;
\r
563 c[0]=GeneticCode[com.icode][i]; c[1]=GeneticCode[com.icode][j];
\r
564 if (c[0]==-1 || c[1]==-1) continue;
\r
565 for (k=0,ndiff=0; k<3; k++) if (from[k]!=to[k]) { ndiff++; pos=k; }
\r
566 if (ndiff!=1) continue;
\r
568 if ((from[pos]+to[pos]-1)*(from[pos]+to[pos]-5)==0) Q[i*n+j]*=kappa;
\r
569 if(c[0]!=c[1]) Q[i*n+j]*=omega;
\r
572 for(i=0; i<n; i++) for(j=0; j<n; j++)
\r
573 Q[i*n+j] *= com.pi[j];
\r
574 for(i=0,mr=0;i<n;i++) {
\r
575 Q[i*n+i] = -sum(Q+i*n,n);
\r
576 mr -= pi[i]*Q[i*n+i];
\r
580 Qfactor += freqK_NS * mr;
\r
582 if(com.ncatG==0) FOR(i,n*n) Q[i]*=1/mr;
\r
583 else FOR(i,n*n) Q[i]*=Qfactor; /* NSsites models */
\r
584 eigenQREV(Q, com.pi, n, Root, U, V, space);
\r
591 int EigenQaa (double pi[], double Root[], double U[], double V[], double Q[])
\r
593 /* Construct the rate matrix Q[]
\r
596 double mr, space[20];
\r
598 FOR (i,n*n) Q[i]=0;
\r
599 switch (com.model) {
\r
600 case (Poisson) : case (EqualInput) :
\r
601 fillxc (Q, 1., n*n); break;
\r
602 case (Empirical) : case (Empirical_F):
\r
603 FOR(i,n) FOR(j,i) Q[i*n+j]=Q[j*n+i]=com.daa[i*n+j]/100;
\r
606 FOR (i,n) FOR (j,n) Q[i*n+j]*=com.pi[j];
\r
607 for (i=0,mr=0; i<n; i++) {
\r
608 Q[i*n+i]=0; Q[i*n+i]=-sum(Q+i*n,n); mr-=com.pi[i]*Q[i*n+i];
\r
611 eigenQREV(Q, com.pi, n, Root, U, V, space);
\r
612 FOR(i,n) Root[i]=Root[i]/mr;
\r
618 int GetDaa (FILE* fout, double daa[])
\r
620 /* Get the amino acid substitution rate matrix (grantham, dayhoff, jones, etc).
\r
626 fdaa=gfopen(com.daafile, "r");
\r
627 printf("\nReading rate matrix from %s\n", com.daafile);
\r
629 for (i=0; i<n; i++) for (j=0,daa[i*n+i]=0; j<i; j++) {
\r
630 fscanf(fdaa, "%lf", &daa[i*n+j]);
\r
631 daa[j*n+i]=daa[i*n+j];
\r
633 if (com.model==Empirical) {
\r
634 FOR(i,n) if(fscanf(fdaa,"%lf",&com.pi[i])!=1) error2("err aaRatefile");
\r
635 if (fabs(1-sum(com.pi,20))>1e-4) error2("\nSum of aa freq. != 1\n");
\r
640 fprintf (fout, "\n%s\n", com.daafile);
\r
642 fprintf (fout, "\n%4s", getAAstr(aa3,i));
\r
643 FOR (j,i) fprintf (fout, "%5.0f", daa[i*n+j]);
\r
654 void MakeSeq(char*z, int ls)
\r
656 /* generate a random sequence of nucleotides, codons, or amino acids by
\r
657 sampling com.pi[], or read the ancestral sequence from the file RootSeq.txt
\r
658 if the file exists.
\r
660 int i,j,h, n=com.ncode, ch, n31=(com.seqtype==1?3:1), lst;
\r
661 double p[64],r, small=1e-5;
\r
662 char *pch=(com.seqtype==2?AAs:BASEs);
\r
663 char rootseqf[]="RootSeq.txt", codon[4]=" ";
\r
664 FILE *fseq=(FILE*)fopen(rootseqf,"r");
\r
665 static int times=0;
\r
668 if(times++==0) printf("Reading sequence at the root from file.\n\n");
\r
669 if(com.siterates && com.ncatG>1)
\r
670 error2("sequence for root doesn't work for site-class models");
\r
673 for(i=0; i<n31; i++) {
\r
674 while((ch=fgetc(fseq)) !=EOF && !isalpha(ch)) ;
\r
675 if(ch==EOF) error2("EOF when reading root sequence.");
\r
677 codon[i]=(char)(ch=CodeChara((char)ch, com.seqtype));
\r
679 if(com.seqtype==1) ch = codon[0]*16 + codon[1]*4 + codon[2];
\r
680 if(ch<0 || ch>n-1)
\r
681 printf("error when reading site %d\n", lst+1);
\r
682 if(com.seqtype==1 && com.pi[ch]==0)
\r
683 printf("you seem to have a stop codon in the root sequence\n");
\r
685 z[lst++] = (char)ch;
\r
686 if(lst==com.ls) break;
\r
691 for(j=0; j<n; j++) p[j] = com.pi[j];
\r
692 for(j=1; j<n; j++) p[j] += p[j-1];
\r
693 if(fabs(p[n-1]-1) > small)
\r
694 { printf("\nsum pi = %.6f != 1!\n", p[n-1]); exit(-1); }
\r
695 for(h=0; h<com.ls; h++) {
\r
696 for(j=0,r=rndu();j<n-1;j++)
\r
705 void Evolve1 (int inode)
\r
707 /* evolve sequence com.z[tree.root] along the tree to generate com.z[],
\r
708 using nodes[].branch, nodes[].omega, & com.model
\r
709 Needs com.z[0,1,...,nnode-1], while com.z[0] -- com.z[ns-1] constitute
\r
711 For codon sequences, com.siterates[] has w's for NSsites and NSbranchsite models.
\r
713 int is, h,i,j, ison, from, n=com.ncode, longseq=100000;
\r
716 for (is=0; is<nodes[inode].nson; is++) {
\r
717 ison=nodes[inode].sons[is];
\r
718 memcpy(com.z[ison],com.z[inode],com.ls*sizeof(unsigned char));
\r
719 t=nodes[ison].branch;
\r
721 if(com.seqtype==1 && com.model && com.NSsites) { /* branch-site models */
\r
722 Qfactor = com.QfactorBS[ison];
\r
723 for(h=0; h<com.ls; h++)
\r
724 com.siterates[h] = com.omegaBS[ison*com.ncatG+com.siteID[h]];
\r
727 for(h=0; h<com.ls; h++) {
\r
728 /* decide whether to recalcualte PMat[]. */
\r
729 if (h==0 || (com.siterates && com.siterates[h]!=com.siterates[h-1])) {
\r
730 rw = (com.siterates?com.siterates[h]:1);
\r
732 switch(com.seqtype) {
\r
734 if(com.model<=TN93)
\r
735 PMatTN93(PMat, t*Qfactor*rw*Qrates[0],
\r
736 t*Qfactor*rw*Qrates[1], t*Qfactor*rw, com.pi);
\r
737 else if(com.model==REV)
\r
738 PMatUVRoot(PMat, t*rw, com.ncode, U,V,Root);
\r
741 case (CODONseq): /* Watch out for NSsites model */
\r
742 if(com.model || com.NSsites) { /* no need to update UVRoot if M0 */
\r
743 if(com.model && com.NSsites==0) /* branch */
\r
744 rw = nodes[ison].omega; /* should be equal to com.rK[nodes[].label] */
\r
746 EigenQcodon(0, com.kappa, rw, com.pi, Root, U, V, PMat);
\r
748 PMatUVRoot(PMat, t, com.ncode, U, V, Root);
\r
752 PMatUVRoot(PMat, t*rw, com.ncode, U, V, Root);
\r
757 PMat[i*n+j] += PMat[i*n+j-1];
\r
759 for(j=0,from=com.z[ison][h],rw=rndu(); j<n-1; j++)
\r
760 if(rw < PMat[from*n+j]) break;
\r
761 com.z[ison][h] = j;
\r
764 if(com.ls>longseq) printf("\r nodes %2d -> %2d, evolving . . ", inode+1, ison+1);
\r
766 if(nodes[ison].nson) Evolve1(ison);
\r
769 if(inode==tree.root && com.ls>longseq) printf("\r%s", strc(50,' '));
\r
773 int PatternWeightSimple (int CollapsJC)
\r
775 /* This is modified from PatternWeight() and collaps sites into patterns,
\r
776 for nucleotide, amino acid, or codon sequences.
\r
777 This relies on \0 being the end of the string so that sequences should not be
\r
778 encoded before this routine is called.
\r
779 com.pose[i] has labels for genes as input and maps sites to patterns in return.
\r
780 com.fpatt, a vector of doubles, wastes space as site pattern counts are integers.
\r
781 Sequences z[ns*ls] are copied into patterns zt[ls*lpatt], and bsearch is used
\r
782 twice to avoid excessive copying, to count npatt first & to generate fpatt etc.
\r
784 int maxnpatt=com.ls, h, l,u, ip, j, k, same;
\r
785 /* int n31 = (com.seqtype==CODONseq ? 3 : 1); */
\r
787 int lpatt = com.ns*n31+1; /* extra 0 used for easy debugging, can be avoided */
\r
788 int *p2s; /* point patterns to sites in zt */
\r
790 double nc = (com.seqtype == 1 ? 64 : com.ncode) + !com.cleandata+1;
\r
794 /* (A) Collect and sort patterns. Get com.npatt.
\r
795 Move sequences com.z[ns][ls] into sites zt[ls*lpatt].
\r
796 Use p2s to map patterns to sites in zt to avoid copying.
\r
799 if((com.seqtype==1 && com.ns<5) || (com.seqtype!=1 && com.ns<7))
\r
800 maxnpatt = (int)(pow(nc, (double)com.ns) + 0.5);
\r
801 if(maxnpatt>com.ls) maxnpatt = com.ls;
\r
802 p2s = (int*)malloc(maxnpatt*sizeof(int));
\r
803 zt = (char*)malloc(com.ls*lpatt*sizeof(char));
\r
804 if(p2s==NULL || zt==NULL) error2("oom p2s or zt");
\r
805 memset(zt, 0, com.ls*lpatt*sizeof(char));
\r
806 for(j=0; j<com.ns; j++)
\r
807 for(h=0; h<com.ls; h++)
\r
808 for(k=0; k<n31; k++)
\r
809 zt[h*lpatt+j*n31+k] = com.z[j][h*n31+k];
\r
811 com.npatt = l = u = ip = 0;
\r
812 for(h=0; h<com.ls; h++) {
\r
813 if(debug) printf("\nh %3d %s", h, zt+h*lpatt);
\r
814 /* bsearch in existing patterns. Knuth 1998 Vol3 Ed2 p.410
\r
815 ip is the loc for match or insertion. [l,u] is the search interval.
\r
818 if(h != 0) { /* not 1st pattern? */
\r
819 for(l=0, u=com.npatt-1; ; ) {
\r
822 k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);
\r
823 if(k<0) u = ip - 1;
\r
824 else if(k>0) l = ip + 1;
\r
825 else { same = 1; break; }
\r
829 if(com.npatt>maxnpatt)
\r
830 error2("npatt > maxnpatt");
\r
831 if(l > ip) ip++; /* last comparison in bsearch had k > 0. */
\r
832 /* Insert new pattern at ip. This is the expensive step. */
\r
835 memmove(p2s+ip+1, p2s+ip, (com.npatt-ip)*sizeof(int));
\r
841 printf(": %3d (%c ilu %3d%3d%3d) ", com.npatt, DS[same], ip, l, u);
\r
842 for(j=0; j<com.npatt; j++)
\r
843 printf(" %s", zt+p2s[j]*lpatt);
\r
847 /* (B) count pattern frequencies */
\r
848 com.fpatt = (double*)realloc(com.fpatt, com.npatt*sizeof(double));
\r
849 if(com.fpatt==NULL) error2("oom fpatt");
\r
850 for(ip=0; ip<com.npatt; ip++) com.fpatt[ip] = 0;
\r
851 for(h=0; h<com.ls; h++) {
\r
852 for(same=0, l=0, u=com.npatt-1; ; ) {
\r
855 k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);
\r
856 if(k<0) u = ip - 1;
\r
857 else if(k>0) l = ip + 1;
\r
858 else { same = 1; break; }
\r
861 error2("ghost pattern?");
\r
865 for(j=0; j<com.ns; j++) {
\r
866 for(ip=0,p=com.z[j]; ip<com.npatt; ip++)
\r
867 for(k=0; k<n31; k++)
\r
868 *p++ = zt[p2s[ip]*lpatt+j*n31+k];
\r
870 free(p2s); free(zt);
\r
876 void Simulate (char*ctlf)
\r
878 /* simulate nr data sets of nucleotide, codon, or AA sequences.
\r
879 ls: number of nucleotides, codons, or AAs in each sequence.
\r
880 All 64 codons are used for codon sequences.
\r
881 When com.alpha or com.ncatG>1, sites are randomized after sequences are
\r
883 space[com.ls] is used to hold site marks.
\r
884 format (0: paml sites; 1: paml patterns; 2: paup nex)
\r
886 char *ancf="ancestral.txt", *siteIDf="siterates.txt";
\r
887 FILE *fin, *fseq, *ftree=NULL, *fanc=NULL, *fsiteID=NULL;
\r
888 char *paupstart="paupstart",*paupblock="paupblock",*paupend="paupend";
\r
890 int lline=32000, i,j,k, ir,n,nr, fixtree=1, sspace=10000, rooted=1;
\r
891 int h=0,format=0, b[3]={0}, nrate=1, counts[NCATG];
\r
892 int *siteorder=NULL;
\r
893 char *tmpseq=NULL, *pc;
\r
894 double birth=0, death=0, sample=1, mut=1, tlength, *space, *blengthBS;
\r
895 double T,C,A,G,Y,R, Falias[NCATG];
\r
899 printf("\nReading options from data file %s\n", ctlf);
\r
900 com.ncode = n = (com.seqtype==0 ? 4 : (com.seqtype==1?64:20));
\r
901 fin = (FILE*)gfopen(ctlf,"r");
\r
902 fscanf(fin, "%d", &format); fgets(line, lline, fin);
\r
903 printf("\nSimulated data will go into %s.\n", seqf[format]);
\r
904 if(format==2) printf("%s, %s, & %s will be appended if existent.\n",
\r
905 paupstart,paupblock,paupend);
\r
907 fscanf (fin, "%d", &i);
\r
908 fgets(line, lline, fin);
\r
910 fscanf (fin, "%d%d%d", &com.ns, &com.ls, &nr);
\r
911 fgets(line, lline, fin);
\r
912 i=(com.ns*2-1)*sizeof(struct TREEN);
\r
913 if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
\r
915 if(com.ns>NS) error2("too many seqs?");
\r
916 printf ("\n%d seqs, %d sites, %d replicate(s)\n", com.ns, com.ls, nr);
\r
917 k=(com.ns*com.ls* (com.seqtype==CODONseq?4:1) *nr)/1000+1;
\r
918 printf ("Seq file will be about %dK bytes.\n",k);
\r
919 for(i=0; i<com.ns; i++) /* default spname */
\r
920 sprintf(com.spname[i],"S%d",i+1);
\r
923 fscanf(fin,"%lf",&tlength); fgets(line, lline, fin);
\r
924 if(ReadTreeN(fin,&i,&j, 1, 1)) /* might overwrite spname */
\r
925 error2("err tree..");
\r
927 if(i==0) error2("use : to specify branch lengths in tree");
\r
928 for(i=0,T=0; i<tree.nnode; i++)
\r
929 if(i!=tree.root) T += nodes[i].branch;
\r
931 for(i=0; i<tree.nnode; i++)
\r
932 if(i!=tree.root) nodes[i].branch *= tlength/T;
\r
934 printf("tree length = %.3f\n", (tlength>0?tlength:T));
\r
936 printf("\nModel tree & branch lengths:\n");
\r
937 OutTreeN(F0,1,1); FPN(F0);
\r
938 OutTreeN(F0,0,1); FPN(F0);
\r
940 if(com.seqtype==CODONseq && com.model && !com.NSsites) { /* branch model */
\r
941 FOR(i,tree.nnode) nodes[i].omega=nodes[i].label;
\r
942 FPN(F0); OutTreeN(F0, 1, PrBranch&PrLabel); FPN(F0);
\r
945 else { /* random trees, broken or need testing? */
\r
946 printf ("\nbirth rate, death rate, sampling fraction, mutation rate (tree height)?\n");
\r
947 fscanf (fin, "%lf%lf%lf%lf", &birth, &death, &sample, &mut);
\r
948 fgets(line, lline, fin);
\r
949 printf("%9.4f %9.4f %9.4f %9.4f\n", birth, death, sample, mut);
\r
952 if(com.seqtype==BASEseq) {
\r
953 fscanf(fin,"%d",&com.model);
\r
954 fgets(line, lline, fin);
\r
955 if(com.model<0 || com.model>REV) error2("model err");
\r
956 if(com.model==T92) error2("T92: please use HKY85 with T=A and C=G.");
\r
958 printf("\nModel: %s\n", basemodels[com.model]);
\r
959 if(com.model==REV) nrate=5;
\r
960 else if(com.model==TN93) nrate=2;
\r
961 FOR(i,nrate) fscanf(fin,"%lf",&Qrates[i]);
\r
962 fgets(line, lline, fin);
\r
963 if(nrate<=2) FOR(i,nrate) printf("kappa %9.5f\n",Qrates[i]); FPN(F0);
\r
965 printf("a & b & c & d & e: ");
\r
966 FOR(i,nrate) printf("%9.5f",Qrates[i]); FPN(F0);
\r
968 if((com.model==JC69 || com.model==F81)&&Qrates[0]!=1)
\r
969 error2("kappa should be 1 for this model");
\r
971 else if(com.seqtype==CODONseq) {
\r
972 for(i=0; i<64; i++)
\r
973 getcodon(CODONs[i], i);
\r
974 if(com.model==0 && com.NSsites) { /* site model */
\r
975 fscanf(fin,"%d", &com.ncatG); fgets(line, lline, fin);
\r
976 if(com.ncatG>NCATG) error2("ncatG>NCATG");
\r
977 FOR(i,com.ncatG) fscanf(fin,"%lf",&com.freqK[i]); fgets(line, lline, fin);
\r
978 FOR(i,com.ncatG) fscanf(fin,"%lf",&com.rK[i]); fgets(line, lline, fin);
\r
979 printf("\n\ndN/dS (w) for site classes (K=%d)", com.ncatG);
\r
980 printf("\nf: "); FOR(i,com.ncatG) printf("%9.5f",com.freqK[i]);
\r
981 printf("\nw: "); FOR(i,com.ncatG) printf("%9.5f",com.rK[i]); FPN(F0);
\r
983 else if(com.model && com.NSsites) { /* branchsite model */
\r
984 fscanf(fin,"%d",&com.ncatG); fgets(line, lline, fin);
\r
985 if(com.ncatG>min2(NCATG,127)) error2("ncatG too large");
\r
986 FOR(i,com.ncatG) fscanf(fin,"%lf",&com.freqK[i]); fgets(line,lline,fin);
\r
987 printf("\n%d site classes.\nFreqs: ", com.ncatG);
\r
988 FOR(i,com.ncatG) printf("%9.5f",com.freqK[i]);
\r
990 if((com.omegaBS=(double*)malloc((com.ncatG+2)*tree.nnode*sizeof(double)))==NULL)
\r
992 com.QfactorBS = com.omegaBS + com.ncatG*tree.nnode;
\r
993 blengthBS = com.QfactorBS + tree.nnode;
\r
995 for(i=0; i<tree.nnode; i++)
\r
996 blengthBS[i] = nodes[i].branch;
\r
997 for(k=0; k<com.ncatG; k++) {
\r
998 ReadTreeN(fin, &i, &j, 0, 1);
\r
999 if(i) error2("do not include branch lengths except in the first tree.");
\r
1000 if(!j) error2("Use # to specify omega's for branches");
\r
1001 for(i=0; i<tree.nnode; i++) com.omegaBS[i*com.ncatG+k]=nodes[i].label;
\r
1003 for(i=0; i<tree.nnode; i++)
\r
1004 { nodes[i].branch=blengthBS[i]; nodes[i].label=nodes[i].omega=0; }
\r
1005 for(i=0; i<tree.nnode; i++) { /* print out omega as node labels. */
\r
1006 nodes[i].nodeStr=pc=(char*)malloc(20*com.ncatG*sizeof(char));
\r
1007 sprintf(pc, "'[%.2f", com.omegaBS[i*com.ncatG+0]);
\r
1008 for(k=1,pc+=strlen(pc); k<com.ncatG; k++,pc+=strlen(pc))
\r
1009 sprintf(pc, ", %.2f", com.omegaBS[i*com.ncatG+k]);
\r
1010 sprintf(pc, "]'");
\r
1012 FPN(F0); OutTreeN(F0,1,PrBranch|PrLabel); FPN(F0);
\r
1014 else if(com.model==0) { /* M0 */
\r
1015 fscanf(fin,"%lf",&com.omega);
\r
1016 fgets(line, lline, fin);
\r
1017 printf("omega = %9.5f\n",com.omega);
\r
1018 for(i=0; i<tree.nbranch; i++)
\r
1019 nodes[tree.branches[i][1]].omega = com.omega;
\r
1022 fscanf(fin, "%lf", &com.kappa); fgets(line, lline, fin);
\r
1023 printf("kappa = %9.5f\n",com.kappa);
\r
1026 if(com.seqtype==BASEseq || com.seqtype==AAseq) {
\r
1027 fscanf(fin,"%lf%d", &com.alpha, &com.ncatG);
\r
1028 fgets(line, lline, fin);
\r
1030 printf("Gamma rates, alpha =%.4f (K=%d)\n", com.alpha, com.ncatG);
\r
1033 puts("Rates are constant over sites.");
\r
1036 if(com.alpha || com.ncatG) { /* this is used for codon NSsites as well. */
\r
1038 if(com.seqtype==1 && com.model && com.NSsites) k *= tree.nnode;
\r
1039 if((com.siterates=(double*)malloc(k*sizeof(double)))==NULL) error2("oom1");
\r
1040 if((siteorder=(int*)malloc(com.ls*sizeof(int)))==NULL) error2("oom2");
\r
1043 if(com.seqtype==AAseq) { /* get aa substitution model and rate matrix */
\r
1044 fscanf(fin,"%d",&com.model);
\r
1045 printf("\nmodel: %s",aamodels[com.model]);
\r
1046 if(com.model>=2) { fscanf(fin,"%s",com.daafile); GetDaa(NULL,com.daa); }
\r
1047 fgets(line, lline, fin);
\r
1050 /* get freqs com.pi[] */
\r
1051 if((com.seqtype==BASEseq && com.model>K80) ||
\r
1052 com.seqtype==CODONseq ||
\r
1053 (com.seqtype==AAseq && (com.model==1 || com.model==3)))
\r
1054 for(k=0; k<com.ncode; k++) fscanf(fin,"%lf", &com.pi[k]);
\r
1055 else if(com.model==0 || (com.seqtype==BASEseq && com.model<=K80))
\r
1056 fillxc(com.pi, 1./com.ncode, com.ncode);
\r
1058 printf("sum pi = 1 = %.6f:", sum(com.pi,com.ncode));
\r
1059 matout2(F0, com.pi, com.ncode/4, 4, 9, 6);
\r
1060 if(com.seqtype==CODONseq) {
\r
1061 fscanf(fin, "%d", &com.icode); fgets(line, lline, fin);
\r
1062 printf("genetic code = %d\n", com.icode);
\r
1063 for(k=0; k<com.ncode; k++)
\r
1064 if(GeneticCode[com.icode][k] == -1 && com.pi[k])
\r
1065 error2("stop codons should have frequency 0?");
\r
1068 if(com.seqtype==BASEseq) {
\r
1069 if(com.model<REV) {
\r
1070 T=com.pi[0]; C=com.pi[1]; A=com.pi[2]; G=com.pi[3]; Y=T+C; R=A+G;
\r
1071 if (com.model==F84) {
\r
1072 Qrates[1]=1+Qrates[0]/R; /* kappa2 */
\r
1073 Qrates[0]=1+Qrates[0]/Y; /* kappa1 */
\r
1075 else if (com.model<=HKY85) Qrates[1]=Qrates[0];
\r
1076 Qfactor = 1/(2*T*C*Qrates[0] + 2*A*G*Qrates[1] + 2*Y*R);
\r
1079 if(com.model==REV) EigenQbase(Qrates, com.pi, Root,U,V,PMat);
\r
1082 /* get Qfactor for NSsites & NSbranchsite models */
\r
1083 if(com.seqtype==CODONseq && com.NSsites) {
\r
1084 if(!com.model) { /* site models */
\r
1085 for(k=0,Qfactor=0; k<com.ncatG; k++) {
\r
1086 freqK_NS=com.freqK[k];
\r
1087 EigenQcodon(1, com.kappa,com.rK[k],com.pi, NULL,NULL,NULL, PMat);
\r
1089 Qfactor=1/Qfactor;
\r
1090 printf("Qfactor for NSsites model = %9.5f\n", Qfactor);
\r
1092 else { /* branch-site models */
\r
1093 for(i=0; i<tree.nnode; i++) {
\r
1094 if(i==tree.root) { com.QfactorBS[i]=-1; continue; }
\r
1095 for(k=0,Qfactor=0; k<com.ncatG; k++) {
\r
1096 freqK_NS=com.freqK[k];
\r
1097 EigenQcodon(1, com.kappa,com.omegaBS[i*com.ncatG+k],com.pi, NULL,NULL,NULL, PMat);
\r
1099 com.QfactorBS[i]=1/Qfactor; Qfactor=0;
\r
1100 printf("node %2d: Qfactor = %9.5f\n", i+1, com.QfactorBS[i]);
\r
1104 if(com.seqtype==CODONseq && com.ncatG<=1 && com.model==0)
\r
1105 EigenQcodon(0, com.kappa,com.omega, com.pi, Root, U, V, PMat);
\r
1106 else if(com.seqtype==AAseq)
\r
1107 EigenQaa(com.pi, Root, U, V,PMat);
\r
1109 puts("\nAll parameters are read. Ready to simulate\n");
\r
1110 for(j=0; j<com.ns*2-1; j++)
\r
1111 com.z[j] = (unsigned char*)malloc(com.ls*sizeof(unsigned char));
\r
1112 sspace = max2(sspace, com.ls*(int)sizeof(double));
\r
1113 space = (double*)malloc(sspace);
\r
1114 if(com.alpha || com.ncatG) tmpseq=(char*)space;
\r
1115 if (com.z[com.ns*2-1-1]==NULL) error2("oom for seqs");
\r
1116 if (space==NULL) {
\r
1117 printf("oom for space, %d bytes needed.", sspace);
\r
1121 fseq = gfopen(seqf[format], "w");
\r
1122 if(format==2) appendfile(fseq, paupstart);
\r
1124 fanc = (FILE*)gfopen(ancf, "w");
\r
1126 fputs("\nAncestral sequences generated during simulation ",fanc);
\r
1127 fprintf(fanc,"(check against %s)\n",seqf[format]);
\r
1128 OutTreeN(fanc,0,0); FPN(fanc); OutTreeB(fanc); FPN(fanc);
\r
1130 if(com.alpha || com.NSsites) {
\r
1131 fsiteID=(FILE*)gfopen(siteIDf,"w");
\r
1132 if(com.seqtype==1) fprintf(fsiteID, "\nSite class IDs\n");
\r
1133 else fprintf(fsiteID, "\nRates for sites\n");
\r
1134 if(com.seqtype==CODONseq && com.NSsites) {
\r
1135 if(!com.model) matout(fsiteID,com.rK, 1,com.ncatG);
\r
1136 if((com.siteID=(char*)malloc(com.ls*sizeof(char)))==NULL)
\r
1137 error2("oom siteID");
\r
1141 for (ir=0; ir<nr; ir++) {
\r
1142 if (!fixtree) { /* right now tree is fixed */
\r
1143 RandomLHistory (rooted, space);
\r
1144 if (rooted && com.ns<10) j = GetIofLHistory ();
\r
1145 BranchLengthBD (1, birth, death, sample, mut);
\r
1147 printf ("\ntree used: ");
\r
1152 MakeSeq(com.z[tree.root], com.ls);
\r
1155 Rates4Sites(com.siterates,com.alpha,com.ncatG,com.ls, 0,space);
\r
1156 else if(com.seqtype==1 && com.NSsites) { /* for NSsites */
\r
1157 /* the table for the alias algorithm is the same, but ncatG is small. */
\r
1158 MultiNomialAliasSetTable(com.ncatG, com.freqK, Falias, Lalias, space);
\r
1159 MultiNomialAlias(com.ls, com.ncatG, Falias, Lalias, counts);
\r
1161 for (i=0,h=0; i<com.ncatG; i++)
\r
1162 for (j=0; j<counts[i]; j++) {
\r
1163 com.siteID[h]=(char)i;
\r
1164 com.siterates[h++]=com.rK[i]; /* overwritten later for branchsite */
\r
1168 Evolve1(tree.root);
\r
1170 /* randomize sites for site-class model */
\r
1171 if(com.siterates && com.ncatG>1) {
\r
1172 if(format==1 && ir==0)
\r
1173 puts("\nrequested site pattern counts as output for site-class model.\n");
\r
1174 randorder(siteorder, com.ls, (int*)space);
\r
1175 for(j=0; j<tree.nnode; j++) {
\r
1176 memcpy(tmpseq,com.z[j],com.ls*sizeof(char));
\r
1177 for(h=0; h<com.ls; h++) com.z[j][h]=tmpseq[siteorder[h]];
\r
1179 if(com.alpha || com.ncatG>1) {
\r
1180 memcpy(space,com.siterates,com.ls*sizeof(double));
\r
1181 for(h=0; h<com.ls; h++) com.siterates[h]=space[siteorder[h]];
\r
1184 memcpy((char*)space,com.siteID,com.ls*sizeof(char));
\r
1185 for(h=0; h<com.ls; h++) com.siteID[h]=*((char*)space+siteorder[h]);
\r
1189 /* print sequences*/
\r
1191 for(i=0; i<com.ns; i++) for(h=0; h<com.ls; h++) com.z[i][h] ++;
\r
1192 PatternWeightSimple(0);
\r
1193 for(i=0; i<com.ns; i++) for(h=0; h<com.npatt; h++) com.z[i][h] --;
\r
1195 if(format==2) fprintf(fseq,"\n\n[Replicate # %d]\n", ir+1);
\r
1196 printSeqs(fseq, NULL, NULL, format); /* printsma not usable as it codes into 0,1,...,60. */
\r
1197 if(format==2 && !fixtree) {
\r
1198 fprintf(fseq,"\nbegin tree;\n tree true_tree = [&U] ");
\r
1199 OutTreeN(fseq,1,1); fputs(";\n",fseq);
\r
1200 fprintf(fseq,"end;\n\n");
\r
1202 if(format==2) appendfile(fseq,paupblock);
\r
1204 /* print ancestral seqs, rates for sites. */
\r
1206 j = (com.seqtype==CODONseq?3*com.ls:com.ls);
\r
1207 fprintf(fanc,"[replicate %d]\n",ir+1);
\r
1211 { OutTreeN(fanc,1,1); FPN(fanc); FPN(fanc); }
\r
1214 fprintf(fanc,"%6d %6d\n",tree.nnode-com.ns,j);
\r
1215 for(j=com.ns; j<tree.nnode; j++,FPN(fanc)) {
\r
1216 fprintf(fanc,"node%-26d ", j+1);
\r
1217 print1seq(fanc, com.z[j], com.ls, NULL);
\r
1222 if(com.seqtype==CODONseq && com.NSsites && com.model==0) { /* site model */
\r
1224 if(com.rK[com.ncatG-1]>1)
\r
1225 FOR(h,com.ls) if(com.rK[com.siteID[h]]>1) k++;
\r
1226 fprintf(fsiteID, "\n[replicate %d: %2d]\n",ir+1, k);
\r
1227 if(k) for(h=0,k=0; h<com.ls; h++) {
\r
1228 if(com.rK[com.siteID[h]]>1) {
\r
1229 fprintf(fsiteID,"%4d ",h+1);
\r
1230 if(++k%15==0) FPN(fsiteID);
\r
1235 else if(com.seqtype==CODONseq && com.NSsites && com.model) { /* branchsite */
\r
1236 fprintf(fsiteID, "\n[replicate %d]\n",ir+1);
\r
1237 for(h=0; h<com.ls; h++) {
\r
1238 fprintf(fsiteID," %4d ", com.siteID[h]+1);
\r
1239 if(h==com.ls-1 || (h+1)%15==0) FPN(fsiteID);
\r
1242 else { /* gamma rates */
\r
1243 fprintf(fsiteID,"\n[replicate %d]\n",ir+1);
\r
1244 for(h=0; h<com.ls; h++) {
\r
1245 fprintf(fsiteID,"%7.4f ",com.siterates[h]);
\r
1246 if(h==com.ls-1 || (h+1)%10==0) FPN(fsiteID);
\r
1253 printf ("\rdid data set %d %s", ir+1, (com.ls>100000||nr<100? "\n" : ""));
\r
1255 if(format==2) appendfile(fseq, paupend);
\r
1257 fclose(fseq); if(!fixtree) fclose(fanc);
\r
1258 if(com.alpha || com.NSsites) fclose(fsiteID);
\r
1259 for(j=0; j<com.ns*2-1; j++) free(com.z[j]);
\r
1261 if(com.model && com.NSsites) /* branch-site model */
\r
1262 for(i=0; i<tree.nnode; i++) free(nodes[i].nodeStr);
\r
1264 if(com.alpha || com.ncatG) {
\r
1265 free(com.siterates); com.siterates=NULL;
\r
1267 if(com.siteID) free(com.siteID); com.siteID=NULL;
\r
1269 if(com.seqtype==1 && com.model && com.NSsites) free(com.omegaBS);
\r
1270 com.omegaBS = NULL;
\r
1276 int GetSpnamesFromMB (FILE *fmb, char line[], int lline)
\r
1278 /* This reads species names from MrBayes output file fmb, like the following.
\r
1280 Taxon 1 -> 1_Arabidopsis_thaliana
\r
1281 Taxon 2 -> 2_Taxus_baccata
\r
1284 char *p=NULL, *mbstr1="Taxon ", *mbstr2="->";
\r
1286 puts("Reading species names from mb output file.\n");
\r
1288 for(ispecies=0; ; ) {
\r
1289 if(fgets(line, lline, fmb)==NULL) return(-1);
\r
1290 if(strstr(line, mbstr1) && strstr(line, mbstr2)) {
\r
1291 p=strstr(line, mbstr1)+5;
\r
1292 sscanf(p, "%d", &ispecies);
\r
1293 p=strstr(line, mbstr2)+3;
\r
1294 if(com.spname[ispecies-1][0])
\r
1295 error2("species name already read?");
\r
1297 for(j=0; isgraph(*p)&&j<lline; ) com.spname[ispecies-1][j++] = *p++;
\r
1298 com.spname[ispecies-1][j]=0;
\r
1300 printf("\tTaxon %2d: %s\n", ispecies, com.spname[ispecies-1]);
\r
1302 else if (ispecies)
\r
1311 char *GrepLine (FILE*fin, char*query, char* line, int lline)
\r
1313 /* This greps infile to search for query[], and returns NULL or line[].
\r
1319 if(fgets(line, lline, fin)==NULL) return(NULL);
\r
1320 if(strstr(line, query)) return(line);
\r
1326 void CladeMrBayesProbabilities (char treefile[])
\r
1328 /* This reads a tree from treefile and then scans a set of MrBayes output files
\r
1329 (mbfiles) to retrieve posterior probabilities for every clade in that tree.
\r
1330 It first scans the first mb output file to get the species names.
\r
1333 6 -- ...........................************* 8001 1.000 0.005 (0.000)
\r
1334 7 -- ....................******************** 8001 1.000 0.006 (0.000)
\r
1336 Note 4 Jan 2014: This uses parti2B[], and is broken after i rewrote
\r
1337 Tree2Partition().
\r
1339 int lline=100000, i,j,k, nib, inode, parti2B[NS];
\r
1340 char line[100000], *partition, *p;
\r
1341 char symbol[2]=".*", cladestr[NS+1]={0};
\r
1342 FILE *ftree, *fmb[20];
\r
1343 double *Pclade, t;
\r
1346 char *mbfiles[]={"mb-1e-5.out", "mb-2e-5.out", "mb-3e-5.out", "mb-4e-5.out",
\r
1347 "mb-5e-5.out", "mb-6e-5.out", "mb-7e-5.out", "mb-8e-5.out",
\r
1348 "mb-9e-5.out", "mb-1e-4.out", "mb-2e-4.out", "mb-3e-4.out",
\r
1349 "mb-5e-4.out", "mb-1e-3.out", "mb-1e-2.out"};
\r
1352 char *mbfiles[]={"mb-1e-4.out", "mb-1e-1.out"};
\r
1354 printf("tree file is %s\nmb output files:\n", treefile);
\r
1355 ftree=gfopen(treefile,"r");
\r
1356 for(k=0; k<nmbfiles; k++)
\r
1357 fmb[k]=gfopen(mbfiles[k],"r");
\r
1358 for(k=0; k<nmbfiles; k++) printf("\t%s\n", mbfiles[k]);
\r
1360 GetSpnamesFromMB(fmb[0], line, lline); /* read species names from mb output */
\r
1362 fscanf (ftree, "%d%d", &i, &k);
\r
1363 if(i && i!=com.ns) error2("do you mean to specify ns in the tree file?");
\r
1364 i=(com.ns*2-1)*sizeof(struct TREEN);
\r
1365 if((nodes=(struct TREEN*)malloc(i))==NULL) error2("oom");
\r
1366 ReadTreeN (ftree, &i, &j, 0, 1);
\r
1368 FPN(F0); OutTreeN(F0, 0, 0); FPN(F0); FPN(F0);
\r
1369 nib=tree.nbranch-com.ns;
\r
1370 for(i=0;i<tree.nnode;i++) {
\r
1371 nodes[i].nodeStr = NULL;
\r
1372 if(i>com.ns) nodes[i].nodeStr=(char*)malloc(100*sizeof(char));
\r
1375 partition=(char*)malloc(nib*com.ns*sizeof(char));
\r
1376 if (partition==NULL) error2("oom");
\r
1377 if((Pclade=(double*)malloc(nib*nmbfiles*sizeof(double)))==NULL)
\r
1379 for(i=0;i<nib*nmbfiles; i++) Pclade[i]=0;
\r
1381 Tree2Partition(partition);
\r
1383 for(i=0; i<nib; i++) {
\r
1384 inode=tree.branches[parti2B[i]][1];
\r
1385 if(partition[i*com.ns+0])
\r
1386 for(j=0; j<com.ns; j++) cladestr[j]=symbol[1-partition[i*com.ns+j]];
\r
1388 for(j=0; j<com.ns; j++) cladestr[j]=symbol[partition[i*com.ns+j]];
\r
1389 printf("#%2d branch %2d node %2d %s", i+1, parti2B[i], inode, cladestr);
\r
1391 for(k=0; k<nmbfiles; k++) {
\r
1392 if(GrepLine(fmb[k], cladestr, line, lline)) {
\r
1393 p=strstr(line,cladestr);
\r
1394 sscanf(p+com.ns, "%lf%lf\0", &t, &Pclade[i*nmbfiles+k]);
\r
1397 for(k=0; k<nmbfiles; k++) printf("%6.2f", Pclade[i*nmbfiles+k]);
\r
1399 for(k=0,p=nodes[inode].nodeStr; k<nmbfiles; k++) {
\r
1400 sprintf(p, "%3.0f%s", Pclade[i*nmbfiles+k]*100,(k<nmbfiles-1?"/":""));
\r
1404 FPN(F0); OutTreeN(F0,1,PrLabel); FPN(F0);
\r
1406 for(i=0; i<tree.nnode; i++) free(nodes[i].nodeStr);
\r
1407 free(nodes); free(partition); free(Pclade);
\r
1409 for(k=0; k<nmbfiles; k++) fclose(fmb[k]);
\r