2 subroutines that operates on trees, inserted into other programs
\r
3 such as baseml, basemlg, codeml, and pamp.
\r
6 extern char BASEs[], *EquateBASE[], BASEs5[], *EquateBASE5[], AAs[], BINs[], CODONs[][4], nChara[], CharaMap[][64];
\r
11 #define REALSEQUENCE
\r
12 #define NODESTRUCTURE
\r
16 #define RECONSTRUCTION
\r
17 #define MINIMIZATION
\r
21 #define REALSEQUENCE
\r
22 #define NODESTRUCTURE
\r
26 #define RECONSTRUCTION
\r
27 #define MINIMIZATION
\r
31 #define REALSEQUENCE
\r
32 #define NODESTRUCTURE
\r
36 #ifdef RECONSTRUCTION
\r
41 #define REALSEQUENCE
\r
42 #define NODESTRUCTURE
\r
46 #if(defined CODEML || defined YN00)
\r
47 double SS, NN, Sd, Nd; /* kostas, # of syn. sites,# of non syn. sites,# of syn. subst.,# of non syn. subst. */
\r
54 int hasbase (char *str)
\r
56 char *p=str, *eqdel=".-?";
\r
58 if (*p==eqdel[0] || *p==eqdel[1] || *p==eqdel[2] || isalpha(*p++))
\r
64 int GetSeqFileType(FILE *fseq, int *paupseq);
\r
65 int IdenticalSeqs(void);
\r
66 void RemoveEmptySequences(void);
\r
68 int GetSeqFileType(FILE *fseq, int *format)
\r
70 /* paupstart="begin data" and paupend="matrix" identify nexus file format.
\r
71 Modify if necessary.
\r
72 format: 0: alignment; 1: fasta; 2: nexus.
\r
75 int lline=1000, ch, aligned;
\r
76 char fastastarter='>';
\r
77 char line[1000], *paupstart="begin data",*paupend="matrix", *p;
\r
78 char *ntax="ntax",*nchar="nchar";
\r
80 while (isspace(ch=fgetc(fseq)))
\r
83 if(ch == fastastarter) {
\r
85 ScanFastaFile(fseq, &com.ns, &com.ls, &aligned);
\r
89 error2("The seq file appears to be in fasta format, but not aligned?");
\r
91 if(fscanf(fseq,"%d%d", &com.ns, &com.ls)==2) {
\r
92 *format = 0; return(0);
\r
95 printf("\nseq file is not paml/phylip format. Trying nexus format.");
\r
98 if(fgets(line,lline,fseq)==NULL) error2("seq err1: EOF");
\r
100 if(strstr(line,paupstart)) break;
\r
103 if(fgets(line,lline,fseq)==NULL) error2("seq err2: EOF");
\r
105 if((p=strstr(line,ntax))!=NULL) {
\r
106 while (*p != '=') { if(*p==0) error2("seq err"); p++; }
\r
107 sscanf(p+1,"%d", &com.ns);
\r
108 if((p=strstr(line,nchar))==NULL) error2("expect nchar");
\r
109 while (*p != '=') { if(*p==0) error2("expect ="); p++; }
\r
110 sscanf(p+1,"%d", &com.ls);
\r
114 /* printf("\nns: %d\tls: %d\n", com.ns, com.ls); */
\r
116 if(fgets(line,lline,fseq)==NULL) error2("seq err1: EOF");
\r
118 if (strstr(line,paupend)) break;
\r
123 int PopupComment(FILE *fseq)
\r
125 int ch, comment1=']';
\r
128 if(ch==EOF) error2("expecting ]");
\r
129 if(ch==comment1) break;
\r
130 if(noisy) putchar(ch);
\r
138 int ReadMorphology (FILE *fout, FILE *fin)
\r
140 int i,j, locus=data.nmorphloci;
\r
141 char line[1024], str[64];
\r
143 if((data.zmorph[locus][0] = (double*)malloc((com.ns*2-1)*com.ls*sizeof(double))) == NULL)
\r
144 error2("oom zmorph");
\r
145 if((data.Rmorph[locus] = (double*)malloc(com.ls*com.ls*sizeof(double))) == NULL)
\r
146 error2("oom Rmorph");
\r
148 if((data.nmorphloci = locus+1) > NMORPHLOCI) error2("raise NMORPHLOCI and recompile.");
\r
149 for(i=1; i<com.ns*2-1; i++) {
\r
150 data.zmorph[locus][i] = data.zmorph[locus][0] + i*com.ls;
\r
152 for(i=0; i<com.ns; i++) {
\r
153 fscanf(fin, "%s", com.spname[i]);
\r
154 printf ("Reading data for species #%2d: %s \r", i+1, com.spname[i]);
\r
155 for(j=0; j<com.ls; j++)
\r
156 fscanf(fin, "%lf", &data.zmorph[locus][i][j]);
\r
159 for(i=0; i<com.ns; i++) {
\r
160 fprintf(fout, "%-10s ", com.spname[i]);
\r
161 for(j=0; j<com.ls; j++)
\r
162 fprintf(fout, " %8.5f", data.zmorph[locus][i][j]);
\r
167 fscanf(fin, "%s", str);
\r
168 fgets(line, 1024, fin);
\r
170 if(strstr("Correlation", str)) {
\r
171 for(i=0; i<com.ls; i++) {
\r
172 for(j=0; j<com.ls; j++)
\r
173 if(fscanf(fin, "%lf", &data.Rmorph[locus][i*com.ls+j]) != 1) break;
\r
174 if(j<com.ls) break;
\r
177 if(i!=com.ls || j!=com.ls) {
\r
178 printf("\ndid not find a good R matrix. Setting it to identity matrix I.\n");
\r
179 for(i=0; i<com.ls; i++)
\r
180 for(j=0; j<com.ls; j++)
\r
181 data.Rmorph[locus][i*com.ls+j] = (i==j);
\r
189 int ReadSeq (FILE *fout, FILE *fseq, int cleandata, int locus)
\r
191 /* read in sequence, translate into protein (CODON2AAseq), and
\r
192 This counts ngene but does not initialize lgene[].
\r
193 It also codes (transforms) the sequences.
\r
194 com.seqtype: 0=nucleotides; 1=codons; 2:AAs; 3:CODON2AAs; 4:BINs
\r
195 com.pose[] is used to store gene or site-partition labels.
\r
196 ls/3 gene marks for codon sequences.
\r
197 char opt_c[]="GIPM";
\r
198 G:many genes; I:interlaved format; P:patterns; M:morphological characters
\r
200 Use cleandata=1 to clean up ambiguities. In return, com.cleandata=1 if the
\r
201 data are clean or are cleaned, and com.cleandata=0 is the data are unclean.
\r
203 char *p,*p1, eq='.', comment0='[', *line;
\r
204 int format=0; /* 0: paml/phylip, 1: fasta; 2: paup/nexus */
\r
205 int i,j,k, ch, noptline=0, lspname=LSPNAME, miss=0, nb;
\r
206 int lline=10000,lt[NS], igroup, Sequential=1, basecoding=0;
\r
207 int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
\r
208 int gap=(n31==3?3:10), nchar=(com.seqtype==AAseq?20:4);
\r
210 char *pch=((com.seqtype<=1||com.seqtype==CODON2AAseq) ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5 ? BASEs5 : BINs)));
\r
214 data.datatype[locus] = com.seqtype;
\r
216 str[0]=0; h=-1; b[0]=-1; /* avoid warning */
\r
217 com.readpattern = 0;
\r
218 if (com.seqtype==4) error2("seqtype==BINs, check with author");
\r
219 if (noisy>=9 && (com.seqtype<=CODONseq||com.seqtype==CODON2AAseq)) {
\r
220 puts("\n\nAmbiguity character definition table:\n");
\r
221 for(i=0; i<(int)strlen(BASEs); i++) {
\r
222 nb = strlen(EquateBASE[i]);
\r
223 printf("%c (%d): ", BASEs[i], nb);
\r
224 for(j=0; j<nb; j++) printf("%c ", EquateBASE[i][j]);
\r
228 GetSeqFileType(fseq, &format);
\r
230 if (com.ns>NS) error2("too many sequences.. raise NS?");
\r
231 if (com.ls%n31!=0) {
\r
232 printf ("\n%d nucleotides, not a multiple of 3!", com.ls); exit(-1);
\r
234 if (noisy) printf ("\nns = %d \tls = %d\n", com.ns, com.ls);
\r
236 for(j=0; j<com.ns; j++) {
\r
237 if(com.spname[j]) free(com.spname[j]);
\r
238 com.spname[j] = (char*)malloc((lspname+1)*sizeof(char));
\r
239 for(i=0; i<lspname+1; i++) com.spname[j][i]=0;
\r
240 if((com.z[j] = (unsigned char*)realloc(com.z[j],com.ls*sizeof(unsigned char))) == NULL)
\r
243 com.rgene[0] = 1; com.ngene = 1;
\r
244 lline = max2(lline, com.ls/n31*(n31+1)+lspname+50);
\r
245 if((line=(char*)malloc(lline*sizeof(char))) == NULL) error2("oom line");
\r
249 if(!fgets(line,lline,fseq)) error2("ReadSeq: first line");
\r
250 com.readpattern = (strchr(line, 'P') || strchr(line, 'p'));
\r
252 if(strchr(line, 'M') || strchr(line, 'm')) data.datatype[locus] = MORPHC;
\r
256 if(data.datatype[locus] == MORPHC) { /* morhpological data */
\r
257 ReadMorphology(fout, fseq);
\r
262 if(!com.readpattern) {
\r
263 if((com.pose=(int*)realloc(com.pose, com.ls/n31*sizeof(int)))==NULL)
\r
264 error2("oom pose");
\r
265 for(j=0; j<com.ls/n31; j++) com.pose[j]=0; /* gene #1, default */
\r
268 if(com.pose) free(com.pose);
\r
271 if(format) goto readseq;
\r
273 for (j=0; j<lline && line[j] && line[j]!='\n'; j++) {
\r
274 if (!isalnum(line[j])) continue;
\r
275 line[j]=(char)toupper(line[j]);
\r
277 case 'G': noptline++; break;
\r
278 case 'C': basecoding=1; break;
\r
279 case 'S': Sequential=1; break;
\r
280 case 'I': Sequential=0; break;
\r
281 case 'P': break; /* already dealt with. */
\r
283 printf ("\nBad option '%c' in first line of seqfile\n", line[j]);
\r
287 if (strchr(line,'C')) { /* protein-coding DNA sequences */
\r
288 if(com.seqtype==2) error2("option C?");
\r
289 if(com.seqtype==0) {
\r
290 if (com.ls%3!=0 || noptline<1) error2("option C?");
\r
292 for(i=0;i<3;i++) com.lgene[i]=com.ls/3;
\r
293 #if(defined(BASEML) || defined(BASEMLG))
\r
295 if(com.readpattern)
\r
296 error2("partterns for coding sequences (G C P) not implemented.");
\r
298 for (i=0;i<com.ls;i++) com.pose[i]=(char)(i%3);
\r
306 for(j=0; j<noptline; j++) {
\r
308 ch = (char)fgetc(fseq);
\r
309 if(ch == comment0)
\r
310 PopupComment(fseq);
\r
311 if(isalnum(ch)) break;
\r
314 ch = (char)toupper(ch);
\r
317 if(basecoding) error2("Error in sequence data file: incorrect option format, use GC?\n");
\r
318 if (fscanf(fseq,"%d",&com.ngene)!=1) error2("expecting #gene here..");
\r
319 if (com.ngene>NGENE) error2("raise NGENE?");
\r
321 fgets(line,lline,fseq);
\r
322 if (!blankline(line)) { /* #sites in genes on the 2nd line */
\r
323 for (i=0,p=line; i<com.ngene; i++) {
\r
324 while (*p && !isalnum(*p)) p++;
\r
325 if (sscanf(p,"%d",&com.lgene[i])!=1) break;
\r
326 while (*p && isalnum(*p)) p++;
\r
328 /* if ngene is large and some lgene is on the next line */
\r
329 for (; i<com.ngene; i++)
\r
330 if (fscanf(fseq,"%d", &com.lgene[i])!=1) error2("EOF at lgene");
\r
332 for(i=0,k=0; i<com.ngene; i++)
\r
334 if(k!=com.ls/n31) {
\r
335 matIout(F0, com.lgene, 1, com.ngene);
\r
336 printf("\n%6d != %d", com.ls/n31, k);
\r
337 puts("\nOption G: total length over genes is not correct");
\r
338 if(com.seqtype==1) {
\r
339 puts("Note: gene length is in number of codons.");
\r
341 puts("Sequence length in number of nucleotides.");
\r
344 if(!com.readpattern)
\r
345 for(i=0,k=0; i<com.ngene; k+=com.lgene[i],i++)
\r
346 for(j=0; j<com.lgene[i]; j++)
\r
350 else { /* site marks on later line(s) */
\r
351 if(com.readpattern)
\r
352 error2("option PG: use number of patterns in each gene and not site marks");
\r
353 for(k=0; k<com.ls/n31; ) {
\r
354 if (com.ngene>9) fscanf(fseq,"%d", &ch);
\r
356 do ch=fgetc(fseq); while (!isdigit(ch));
\r
357 ch=ch-(int)'1'+1; /* assumes 1,2,...,9 are consecutive */
\r
359 if (ch<1 || ch>com.ngene)
\r
360 { printf("\ngene mark %d at %d?\n", ch, k+1); exit (-1); }
\r
361 com.pose[k++]=ch-1;
\r
363 if(!fgets(line,lline,fseq)) error2("sequence file, gene marks");
\r
367 printf ("Bad option '%c' in option lines in seqfile\n", line[0]);
\r
373 /* read sequence */
\r
374 if (Sequential) { /* sequential */
\r
375 if (noisy) printf ("Reading sequences, sequential format..\n");
\r
376 for (j=0; j<com.ns; j++) {
\r
378 for (i=0; i<2*lspname; i++) line[i]='\0';
\r
379 if (!fgets (line, lline, fseq)) error2("EOF?");
\r
380 if (blankline(line)) {
\r
381 if (PopEmptyLines (fseq, lline, line))
\r
382 { printf("error in sequence data file: empty line (seq %d)\n",j+1); exit(-1); }
\r
384 p = line+(line[0]=='=' || line[0]=='>') ;
\r
385 while(isspace(*p)) p++;
\r
386 if ((ch=strstr(p," ")-p)<lspname && ch>0) lspname=ch;
\r
387 strncpy (com.spname[j], p, lspname);
\r
388 k = strlen(com.spname[j]);
\r
389 p += (k<lspname?k:lspname);
\r
391 for (; k>0; k--) /* trim spaces */
\r
392 if (!isgraph(com.spname[j][k])) com.spname[j][k]=0;
\r
395 if (noisy>=2) printf ("Reading seq #%2d: %s \r", j+1, com.spname[j]);
\r
396 for (k=0; k<com.ls; p++) {
\r
397 while (*p=='\n' || *p=='\0') {
\r
398 p=fgets(line, lline, fseq);
\r
400 { printf("\nEOF at site %d, seq %d\n", k+1,j+1); exit(-1); }
\r
402 *p = (char)toupper(*p);
\r
403 if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U')
\r
405 p1 = strchr(pch, *p);
\r
406 if (p1 && p1-pch>=nchar)
\r
409 if (j==0) error2("Error in sequence data file: . in 1st seq.?");
\r
410 com.z[j][k] = com.z[0][k]; k++;
\r
413 com.z[j][k++] = *p;
\r
414 else if (isalpha(*p)) {
\r
415 printf("\nError in sequence data file: %c at %d seq %d.\n",*p,k+1,j+1);
\r
416 puts("Make sure to separate the sequence from its name by 2 or more spaces.");
\r
419 else if (*p == (char)EOF) error2("EOF?");
\r
421 if(strchr(p,'\n')==NULL) /* pop up line return */
\r
422 while((ch=fgetc(fseq))!='\n' && ch!=EOF) ;
\r
423 } /* for (j,com.ns) */
\r
425 else { /* interlaved */
\r
426 if (noisy) printf ("Reading sequences, interlaved format..\n");
\r
427 FOR (j, com.ns) lt[j]=0; /* temporary seq length */
\r
428 for (igroup=0; ; igroup++) {
\r
430 printf ("\nreading block %d ", igroup+1); matIout(F0,lt,1,com.ns);*/
\r
432 FOR (j, com.ns) if (lt[j]<com.ls) break;
\r
433 if (j==com.ns) break;
\r
435 if (!fgets(line,lline,fseq)) {
\r
436 printf("\nerr reading site %d, seq %d group %d\nsites read in each seq:",
\r
437 lt[j]+1,j+1,igroup+1);
\r
440 if (!hasbase(line)) {
\r
442 printf ("\n%d, seq %d group %d", lt[j]+1, j+1, igroup+1);
\r
443 error2("empty line.");
\r
446 if (PopEmptyLines(fseq,lline,line)==-1) {
\r
447 printf ("\n%d, seq %d group %d", lt[j]+1, j+1, igroup+1);
\r
454 while(isspace(*p)) p++;
\r
455 if ((ch=strstr(p," ")-p)<lspname && ch>0)
\r
457 strncpy (com.spname[j], p, lspname);
\r
458 k = strlen(com.spname[j]);
\r
459 p += (k<lspname?k:lspname);
\r
461 for (; k>0; k--) /* trim spaces */
\r
462 if (!isgraph(com.spname[j][k]))
\r
463 com.spname[j][k]=0;
\r
466 if(noisy>=2) printf("Reading seq #%2d: %s \r",j+1,com.spname[j]);
\r
468 for (; *p && *p!='\n'; p++) {
\r
469 if (lt[j]==com.ls) break;
\r
470 *p = (char)toupper(*p);
\r
471 if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U')
\r
473 p1 = strchr(pch, *p);
\r
474 if (p1 && p1-pch>=nchar)
\r
478 printf("err: . in 1st seq, group %d.\n",igroup);
\r
481 com.z[j][lt[j]] = com.z[0][lt[j]];
\r
485 com.z[j][lt[j]++]=*p;
\r
486 else if (isalpha(*p)) {
\r
487 printf("\nerr: unrecognised character %c at %d seq %d block %d.",
\r
488 *p,lt[j]+1,j+1,igroup+1);
\r
491 else if (*p==(char)EOF) error2("EOF");
\r
493 } /* for (j,com.ns) */
\r
496 printf("\nblock %3d:", igroup+1);
\r
497 for(j=0;j<com.ns;j++) printf(" %6d",lt[j]);
\r
500 } /* for (igroup) */
\r
505 /* mask stop codons as ???. */
\r
506 if(com.seqtype==1 && MarkStopCodons())
\r
512 else if (cleandata) { /* forced removal of ambiguity characters */
\r
513 if(noisy>2) puts("\nSites with gaps or missing data are removed.");
\r
515 fprintf(fout,"\nBefore deleting alignment gaps\n");
\r
516 fprintf(fout, " %6d %6d\n", com.ns, com.ls);
\r
517 printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,gap,com.seqtype,0,0,NULL);
\r
520 if(fout) fprintf(fout,"\nAfter deleting gaps. %d sites\n",com.ls);
\r
523 if(fout && !com.readpattern) {/* verbose=1, listing sequences again */
\r
524 fprintf(fout, " %6d %6d\n", com.ns, com.ls);
\r
525 printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,gap,com.seqtype,0,0,NULL);
\r
528 if(n31==3) com.ls/=n31;
\r
530 /* IdenticalSeqs(); */
\r
533 if(com.seqtype==1 && com.verbose) Get4foldSites();
\r
535 if(com.seqtype==CODON2AAseq) {
\r
536 if (noisy>2) puts("\nTranslating into AA sequences\n");
\r
537 for(j=0; j<com.ns; j++) {
\r
538 if (noisy>2) printf("Translating sequence %d\n",j+1);
\r
539 DNA2protein(com.z[j], com.z[j], com.ls,com.icode);
\r
544 fputs("\nTranslated AA Sequences\n",fout);
\r
545 fprintf(fout,"%4d %6d",com.ns,com.ls);
\r
546 printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,10,com.seqtype,0,0,NULL);
\r
551 #if (defined CODEML || defined BASEML)
\r
552 if(com.ngene==1 && com.Mgene==1) com.Mgene=0;
\r
553 if(com.ngene>1 && com.Mgene==1 && com.verbose) printSeqsMgenes ();
\r
555 if(com.bootstrap) { BootstrapSeq("boot.txt"); exit(0); }
\r
559 #if (defined CODEML)
\r
560 /* list sites with 2 types of serine codons: TC? and TCY. 19 March 2014, Ziheng. */
\r
564 for(h=0; h<com.ls; h++) {
\r
565 for(i=0,nbox0=nbox1=0; i<com.ns; i++) {
\r
566 codon[0]=com.z[i][h*3+0]; codon[1]=com.z[i][h*3+1]; codon[2]=com.z[i][h*3+2];
\r
567 if(codon[0]=='T' && codon[1]=='C') nbox0++;
\r
568 else if(codon[0]=='A' && codon[1]=='G' && (codon[2]=='T' || codon[2]=='C')) nbox1++;
\r
570 if(nbox0 && nbox1 && nbox0+nbox1==com.ns) {
\r
571 printf("\ncodon %7d: ", h+1);
\r
572 for(i=0; i<com.ns; i++)
\r
573 printf("%c%c%c ", com.z[i][h*3+0], com.z[i][h*3+1], com.z[i][h*3+2]);
\r
581 if(noisy>=2) printf ("\nSequences read..\n");
\r
583 puts("no sites. Got nothing to do");
\r
587 #if (defined MCMCTREE)
\r
588 /* Check and remove empty sequences. */
\r
590 if(com.cleandata==0)
\r
591 RemoveEmptySequences();
\r
595 if(!com.readpattern)
\r
597 else { /* read pattern counts */
\r
598 com.npatt = com.ls;
\r
599 if((com.fpatt=(double*)realloc(com.fpatt, com.npatt*sizeof(double))) == NULL)
\r
600 error2("oom fpatt");
\r
601 for(h=0,lst=0; h<com.npatt; h++) {
\r
602 fscanf(fseq, "%lf", &com.fpatt[h]);
\r
603 lst += com.fpatt[h];
\r
604 if(com.fpatt[h]<0 || com.fpatt[h]>1e6)
\r
605 printf("fpatth[%d] = %.6g\n", h+1, com.fpatt[h]);
\r
609 if(noisy) printf("\n%d site patterns read, %d sites\n", com.npatt, com.ls);
\r
611 if(com.ngene==1) {
\r
612 com.lgene[0] = com.ls;
\r
614 com.posG[1] = com.npatt;
\r
617 for(j=0,com.posG[0]=0; j<com.ngene; j++)
\r
618 com.posG[j+1] = com.posG[j] + com.lgene[j];
\r
620 for(j=0; j<com.ngene; j++) {
\r
621 com.lgene[j] = (j==0 ? 0 : com.lgene[j-1]);
\r
622 for(h=com.posG[j]; h<com.posG[j+1]; h++)
\r
623 com.lgene[j] += (int)com.fpatt[h];
\r
631 fprintf(fout,"\nPrinting out site pattern counts\n\n");
\r
632 printPatterns(fout);
\r
639 #if(defined CODEML)
\r
641 int MarkStopCodons(void)
\r
643 /* this converts the whole column into ??? if there is a stop codon in one sequence.
\r
644 Data in com.z[] are just read in and not encoded yet.
\r
646 int i,j,h,k, NColumnEdited=0;
\r
647 char codon[4]="", stops[6][4]={"","",""}, nstops=0;
\r
649 if(com.seqtype!=1) error2("should not be here");
\r
651 for(i=0; i<64; i++)
\r
652 if(GeneticCode[com.icode][i]==-1)
\r
653 getcodon(stops[nstops++], i);
\r
655 for(h=0; h<com.ls/3; h++) {
\r
656 for(i=0; i<com.ns; i++) {
\r
657 codon[0] = com.z[i][h*3+0];
\r
658 codon[1] = com.z[i][h*3+1];
\r
659 codon[2] = com.z[i][h*3+2];
\r
660 for(j=0; j<nstops; j++)
\r
661 if(strcmp(codon, stops[j])==0) {
\r
662 printf("stop codon %s in seq. # %3d (%s)\r", codon, i+1, com.spname[i]);
\r
665 if(j<nstops) break;
\r
668 for(i=0; i<com.ns; i++)
\r
669 com.z[i][h*3+0] = com.z[i][h*3+1] = com.z[i][h*3+2] = '?';
\r
673 if(NColumnEdited) {
\r
674 printf("\n%2d columns are converted into ??? because of stop codons\nPress Enter to continue", NColumnEdited);
\r
677 return(NColumnEdited);
\r
683 void RemoveEmptySequences(void)
\r
685 /* this removes empty sequences (? or - only) and adjust com.ns
\r
690 for(j=0; j<com.ns; j++) {
\r
692 for(h=0; h<com.ls*(com.seqtype==1?3:1); h++)
\r
693 if(com.z[j][h] != '?' && com.z[j][h] != '-') {
\r
698 for(j=0,nsnew=0; j<com.ns; j++) {
\r
700 printf("seq #%3d: %-30s is removed\n", j+1, com.spname[j]);
\r
702 free(com.spname[j]);
\r
705 com.z[nsnew] = com.z[j];
\r
706 com.spname[nsnew] = com.spname[j];
\r
709 for(j=nsnew; j<com.ns; j++) {
\r
711 com.spname[j] = NULL;
\r
717 int printPatterns(FILE *fout)
\r
719 int j,h, n31 = (com.seqtype==CODONseq||com.seqtype==CODON2AAseq ? 3 : 1);
\r
720 int gap=(n31==3?3:10), n=(com.seqtype==AAseq?20:4);
\r
722 fprintf(fout,"\n%10d %10d P", com.ns, com.npatt*n31);
\r
724 fprintf (fout," G\nG %d ", com.ngene);
\r
725 for(j=0; j<com.ngene; j++)
\r
726 fprintf(fout,"%7d", com.posG[j+1]-com.posG[j]);
\r
730 if(com.seqtype==1 && com.cleandata) {
\r
731 ; /* nothing is printed out for yn00, as the coding is different. */
\r
732 #if(defined CODEML || defined YN00)
\r
733 printsmaCodon (fout, com.z, com.ns, com.npatt, com.npatt, 1);
\r
737 printsma(fout,com.spname,com.z,com.ns, com.npatt,com.npatt, gap, com.seqtype, 1, 0, NULL);
\r
738 if(com.ls>1.0001) {
\r
739 fprintf(fout, "\n");
\r
740 for(h=0; h<com.npatt; h++) {
\r
741 fprintf(fout," %4.0f", com.fpatt[h]);
\r
742 if((h+1)%15 == 0) FPN(fout);
\r
744 fprintf(fout, "\n\n");
\r
751 void EncodeSeqs (void)
\r
753 /* This encodes sequences and set up com.TipMap[][], called after sites are collapsed
\r
756 int n=com.ncode, nA, is,h, i, j, k,ic, indel=0, ch, b[3];
\r
757 char *pch = ((com.seqtype==0||com.seqtype==1) ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));
\r
758 unsigned char c[4]="", str[4]=" ";
\r
760 if(com.seqtype != 1) {
\r
761 for(is=0; is<com.ns; is++) {
\r
762 for (h=0; h<com.npatt; h++) {
\r
764 com.z[is][h] = (char)(k = strchr(pch, ch) - pch);
\r
766 printf("strange character %c in seq %d site %d\n", ch, is+1, h+1);
\r
772 #if (defined CODEML || defined YN00)
\r
773 else if(com.seqtype==1) {
\r
774 /* collect all observed codons into CODONs, with a maximum of 256 distinct codons. */
\r
775 memset(&CODONs[0][0], 0, 256*4*sizeof(char));
\r
776 for(nA=0; nA<n; nA++) {
\r
777 ic=FROM61[nA]; b[0]=ic/16; b[1]=(ic/4)%4; b[2]=ic%4;
\r
778 for(i=0; i<3; i++) CODONs[nA][i] = BASEs[b[i]];
\r
780 for(j=0,nA=n; j<com.ns; j++) {
\r
781 for(h=0; h<com.npatt; h++) {
\r
782 for(k=0; k<3; k++) {
\r
783 c[k] = com.z[j][h*3+k];
\r
784 b[k] = strchr(BASEs,c[k]) - BASEs;
\r
785 if(b[k]<0) printf("strange nucleotide %c in seq %d\n", c[k], j+1);
\r
787 if(b[0]<4 && b[1]<4 && b[2]<4) {
\r
788 k = FROM64[b[0]*16 + b[1]*4 + b[2]];
\r
790 printf("\nstop codon %s in seq #%2d: %s\n", c, j+1, com.spname[j]);
\r
791 printf("\ncodons in other sequences are\n");
\r
792 for(i=0; i<com.ns; i++) {
\r
793 for(k=0; k<3; k++) c[k] = com.z[i][h*3+k];
\r
794 printf("seq #%2d %-30s %s\n", i+1, com.spname[i], c);
\r
799 else { /* an ambiguous codon */
\r
800 for(k=n; k<nA; k++)
\r
801 if(strcmp(CODONs[k], c) == 0) break;
\r
805 error2("too many ambiguity codons in the data. Contact author");
\r
806 strcpy(CODONs[nA-1], c);
\r
808 com.z[j][h] = (unsigned char)k;
\r
810 com.z[j] = (unsigned char*)realloc(com.z[j], com.npatt);
\r
813 printf("%d ambiguous codons are seen in the data:\n", nA - n);
\r
814 for(k=n; k<nA; k++) printf("%4s", CODONs[k]);
\r
822 void SetMapAmbiguity (void)
\r
824 /* This sets up CharaMap, the map from the ambiguity characters to resolved characters.
\r
826 int n=com.ncode, i,j, i0,i1,i2, nb[3], ib[3][4], ic;
\r
827 char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));
\r
828 char *pbases = (com.seqtype==0 ? BASEs : (com.seqtype==5 ? BASEs5: NULL));
\r
829 char **pEquateBASE = (com.seqtype==0 ? EquateBASE : (com.seqtype==5 ? EquateBASE5 : NULL));
\r
832 for(j=0; j<n; j++) { /* basic characters, coded according to the definition in pch. */
\r
833 nChara[j] = (char)1;
\r
834 CharaMap[j][0] = (char)j;
\r
837 if(com.seqtype != 1) {
\r
838 for(j=n,pch+=n; *pch; j++,pch++) {
\r
839 if(com.seqtype==0 || com.seqtype==5) { /* ambiguities are allowed for those 2 types */
\r
840 nChara[j] = (char)strlen(pEquateBASE[j]);
\r
841 for(i=0; i<nChara[j]; i++)
\r
842 CharaMap[j][i] = (char)(strchr(pbases, pEquateBASE[j][i]) - pbases);
\r
844 else { /* for non-nucleotide characters, ambiguity characters must be ? or -. */
\r
845 nChara[j] = (char)n;
\r
847 CharaMap[j][i] = (char)i;
\r
850 printf("character %c (%d): ", pbases[j], nChara[j]);
\r
851 for(i=0; i<nChara[j]; i++)
\r
852 printf("%c", pbases[CharaMap[j][i]]);
\r
859 for(j=n; j<256 && CODONs[j][0]; j++) {
\r
860 nChara[j] = (char)0;
\r
862 NucListall(CODONs[j][i], &nb[i], ib[i]);
\r
863 for(i0=0; i0<nb[0]; i0++) {
\r
864 for(i1=0; i1<nb[1]; i1++)
\r
865 for(i2=0; i2<nb[2]; i2++) {
\r
866 ic = ib[0][i0]*16+ib[1][i1]*4+ib[2][i2];
\r
867 if(GeneticCode[com.icode][ic] != -1)
\r
868 CharaMap[j][nChara[j]++] = FROM64[ic];
\r
872 printf("\ncodon %s is stop codon", CODONs[j]);
\r
881 int IdenticalSeqs(void)
\r
883 /* This checks for identical sequences and create a data set of unique
\r
884 sequences. The file name is <SeqDataFile.unique. This is casually
\r
885 written and need more testing.
\r
886 The routine is called right after the sequence data are read.
\r
887 For codon sequences, com.ls has the number of codons, which are NOT
\r
890 char tmpf[96], keep[NS];
\r
892 int is,js,h, same,nkept=com.ns;
\r
893 int ls1=com.ls*(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
\r
895 puts("\nIdenticalSeqs: not tested\a");
\r
896 for(is=0; is<com.ns; is++)
\r
898 for(is=0; is<com.ns; is++) {
\r
899 if(!keep[is]) continue;
\r
900 for(js=0; js<is; js++) {
\r
901 for(h=0,same=1; h<ls1; h++)
\r
902 if(com.z[is][h] != com.z[js][h]) break;
\r
904 printf("Seqs. %3d & %3d (%s & %s) are identical!\n",
\r
905 js+1,is+1,com.spname[js],com.spname[is]);
\r
910 for(is=0; is<com.ns; is++)
\r
911 if(!keep[is]) nkept--;
\r
913 strcpy(tmpf, com.seqf);
\r
914 strcat(tmpf, ".unique");
\r
915 if((ftmp=fopen(tmpf,"w"))==NULL) error2("IdenticalSeqs: file error");
\r
916 printSeqs(ftmp, NULL, keep, 1);
\r
918 printf("\nUnique sequences collected in %s.\n", tmpf);
\r
924 void AllPatterns (FILE* fout)
\r
926 /* This prints out an alignment containting all possible site patterns, and then exits.
\r
927 This alignment may be useful to generate a dataset of infinitely long sequences,
\r
928 summarized in the site pattern probabilities.
\r
929 Because the PatternWeight() function changes the order of patterns, this routine
\r
930 prints out the alignment as one of patterns, with lots of 1's below it, to avoid
\r
931 baseml or codeml calling that routine to collaps sites.
\r
932 You then replace those 1'with the calculated pattern probabilities for further
\r
937 int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
\r
938 int gap=(n31==3?3:10);
\r
941 for(j=0,com.npatt=1; j<com.ns; j++) com.npatt*=com.ncode;
\r
942 printf ("%3d species, %d site patterns\n", com.ns, com.npatt);
\r
944 for(j=0; j<com.ns; j++) {
\r
945 com.spname[j] = (char*)realloc(com.spname[j], 11*sizeof(char));
\r
946 sprintf(com.spname[j], "%c ", 'a'+j);
\r
948 for(j=0; j<com.ns; j++)
\r
949 if((com.z[j]=(unsigned char*) malloc(com.npatt*sizeof(char))) == NULL)
\r
950 error2("oom in AllPatterns");
\r
951 for (h=0; h<com.npatt; h++) {
\r
952 for (j=0,it=h; j<com.ns; j++) {
\r
955 com.z[com.ns-1-j][h] = (char)ic;
\r
958 com.ls = com.npatt;
\r
960 fprintf(fout, " %6d %6d P\n", com.ns, com.ls*n31);
\r
961 if(com.seqtype==1) {
\r
962 #if(defined CODEML || defined YN00)
\r
963 printsmaCodon (fout, com.z, com.ns, com.ls, com.ls, 0);
\r
967 printsma(fout,com.spname,com.z,com.ns, com.ls, com.ls, gap, com.seqtype, 1, 0, NULL);
\r
969 for(h=0; h<com.npatt; h++) {
\r
970 fprintf(fout, " 1");
\r
971 if((h+1)%40==0) FPN(fout);
\r
978 int PatternWeight (void)
\r
980 /* This collaps sites into patterns, for nucleotide, amino acid, or codon sequences.
\r
981 This relies on \0 being the end of the string so that sequences should not be
\r
982 encoded before this routine is called.
\r
983 com.pose[i] has labels for genes as input and maps sites to patterns in return.
\r
984 com.fpatt, a vector of doubles, wastes space as site pattern counts are integers.
\r
985 Sequences z[ns*ls] are copied into patterns zt[ls*lpatt], and bsearch is used
\r
986 twice to avoid excessive copying, to count npatt first & to generate fpatt etc.
\r
988 int maxnpatt=com.ls, h, ip,l,u, j, k, same, ig, *poset;
\r
989 int gap = (com.seqtype==CODONseq ? 3 : 10);
\r
990 int n31 = (com.seqtype==CODONseq ? 3 : 1);
\r
991 int lpatt=com.ns*n31+1; /* extra 0 used for easy debugging, can be voided */
\r
992 int *p2s; /* point patterns to sites in zt */
\r
993 char *zt, *p, timestr[36];
\r
994 double nc = (com.seqtype == 1 ? 64 : com.ncode) + !com.cleandata+1;
\r
999 Collect and sort patterns. Get com.npatt, com.lgene, com.posG.
\r
1000 Move sequences com.z[ns][ls] into sites zt[ls*lpatt].
\r
1001 Use p2s to map patterns to sites in zt to avoid copying.
\r
1003 if(noisy) printf("Counting site patterns.. %s\n", printtime(timestr));
\r
1005 if((com.seqtype==1 && com.ns<5) || (com.seqtype!=1 && com.ns<7))
\r
1006 maxnpatt = (int)(pow(nc, (double)com.ns) + 0.5) * com.ngene;
\r
1007 if(maxnpatt>com.ls) maxnpatt = com.ls;
\r
1008 p2s = (int*)malloc(maxnpatt*sizeof(int));
\r
1009 zt = (char*)malloc(com.ls*lpatt*sizeof(char));
\r
1010 if(p2s==NULL || zt==NULL) error2("oom p2s or zt");
\r
1011 memset(zt, 0, com.ls*lpatt*sizeof(char));
\r
1012 for(j=0; j<com.ns; j++)
\r
1013 for(h=0; h<com.ls; h++)
\r
1014 for(k=0; k<n31; k++)
\r
1015 zt[h*lpatt+j*n31+k] = com.z[j][h*n31+k];
\r
1017 for(j=0; j<com.ns; j++) free(com.z[j]);
\r
1019 for(ig=0; ig<com.ngene; ig++) com.lgene[ig] = 0;
\r
1020 for(ig=0,com.npatt=0; ig<com.ngene; ig++) {
\r
1021 com.posG[ig] = l = u = ip = com.npatt;
\r
1022 for(h=0; h<com.ls; h++) {
\r
1023 if(com.pose[h] != ig) continue;
\r
1024 if(debug) printf("\nh %3d %s", h, zt+h*lpatt);
\r
1026 /* bsearch in existing patterns. Knuth 1998 Vol3 Ed2 p.410
\r
1027 ip is the loc for match or insertion. [l,u] is the search interval.
\r
1030 if(com.lgene[ig]++ != 0) { /* not 1st pattern? */
\r
1031 for(l=com.posG[ig], u=com.npatt-1; ; ) {
\r
1034 k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);
\r
1035 if(k<0) u = ip - 1;
\r
1036 else if(k>0) l = ip + 1;
\r
1037 else { same = 1; break; }
\r
1041 if(com.npatt>maxnpatt)
\r
1042 error2("npatt > maxnpatt");
\r
1043 if(l > ip) ip++; /* last comparison in bsearch had k > 0. */
\r
1044 /* Insert new pattern at ip. This is the expensive step. */
\r
1047 memmove(p2s+ip+1, p2s+ip, (com.npatt-ip)*sizeof(int));
\r
1050 for(j=com.npatt; j>ip; j--)
\r
1051 p2s[j] = p2s[j-1];
\r
1058 printf(": %3d (%c ilu %3d%3d%3d) ", com.npatt, DS[same], ip, l, u);
\r
1059 for(j=0; j<com.npatt; j++)
\r
1060 printf(" %s", zt+p2s[j]*lpatt);
\r
1062 if(noisy && ((h+1)%10000==0 || h+1==com.ls))
\r
1063 printf("\r%12d patterns at %8d / %8d sites (%.1f%%), %s",
\r
1064 com.npatt, h+1, com.ls, (h+1.)*100/com.ls, printtime(timestr));
\r
1068 if(noisy) FPN(F0);
\r
1070 /* (B) count pattern frequencies and collect pose[] */
\r
1071 com.posG[com.ngene] = com.npatt;
\r
1072 for(j=0; j<com.ngene; j++)
\r
1073 if(com.lgene[j]==0)
\r
1074 error2("some gene labels are missing");
\r
1075 for(j=1; j<com.ngene; j++)
\r
1076 com.lgene[j] += com.lgene[j-1];
\r
1078 com.fpatt = (double*)realloc(com.fpatt, com.npatt*sizeof(double));
\r
1079 poset = (int*)malloc(com.ls*sizeof(int));
\r
1080 if(com.fpatt==NULL || poset==NULL) error2("oom poset");
\r
1081 for(ip=0; ip<com.npatt; ip++) com.fpatt[ip] = 0;
\r
1083 for(ig=0; ig<com.ngene; ig++) {
\r
1084 for(h=0; h<com.ls; h++) {
\r
1085 if(com.pose[h] != ig) continue;
\r
1086 for(same=0, l=com.posG[ig], u=com.posG[ig+1]-1; ; ) {
\r
1089 k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);
\r
1090 if(k<0) u = ip - 1;
\r
1091 else if(k>0) l = ip + 1;
\r
1092 else { same = 1; break; }
\r
1095 error2("ghost pattern?");
\r
1101 if(com.seqtype==CODONseq && com.ngene==3 &&com.lgene[0]==com.ls/3) {
\r
1102 puts("\nCheck option G in data file? (Enter)\n");
\r
1105 for(j=0; j<com.ns; j++) {
\r
1106 com.z[j] = (unsigned char*)malloc(com.npatt*n31*sizeof(char));
\r
1107 for(ip=0,p=com.z[j]; ip<com.npatt; ip++)
\r
1108 for(k=0; k<n31; k++)
\r
1109 *p++ = zt[p2s[ip]*lpatt + j*n31 + k];
\r
1111 memcpy(com.pose, poset, com.ls*sizeof(int));
\r
1112 free(poset); free(p2s); free(zt);
\r
1118 void AddFreqSeqGene(int js,int ig,double pi0[],double pi[]);
\r
1121 void Chi2FreqHomo(double f[], int ns, int nc, double X2G[2])
\r
1123 /* This calculates a chi-square like statistic for testing that the base
\r
1124 or amino acid frequencies are identical among sequences.
\r
1125 f[ns*nc] where ns is #sequences (rows) and nc is #states (columns).
\r
1128 double mf[64]={0}, small=1e-50;
\r
1131 for(i=0; i<ns; i++)
\r
1132 for(j=0; j<nc; j++)
\r
1133 mf[j]+=f[i*nc+j]/ns;
\r
1135 for(i=0; i<ns; i++) {
\r
1136 for(j=0; j<nc; j++) {
\r
1138 X2G[0] += square(f[i*nc+j]-mf[j])/mf[j];
\r
1140 X2G[1] += 2*f[i*nc+j]*log(f[i*nc+j]/mf[j]);
\r
1146 int InitializeBaseAA (FILE *fout)
\r
1148 /* Count site patterns (com.fpatt) and calculate base or amino acid frequencies
\r
1149 in genes and species. This works on raw (uncoded) data.
\r
1150 Ambiguity characters in sequences are resolved by iteration.
\r
1151 For frequencies in each species, they are resolved within that sequence.
\r
1152 For average base frequencies among species, they are resolved over all
\r
1155 This routine is called by baseml and aaml. codonml uses another
\r
1156 routine InitializeCodon()
\r
1158 char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));
\r
1159 char indel[]="-?";
\r
1160 int wname=30, h,js,k, ig, nconstp, n=com.ncode;
\r
1162 double pi0[20], t,lmax=0, X2G[2], *pisg; /* freq for species & gene, for X2 & G */
\r
1164 if(noisy) printf("Counting frequencies..");
\r
1165 if(fout) fprintf(fout,"\nFrequencies..");
\r
1166 if((pisg=(double*)malloc(com.ns*n*sizeof(double))) == NULL)
\r
1167 error2("oom pisg");
\r
1168 for(h=0,nconstp=0; h<com.npatt; h++) {
\r
1169 for (js=1; js<com.ns; js++)
\r
1170 if(com.z[js][h] != com.z[0][h]) break;
\r
1171 if (js==com.ns && com.z[0][h]!=indel[0] && com.z[0][h]!=indel[1])
\r
1172 nconstp += (int)com.fpatt[h];
\r
1174 for (ig=0,zero(com.pi,n); ig<com.ngene; ig++) {
\r
1176 fprintf (fout,"\n\nGene %2d (len %4d)", ig+1, com.lgene[ig]-(ig==0?0:com.lgene[ig-1]));
\r
1177 fprintf(fout,"\n%*s", wname, "");
\r
1178 for(k=0; k<n; k++) fprintf(fout,"%7c", pch[k]);
\r
1180 /* The following block calculates freqs in each species for each gene.
\r
1181 Ambiguities are resolved in each species. com.pi and com.piG are
\r
1182 used for output only, and are not be used later with missing data.
\r
1184 zero(com.piG[ig], n);
\r
1185 zero(pisg, com.ns*n);
\r
1186 for(js=0; js<com.ns; js++) {
\r
1187 fillxc(pi0, 1.0/n, n);
\r
1188 for(irf=0; irf<nrf; irf++) {
\r
1190 AddFreqSeqGene(js, ig, pi0, com.pi);
\r
1191 t = sum(com.pi, n);
\r
1193 printf("Some sequences are empty.\n");
\r
1194 fillxc(com.pi, 1.0/n, n);
\r
1197 abyx(1/t, com.pi, n);
\r
1198 if(com.cleandata || com.cleandata || (t=distance(com.pi,pi0,n))<1e-8)
\r
1200 xtoy(com.pi, pi0, n);
\r
1202 fprintf(fout,"\n%-*s", wname, com.spname[js]);
\r
1203 for(k=0; k<n; k++) fprintf(fout, "%8.5f", com.pi[k]);
\r
1204 if(com.ncode==4 && com.ngene==1) fprintf(fout, " GC = %5.3f", com.pi[1]+com.pi[3]);
\r
1205 for(k=0; k<n; k++) com.piG[ig][k] += com.pi[k]/com.ns;
\r
1206 xtoy(com.pi, pisg+js*n, n);
\r
1207 } /* for(js,ns) */
\r
1209 fprintf(fout,"\n\n%-*s", wname, "Mean");
\r
1210 for(k=0; k<n; k++) fprintf(fout, "%7.4f", com.piG[ig][k]);
\r
1213 Chi2FreqHomo(pisg, com.ns, n, X2G);
\r
1215 fprintf(fout,"\n\nHomogeneity statistic: X2 = %.5f G = %.5f ",X2G[0], X2G[1]);
\r
1217 /* fprintf(frst1,"\t%.5f", X2G[1]); */
\r
1220 if(noisy) printf("\n");
\r
1222 /* If there are missing data, the following block calculates freqs
\r
1223 in each gene (com.piG[]), as well as com.pi[] for the entire sequence.
\r
1224 Ambiguities are resolved over entire data sets across species (within
\r
1225 each gene for com.piG[]). These are used in ML calculation later.
\r
1227 if(com.cleandata) {
\r
1228 for (ig=0,zero(com.pi,n); ig<com.ngene; ig++) {
\r
1229 t = (ig==0 ? com.lgene[0] : com.lgene[ig]-com.lgene[ig-1])/(double)com.ls;
\r
1230 for(k=0; k<n; k++) com.pi[k] += com.piG[ig][k]*t;
\r
1234 for (ig=0; ig<com.ngene; ig++) {
\r
1235 xtoy(com.piG[ig], pi0, n);
\r
1236 for(irf=0; irf<nrf; irf++) { /* com.piG[] */
\r
1237 zero(com.piG[ig], n);
\r
1238 for(js=0; js<com.ns; js++)
\r
1239 AddFreqSeqGene(js, ig, pi0, com.piG[ig]);
\r
1240 t = sum(com.piG[ig], n);
\r
1242 puts("empty sequences?");
\r
1243 abyx(1/t, com.piG[ig], n);
\r
1244 if(distance(com.piG[ig], pi0, n)<1e-8) break;
\r
1245 xtoy(com.piG[ig], pi0, n);
\r
1249 for(k=0; k<n; k++) for(ig=0; ig<com.ngene; ig++)
\r
1250 pi0[k] += com.piG[ig][k]/com.ngene;
\r
1251 for(irf=0; irf<nrf; irf++) { /* com.pi[] */
\r
1253 for(ig=0; ig<com.ngene; ig++) for(js=0; js<com.ns; js++)
\r
1254 AddFreqSeqGene(js, ig, pi0, com.pi);
\r
1255 abyx(1/sum(com.pi,n), com.pi, n);
\r
1256 if(distance(com.pi, pi0, n)<1e-8) break;
\r
1257 xtoy(com.pi, pi0, n);
\r
1260 fprintf (fout, "\n\n%-*s", wname, "Average");
\r
1261 for(k=0; k<n; k++) fprintf(fout,"%8.5f", com.pi[k]);
\r
1262 if(!com.cleandata) fputs("\n(Ambiguity characters are used to calculate freqs.)\n",fout);
\r
1264 fprintf (fout,"\n\n# constant sites: %6d (%.2f%%)",
\r
1265 nconstp, (double)nconstp*100./com.ls);
\r
1267 if (com.model==0 || (com.seqtype==BASEseq && com.model==1)) {
\r
1268 fillxc(com.pi, 1./n, n);
\r
1269 for(ig=0; ig<com.ngene; ig++)
\r
1270 xtoy(com.pi, com.piG[ig], n);
\r
1272 if (com.seqtype==BASEseq && com.model==5) { /* T92 model */
\r
1273 com.pi[0] = com.pi[2] = (com.pi[0] + com.pi[2])/2;
\r
1274 com.pi[1] = com.pi[3] = (com.pi[1] + com.pi[3])/2;
\r
1275 for(ig=0; ig<com.ngene; ig++) {
\r
1276 com.piG[ig][0] = com.piG[ig][2] = (com.piG[ig][0] + com.piG[ig][2])/2;
\r
1277 com.piG[ig][1] = com.piG[ig][3] = (com.piG[ig][1] + com.piG[ig][3])/2;
\r
1281 /* this is used only for REV & REVu in baseml and model==3 in aaml */
\r
1282 if(com.seqtype==AAseq) {
\r
1283 for (k=0,t=0; k<n; k++) t += (com.pi[k]>0);
\r
1285 puts("\n\a\t\tAre these a.a. sequences?");
\r
1287 if(com.cleandata && com.ngene==1) {
\r
1288 for(h=0,lmax=-(double)com.ls*log((double)com.ls); h<com.npatt; h++)
\r
1289 if(com.fpatt[h]>1) lmax += com.fpatt[h]*log((double)com.fpatt[h]);
\r
1292 if(lmax) fprintf(fout, "\nln Lmax (unconstrained) = %.6f\n", lmax);
\r
1301 void AddFreqSeqGene(int js, int ig, double pi0[], double pi[])
\r
1303 /* This adds the character counts in sequence js in gene ig to pi,
\r
1304 using pi0, by resolving ambiguities. The data are coded. com.cleandata==1 or 0.
\r
1305 This is for nucleotide and amino acid sequences only.
\r
1307 char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));
\r
1308 int k, h, b, n=com.ncode;
\r
1311 if(com.cleandata) {
\r
1312 for(h=com.posG[ig]; h<com.posG[ig+1]; h++)
\r
1313 pi[com.z[js][h]] += com.fpatt[h];
\r
1316 for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
1319 pi[b] += com.fpatt[h];
\r
1322 if(com.seqtype==BASEseq) {
\r
1323 NucListall(BASEs[b], &nb, ib);
\r
1324 for(k=0,t=0; k<nb; k++) t += pi0[ib[k]];
\r
1325 for(k=0; k<nb; k++)
\r
1326 pi[ib[k]] += pi0[ib[k]]/t * com.fpatt[h];
\r
1329 if(com.seqtype==BASEseq) {
\r
1330 for(k=0,t=0; k<nChara[b]; k++)
\r
1331 t += pi0[CharaMap[b][k]];
\r
1332 for(k=0; k<nChara[b]; k++)
\r
1333 pi[CharaMap[b][k]] += pi0[CharaMap[b][k]]/t * com.fpatt[h];
\r
1335 else if(com.seqtype==AAseq) /* unrecognized AAs are treated as "?". */
\r
1336 for(k=0; k<n; k++) pi[k] += pi0[k]*com.fpatt[h];
\r
1343 int RemoveIndel(void)
\r
1345 /* Remove ambiguity characters and indels in the untranformed sequences,
\r
1346 Changing com.ls and com.pose[] (site marks for multiple genes).
\r
1347 For codonml, com.ls is still 3*#codons
\r
1348 Called at the end of ReadSeq, when com.pose[] are still site marks.
\r
1349 All characters in com.z[][] not found in the character string pch are
\r
1350 considered ambiguity characters and are removed.
\r
1352 int n=com.ncode, h,k, j,js,lnew,nindel, n31=1;
\r
1353 char b, *miss; /* miss[h]=1 if site (codon) h is missing, 0 otherwise */
\r
1354 char *pch=((com.seqtype<=1||com.seqtype==CODON2AAseq)?BASEs:(com.seqtype==2?AAs: (com.seqtype==5?BASEs5:BINs)));
\r
1356 if(com.seqtype==CODONseq || com.seqtype==CODON2AAseq) {
\r
1360 if (com.ls%n31) error2("ls in RemoveIndel.");
\r
1361 if((miss=(char*)malloc(com.ls/n31 *sizeof(char)))==NULL)
\r
1362 error2("oom miss");
\r
1363 for(h=0; h<com.ls/n31; h++)
\r
1365 for (js=0; js<com.ns; js++) {
\r
1366 for (h=0,nindel=0; h<com.ls/n31; h++) {
\r
1367 for (k=0; k<n31; k++) {
\r
1368 b = (char)toupper(com.z[js][h*n31+k]);
\r
1369 for(j=0; j<n; j++)
\r
1370 if(b==pch[j]) break;
\r
1372 miss[h]=1; nindel++;
\r
1376 if (noisy>2 && nindel)
\r
1377 printf("\n%6d ambiguity characters in seq. %d", nindel,js+1);
\r
1380 for(h=0,k=0; h<com.ls/n31; h++) if(miss[h]) k++;
\r
1381 printf("\n%d sites are removed. ", k);
\r
1383 for(h=0; h<com.ls/n31; h++) if(miss[h]) printf(" %2d", h+1);
\r
1386 for (h=0,lnew=0; h<com.ls/n31; h++) {
\r
1387 if(miss[h]) continue;
\r
1388 for (js=0; js<com.ns; js++) {
\r
1389 for (k=0; k<n31; k++)
\r
1390 com.z[js][lnew*n31+k]=com.z[js][h*n31+k];
\r
1392 com.pose[lnew]=com.pose[h];
\r
1402 int MPInformSites (void)
\r
1404 /* Outputs parsimony informative and noninformative sites into
\r
1405 two files named MPinf.seq and MPninf.seq
\r
1406 Uses transformed sequences.
\r
1407 Not used for a long time. Does not work if com.pose is NULL.
\r
1410 char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
\r
1411 int h, i, markb[NS], inf, lsinf;
\r
1412 FILE *finf, *fninf;
\r
1414 puts("\nMPInformSites: missing data not dealt with yet?\n");
\r
1416 finf=fopen("MPinf.seq","w");
\r
1417 fninf=fopen("MPninf.seq","w");
\r
1418 if (finf==NULL || fninf==NULL) error2("MPInformSites: file creation error");
\r
1420 puts ("\nSorting parsimony-informative sites: MPinf.seq & MPninf.seq");
\r
1421 if ((imark=(char*)malloc(com.ls*sizeof(char)))==NULL) error2("oom imark");
\r
1422 for (h=0,lsinf=0; h<com.ls; h++) {
\r
1423 for (i=0; i<com.ns; i++) markb[i]=0;
\r
1424 for (i=0; i<com.ns; i++) markb[(int)com.z[i][com.pose[h]]]++;
\r
1426 for (i=0,inf=0; i<com.ncode; i++) if (markb[i]>=2) inf++;
\r
1427 if (inf>=2) { imark[h]=1; lsinf++; }
\r
1430 fprintf (finf, "%6d%6d\n", com.ns, lsinf);
\r
1431 fprintf (fninf, "%6d%6d\n", com.ns, com.ls-lsinf);
\r
1432 for (i=0; i<com.ns; i++) {
\r
1433 fprintf (finf, "\n%s\n", com.spname[i]);
\r
1434 fprintf (fninf, "\n%s\n", com.spname[i]);
\r
1435 for (h=0; h<com.ls; h++)
\r
1436 fprintf ((imark[h]?finf:fninf), "%c", pch[(int)com.z[i][com.pose[h]]]);
\r
1437 FPN (finf); FPN(fninf);
\r
1440 fclose(finf); fclose(fninf);
\r
1445 int PatternWeightJC69like (FILE *fout)
\r
1447 /* This collaps site patterns further for JC69-like models, called after
\r
1448 PatternWeight(). This is used for JC and poisson amino acid models.
\r
1449 The routine could be merged into PatternWeight(), which should lead to
\r
1450 faster computation, but this is not done because right now
\r
1451 InitializeBaseAA() prints out base or amino acid frequencies after
\r
1452 PatternWeight() and before this routine.
\r
1454 If the data have no ambiguities (com.cleanddata=1), the routine recodes
\r
1455 the data, for example, changing data at a site 1120 (CCAT) into 0012
\r
1456 (TTCA) before checking against old patterns already found. If the data
\r
1457 contain ambiguities, they are not encoded. In that case, for every
\r
1458 site, the routine changes ? or N into - first. It then checks whether there
\r
1459 are any other ambibiguities and will recode if and only if there are not
\r
1460 any other ambiguities. For example, a site with data CC?T will be
\r
1461 changed into CC-T first and then recoded into TT-C and checked against
\r
1462 old patterns found. A site with data CCRT will not be recoded. In theory
\r
1463 such sites may be packed as well, but perhaps the effort is not worthwhile.
\r
1464 The routine checks data like CCRT against old patterns already found,
\r
1466 If com.pose is not NULL, the routine also updates com.pose. This allows
\r
1467 the program to work if com.readpattern==1.
\r
1469 char zh[NS], b, gap;
\r
1470 char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
\r
1471 int npatt0=com.npatt, h, ht, j,k, same=0, ig, recode;
\r
1473 if(com.seqtype==1)
\r
1474 error2("PatternWeightJC69like does not work for codon seqs");
\r
1475 if(noisy) printf("Counting site patterns again, for JC69.\n");
\r
1476 gap = (char) (strchr(pch, (int)'-') - pch);
\r
1477 for (h=0,com.npatt=0,ig=-1; h<npatt0; h++) {
\r
1478 if (ig<com.ngene-1 && h==com.posG[ig+1])
\r
1479 com.posG[++ig] = com.npatt;
\r
1481 if(com.cleandata) { /* clean data, always recode */
\r
1484 for (j=1; j<com.ns; j++) {
\r
1485 for(k=0; k<j; k++)
\r
1486 if (com.z[j][h]==com.z[k][h]) break;
\r
1487 zh[j] = (k<j ? zh[k] : b++);
\r
1490 else { /* recode only if there are no non-gap ambiguity characters */
\r
1491 for(j=0; j<com.ns; j++)
\r
1492 zh[j] = com.z[j][h];
\r
1494 /* After this loop, recode = 0 or 1 decides whether to recode. */
\r
1495 for (j=0,recode=1; j<com.ns; j++) {
\r
1496 if (zh[j] < com.ncode)
\r
1498 if (nChara[zh[j]] == com.ncode) {
\r
1509 for (j=1; j<com.ns; j++) {
\r
1510 if(zh[j] != gap) {
\r
1511 for(k=0; k<j; k++)
\r
1512 if (zh[j] == com.z[k][h]) break;
\r
1513 if(k<j) zh[j] = zh[k];
\r
1520 for (ht=com.posG[ig],same=0; ht<com.npatt; ht++) {
\r
1521 for (j=0,same=1; j<com.ns; j++)
\r
1522 if (zh[j]!=com.z[j][ht]) {
\r
1528 com.fpatt[ht] += com.fpatt[h];
\r
1530 for(j=0; j<com.ns; j++) com.z[j][com.npatt] = zh[j];
\r
1531 com.fpatt[com.npatt++] = com.fpatt[h];
\r
1534 for(k=0; k<com.ls; k++)
\r
1535 if(com.pose[k]==h) com.pose[k] = ht;
\r
1537 com.posG[com.ngene] = com.npatt;
\r
1538 if (noisy) printf ("new no. site patterns:%7d\n", com.npatt);
\r
1541 fprintf(fout, "\n\nPrinting out site pattern counts\n");
\r
1542 printPatterns(fout);
\r
1547 int Site2Pattern (FILE *fout)
\r
1550 fprintf(fout,"\n\nMapping site to pattern (i.e. site %d has pattern %d):\n",
\r
1551 com.ls-1, com.pose[com.ls-2]+1);
\r
1553 fprintf (fout, "%6d", com.pose[h]+1);
\r
1554 if ((h+1)%10==0) FPN (fout);
\r
1565 int print1seq (FILE*fout, char *z, int ls, int pose[])
\r
1567 /* This prints out one sequence, and the sequences are encoded.
\r
1568 z[] contains patterns if (pose!=NULL)
\r
1569 This uses com.seqtype.
\r
1571 int h, hp, gap=10;
\r
1572 char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
\r
1574 int nb = (com.seqtype==CODONseq?3:1);
\r
1576 for(h=0; h<ls; h++) {
\r
1577 hp = (pose ? pose[h] : h);
\r
1578 if(com.seqtype != CODONseq) {
\r
1579 fprintf(fout, "%c", pch[(int)z[hp]]);
\r
1580 if((h+1)%gap==0) fputc(' ', fout);
\r
1583 fprintf(fout, "%s ", CODONs[z[hp]]);
\r
1588 void printSeqs (FILE *fout, int *pose, char keep[], int format)
\r
1590 /* Print sequences into fout, using paml (format=0 or 1) or paup (format=2)
\r
1592 Use pose=NULL if called before site patterns are collapsed.
\r
1593 keep[] marks the sequences to be printed. Use NULL for keep if all sequences
\r
1594 are to be printed.
\r
1595 Sequences may (com.cleandata==1) and may not (com.cleandata=0) be coded.
\r
1596 com.z[] has site patterns if pose!=NULL.
\r
1597 This uses com.seqtype, and com.ls is the number of codons for codon seqs.
\r
1598 See notes in print1seq()
\r
1600 format = 0,1: PAML sites or patterns
\r
1601 2: PAUP Nexus format.
\r
1603 This is used by evolver. Check and merge with printsma().
\r
1606 int h, j, ls1, n31=(com.seqtype==1?3:1), nskept=com.ns, wname=30;
\r
1607 char *dt=(com.seqtype==AAseq?"protein":"dna");
\r
1609 ls1 = (format==1 ? com.npatt : com.ls);
\r
1611 for(j=0; j<com.ns; j++) nskept -= !keep[j];
\r
1612 if(format==0 || format==1)
\r
1613 fprintf(fout, "\n\n%6d %7d %s\n\n", nskept, ls1*n31, (format==1?" P":""));
\r
1614 else if(format==2) { /* NEXUS format */
\r
1615 fprintf(fout,"\nbegin data;\n");
\r
1616 fprintf(fout," dimensions ntax=%d nchar=%d;\n", nskept, ls1*n31);
\r
1617 fprintf(fout," format datatype=%s missing=? gap=-;\n matrix\n",dt);
\r
1620 for(j=0; j<com.ns; j++,FPN(fout)) {
\r
1621 if(keep && !keep[j]) continue;
\r
1622 fprintf(fout,"%s%-*s ", (format==2?" ":""), wname, com.spname[j]);
\r
1623 print1seq(fout, com.z[j], (format==1?com.npatt:com.ls), pose);
\r
1625 if(format==2) fprintf(fout, " ;\nend;");
\r
1626 else if (format==1) {
\r
1627 for(h=0,FPN(fout); h<com.npatt; h++) {
\r
1628 /* fprintf(fout," %12.8f", com.fpatt[h]/(double)com.ls); */
\r
1629 fprintf(fout," %4.0f", com.fpatt[h]);
\r
1630 if((h+1)%15==0) FPN(fout);
\r
1634 fprintf(fout,"\n\n");
\r
1638 #define gammap(x,alpha) (alpha*(1-pow(x,-1.0/alpha)))
\r
1639 /* DistanceREV () used to be here, moved to pamp.
\r
1642 #if (defined BASEML || defined BASEMLG || defined MCMCTREE || defined PROBTREE || defined YULETREE)
\r
1644 double SeqDivergence (double x[], int model, double alpha, double *kappa)
\r
1646 /* alpha=0 if no gamma
\r
1647 return -1 if in error.
\r
1648 Check DistanceF84() if variances are wanted.
\r
1651 double p[4], Y,R, a1,a2,b, P1,P2,Q,fd,tc,ag, GC;
\r
1652 double small=1e-10/com.ls,largek=999, larged=9;
\r
1654 if (testXMat(x)) {
\r
1655 matout(F0, x, 4, 4);
\r
1656 printf("\nfrequency matrix error, setting distance to large d");
\r
1659 for (i=0,fd=1,zero(p,4); i<4; i++) {
\r
1661 FOR (j,4) { p[i]+=x[i*4+j]/2; p[j]+=x[i*4+j]/2; }
\r
1663 P1 = x[0*4+1]+x[1*4+0];
\r
1664 P2 = x[2*4+3]+x[3*4+2];
\r
1665 Q = x[0*4+2]+x[0*4+3]+x[1*4+2]+x[1*4+3]+ x[2*4+0]+x[2*4+1]+x[3*4+0]+x[3*4+1];
\r
1668 if(P1<small) P1=0;
\r
1669 if(P2<small) P2=0;
\r
1671 Y=p[0]+p[1]; R=p[2]+p[3]; tc=p[0]*p[1]; ag=p[2]*p[3];
\r
1675 FOR (i,4) p[i]=.25;
\r
1677 for (i=0,b=0; i<4; i++) b += p[i]*(1-p[i]);
\r
1678 if (1-fd/b<=0) return (larged);
\r
1680 if (alpha<=0) return (-b*log (1-fd/b));
\r
1681 else return (-b*gammap(1-fd/b,alpha));
\r
1684 printf("\nP Q = %.6f %.6f\n", P1+P2,Q);
\r
1685 printf("\nP1 P2 Q = %.6f %.6f %.6f\n", P1,P2,Q);
\r
1687 a1=1-2*(P1+P2)-Q; b=1-2*Q;
\r
1688 /* if (a1<=0 || b<=0) return (-1); */
\r
1689 if (a1<=0 || b<=0) return (larged);
\r
1690 if (alpha<=0) { a1=-log(a1); b=-log(b); }
\r
1691 else { a1=-gammap(a1,alpha); b=-gammap(b,alpha); }
\r
1692 a1=.5*a1-.25*b; b=.25*b;
\r
1693 if(b>small) *kappa = a1/b; else *kappa=largek;
\r
1696 if(Y<small || R<small)
\r
1697 error2("Y or R = 0.");
\r
1699 a1=(2*(tc+ag)+2*(tc*R/Y+ag*Y/R)*(1-Q/(2*Y*R)) -P1-P2) / (2*tc/Y+2*ag/R);
\r
1700 b = 1 - Q/(2*Y*R);
\r
1701 /* if (a1<=0 || b<=0) return (-1); */
\r
1702 if (a1<=0 || b<=0) return (larged);
\r
1703 if (alpha<=0) { a1=-log(a1); b=-log(b); }
\r
1704 else { a1=-gammap(a1,alpha); b=-gammap(b,alpha); }
\r
1707 *kappa = max2(*kappa, -.5);
\r
1708 return 4*b*(tc*(1+ *kappa/Y)+ag*(1+ *kappa/R)+Y*R);
\r
1709 case (HKY85): /* HKY85, from Rzhetsky & Nei (1995 MBE 12, 131-51) */
\r
1710 if(Y<small || R<small)
\r
1711 error2("Y or R = 0.");
\r
1714 a1=1-Y*P1/(2*tc)-Q/(2*Y);
\r
1715 a2=1-R*P2/(2*ag)-Q/(2*R);
\r
1717 if (a1<=0 || a2<=0 || b<=0) return (larged);
\r
1718 if (alpha<=0) { a1=-log(a1); a2=-log(a2); b=-log(b); }
\r
1719 else { a1=-gammap(a1,alpha); a2=-gammap(a2,alpha); b=-gammap(b,alpha);}
\r
1720 a1 = -R/Y*b + a1/Y;
\r
1721 a2 = -Y/R*b + a2/R;
\r
1722 if (b>0) *kappa = min2((a1+a2)/(2*b), largek);
\r
1723 return 2*(p[0]*p[1] + p[2]*p[3])*(a1+a2)/2 + 2*Y*R*b;
\r
1727 a1 = 1 - Q - (P1+P2)/(2*GC*(1-GC)); b=1-2*Q;
\r
1728 if (a1<=0 || b<=0) return (larged);
\r
1729 if (alpha<=0) { a1=-log(a1); b=-log(b); }
\r
1730 else { a1=-gammap(a1,alpha); b=-gammap(b,alpha);}
\r
1731 if(Q>0) *kappa = 2*a1/b-1;
\r
1732 return 2*GC*(1-GC)*a1 + (1-2*GC*(1-GC))/2*b;
\r
1733 case (TN93): /* TN93 */
\r
1734 if(Y<small || R<small)
\r
1735 error2("Y or R = 0.");
\r
1736 a1=1-Y*P1/(2*tc)-Q/(2*Y);
\r
1737 a2=1-R*P2/(2*ag)-Q/(2*R);
\r
1739 /* if (a1<=0 || a2<=0 || b<=0) return (-1); */
\r
1740 if (a1<=0 || a2<=0 || b<=0) return (larged);
\r
1741 if (alpha<=0) { a1=-log(a1); a2=-log(a2); b=-log(b); }
\r
1742 else { a1=-gammap(a1,alpha); a2=-gammap(a2,alpha); b=-gammap(b,alpha);}
\r
1743 a1=.5/Y*(a1-R*b); a2=.5/R*(a2-Y*b); b=.5*b;
\r
1746 printf("\nk1&k2 = %.6f %.6f\n", a1/b,a2/b);
\r
1748 if (b>0) *kappa = min2((a1+a2)/(2*b), largek);
\r
1749 return 4*p[0]*p[1]*a1 + 4*p[2]*p[3]*a2 + 4*Y*R*b;
\r
1755 double DistanceIJ (int is, int js, int model, double alpha, double *kappa)
\r
1757 /* Distance between sequences is and js.
\r
1758 See DistanceMatNuc() for more details.
\r
1761 int h, n=4, missing=0;
\r
1762 double x[16], sumx, larged=9;
\r
1765 if(com.cleandata && com.seqtype==0) {
\r
1766 for (h=0; h<com.npatt; h++)
\r
1767 x[com.z[is][h]*n+com.z[js][h]] += com.fpatt[h];
\r
1770 for (h=0; h<com.npatt; h++) {
\r
1771 b0 = com.z[is][h];
\r
1772 b1 = com.z[js][h];
\r
1774 x[b0*n+b1] += com.fpatt[h];
\r
1781 if(sumx<=0) return(larged); /* questionable??? */
\r
1782 abyx(1./sum(x,16),x,16);
\r
1783 return SeqDivergence(x, model, alpha, kappa);
\r
1787 #if (defined LSDISTANCE && defined REALSEQUENCE)
\r
1789 extern double *SeqDistance;
\r
1791 int DistanceMatNuc (FILE *fout, FILE*f2base, int model, double alpha)
\r
1793 /* This calculates pairwise distances. The data may be clean and coded
\r
1794 (com.cleandata==1) or not. In the latter case, ambiguity sites are not
\r
1795 used (pairwise deletion). Site patterns are used.
\r
1797 int is,js, status=0;
\r
1798 double kappat=0, t,bigD=9;
\r
1800 if(f2base) fprintf(f2base,"%6d\n", com.ns);
\r
1801 if(model>=REV) model=TN93; /* TN93 here */
\r
1803 fprintf(fout,"\nDistances:%5s", models[model]);
\r
1804 if (model!=JC69 && model!=F81) fprintf (fout, " (kappa) ");
\r
1805 fprintf(fout," (alpha set at %.2f)\n", alpha);
\r
1806 fprintf(fout,"This matrix is not used in later m.l. analysis.\n");
\r
1807 if(!com.cleandata) fprintf(fout, "\n(Pairwise deletion.)");
\r
1809 for(is=0; is<com.ns; is++) {
\r
1810 if(fout) fprintf(fout,"\n%-15s ", com.spname[is]);
\r
1811 if(f2base) fprintf(f2base,"%-15s ", com.spname[is]);
\r
1812 for(js=0; js<is; js++) {
\r
1813 t = DistanceIJ(is, js, model, alpha, &kappat);
\r
1814 if(t<0) { t=bigD; status=-1; }
\r
1815 SeqDistance[is*(is-1)/2+js] = t;
\r
1816 if(f2base) fprintf(f2base," %7.4f", t);
\r
1817 if(fout) fprintf(fout,"%8.4f", t);
\r
1818 if(fout && (model==K80 || model>=F84))
\r
1819 fprintf(fout,"(%7.4f)", kappat);
\r
1821 if(f2base) FPN(f2base);
\r
1823 if(fout) FPN(fout);
\r
1824 if(status) puts("\ndistance formula sometimes inapplicable..");
\r
1834 extern int CijkIs0[];
\r
1838 extern double Cijk[], Root[];
\r
1840 int QTN93 (int model, double Q[], double kappa1, double kappa2, double pi[])
\r
1843 double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G, scalefactor;
\r
1845 if (model==JC69 || model==F81) kappa1=kappa2=com.kappa=1;
\r
1846 else if (com.model<TN93) kappa2=kappa1;
\r
1847 if(model==F84) { kappa2=1+kappa1/R; kappa1=1+kappa1/Y; }
\r
1848 scalefactor = 1/(2*T*C*kappa1+2*A*G*kappa2 + 2*Y*R);
\r
1850 for(i=0; i<4; i++) for(j=0; j<4; j++) Q[i*4+j] = (i==j ? 0 : 1);
\r
1851 Q[0*4+1] = Q[1*4+0] = kappa1;
\r
1852 Q[2*4+3] = Q[3*4+2] = kappa2;
\r
1853 for(i=0; i<4; i++) for(j=0; j<4; j++) Q[i*4+j] *= pi[j]*scalefactor;
\r
1854 for(i=0; i<4; i++) { Q[i*4+i] = 0; Q[i*4+i] = -sum(Q+i*4, 4); }
\r
1859 int RootTN93 (int model, double kappa1, double kappa2, double pi[],
\r
1860 double *scalefactor, double Root[])
\r
1862 double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G;
\r
1864 if (model==JC69 || model==F81) kappa1=kappa2=com.kappa=1;
\r
1865 else if (com.model<TN93) kappa2=kappa1;
\r
1866 if(model==F84) { kappa2=1+kappa1/R; kappa1=1+kappa1/Y; }
\r
1868 *scalefactor = 1/(2*T*C*kappa1+2*A*G*kappa2 + 2*Y*R);
\r
1871 Root[1] = - (*scalefactor);
\r
1872 Root[2] = -(Y+R*kappa2) * (*scalefactor);
\r
1873 Root[3] = -(Y*kappa1+R) * (*scalefactor);
\r
1878 int eigenTN93 (int model, double kappa1, double kappa2, double pi[],
\r
1879 int *nR, double Root[], double Cijk[])
\r
1881 /* initialize Cijk[] & Root[], which are the only part to be changed
\r
1882 for a new substitution model
\r
1883 for JC69, K80, F81, F84, HKY85, TN93
\r
1884 Root: real Root divided by v, the number of nucleotide substitutions.
\r
1887 double scalefactor, U[16],V[16], t;
\r
1888 double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G;
\r
1890 if (model==JC69 || model==F81) kappa1=kappa2=com.kappa=1;
\r
1891 else if (com.model<TN93) kappa2=kappa1;
\r
1892 RootTN93(model, kappa1, kappa2, pi, &scalefactor, Root);
\r
1894 *nR = nr = 2 + (model==K80||model>=F84) + (model>=HKY85);
\r
1895 U[0*4+0]=U[1*4+0]=U[2*4+0]=U[3*4+0]=1;
\r
1896 U[0*4+1]=U[1*4+1]=1/Y; U[2*4+1]=U[3*4+1]=-1/R;
\r
1897 U[0*4+2]=U[1*4+2]=0; U[2*4+2]=G/R; U[3*4+2]=-A/R;
\r
1898 U[2*4+3]=U[3*4+3]=0; U[0*4+3]=C/Y; U[1*4+3]=-T/Y;
\r
1901 V[1*4+0]=R*T; V[1*4+1]=R*C;
\r
1902 V[1*4+2]=-Y*A; V[1*4+3]=-Y*G;
\r
1903 V[2*4+0]=V[2*4+1]=0; V[2*4+2]=1; V[2*4+3]=-1;
\r
1904 V[3*4+0]=1; V[3*4+1]=-1; V[3*4+2]=V[3*4+3]=0;
\r
1906 for(i=0; i<4; i++) for(j=0; j<4; j++) {
\r
1907 Cijk[i*4*nr+j*nr+0]=U[i*4+0]*V[0*4+j];
\r
1911 for (k=1,t=0; k<4; k++) t += U[i*4+k]*V[k*4+j];
\r
1912 Cijk[i*4*nr+j*nr+1] = t;
\r
1916 Cijk[i*4*nr+j*nr+1]=U[i*4+1]*V[1*4+j];
\r
1917 for (k=2,t=0; k<4; k++) t += U[i*4+k]*V[k*4+j];
\r
1918 Cijk[i*4*nr+j*nr+2]=t;
\r
1920 case HKY85: case T92: case TN93:
\r
1921 for (k=1; k<4; k++) Cijk[i*4*nr+j*nr+k] = U[i*4+k]*V[k*4+j];
\r
1924 error2("model in eigenTN93");
\r
1928 FOR (i,64) CijkIs0[i] = (Cijk[i]==0);
\r
1938 #if (defined(CODEML) || defined(YN00))
\r
1940 int printfcode (FILE *fout, double fb61[], double space[])
\r
1944 int i, n=Nsensecodon;
\r
1946 fprintf (fout, "\nCodon freq., x 10000\n");
\r
1948 for(i=0; i<n; i++) space[FROM61[i]] = fb61[i]*10000;
\r
1949 printcu(fout, space, com.icode);
\r
1954 int printsmaCodon (FILE *fout, unsigned char * z[],int ns,int ls,int lline,int simple)
\r
1956 /* print, in blocks, multiple aligned and transformed codon sequences.
\r
1958 This is needed as codons are coded 0,1, 2, ..., 60, and
\r
1959 printsma won't work.
\r
1961 int ig, ngroup, lt, il,is, i,b, lspname=30;
\r
1962 char equal='.',*pz, c0[4],c[4];
\r
1964 if(ls==0) return(1);
\r
1965 ngroup = (ls-1)/lline + 1;
\r
1966 for (ig=0,FPN(fout); ig<ngroup; ig++) {
\r
1967 /* fprintf (fout,"%-8d\n", ig*lline+1); */
\r
1968 for (is=0; is<ns; is++) {
\r
1969 fprintf(fout,"%-*s ", lspname, com.spname[is]);
\r
1971 for(il=ig*lline,pz=z[is]+il; lt<lline && il<ls; il++,lt++,pz++) {
\r
1974 c[0] = (char)(b/16);
\r
1975 c[1] = (char)((b%16)/4);
\r
1976 c[2] = (char)(b%4);
\r
1978 for(i=0; i<3; i++)
\r
1979 c[i] = BASEs[(int)c[i]];
\r
1980 if (is && simple) {
\r
1983 c0[0]=(char)(b/16); c0[1]=(char)((b%16)/4); c0[2]=(char)(b%4);
\r
1984 for(i=0; i<3; i++)
\r
1985 if (c[i]==BASEs[(int)c0[i]]) c[i]=equal;
\r
1987 fprintf(fout,"%3s ", c);
\r
1996 int setmark_61_64 (void)
\r
1998 /* This sets two matrices FROM61[], and FROM64[], which translate between two
\r
1999 codings of codons. In one coding, codons go from 0, 1, ..., 63 while in
\r
2000 the other codons range from 0, 1, ..., 61 with the three stop codons removed.
\r
2001 FROM61[] translates from the 61-state coding to the 64-state coding, while
\r
2002 FROM64[] translates from the 64-state coding to the 61-state coding.
\r
2004 This routine also sets up FourFold[4][4], which defines the 4-fold codon
\r
2007 int i,j,k, *code=GeneticCode[com.icode];
\r
2008 int c[3],aa0,aa1, by[3]={16,4,1};
\r
2009 double nSilent, nStop, nRepl;
\r
2012 for (i=0; i<64; i++) {
\r
2013 if (code[i]==-1) FROM64[i]=-1;
\r
2014 else { FROM61[Nsensecodon]=i; FROM64[i]=Nsensecodon++; }
\r
2016 com.ncode=Nsensecodon;
\r
2018 for(i=0; i<4; i++) for(j=0; j<4; j++) {
\r
2020 FourFold[i][j] = (code[k]==code[k+1] && code[k]==code[k+2] && code[k]==code[k+3]);
\r
2023 for (i=0,nSilent=nStop=nRepl=0; i<64; i++) {
\r
2024 c[0]=i/16; c[1]=(i/4)%4; c[2]=i%4;
\r
2025 if((aa0=code[i])==-1) continue;
\r
2026 for(j=0; j<3; j++) for(k=0; k<3; k++) {
\r
2027 aa1 = code[i + ((c[j]+k+1)%4 - c[j])*by[j]];
\r
2028 if(aa1==-1) nStop++;
\r
2029 else if(aa0==aa1) nSilent++;
\r
2034 printf("\ncode Stop Silent Replace\n");
\r
2035 printf("%3d (%d) %6.0f%6.0f%6.0f %12.6f%12.6f\n",
\r
2036 com.icode, 64-com.ncode, nStop,nSilent,nRepl,nStop*3/(com.ncode*9),nSilent*3/(com.ncode*9));
\r
2041 int DistanceMatNG86 (FILE *fout, FILE*fds, FILE*fdn, FILE*ft, double alpha)
\r
2043 /* Estimation of dS and dN by the method of Nei & Gojobori (1986)
\r
2044 This works with both coded (com.cleandata==1) and uncoded data.
\r
2045 In the latter case (com.cleandata==0), the method does pairwise delection.
\r
2047 alpha for gamma rates is used for dN only.
\r
2050 int is,js, i,k,h, wname=20, status=0, ndiff,nsd[4];
\r
2051 int nb[3],ib[3][4], missing;
\r
2052 double ns,na, nst,nat, S,N, St,Nt, dS,dN,dN_dS,y, bigD=3, lst;
\r
2053 double SEds, SEdn, p;
\r
2056 fputs("\n\n\nNei & Gojobori 1986. dN/dS (dN, dS)",fout);
\r
2057 if(com.cleandata==0) fputs("\n(Pairwise deletion)",fout);
\r
2058 fputs("\n(Note: This matrix is not used in later ML. analysis.\n",fout);
\r
2059 fputs("Use runmode = -2 for ML pairwise comparison.)\n",fout);
\r
2063 fprintf(fds,"%6d\n",com.ns);
\r
2064 fprintf(fdn,"%6d\n",com.ns);
\r
2065 fprintf(ft,"%6d\n",com.ns);
\r
2067 if(noisy>1 && com.ns>10) puts("NG distances for seqs.:");
\r
2068 for(is=0; is<com.ns; is++) {
\r
2070 fprintf(fout,"\n%-*s", wname,com.spname[is]);
\r
2072 fprintf(fds, "%-*s ",wname,com.spname[is]);
\r
2073 fprintf(fdn, "%-*s ",wname,com.spname[is]);
\r
2074 fprintf(ft, "%-*s ",wname,com.spname[is]);
\r
2076 for(js=0; js<is; js++) {
\r
2077 for(k=0; k<4; k++) nsd[k] = 0;
\r
2078 for (h=0,lst=0,nst=nat=S=N=0; h<com.npatt; h++) {
\r
2079 if(com.z[is][h]>=com.ncode || com.z[js][h]>=com.ncode)
\r
2081 codon[0] = CODONs[com.z[is][h]];
\r
2082 codon[1] = CODONs[com.z[js][h]];
\r
2083 lst += com.fpatt[h];
\r
2084 ndiff = difcodonNG(codon[0], codon[1], &St, &Nt, &ns, &na, 0, com.icode);
\r
2085 nsd[ndiff] += (int)com.fpatt[h];
\r
2086 S += St*com.fpatt[h];
\r
2087 N += Nt*com.fpatt[h];
\r
2088 nst += ns*com.fpatt[h];
\r
2089 nat += na*com.fpatt[h];
\r
2093 else { /* rescale for stop codons */
\r
2099 printf("\n%3d %3d:Sites %7.1f +%7.1f =%7.1f\tDiffs %7.1f +%7.1f =%7.1f",
\r
2100 is+1,js+1,S,N,S+N,nst,nat, nst+nat);
\r
2102 dS = (S<=0 ? 0 : 1-4./3*nst/S);
\r
2103 dN = (N<=0 ? 0 : 1-4./3*nat/N);
\r
2104 if(noisy>=9 && (dS<=0 || dN<=0))
\r
2105 { puts("\nNG86 unusable."); status=-1;}
\r
2107 else dS = (dS<=0 ? -1 : 3./4*(-log(dS)));
\r
2109 else dN = (dN<=0 ? -1 : 3./4*(alpha==0?-log(dN):alpha*(pow(dN,-1/alpha)-1)));
\r
2111 dN_dS = (dS>0 && dN>0 ? dN/dS : -1);
\r
2112 if(fout) fprintf(fout,"%7.4f (%5.4f %5.4f)", dN_dS, dN, dS);
\r
2114 if(N>0 && dN<0) dN = bigD;
\r
2115 if(S>0&&dS<0) dS = bigD;
\r
2118 SeqDistance[is*(is-1)/2+js] = (S<=0||N<=0 ? 0 : (S*dS+N*dN)*3/(S+N));
\r
2122 fprintf(fds," %7.4f", dS);
\r
2123 fprintf(fdn," %7.4f", dN);
\r
2124 fprintf(ft," %7.4f", (S*dS+N*dN)*3/(S+N));
\r
2126 if(alpha==0 && dS<bigD) { p=nst/S; SEds=sqrt(9*p*(1-p)/(square(3-4*p)*S)); }
\r
2127 if(alpha==0 && dN<bigD) { p=nat/N; SEdn=sqrt(9*p*(1-p)/(square(3-4*p)*N)); }
\r
2130 FPN(fds); FPN(fdn); FPN(ft);
\r
2132 if(noisy>1 && com.ns>10) printf(" %3d", is+1);
\r
2135 if(fout) FPN(fout);
\r
2136 if(status) fprintf (fout, "NOTE: -1 means that NG86 is inapplicable.\n");
\r
2138 SS=S, NN=N, Sd=nst, Nd=nat; /* kostas */
\r
2150 int eigenQREVbase (FILE* fout, double Q[NCODE*NCODE], double kappa[], double pi[], int *nR, double Root[], double Cijk[])
\r
2152 /* pi[] is constant.
\r
2153 This returns the Q matrix in Q.
\r
2155 int n=com.ncode, i,j,k;
\r
2156 int nr = (com.ngene>1 && com.Mgene>=3 ? com.nrate/com.ngene : com.nrate);
\r
2157 double Q0[NCODE*NCODE], U[NCODE*NCODE], V[NCODE*NCODE], mr, space_pisqrt[NCODE*NCODE];
\r
2162 if(com.model==REV) {
\r
2163 if(n!=4) error2("ncode != 4 for REV");
\r
2164 Q[3*n+2] = Q[2*n+3] = 1; /* r_AG = r_GA = 1. */
\r
2165 for(i=0,k=0; i<n-1; i++) for (j=i+1; j<n; j++)
\r
2166 if(i*n+j != 2*n+3)
\r
2167 Q[i*n+j] = Q[j*n+i] = kappa[k++];
\r
2169 else /* (model==REVu) */
\r
2170 for(i=0; i<n-1; i++) for(j=i+1; j<n; j++)
\r
2171 Q[i*n+j]=Q[j*n+i] = (StepMatrix[i*n+j] ? kappa[StepMatrix[i*n+j]-1] : 1);
\r
2173 for(i=0; i<n; i++) for(j=0; j<n; j++)
\r
2174 Q[i*n+j] *= pi[j];
\r
2176 for (i=0,mr=0; i<n; i++) {
\r
2178 Q[i*n+i] = -sum(Q+i*n, n);
\r
2179 mr -= pi[i]*Q[i*n+i];
\r
2181 abyx(1/mr, Q, n*n);
\r
2184 mr = 2*pi[0]*Q[0*n+1] + 2*pi[2]*Q[2*n+3];
\r
2185 if(com.nhomo==0) {
\r
2186 fprintf(fout, "\nRate parameters: ");
\r
2187 for(j=0; j<nr; j++)
\r
2188 fprintf(fout, " %8.5f", kappa[j]);
\r
2189 fprintf(fout, "\nBase frequencies: ");
\r
2190 for(j=0; j<n; j++)
\r
2191 fprintf(fout," %8.5f", pi[j]);
\r
2193 fprintf (fout, "\nRate matrix Q, Average Ts/Tv =%9.4f", mr/(1-mr));
\r
2194 matout (fout, Q, n, n);
\r
2197 xtoy (Q, Q0, n*n);
\r
2198 eigenQREV(Q0, pi, n, Root, U, V, space_pisqrt);
\r
2199 for(i=0; i<n; i++) for(j=0; j<n; j++) for(k=0; k<n; k++)
\r
2200 Cijk[i*n*n+j*n+k] = U[i*n+k]*V[k*n+j];
\r
2206 int QUNREST (FILE *fout, double Q[], double rate[], double pi[])
\r
2208 /* This constructs the rate matrix Q for the unrestricted model.
\r
2209 pi[] is changed in the routine.
\r
2211 int n=com.ncode, i,j,k;
\r
2212 double mr, ts, space[20];
\r
2214 if(com.model==UNREST) {
\r
2215 if(n!=4) error2("ncode != 4 for UNREST");
\r
2216 for (i=0,k=0,Q[14]=1; i<n; i++) for(j=0; j<n; j++)
\r
2217 if (i!=j && i*n+j != 14) Q[i*n+j] = rate[k++];
\r
2219 else /* (model==UNRESTu) */
\r
2220 for(i=0; i<n; i++) for(j=0; j<n; j++)
\r
2222 Q[i*n+j] = (StepMatrix[i*n+j] ? rate[StepMatrix[i*n+j]-1] : 1);
\r
2224 for(i=0; i<n; i++) {
\r
2226 Q[i*n+i] = -sum(Q+i*n, n);
\r
2230 QtoPi(Q, com.pi, n, space);
\r
2232 for (i=0,mr=0; i<n; i++) mr -= pi[i]*Q[i*n+i];
\r
2233 for (i=0; i<n*n; i++) Q[i] /= mr;
\r
2236 ts = pi[0]*Q[0*n+1] + pi[1]*Q[1*n+0] + pi[2]*Q[2*n+3] + pi[3]*Q[3*n+2];
\r
2238 fprintf(fout, "Rate parameters: ");
\r
2239 for(j=0; j<com.nrate; j++) fprintf(fout, " %8.5f", rate[j]);
\r
2240 fprintf(fout, "\nBase frequencies: ");
\r
2241 for(j=0; j<n; j++) fprintf(fout," %8.5f", pi[j]);
\r
2243 fprintf (fout,"\nrate matrix Q, Average Ts/Tv (similar to kappa/2) =%9.4f", ts/(1-ts));
\r
2245 fprintf (fout,"\nrate matrix Q");
\r
2246 matout (fout, Q, n, n);
\r
2256 double *SeqDistance=NULL;
\r
2257 int *ancestor=NULL;
\r
2261 /* This finds the most recent common ancestor of species is and js.
\r
2263 int is, js, it, a1, a2;
\r
2265 for(is=0; is<com.ns; is++) for(js=0; js<is; js++) {
\r
2266 it = is*(is-1)/2+js;
\r
2267 ancestor[it] = -1;
\r
2268 for (a1=is; a1!=-1; a1=nodes[a1].father) {
\r
2269 for (a2=js; a2!=-1; a2=nodes[a2].father)
\r
2270 if (a1==a2) { ancestor[it] = a1; break; }
\r
2271 if (ancestor[it] != -1) break;
\r
2273 if (ancestor[it] == -1) error2("no ancestor");
\r
2278 int fun_LS (double x[], double diff[], int np, int npair);
\r
2280 int fun_LS (double x[], double diff[], int np, int npair)
\r
2282 int i,j, aa, it=-np;
\r
2285 if (SetBranch(x) && noisy>2) puts ("branch len.");
\r
2286 if (npair != com.ns*(com.ns-1)/2) error2("# seq pairs err.");
\r
2287 for(i=0; i<com.ns; i++) for(j=0; j<i; j++) {
\r
2289 for (aa=i,dexp=0; aa!=ancestor[it]; aa=nodes[aa].father)
\r
2290 dexp += nodes[aa].branch;
\r
2291 for (aa=j; aa!=ancestor[it]; aa=nodes[aa].father)
\r
2292 dexp += nodes[aa].branch;
\r
2293 diff[it] = SeqDistance[it] - dexp;
\r
2295 if(fabs(diff[it])>1000) {
\r
2296 printf("\ndistances very different: diff = %12.6f ", diff[it]);
\r
2303 int LSDistance (double *ss,double x[],int (*testx)(double x[],int np))
\r
2305 /* get Least Squares estimates of branch lengths for a given tree topology
\r
2306 This uses nls2, a general least squares algorithm for nonlinear programming
\r
2307 to estimate branch lengths, and it thus inefficient.
\r
2311 if ((*testx)(x, com.ntime)) {
\r
2312 matout (F0, x, 1, com.ntime);
\r
2313 puts ("initial err in LSDistance()");
\r
2316 i = nls2((com.ntime>20&&noisy>=3?F0:NULL),
\r
2317 ss,x,com.ntime,fun_LS,NULL,testx,com.ns*(com.ns-1)/2,1e-6);
\r
2322 double PairDistanceML(int is, int js)
\r
2324 /* This calculates the ML distance between is and js, the sum of ML branch
\r
2325 lengths along the path between is and js.
\r
2326 LSdistance() has to be called once to set ancestor before calling this
\r
2332 if(is==js) return(0);
\r
2333 if(is<js) { it=is; is=js; js=it; }
\r
2335 it = is*(is-1)/2 + js;
\r
2336 for (a=is; a!=ancestor[it]; a=nodes[a].father)
\r
2337 dij += nodes[a].branch;
\r
2338 for (a=js; a!=ancestor[it]; a=nodes[a].father)
\r
2339 dij += nodes[a].branch;
\r
2344 int GroupDistances()
\r
2346 /* This calculates average group distances by summing over the ML
\r
2348 int newancestor=0, i,j, ig,jg;
\r
2349 /* int ngroup=2, Ningroup[10], group[200]={1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
\r
2350 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
\r
2351 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
\r
2352 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
\r
2353 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
\r
2354 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
\r
2355 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
\r
2356 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
\r
2357 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; */ /* dloop for HC200.paup */
\r
2358 int ngroup=10, Ningroup[10], group[115]={
\r
2359 10, 9, 9, 9, 9, 9, 9, 9, 9, 10,
\r
2360 9, 9, 3, 3, 1, 1, 1, 1, 1, 1,
\r
2361 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
\r
2362 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
\r
2363 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
\r
2364 1, 2, 2, 2, 2, 2, 2, 4, 4, 4,
\r
2365 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
\r
2366 4, 4, 4, 4, 4, 4, 5, 5, 5, 5,
\r
2367 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
\r
2368 5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
\r
2369 6, 7, 7, 7, 7, 7, 7, 7, 7, 7,
\r
2370 8, 8, 8, 8, 8}; /* dloop data for Anne Yoder, ns=115 */
\r
2371 double dgroup, npairused;
\r
2373 /* ngroup=2; for(j=0;j<com.ns; j++) group[j]=1+(group[j]>2); */
\r
2375 for(j=0;j<ngroup;j++) Ningroup[j]=0;
\r
2376 for(j=0;j<com.ns; j++) Ningroup[group[j]-1]++;
\r
2377 printf("\n# sequences in group:");
\r
2378 matIout(F0,Ningroup,1,ngroup);
\r
2379 if(ancestor==NULL) {
\r
2381 ancestor=(int*)realloc(ancestor, com.ns*(com.ns-1)/2*sizeof(int));
\r
2382 if(ancestor==NULL) error2("oom ancestor");
\r
2386 for(ig=0; ig<ngroup; ig++) {
\r
2387 printf("\ngroup %2d",ig+1);
\r
2388 for(jg=0; jg<ig+1; jg++) {
\r
2389 dgroup=0; npairused=0;
\r
2390 for(i=0;i<com.ns;i++) for(j=0;j<com.ns;j++) {
\r
2391 if(i!=j && group[i]==ig+1 && group[j]==jg+1) {
\r
2392 dgroup += PairDistanceML(i, j);
\r
2396 dgroup/=npairused;
\r
2397 printf("%9.4f", dgroup);
\r
2399 /* printf("%6.1f", dgroup/0.2604*5); */ /* 0.2604, 0.5611 */
\r
2402 if(newancestor==1) free(ancestor);
\r
2408 #ifdef NODESTRUCTURE
\r
2410 void BranchToNode (void)
\r
2412 /* tree.root need to be specified before calling this
\r
2416 tree.nnode=tree.nbranch+1;
\r
2417 for(i=0; i<tree.nnode; i++)
\r
2418 { nodes[i].father=nodes[i].ibranch=-1; nodes[i].nson=0; }
\r
2419 for (i=0; i<tree.nbranch; i++) {
\r
2420 from=tree.branches[i][0];
\r
2421 to =tree.branches[i][1];
\r
2422 nodes[from].sons[nodes[from].nson++]=to;
\r
2423 nodes[to].father=from;
\r
2424 nodes[to].ibranch=i;
\r
2426 /* nodes[tree.root].branch=0; this breaks method=1 */
\r
2429 void NodeToBranchSub (int inode);
\r
2431 void NodeToBranchSub (int inode)
\r
2435 for(i=0; i<nodes[inode].nson; i++) {
\r
2436 tree.branches[tree.nbranch][0] = inode;
\r
2437 tree.branches[tree.nbranch][1] = ison = nodes[inode].sons[i];
\r
2438 nodes[ison].ibranch = tree.nbranch++;
\r
2439 if(nodes[ison].nson>0) NodeToBranchSub(ison);
\r
2443 void NodeToBranch (void)
\r
2446 NodeToBranchSub (tree.root);
\r
2447 if(tree.nnode != tree.nbranch+1)
\r
2448 error2("nnode != nbranch + 1?");
\r
2452 void ClearNode (int inode)
\r
2454 /* a source of confusion. Try not to use this routine.
\r
2456 nodes[inode].father = nodes[inode].ibranch = -1;
\r
2457 nodes[inode].nson = 0;
\r
2458 nodes[inode].branch = nodes[inode].age = 0;
\r
2459 /* nodes[inode].label = -1; */
\r
2460 /* nodes[inode].branch = 0; clear node structure only, not branch lengths */
\r
2461 /* for(i=0; i<com.ns; i++) nodes[inode].sons[i]=-1; */
\r
2464 int ReadTreeB (FILE *ftree, int popline)
\r
2467 int nodemark[NS*2-1]={0}; /* 0: absent; 1: father only (root); 2: son */
\r
2468 int i,j, state=0, YoungAncestor=0;
\r
2471 puts("\nbranch representation of tree might not work with clock model.");
\r
2475 fscanf (ftree, "%d", &tree.nbranch);
\r
2476 for(j=0; j<tree.nbranch; j++) {
\r
2477 for(i=0; i<2; i++) {
\r
2478 if (fscanf (ftree, "%d", & tree.branches[j][i]) != 1) state=-1;
\r
2479 tree.branches[j][i]--;
\r
2480 if(tree.branches[j][i]<0 || tree.branches[j][i]>com.ns*2-1)
\r
2481 error2("ReadTreeB: node numbers out of range");
\r
2483 nodemark[tree.branches[j][1]]=2;
\r
2484 if(nodemark[tree.branches[j][0]]!=2) nodemark[tree.branches[j][0]]=1;
\r
2485 if (tree.branches[j][0]<com.ns) YoungAncestor=1;
\r
2487 printf ("\nBranch #%3d: %3d -> %3d",j+1,tree.branches[j][0]+1,tree.branches[j][1]+1);
\r
2490 if(popline) fgets(line, 254, ftree);
\r
2491 for(i=0,tree.root=-1; i<tree.nbranch; i++)
\r
2492 if(nodemark[tree.branches[i][0]]!=2) tree.root=tree.branches[i][0];
\r
2493 if(tree.root==-1) error2("root err");
\r
2494 for(i=0; i<com.ns; i++)
\r
2495 if(nodemark[i]==0) {
\r
2496 matIout(F0,nodemark,1,com.ns);
\r
2497 error2("branch specification of tree");
\r
2500 if(YoungAncestor) {
\r
2501 puts("\nAncestors in the data? Take care.");
\r
2502 if(!com.cleandata) {
\r
2503 puts("This kind of tree does not work with unclean data.");
\r
2509 com.ntime = com.clock ? (tree.nbranch+1)-com.ns+(tree.root<com.ns)
\r
2518 int OutTreeB (FILE *fout)
\r
2521 char *fmt[]={" %3d..%-3d", " %2d..%-2d"};
\r
2522 FOR (j, tree.nbranch)
\r
2523 fprintf(fout, fmt[0], tree.branches[j][0]+1,tree.branches[j][1]+1);
\r
2527 int GetTreeFileType(FILE *ftree, int *ntree, int *pauptree, int shortform);
\r
2529 int GetTreeFileType(FILE *ftree, int *ntree, int *pauptree, int shortform)
\r
2531 /* paupstart="begin trees" and paupend="translate" identify paup tree files.
\r
2532 paupch=";" will be the last character before the list of trees.
\r
2533 Modify if necessary.
\r
2535 int i,k, lline=32000, ch=0, paupch=';';
\r
2537 char *paupstart="begin tree", *paupend="translate";
\r
2540 k=fscanf(ftree,"%d%d",&i,ntree);
\r
2542 if(i==com.ns) return(0); /* old paml style */
\r
2543 else error2("Number of sequences different in tree and seq files.");
\r
2545 else if(k==1) { *ntree=i; return(0); } /* phylip & molphy style */
\r
2546 while(ch!='(' && !isalnum(ch) && ch!=EOF) ch=fgetc(ftree); /* treeview style */
\r
2547 if(ch=='(') { *ntree=-1; ungetc(ch,ftree); return(0); }
\r
2549 puts("\n# seqs in tree file does not match. Read as the nexus format.");
\r
2551 if(fgets(line,lline,ftree)==NULL) error2("tree err1: EOF");
\r
2553 if (strstr(line,paupstart)) { *pauptree=1; *ntree=-1; break; }
\r
2555 if(shortform) return(0);
\r
2557 if(fgets(line,lline,ftree)==NULL) error2("tree err2: EOF");
\r
2559 if (strstr(line,paupend)) break;
\r
2562 if((ch=fgetc(ftree))==EOF) error2("tree err3: EOF");
\r
2563 if (ch==paupch) break;
\r
2565 if(fgets(line,lline,ftree)==NULL) error2("tree err4: EOF");
\r
2570 int PopPaupTreeRubbish(FILE *ftree);
\r
2571 int PopPaupTreeRubbish(FILE *ftree)
\r
2573 /* This reads out the string in front of the tree in the nexus format,
\r
2574 typically "tree PAUP_1 = [&U]" with "[&U]" optional
\r
2581 { ungetc(ch,ftree); return(0); }
\r
2582 else if(ch==EOF || ch=='/')
\r
2589 static int *CladeLabel = NULL;
\r
2591 void DownTreeCladeLabel (int inode, int cLabel)
\r
2593 /* This goes down the tree to change $ labels (stored in CladeLabel[]) into
\r
2594 # labels (stored in nodes[].label). To deal with nested clade labels,
\r
2595 branches within a clade are labeled by negative numbers initially, and
\r
2596 converted to positive labels at the end of the algorithm.
\r
2598 nodes[].label and CladeLabel[] are initialized to -1 before this routine
\r
2604 if(CladeLabel[inode] != -1)
\r
2605 label = CladeLabel[inode];
\r
2606 if(inode != tree.root && nodes[inode].label == -1)
\r
2607 nodes[inode].label = label;
\r
2608 for(i=0; i<nodes[inode].nson; i++)
\r
2609 DownTreeCladeLabel(nodes[inode].sons[i], label);
\r
2612 int IsNameNumber(char line[])
\r
2614 /* returns 0 if line has species number; 1 if it has name.
\r
2617 int isname=1, alldigits=1, n;
\r
2621 if(!isdigit(*p++)) { alldigits=0; break; }
\r
2624 if(n>=1 && n<=com.ns) isname = 0;
\r
2631 int ReadTreeN (FILE *ftree, int *haslength, int *haslabel, int copyname, int popline)
\r
2633 /* Read a tree from ftree, using the parenthesis node representation of trees.
\r
2634 Branch lengths are read in nodes[].branch, and branch (node) labels
\r
2635 (integers) are preceeded by # and read in nodes[].label. If the clade label
\r
2636 $ is used, the label is read into CladeLabel[] first and then moved into
\r
2637 nodes[].label in the routine DownTreeCladeLabel().
\r
2639 This assumes that com.ns is known.
\r
2640 Species names are considered case-sensitive, with trailing spaces ignored.
\r
2642 copyname = 0: species numbers and names are both accepted, but names have
\r
2643 to match the names in com.spname[], which are from the
\r
2644 sequence data file. Used by baseml and codeml, for example.
\r
2645 1: species names are copied into com.spname[], but species
\r
2646 numbers are accepted. Used by evolver for simulation,
\r
2647 in which case no species names were read before.
\r
2648 2: the tree must have species names, which are copied into com.spname[].
\r
2649 Note that com.ns is assumed known. To remove this restrition,
\r
2650 one has to consider the space for nodes[], CladeLabel, starting
\r
2653 isname = 0: species number; 1: species name;
\r
2655 Ziheng note (18/12/2011): I have changed the code so that sequence number is not used
\r
2656 anymore. isname = 1 always.
\r
2658 int cnode, cfather=-1; /* current node and father */
\r
2659 int inodeb=0; /* node number that will have the next branch length */
\r
2660 int cladeLabels=0, i,j,k, level=0, isname, ch=' ', icurspecies=0;
\r
2661 char check[NS], delimiters[]="(),:#$=@><;", quote[]="\"\'";
\r
2663 char line[32000], *pch;
\r
2665 if(com.ns<=0) error2("you should specify # seqs in the tree file.");
\r
2667 if((CladeLabel=(int*)malloc((com.ns*2-1)*sizeof(int)))==NULL)
\r
2668 error2("oom trying to get space for cladelabel");
\r
2669 for(i=0; i<2*com.ns-1; i++)
\r
2670 CladeLabel[i] = -1;
\r
2672 /* initialization */
\r
2673 for(i=0; i<com.ns; i++) check[i]=0;
\r
2674 *haslength = 0; *haslabel = 0;
\r
2675 tree.nnode = com.ns; tree.nbranch = 0;
\r
2676 for(i=0; i<2*com.ns-1; i++) {
\r
2677 nodes[i].father = nodes[i].ibranch = -1;
\r
2678 nodes[i].nson = 0; nodes[i].label = -1; nodes[i].branch = 0;
\r
2679 nodes[i].age = 0; /* TipDate models set this for each tree later. */
\r
2680 #if (defined(BASEML) || defined(CODEML))
\r
2681 nodes[i].fossil = 0;
\r
2684 while(isspace(ch))
\r
2685 ch=fgetc(ftree); /* skip spaces */
\r
2688 { ReadTreeB(ftree,popline); return(0); }
\r
2690 if(PopPaupTreeRubbish(ftree) == -1) return(-1);
\r
2693 ch = fgetc (ftree);
\r
2694 if (ch==EOF) return(-1);
\r
2695 else if (ch == ';') {
\r
2696 if(level!=0) error2("; in treefile");
\r
2699 else if (ch==',') ;
\r
2700 else if (!isgraph(ch))
\r
2702 else if (ch == '(') { /* left ( */
\r
2704 cnode=tree.nnode++;
\r
2705 if(tree.nnode>2*com.ns-1)
\r
2706 error2("check #seqs and tree: perhaps too many '('?");
\r
2707 if (cfather >= 0) {
\r
2708 if(nodes[cfather].nson >= MAXNSONS) {
\r
2709 printf("there are at least %d daughter nodes, raise MAXNSONS?", nodes[cfather].nson);
\r
2712 nodes[cfather].sons[nodes[cfather].nson++] = cnode;
\r
2713 nodes[cnode].father = cfather;
\r
2714 tree.branches[tree.nbranch][0] = cfather;
\r
2715 tree.branches[tree.nbranch][1] = cnode;
\r
2716 nodes[cnode].ibranch = tree.nbranch++;
\r
2719 tree.root = cnode;
\r
2722 /* treating : and > in the same way is risky. */
\r
2723 else if (ch==')') {
\r
2724 level--; inodeb=cfather; cfather=nodes[cfather].father;
\r
2726 else if (ch==':'||ch=='>') {
\r
2727 if(ch==':') *haslength=1;
\r
2728 fscanf(ftree, "%lf", &nodes[inodeb].branch);
\r
2730 else if (ch==quote[0] || ch==quote[1]) {
\r
2731 for (k=0; ; k++) { /* read notes into line[] */
\r
2732 line[k] = (char)fgetc(ftree);
\r
2733 if((int)line[k] == EOF)
\r
2734 error2("EOF when reading node label");
\r
2735 if(line[k] == quote[0] || line[k] == quote[1])
\r
2739 nodes[inodeb].nodeStr = (char*)malloc(k*sizeof(char));
\r
2740 if (nodes[inodeb].nodeStr == NULL) error2("oom nodeStr");
\r
2741 strcpy(nodes[inodeb].nodeStr, line);
\r
2742 if((pch = strchr(line,'#')) || (pch = strchr(line,'<'))) {
\r
2743 *haslabel=1; sscanf(pch+1, "%lf", &nodes[inodeb].label);
\r
2745 if((pch = strchr(line,'>'))) {
\r
2746 sscanf(pch+1, "%lf", &nodes[inodeb].branch);
\r
2748 if((pch = strchr(line,'$'))) {
\r
2749 *haslabel=1; sscanf(pch+1, "%d", &CladeLabel[inodeb]);
\r
2751 if((pch = strchr(line,'=')) || (pch = strchr(line,'@'))) {
\r
2752 sscanf(pch+1, "%lf", &nodes[inodeb].age);
\r
2753 #if (defined(BASEML) || defined(CODEML))
\r
2754 if(com.clock) nodes[inodeb].fossil = 1;
\r
2756 #if (defined(CODEML))
\r
2757 nodes[inodeb].omega = 0;
\r
2761 else if (ch=='#' || ch=='<') { *haslabel=1; fscanf(ftree, "%lf", &nodes[inodeb].label); }
\r
2762 else if (ch=='$') { *haslabel=1; fscanf(ftree, "%d", &CladeLabel[inodeb]); }
\r
2763 else if (ch=='@' || ch=='=') {
\r
2764 fscanf(ftree,"%lf", &nodes[inodeb].age);
\r
2765 #if (defined(BASEML) || defined(CODEML))
\r
2766 if(com.clock) nodes[inodeb].fossil = 1;
\r
2768 #if (defined(CODEML))
\r
2769 nodes[inodeb].omega = 0;
\r
2772 else { /* read species name or number */
\r
2774 error2("expecting ; in the tree file");
\r
2775 line[0]=(char)ch; line[1]=(char)fgetc(ftree);
\r
2776 /* if(line[1]==(char)EOF) error2("eof in tree file"); */
\r
2778 for (i=1; i<lline; ) { /* read species name into line[] until delimiter */
\r
2779 if ((strchr(delimiters,line[i]) && line[i]!='@')
\r
2780 || line[i]==(char)EOF || line[i]=='\n')
\r
2781 { ungetc(line[i],ftree); line[i]=0; break; }
\r
2782 line[++i]=(char)fgetc(ftree);
\r
2784 for(j=i-1;j>0;j--) /* trim spaces*/
\r
2785 if(isgraph(line[j])) break; else line[j]=0;
\r
2788 isname = 1; /* numbers are part of names. */
\r
2790 isname = IsNameNumber(line);
\r
2792 if (isname==0) { /* number */
\r
2793 if(copyname==2) error2("Use names in tree.");
\r
2794 sscanf(line, "%d", &cnode);
\r
2799 for(i=0; i<com.ns; i++) if (!strcmp(line,com.spname[i])) break;
\r
2800 if((cnode=i)==com.ns)
\r
2801 { printf("\nSpecies %s?\n", line); exit(-1); }
\r
2804 if(icurspecies>com.ns-1) {
\r
2805 error2("error in tree: too many species in tree");
\r
2807 strcpy(com.spname[cnode=icurspecies++], line);
\r
2810 nodes[cnode].father=cfather;
\r
2811 if(nodes[cfather].nson>=MAXNSONS)
\r
2812 error2("too many daughter nodes, raise MAXNSONS");
\r
2814 nodes[cfather].sons[nodes[cfather].nson++] = cnode;
\r
2815 tree.branches[tree.nbranch][0] = cfather;
\r
2816 tree.branches[tree.nbranch][1] = cnode;
\r
2817 nodes[cnode].ibranch = tree.nbranch++;
\r
2824 fgets(line, lline, ftree);
\r
2825 for(i=0; i<com.ns; i++) {
\r
2827 printf("\nSeq #%d occurs more than once in the tree\n",i+1); exit(-1);
\r
2829 else if(check[i]<1) {
\r
2830 printf("\nSeq #%d (%s) is missing in the tree\n", i+1, com.spname[i]);
\r
2834 if(tree.nbranch>2*com.ns-2) {
\r
2835 printf("nbranch %d", tree.nbranch); puts("too many branches in tree?");
\r
2837 if (tree.nnode != tree.nbranch+1) {
\r
2838 printf ("\nnnode%6d != nbranch%6d + 1\n", tree.nnode, tree.nbranch);
\r
2842 /* check that it is o.k. to comment out this line
\r
2843 com.ntime = com.clock ? (tree.nbranch+1)-com.ns+(tree.root<com.ns)
\r
2848 /* check and convert clade labels $ */
\r
2849 #if(defined(BASEML) || defined(CODEML))
\r
2850 #if(defined(BASEML))
\r
2851 if(com.seqtype==0 && com.nhomo==5) cladeLabels = 1;
\r
2853 if(com.clock>1 || (com.seqtype==1 && com.model>=2)) cladeLabels = 1;
\r
2855 for(i=0,j=0; i<tree.nnode; i++) {
\r
2856 if(CladeLabel[i] != -1) j++;
\r
2858 if(j) { /* j is number of clade labels */
\r
2859 DownTreeCladeLabel(tree.root, 0);
\r
2862 /*** Somehow some labels are still -1 after this, so I changed this. Needs checking. ***/
\r
2863 for(i=0; i<tree.nnode; i++)
\r
2864 if(i!=tree.root && nodes[i].label==-1) nodes[i].label = 0;
\r
2866 /* OutTreeN(F0,1,PrBranch|PrNodeNum); FPN(F0); */
\r
2867 /* FPN(F0); OutTreeN(F0,1,PrLabel); FPN(F0); */
\r
2869 for(i=0,com.nbtype=0; i<tree.nnode; i++) {
\r
2870 if(i == tree.root) continue;
\r
2871 j = (int)nodes[i].label;
\r
2872 if(j+1 > com.nbtype) com.nbtype = j+1;
\r
2873 if(j<0 || j>tree.nbranch-1)
\r
2874 error2("branch label in the tree (note labels start from 0 and are consecutive)");
\r
2876 if (com.nbtype<=1)
\r
2877 error2("need branch labels in the tree for the model.");
\r
2879 printf("\n%d branch types are in tree. Stop if wrong.", com.nbtype);
\r
2882 #if(defined(CODEML))
\r
2883 if(com.seqtype==1 && com.NSsites==2 && com.model==3 && com.nbtype>NBTYPE)
\r
2884 error2("nbtype too large. Raise NBTYPE");
\r
2885 else if(com.seqtype==1 && com.NSsites && com.model==2 && com.nbtype!=2)
\r
2886 error2("only two branch types are allowed for branch models.");
\r
2898 int OutSubTreeN (FILE *fout, int inode, int spnames, int printopt, char *labelfmt);
\r
2900 int OutSubTreeN (FILE *fout, int inode, int spnames, int printopt, char *labelfmt)
\r
2902 int i, dad = nodes[inode].father, nsib = (inode==tree.root ? 0 : nodes[dad].nson);
\r
2904 if(inode != tree.root && inode == nodes[dad].sons[0])
\r
2905 fputc ('(', fout);
\r
2907 for(i=0; i<nodes[inode].nson; i++)
\r
2908 OutSubTreeN(fout, nodes[inode].sons[i], spnames, printopt, labelfmt);
\r
2910 if(nodes[inode].nson==0) { /* inode is tip */
\r
2912 if(printopt & PrNodeNum) fprintf(fout, "%d_", inode+1);
\r
2913 fprintf(fout, "%s", com.spname[inode]);
\r
2916 fprintf(fout, "%d", inode+1);
\r
2918 if((printopt & PrNodeNum) && nodes[inode].nson)
\r
2919 fprintf(fout," %d ", inode+1);
\r
2920 if((printopt & PrLabel) && nodes[inode].label>0)
\r
2921 fprintf(fout, labelfmt, nodes[inode].label);
\r
2922 if((printopt & PrAge) && nodes[inode].age)
\r
2923 fprintf(fout, " @%.6f", nodes[inode].age);
\r
2925 /* Add branch labels to be read by Rod Page's TreeView. */
\r
2926 #if (defined CODEML)
\r
2927 if((printopt & PrOmega) && inode != tree.root)
\r
2928 fprintf(fout, " #%.4f ", nodes[inode].omega);
\r
2929 #elif (defined (EVOLVER) || defined (MCMCTREE))
\r
2930 if((printopt & PrLabel) && nodes[inode].nodeStr && nodes[inode].nodeStr[0])
\r
2931 fprintf(fout, " %s", nodes[inode].nodeStr);
\r
2934 if((printopt & PrBranch) && (inode!=tree.root || nodes[inode].branch>0))
\r
2935 fprintf(fout, ": %.6f", nodes[inode].branch);
\r
2937 if((printopt & PrBranch) && nodes[inode].age>0) // print node ages instead of branch lengths
\r
2938 fprintf(fout, ": %.6f", nodes[inode].age);
\r
2941 if(nsib == 0) /* root */
\r
2943 else if (inode == nodes[dad].sons[nsib-1]) /* last sib */
\r
2945 else /* not last sib */
\r
2946 fprintf(fout, ", ");
\r
2952 int OutTreeN (FILE *fout, int spnames, int printopt)
\r
2954 /* print the current tree.
\r
2955 Can the block of print statements be moved inside the recursive function?
\r
2957 int i, intlabel=1;
\r
2958 char* labelfmt[2]={"#%.6f", "#%.0f"};
\r
2960 if(printopt & PrLabel) {
\r
2961 for(i=0; i<tree.nnode; i++)
\r
2962 if(nodes[i].label-(int)nodes[i].label != 0) intlabel=0;
\r
2965 OutSubTreeN(fout, tree.root, spnames, printopt, labelfmt[intlabel]);
\r
2973 /* This cnages the bifurcation at the root into a trifurcation, but setting one of
\r
2974 the sons to be the new root. The new root is the first son that is not a tip.
\r
2975 tree.nnode is updated, but the routine does not re-number the nodes, so the new
\r
2976 node labels do not go from ns, ns + 1, ..., as they normally should.
\r
2978 int i, ison, sib, root = tree.root;
\r
2980 if(nodes[root].nson!=2) error2("in DeRoot?");
\r
2982 ison = nodes[root].sons[i = 0];
\r
2983 if(nodes[ison].nson==0)
\r
2984 ison = nodes[root].sons[i = 1];
\r
2985 sib = nodes[root].sons[1 - i];
\r
2986 nodes[sib].branch += nodes[ison].branch;
\r
2987 nodes[sib].father = tree.root = ison;
\r
2988 nodes[tree.root].father = -1;
\r
2989 nodes[tree.root].sons[nodes[tree.root].nson++] = sib; /* sib added as the last child of the new root */
\r
2990 nodes[tree.root].branch = 0;
\r
2991 tree.nnode --; /* added 2007/4/9 */
\r
2995 int PruneSubTreeN (int inode, int keep[])
\r
2997 /* This prunes tips from the tree, using keep[com.ns]. Removed nodes in the
\r
2998 big tree has nodes[].father=-1 and nodes[].nson=0.
\r
2999 Do not change nodes[inode].nson and nodes[inode].sons[] until after the
\r
3000 node's descendent nodes are all processed. So when a son is deleted,
\r
3001 only the father node's nson is changed, but not
\r
3003 int i,j, ison, father=nodes[inode].father, nson0=nodes[inode].nson;
\r
3005 nodes[inode].label = 0;
\r
3006 for(i=0; i<nson0; i++)
\r
3007 PruneSubTreeN(nodes[inode].sons[i], keep);
\r
3009 /* remove inode because of no descendents.
\r
3010 Note this does not touch the father node */
\r
3011 if(inode<com.ns && keep[inode]==0)
\r
3012 nodes[inode].father = -1;
\r
3013 else if(inode>=com.ns) {
\r
3014 for(i=0,nodes[inode].nson=0; i<nson0; i++) {
\r
3015 ison = nodes[inode].sons[i];
\r
3016 if(nodes[ison].father!=-1)
\r
3017 nodes[inode].sons[ nodes[inode].nson++ ] = nodes[inode].sons[i];
\r
3019 if(nodes[inode].nson == 0)
\r
3020 nodes[inode].father = -1;
\r
3023 /* remove inode if it has a single descendent ison */
\r
3024 if(inode>=com.ns && nodes[inode].nson==1 && inode!=tree.root) {
\r
3025 ison = nodes[inode].sons[0];
\r
3026 nodes[ison].father = father;
\r
3027 nodes[ison].branch += nodes[inode].branch;
\r
3028 nodes[ison].label ++; /* records # deleted nodes for branch ison */
\r
3029 for(j=0; j<nodes[father].nson; j++) {
\r
3030 if(nodes[father].sons[j]==inode)
\r
3031 { nodes[father].sons[j] = ison; break; }
\r
3033 nodes[inode].nson = 0;
\r
3034 nodes[inode].father = -1;
\r
3036 else if(nodes[inode].nson==1 && inode==tree.root) { /* move down root if root has 1 descendent */
\r
3037 nodes[inode].father = -1;
\r
3038 nodes[inode].nson = 0;
\r
3039 ison = nodes[tree.root].sons[0];
\r
3041 nodes[tree.root].father = -1;
\r
3042 nodes[tree.root].branch = 0;
\r
3046 printf("\nVisiting inode %d\n", inode);
\r
3047 FOR(i, tree.nnode) printf(" %2d", i); FPN(F0);
\r
3048 FOR(i, tree.nnode) printf(" %2.0f", nodes[i].label); FPN(F0);
\r
3054 int GetSubTreeN (int keep[], int space[])
\r
3056 /* This removes some tips to generate the subtree. Branch lengths are
\r
3057 preserved by summing them up when some nodes are removed.
\r
3058 The algorithm use post-order tree traversal to remove tips and nodes. It
\r
3059 then switches to the branch representation to renumber nodes.
\r
3060 space[] can be NULL. If not, it returns newnodeNO[], which holds the
\r
3061 new node numbers; for exmaple, newnodeNO[12]=5 means that old node 12 now
\r
3064 The routine does not change com.ns or com.spname[], which have to be updated
\r
3067 CHANGE OF ROOT happens if the root in the old tree had >=3 sons, but has 2
\r
3068 sons in the new tree and if (!com.clock). In that case, the tree is derooted.
\r
3070 This routine does not work if a current seq is ancestral to some others
\r
3071 and if that sequence is removed. (***check this comment ***)
\r
3073 Different formats for keep[] are used. Suppose the current tree is for
\r
3074 nine species: a b c d e f g h i.
\r
3076 (A) keep[]={1,0,1,1,1,0,0,1,0} means that a c d e h are kept in the tree.
\r
3077 The old tip numbers are not changed, so that OutTreeN(?,1,?) gives the
\r
3078 correct species names or OutTreeN(?,0,?) gives the old species numbers.
\r
3080 (B) keep[]={1,0,2,3,4,0,0,5,0} means that a c d e h are kept in the tree, and
\r
3081 they are renumbered 0 1 2 3 4 and all the internal nodes are renumbered
\r
3082 as well to be consecutive. Note that the positive numbers have to be
\r
3083 consecutive natural numbers.
\r
3085 keep[]={5,0,2,1,4,0,0,3,0} means that a c d e h are kept in the tree.
\r
3086 However, the order of the sequences are changed to d c h e a, so that the
\r
3087 numbers are now 0 1 2 3 4 for d c h e a. This is useful when the subtree
\r
3088 is extracted from a big tree for a subset of the sequence data, while the
\r
3089 species are odered d c h e a in the sequence data file.
\r
3090 This option can be used to renumber the tips in the complete tree.
\r
3092 int nsnew, i,j,k, nnode0=tree.nnode, sumnumber=0, newnodeNO[2*NS-1], ison, sib;
\r
3093 int unrooted = (nodes[tree.root].nson>=3); /* com.clock is not checked here! */
\r
3097 if(debug) { FOR(i,com.ns) printf("%-30s %2d\n", com.spname[i], keep[i]); }
\r
3098 for(i=0,nsnew=0; i<com.ns; i++)
\r
3099 if(keep[i]) { nsnew++; sumnumber+=keep[i]; }
\r
3100 if(nsnew<2) return(-1);
\r
3102 /* mark removed nodes in the big tree by father=-1 && nson=0.
\r
3103 nodes[].label records the number of nodes collapsed.
\r
3105 PruneSubTreeN(tree.root, keep);
\r
3106 /* If unrooted tree has a bifurcation at the new root, collapse root. */
\r
3107 if (unrooted && nodes[tree.root].nson==2) {
\r
3108 ison = nodes[tree.root].sons[i = 0];
\r
3109 if(nodes[ison].nson==0)
\r
3110 ison = nodes[tree.root].sons[i = 1];
\r
3111 sib = nodes[tree.root].sons[1 - i];
\r
3113 nodes[sib].branch += nodes[ison].branch;
\r
3114 nodes[sib].label += nodes[ison].label + 2;
\r
3115 nodes[sib].father = tree.root = ison;
\r
3116 nodes[tree.root].father = -1;
\r
3117 nodes[tree.root].sons[nodes[tree.root].nson++] = sib; /* sib added as the last child of the new root */
\r
3118 nodes[tree.root].branch = 0;
\r
3120 if(debug) printtree(1);
\r
3122 for(i=0,k=1; i<tree.nnode; i++) if(nodes[i].father!=-1) k++;
\r
3126 /* to renumber the nodes */
\r
3127 if(sumnumber>nsnew) {
\r
3128 if(sumnumber != nsnew*(nsnew+1)/2)
\r
3129 error2("keep[] not right in GetSubTreeN");
\r
3131 if((branch0=(double*)malloc(nnode0*sizeof(double)))==NULL) error2("oom#");
\r
3132 FOR(i,nnode0) branch0[i] = nodes[i].branch;
\r
3133 FOR(i,nnode0) newnodeNO[i] = -1;
\r
3134 FOR(i,com.ns) if(keep[i]) newnodeNO[i] = keep[i]-1;
\r
3136 newnodeNO[tree.root] = k = nsnew;
\r
3138 for( ; i<nnode0; i++) {
\r
3139 if(nodes[i].father==-1) continue;
\r
3140 for(j=0; j<tree.nbranch; j++)
\r
3141 if(i==tree.branches[j][1]) break;
\r
3142 if(j==tree.nbranch)
\r
3143 error2("strange here");
\r
3144 newnodeNO[i] = k++;
\r
3146 for(j=0; j<tree.nbranch; j++) FOR(i,2)
\r
3147 tree.branches[j][i] = newnodeNO[tree.branches[j][i]];
\r
3149 for(i=0; i<nnode0; i++) {
\r
3150 if(newnodeNO[i]>-1)
\r
3151 nodes[newnodeNO[i]].branch = branch0[i];
\r
3156 if(space) memmove(space, newnodeNO, (com.ns*2-1)*sizeof(int));
\r
3161 void printtree (int timebranches)
\r
3165 printf("\nns = %d nnode = %d", com.ns, tree.nnode);
\r
3166 printf("\n%7s%7s", "father","node");
\r
3167 if(timebranches) printf("%10s%10s%10s", "age", "branch", "label");
\r
3168 printf(" %7s%7s", "nson:","sons");
\r
3169 FOR (i, tree.nnode) {
\r
3170 printf ("\n%7d%7d", nodes[i].father, i);
\r
3172 printf(" %9.6f %9.6f %9.0f", nodes[i].age, nodes[i].branch,nodes[i].label);
\r
3174 printf ("%7d: ", nodes[i].nson);
\r
3175 FOR(j,nodes[i].nson) printf(" %2d", nodes[i].sons[j]);
\r
3178 OutTreeN(F0,0,0); FPN(F0);
\r
3179 OutTreeN(F0,1,0); FPN(F0);
\r
3180 OutTreeN(F0,1,1); FPN(F0);
\r
3184 void PointconPnodes (void)
\r
3186 /* This points the nodes[com.ns+inode].conP to the right space in com.conP.
\r
3187 The space is different depending on com.cleandata (0 or 1)
\r
3188 This routine updates internal nodes com.conP only.
\r
3189 End nodes (com.conP0) are updated in InitConditionalPNode().
\r
3193 for(i=0; i<tree.nbranch+1; i++)
\r
3194 if(nodes[i].nson>0) /* more thinking */
\r
3195 nodes[i].conP = com.conP + com.ncode*com.npatt*nintern ++;
\r
3199 int SetxInitials (int np, double x[], double xb[][2])
\r
3201 /* This forces initial values into the boundary of the space
\r
3205 for (i=com.ntime; i<np; i++) {
\r
3206 if (x[i]<xb[i][0]*1.005) x[i]=xb[i][0]*1.05;
\r
3207 if (x[i]>xb[i][1]/1.005) x[i]=xb[i][1]/1.05;
\r
3209 for (i=0; i<com.np; i++) {
\r
3210 if (x[i]<xb[i][0]) x[i]=xb[i][0]*1.2;
\r
3211 if (x[i]>xb[i][1]) x[i]=xb[i][1]*.8;
\r
3217 #if(defined(BASEML) || defined(CODEML) || defined(MCMCTREE))
\r
3219 int GetTipDate (double *TipDate, double *TipDate_TimeUnit)
\r
3221 /* This scans sequence names to collect the sampling dates. The last field of
\r
3222 the sequence name is assumed to contain the date.
\r
3223 Divergence times are rescaled by using TipDate_TimeUnit.
\r
3225 int i, j, indate, ndates=0;
\r
3226 double young=-1, old=-1;
\r
3230 for(i=0,ndates=0; i<com.ns; i++) {
\r
3232 j = strlen(com.spname[i]);
\r
3233 for(indate=0,p=com.spname[i]+j-1; j>=0; j--,p--) {
\r
3234 if(isdigit(*p) || *p=='.') indate=1;
\r
3238 sscanf(p+1, "%lf", &nodes[i].age);
\r
3239 if(nodes[i].age<=0)
\r
3240 error2("Tip date <= 0");
\r
3245 young = old = nodes[i].age;
\r
3247 old = min2(old, nodes[i].age);
\r
3248 young = max2(young, nodes[i].age);
\r
3252 if(*TipDate_TimeUnit == -1) *TipDate_TimeUnit = 1;
\r
3255 else if (ndates!=com.ns) {
\r
3256 printf("TipDate model requires date for each sequence.");
\r
3259 /* TipDate models */
\r
3260 if(ndates != com.ns)
\r
3261 error2("TipDate model: each sequence must have a date");
\r
3263 if(*TipDate_TimeUnit <= 0)
\r
3264 *TipDate_TimeUnit = (young - old)*2.5;
\r
3265 if(young - old < 1e-30)
\r
3266 error2("TipDate: all sequences are of the same age?");
\r
3267 for(i=0; i<tree.nnode; i++) {
\r
3268 if(i<com.ns || nodes[i].fossil) {
\r
3269 nodes[i].age = (young - nodes[i].age) / *TipDate_TimeUnit;
\r
3270 if(nodes[i].age<1e-20) nodes[i].age = 0;
\r
3274 if(noisy) printf("\nTipDate model\nDate range: (%.2f, %.2f) => (0, %.2f). TimeUnit = %.2f.\n",
\r
3275 young, old, (young-old)/ *TipDate_TimeUnit, *TipDate_TimeUnit);
\r
3283 #if(defined(BASEML) || defined(CODEML))
\r
3285 double *AgeLow=NULL;
\r
3286 int NFossils=0, AbsoluteRate=0;
\r
3287 /* TipDate models:
\r
3288 MutationRate = mut/TipDate_TimeUnit;
\r
3289 age = age*TipDate_TimeUnit
\r
3292 void SetAge(int inode, double x[]);
\r
3293 void GetAgeLow (int inode);
\r
3294 /* number of internal node times, usd to deal with known ancestors. Broken? */
\r
3295 static int innode_time=0;
\r
3297 /* Ziheng Yang, 25 January 2003
\r
3298 The following routines deal with clock and local clock models, including
\r
3299 Andrew Rambaut's TipDate models (Rambaut 2000 Bioinformatics 16:395-399;
\r
3300 Yoder & Yang 2000 Mol Biol Evol 17:1081-1090; Yang & Yoder 2003 Syst Biol).
\r
3301 The tree is rooted. The routine SetAge assumes that ancestral nodes are
\r
3302 arranged in the increasing order and so works only if the input tree uses
\r
3303 the parenthesis notation and not the branch notation. The option of known
\r
3304 ancestors is probably broken.
\r
3306 The flag AbsoluteRate=1 if(TipDate || NFossils). This could be removed
\r
3307 as the flags TipDate and NFossils are sufficient.
\r
3308 clock = 1: global clock, deals with TipDate with no or many fossils,
\r
3309 ignores branch rates (#) in tree if any.
\r
3310 = 2: local clock models, as above, but requires branch rates #
\r
3312 = 3: as 2, but requires Mgene and option G in sequence file.
\r
3314 Order of variables in x[]: divergence times, rates for branches, rgene, ...
\r
3315 In the following ngene=4, com.nbtype=3, with r_ij to be the rate
\r
3316 of gene i and branch class j.
\r
3319 [times, r00(if absolute) r01 r02 rgene1 rgene2 rgene3]
\r
3320 NOTE: rgene[] has relative rates
\r
3322 [times, r00(if absolute) r01 r02 r11 r12 r21 r22 r31 r32 rgene1 rgene2 rgene3]
\r
3323 NOTE: rgene1=r10, rgene2=r20, rgene3=r30
\r
3325 If(nodes[tree.root].fossil==0) x[0] has absolute time for the root.
\r
3326 Otherwise x[0] has proportional ages.
\r
3330 double GetBranchRate(int igene, int ibrate, double x[], int *ix)
\r
3332 /* This finds the right branch rate in x[]. The rate is absolute if AbsoluteRate.
\r
3333 ibrate=0,1,..., indicates the branch rate class.
\r
3334 This routine is used in the likeihood calculation and in formatting output.
\r
3335 ix (k) has the position in x[] for the branch rate if the rate is a parameter.
\r
3336 and is -1 if the rate is not a parameter in the ML iteration. This is
\r
3339 int nage=tree.nnode-com.ns-NFossils, k=nage+AbsoluteRate;
\r
3340 double rate00=(AbsoluteRate?x[nage]:1), brate=rate00;
\r
3342 if(igene==0 && ibrate==0)
\r
3343 k = (AbsoluteRate?nage:-1);
\r
3344 else if(com.clock==GlobalClock) {
\r
3345 brate = x[k=com.ntime+igene-1]; /* igene>0, rgene[] has absolute rates */
\r
3347 else if(com.clock==LocalClock) { /* rgene[] has relative rates */
\r
3348 if(igene==0 && ibrate) { brate = x[k+=ibrate-1]; }
\r
3349 else if(igene && ibrate==0){ brate = rate00*x[com.ntime+igene-1]; k=-1; }
\r
3350 else if(igene && ibrate) { brate = x[k+ibrate-1]*x[com.ntime+igene-1]; k=-1; }
\r
3352 else if(com.clock==ClockCombined) {
\r
3353 if(ibrate==0 && igene) brate = x[k=com.ntime+igene-1];
\r
3354 else brate = x[k+=ibrate-1+igene*(com.nbtype-1)]; /* ibrate>0 */
\r
3362 void SetAge (int inode, double x[])
\r
3364 /* This is called from SetBranch(), to set up age for nodes under clock
\r
3365 models (clock=1,2,3).
\r
3366 if(TipDate||NFossil), that is, if(AbsoluteRate), this routine sets up
\r
3367 times (nodes[].age) and then SetBranch() sets up branch lengths by
\r
3368 multiplying times with rate:
\r
3369 [].age[i] = AgeLow[i] + ([father].age - AgeLow[i])*x[i]
\r
3371 The routine assumes that times are arranged in the order of node numbers,
\r
3372 and should work if parenthesis notation of tree is used in the tree file,
\r
3373 but not if the branch notation is used.
\r
3377 FOR (i,nodes[inode].nson) {
\r
3378 ison=nodes[inode].sons[i];
\r
3379 if(nodes[ison].nson) {
\r
3380 if(AbsoluteRate) {
\r
3381 if(!nodes[ison].fossil)
\r
3382 nodes[ison].age = AgeLow[ison]
\r
3383 +(nodes[inode].age - AgeLow[ison])*x[innode_time++];
\r
3386 nodes[ison].age = nodes[inode].age*x[innode_time++];
\r
3392 void GetAgeLow (int inode)
\r
3394 /* This sets AgeLow[], the minimum age of each node. It moves down the tree to
\r
3395 scan [].age, which has tip dates and fossil dates. It is needed if(AbsoluteRate)
\r
3396 and is called by GetInitialsTimes().
\r
3401 for(i=0; i<nodes[inode].nson; i++) {
\r
3402 ison = nodes[inode].sons[i];
\r
3403 if(nodes[ison].nson)
\r
3405 tlow = max2(tlow, nodes[ison].age);
\r
3407 if(nodes[inode].fossil) {
\r
3408 if(nodes[inode].age < tlow)
\r
3409 error2("age in tree is in conflict.");
\r
3410 AgeLow[inode] = nodes[inode].age;
\r
3413 AgeLow[inode] = nodes[inode].age = tlow;
\r
3418 int SetBranch (double x[])
\r
3420 /* if(AbsoluteRate), mutation rate is not multiplied here, but during the
\r
3421 likelihood calculation. It is copied into com.rgene[0].
\r
3424 double small=-1e-5;
\r
3426 if(com.clock==0) {
\r
3427 for(i=0; i<tree.nnode; i++) {
\r
3429 if((nodes[i].branch=x[nodes[i].ibranch])<small) status = -1;
\r
3434 if(!LASTROUND) { /* transformed variables (proportions) are used */
\r
3435 if(!nodes[tree.root].fossil) /* note order of times in x[] */
\r
3436 nodes[tree.root].age = x[innode_time++];
\r
3437 SetAge(tree.root, x);
\r
3439 else { /* times are used */
\r
3440 for(i=com.ns; i<tree.nnode; i++)
\r
3441 if(!nodes[i].fossil) nodes[i].age = x[innode_time++];
\r
3444 for(i=0; i<tree.nnode; i++) { /* [].age to [].branch */
\r
3445 if(i==tree.root) continue;
\r
3446 nodes[i].branch = nodes[nodes[i].father].age-nodes[i].age;
\r
3447 if(nodes[i].branch<small)
\r
3454 int GetInitialsTimes (double x[])
\r
3456 /* this counts com.ntime and initializes x[] under clock and local clock models,
\r
3457 including TipDate and ClockCombined models. See above for notes.
\r
3458 Under local clock models, com.ntime includes both times and rates for
\r
3460 A recursive algorithm is used to specify initials if(TipDate||NFossil).
\r
3466 if(com.fix_blength==2)
\r
3467 { com.ntime=0; com.method=0; return(0); }
\r
3468 else if(com.clock==0) {
\r
3469 com.ntime = tree.nbranch;
\r
3470 if(com.fix_blength==1) return(0);
\r
3471 for(i=0; i<com.ntime; i++)
\r
3472 x[i] = rndu()*0.1+0.01;
\r
3474 if(com.fix_blength==0 && com.clock<5 && ancestor && com.ntime<100)
\r
3475 LSDistance (&t, x, testx);
\r
3480 /* clock models: check branch rate labels and fossil dates first */
\r
3484 for(i=0; i<tree.nnode; i++) nodes[i].label=0;
\r
3486 for(i=0; i<tree.nnode; i++) {
\r
3487 if(i!=tree.root && (j=(int)nodes[i].label+1)>com.nbtype) {
\r
3489 if(j<0 || j>tree.nbranch-1) error2("branch label in the tree.");
\r
3492 for(j=0; j<com.nbtype; j++) {
\r
3493 for(i=0; i<tree.nnode; i++)
\r
3494 if(i!=tree.root && j==(int)nodes[i].label) break;
\r
3496 printf("\nNot all branch labels (0, ..., %d) are found on tree?", com.nbtype-1);
\r
3498 if(noisy) printf("\nfound %d branch rates in tree.\n", com.nbtype);
\r
3499 if(com.nbtype<=1) error2("use clock = 1 or add branch rate labels in tree");
\r
3501 for(i=0; i<tree.nbranch; i++)
\r
3502 printf("%3.0f",nodes[tree.branches[i][1]].label); FPN(F0);
\r
3505 for(i=0,NFossils=0,maxage=0; i<tree.nnode; i++) {
\r
3506 if(nodes[i].nson && nodes[i].fossil) {
\r
3508 maxage = max2(maxage,nodes[i].age);
\r
3511 if(NFossils && maxage>10)
\r
3512 error2("Change time unit so that fossil dates fall in (0.00001, 10).");
\r
3515 GetTipDate(&com.TipDate, &com.TipDate_TimeUnit);
\r
3517 AbsoluteRate = (com.TipDate || NFossils);
\r
3518 if(com.clock>=5 && AbsoluteRate==0)
\r
3519 error2("needs fossil calibrations");
\r
3521 com.ntime = AbsoluteRate + (tree.nnode-com.ns-NFossils) + (com.nbtype-1);
\r
3522 if(com.clock == ClockCombined)
\r
3523 com.ntime += (com.ngene-1)*(com.nbtype-1);
\r
3524 com.ntime += (tree.root<com.ns); /* root is a known sequence. Broken? */
\r
3526 /* DANGER! AgeLow is not freed in the program. Fix this? */
\r
3528 if(AbsoluteRate) {
\r
3529 AgeLow = (double*)realloc(AgeLow, tree.nnode*sizeof(double));
\r
3530 GetAgeLow(tree.root);
\r
3532 if(!nodes[tree.root].fossil)
\r
3533 x[k++] = (AbsoluteRate?nodes[tree.root].age*(1.2+rndu()) : rndu()*.5+.1); /* root age */
\r
3534 for(; k<tree.nnode-com.ns-NFossils; k++) /* relative times */
\r
3535 x[k] = 0.4+.5*rndu();
\r
3536 if(com.clock!=6) /* branch rates */
\r
3537 for( ; k<com.ntime; k++)
\r
3538 x[k] = 0.1*(.5+rndu());
\r
3540 for(j=0,k=com.ntime-1; j<data.ngene; j++,k++)
\r
3541 x[k] = 0.1*(.5+rndu());
\r
3545 int OutputTimesRates (FILE *fout, double x[], double var[])
\r
3547 /* SetBranch() has been called before calling this, so that [].age is up
\r
3550 int i,j,k=AbsoluteRate+tree.nnode-com.ns-NFossils, jeffnode;
\r
3551 double scale=(com.TipDate ? com.TipDate_TimeUnit : 1);
\r
3554 if(AbsoluteRate && com.clock<5) {
\r
3555 fputs("\nSubstitution rate is per time unit\n", fout);
\r
3556 if(com.nbtype>1) fprintf(fout,"Rates for branch groups\n");
\r
3557 for(i=0; i<com.ngene; i++,FPN(fout)) {
\r
3558 if(com.ngene>1) fprintf(fout,"Gene %2d: ", i+1);
\r
3559 for(j=0; j<com.nbtype; j++) {
\r
3560 fprintf(fout,"%12.6f", GetBranchRate(i,j,x,&k));
\r
3561 if(i==0 && j==0 && !AbsoluteRate) continue;
\r
3562 if((com.clock!=LocalClock||com.ngene==1) && com.getSE) {
\r
3563 if(k==-1) error2("we are in trouble. k should not be -1 here.");
\r
3564 fprintf(fout," +- %8.6f", sqrt(var[k*com.np+k]));
\r
3570 if(com.clock==2) {
\r
3571 fprintf (fout,"rates for branches: 1");
\r
3572 for(k=tree.nnode-com.ns; k<com.ntime; k++) fprintf(fout," %8.5f",x[k]);
\r
3577 if(AbsoluteRate) {
\r
3578 fputs("\nNodes and Times\n",fout);
\r
3579 fputs("(JeffNode is for Thorne's multidivtime. ML analysis uses ingroup data only.)\n\n",fout);
\r
3581 if(com.TipDate) { /* DANGER! SE not printed if(TipDate && NFossil). */
\r
3582 for(i=0,k=0; i<tree.nnode; i++,FPN(fout)) {
\r
3583 jeffnode=(i<com.ns?i:tree.nnode-1+com.ns-i);
\r
3584 fprintf(fout,"Node %3d (Jeffnode %3d) Time %7.2f ",i+1, jeffnode,
\r
3585 com.TipDate - nodes[i].age*scale);
\r
3586 if(com.getSE && i>=com.ns && !nodes[i].fossil) {
\r
3587 fprintf(fout," +- %6.2f", sqrt(var[k*com.np+k])*scale);
\r
3592 else if(AbsoluteRate) {
\r
3593 for(i=com.ns,k=0; i<tree.nnode; i++,FPN(fout)) {
\r
3594 jeffnode=tree.nnode-1+com.ns-i;
\r
3595 fprintf(fout,"Node %3d (Jeffnode %3d) Time %9.5f", i+1, tree.nnode-1+com.ns-i,
\r
3597 if(com.getSE && i>=com.ns && !nodes[i].fossil) {
\r
3598 fprintf(fout," +- %7.5f", sqrt(var[k*com.np+k]));
\r
3599 if(fabs(nodes[i].age-x[k])>1e-5) error2("node order wrong.");
\r
3608 int SetxBoundTimes (double xb[][2])
\r
3610 /* This sets bounds for times (or branch lengths) and branch rates
\r
3613 double tb[]={4e-6,50}, rateb[]={1e-4,99}, pb[]={.000001,.999999};
\r
3615 if(com.clock==0) {
\r
3616 for(i=0;i<com.ntime;i++) {
\r
3622 k=0; xb[0][0]=tb[0]; xb[0][1]=tb[1];
\r
3623 if(!nodes[tree.root].fossil) {
\r
3624 if(AbsoluteRate) xb[0][0]=AgeLow[tree.root];
\r
3627 for( ; k<tree.nnode-com.ns-NFossils; k++) /* proportional ages */
\r
3628 { xb[k][0]=pb[0]; xb[k][1]=pb[1]; }
\r
3629 for(; k<com.ntime; k++) /* rate and branch rates */
\r
3630 FOR(j,2) xb[k][j]=rateb[j];
\r
3638 #if(defined(BASEML) || defined(BASEMLG) || defined(CODEML))
\r
3641 int readx(double x[], int *fromfile)
\r
3643 /* this reads parameters from file, used as initial values
\r
3644 if(runmode>0), this reads common substitution parameters only into x[], which
\r
3645 should be copied into another place before heuristic tree search. This is broken
\r
3646 right now. Ziheng, 9 July 2003.
\r
3647 fromfile=0: if nothing read from file, 1: read from file, -1:fix parameters
\r
3649 static int times=0;
\r
3653 times++; *fromfile=0;
\r
3654 if(finitials==NULL || (com.runmode>0 && times>1)) return(0);
\r
3655 if(com.runmode<=0) { npin=com.np; xin=x; }
\r
3656 else { npin=com.np-com.ntime; xin=x+com.ntime; }
\r
3658 if(npin<=0) return(0);
\r
3659 if(com.runmode>0&&com.seqtype==1&&com.model) error2("option or in.codeml");
\r
3660 printf("\nReading initials/paras from file (np=%d). Stop if wrong.\n",npin);
\r
3661 fscanf(finitials,"%lf",&xin[i=0]);
\r
3663 if(xin[0]==-1) { *fromfile=-1; LASTROUND=1; }
\r
3665 for( ; i<npin; i++)
\r
3666 if(fscanf(finitials, "%lf", &xin[i])!=1) break;
\r
3668 { printf("err at #%d. Edit or remove it.\n",i+1); exit(-1); }
\r
3669 if(com.runmode>0) {
\r
3670 matout(F0,xin,1,npin);
\r
3671 puts("Those are fixed for tree search. Stop if wrong.");
\r
3678 #if(defined(BASEML) || defined(CODEML))
\r
3680 int CollapsNode (int inode, double x[])
\r
3682 /* Merge inode to its father. Update the first com.ntime elments of
\r
3683 x[] only if (x!=NULL), by using either x[] if clock=1 or
\r
3684 nodes[].branch if clock=0. So when clock=0, the routine works
\r
3685 properly only if SetBranch() is called before this routine, which
\r
3686 is true if m.l. or l.s. has been used to estimate branch lengths.
\r
3688 int i,j, ifather, ibranch, ison;
\r
3690 if (inode==tree.root || inode<com.ns) error2("err CollapsNode");
\r
3691 ibranch=nodes[inode].ibranch; ifather=nodes[inode].father;
\r
3692 for (i=0; i<nodes[inode].nson; i++) {
\r
3693 ison=nodes[inode].sons[i];
\r
3694 tree.branches[nodes[ison].ibranch][0]=ifather;
\r
3696 for (i=ibranch+1; i<tree.nbranch; i++)
\r
3697 for (j=0; j<2; j++) tree.branches[i-1][j]=tree.branches[i][j];
\r
3698 tree.nbranch--; com.ntime--;
\r
3699 for (i=0; i<tree.nbranch; i++) for (j=0; j<2; j++)
\r
3700 if (tree.branches[i][j]>inode) tree.branches[i][j]--;
\r
3705 for (i=inode+1; i<tree.nnode+1; i++) x[i-1-com.ns]=x[i-com.ns];
\r
3707 for (i=ibranch+1; i<tree.nbranch+1; i++) x[i-1]=x[i];
\r
3717 #if(defined(BPP) || defined(EVOLVER))
\r
3719 void Tree2PartitionDescentTree (int inode, char split[])
\r
3723 for(i=0; i<nodes[inode].nson; i++) {
\r
3724 ison = nodes[inode].sons[i];
\r
3726 split[ison] = '1';
\r
3728 Tree2PartitionDescentTree(ison, split);
\r
3732 void Tree2Partition (char splits[])
\r
3734 /* This generates branch partitions in splits[nib*(com.ns+1)].
\r
3735 splits[0,...,ns-1] is the first split, splits[ns,...,2*ns-1] the second, and so on.
\r
3736 For large trees, the algorithm is inefficient.
\r
3737 The root node has 2 sons if the tree is rooted and >=3 sons if the tree
\r
3738 is unrooted. For unrooted tree, the mark for the first species is set to 0.
\r
3739 For rooted trees, the mark for the first species can be either 0 or 1.
\r
3741 int unrooted = (nodes[tree.root].nson>=3);
\r
3742 int s=com.ns, lsplit=s+1, nsplit=tree.nnode-s-1, i, j, k;
\r
3746 memset(splits, 0, nsplit*lsplit*sizeof(char));
\r
3747 for(i=com.ns,k=0; i<tree.nnode; i++) {
\r
3748 if(i==tree.root) continue;
\r
3749 split = splits+k*lsplit;
\r
3750 for(j=0; j<s; j++) split[j] = '0';
\r
3751 Tree2PartitionDescentTree(i, split);
\r
3752 /* If unrooted tree, set first species to 0 if tree is unrooted */
\r
3753 if(unrooted && split[0]=='1')
\r
3754 for(j=0; j<s; j++) split[j] = '0' + '1' - split[j];
\r
3759 int Partition2Tree (char splits[], int lsplit, int ns, int nsplit, double label[])
\r
3761 /* This generates the tree in nodes[] using splits or branch partitions.
\r
3762 It constructs pptable[(2*s-1)*(2*s-1)] to generate the tree.
\r
3763 Split i corresponds to node ns+i, while the root is ns + nsplit.
\r
3764 label[] has labels for splits and become labels for nodes on the tree.
\r
3765 This works even if ns=1 and ns=2.
\r
3767 int i,j,k, s21=ns*2-1, a, minndesc, ndesc[NS]={0}; /* clade size */
\r
3768 char debug=0, *p, *pptable;
\r
3770 if(nsplit>ns-2) error2("too many splits for ns");
\r
3771 if(nsplit<0) nsplit=0;
\r
3772 if((pptable=(char*)malloc((s21*s21+1)*sizeof(char))) == NULL)
\r
3773 error2("oom in Partition2Tree");
\r
3774 memset(pptable, 0, s21*s21*sizeof(char));
\r
3776 /* initialize tree */
\r
3777 tree.nnode = ns+nsplit+1;
\r
3778 tree.root = ns+nsplit;
\r
3780 for(i=0; i<tree.nnode; i++) {
\r
3781 nodes[i].father = nodes[i].ibranch = -1;
\r
3782 nodes[i].nson = 0; nodes[i].label = -1; nodes[i].branch = nodes[i].age = 0;
\r
3785 /* set up pptable */
\r
3786 for(i=0,p=splits,ndesc[tree.root-ns]=ns; i<nsplit; i++, p+=lsplit) {
\r
3787 for(j=0; j<ns; j++) {
\r
3788 if(p[j] == '1') { /* clade (split) i includes tip j */
\r
3789 pptable[j*s21 + ns+i] = 1;
\r
3794 for(i=0; i<tree.nnode-1; i++) pptable[i*s21+tree.root] = 1;
\r
3795 for(i=0; i<nsplit; i++) {
\r
3796 for(j=0; j<i; j++) {
\r
3797 if(pptable[(ns+i)*s21+ns+j] || pptable[(ns+j)*s21+ns+i] || ndesc[i] == ndesc[j])
\r
3799 for(k=0; k<ns; k++)
\r
3800 if(pptable[k*s21+ns+i]==1 && pptable[k*s21+ns+j]==1) break;
\r
3801 if(k<ns) { /* i and j are ancestral to k, and are ancestral to each other. */
\r
3802 if(ndesc[i] < ndesc[j]) pptable[(ns+i)*s21+ns+j] = 1;
\r
3803 else pptable[(ns+j)*s21+ns+i] = 1;
\r
3808 printf("\npptable\n");
\r
3809 for(i=0; i<s21; i++,FPN(F0))
\r
3810 for(j=0; j<s21; j++)
\r
3811 printf(" %1d", (int)pptable[i*s21+j]);
\r
3812 printf("ndesc: ");
\r
3813 for(i=0; i<nsplit; i++) printf(" %2d", ndesc[i]); FPN(F0);
\r
3816 /* generate tree nodes and labels. For each nonroot node, youngest ancestor is dad. */
\r
3817 for(i=0; i<tree.nnode-1; i++) {
\r
3818 minndesc=ns+1; a=-1;
\r
3819 for(j=ns; j<tree.nnode; j++) {
\r
3820 if(pptable[i*s21+j]==1 && minndesc>ndesc[j-ns])
\r
3821 { minndesc = ndesc[j-ns]; a=j; }
\r
3825 nodes[i].father = a;
\r
3826 nodes[a].sons[nodes[a].nson++] = i;
\r
3827 if(a!=tree.root && label) nodes[a].label = label[a-ns];
\r
3831 OutTreeN(F0,1,0); FPN(F0);
\r
3838 int GetNSfromTreeFile(FILE *ftree, int *ns, int *ntree)
\r
3840 /* This gets the sequence names from the tree file.
\r
3842 char separators[]="(,):#";
\r
3843 int inname=0, k, c;
\r
3846 *ns = *ntree = -1;
\r
3847 k = fscanf(ftree, "%d%d", ns, ntree);
\r
3848 if(k==1) { *ntree = *ns; *ns = -1; }
\r
3851 while((c = fgetc(ftree)) != ';') {
\r
3852 if(c==EOF) return(-1);
\r
3853 if(strchr(separators, c)) {
\r
3854 if (c == ':') fscanf(ftree, "%lf", &y);
\r
3855 else if(c == '#') fscanf(ftree, "%lf", &y);
\r
3856 if(inname) { inname=0; (*ns)++; }
\r
3858 else if(isgraph(c))
\r
3866 void CladeSupport (FILE *fout, char treef[], int getSnames, char mastertreef[], int pick1tree)
\r
3868 /* This reads all bootstrap or Bayesian trees from treef to identify the best trees
\r
3869 (trees with the highest posterior probabilities), and to construct the majority-rule
\r
3870 consensus tree. The set of the best trees constitute the 95% or 99% credibility set
\r
3872 A tree (ptree) is represented by its splits, ordered lexicographically, and concatenated.
\r
3873 It can also read a master tree file and goes through master trees and attach support
\r
3874 values on splits on each master tree.
\r
3875 split1 if for one tree, and split50 is for the majority-rule consensus tree.
\r
3877 int i,j,k, i1, ntreeM,ntree, itreeM, sizetree, found, lline=1024;
\r
3878 int *index, *indexspace, maxnsplits, nsplits=0, s, nsplit1, nsplit50, lsplit, same;
\r
3879 int maxnptree, nptree=0, sizeptree;
\r
3880 char *split1, *splits=NULL, *splitM=NULL, *split50=NULL, *pM, *ptree, *ptrees=NULL, *pc;
\r
3881 double *countsplits=NULL, *countptree=NULL, *Psplit50, *Psame, y, cdf;
\r
3882 char pick1treef[32]="pick1tree.tre", line[1024];
\r
3883 struct TREEN *nodes_t;
\r
3884 FILE *ft, *fM=NULL, *f1tree=NULL;
\r
3887 /* Count trees and splits */
\r
3888 printf("\nRead tree sample, count trees & splits \n");
\r
3889 ft = gfopen(treef, "r");
\r
3891 if(getSnames) /* species ordered as in first tree in file */
\r
3892 GetNSfromTreeFile(ft, &com.ns, &k);
\r
3893 if(com.ns<3) error2("need >=3 species to justify this much effort.");
\r
3894 s=com.ns; lsplit=s+1; maxnsplits=maxnptree=s; sizeptree=(s-2)*lsplit;
\r
3895 if((split1=(char*)malloc(3*(s-2) * lsplit * sizeof(char))) == NULL)
\r
3896 error2("oom splits");
\r
3897 ptree = split1 + (s-2)*lsplit;
\r
3898 split50 = ptree + (s-2)*lsplit;
\r
3899 memset(split1, 0, 3*(s-2) * lsplit * sizeof(char));
\r
3900 if((Psplit50=(double*)malloc(s*sizeof(double))) == NULL)
\r
3901 error2("oom Psplit50");
\r
3903 sizetree=(s*2-1)*sizeof(struct TREEN);
\r
3904 if((nodes=(struct TREEN*)malloc(sizetree*2))==NULL) error2("oom");
\r
3905 for(i=0; i<s*2-1; i++) nodes[i].nodeStr=NULL;
\r
3906 nodes_t = nodes + s*2-1;
\r
3908 for(ntree=0; ; ntree++) {
\r
3909 if(nptree+s >= maxnptree) {
\r
3910 maxnptree = (int)(maxnptree*(ntree<1000 ? 2 : 1.2));
\r
3911 ptrees = (char *)realloc(ptrees, maxnptree*sizeptree);
\r
3912 countptree = (double*)realloc(countptree, maxnptree*sizeof(double));
\r
3913 if(ptrees==NULL || countptree==NULL) error2("oom ptrees || countptree");
\r
3914 memset(ptrees+nptree*sizeptree, 0, (maxnptree-nptree)*sizeptree);
\r
3915 memset(countptree+nptree, 0, (maxnptree-nptree)*sizeof(double));
\r
3917 if(nsplits+s >= maxnsplits) {
\r
3919 splits = (char*)realloc(splits, maxnsplits * lsplit * sizeof(char));
\r
3920 countsplits = (double*)realloc(countsplits, maxnsplits*sizeof(double));
\r
3921 if(splits==NULL || countsplits==NULL) error2("oom splits realloc");
\r
3924 /* if (getSnames), copy species/sequence names from first tree in file. */
\r
3925 if(ReadTreeN(ft, &i, &j, (getSnames && ntree==0), 1)) break;
\r
3926 if(debug || (ntree+1)%5000==0) {
\r
3927 printf("\rtree %5d ", ntree+1);
\r
3928 if(s<15) OutTreeN(F0, 1, 0);
\r
3930 Tree2Partition(split1);
\r
3931 nsplit1 = tree.nnode - s - 1;
\r
3933 /* Process the tree */
\r
3934 qsort(split1, nsplit1, lsplit, (int(*)(const void *, const void *))strcmp);
\r
3936 { for(i=0; i<nsplit1; i++) printf(" %s", split1+i*lsplit); printf("\n"); }
\r
3937 for(i=0,pc=ptree; i<nsplit1; i++)
\r
3938 for(j=0; j<s; j++) *pc++ = split1[i*lsplit+j];
\r
3939 j = binarysearch(ptree, ptrees, nptree, sizeptree, (int(*)(const void *, const void *))strcmp, &found);
\r
3944 memmove(ptrees+(j+1)*sizeptree, ptrees+j*sizeptree, (nptree-j)*sizeptree);
\r
3945 memmove(countptree+j+1, countptree+j, (nptree-j)*sizeof(double));
\r
3947 memmove(ptrees+j*sizeptree, ptree, sizeptree);
\r
3952 /* Process the splits in the tree */
\r
3953 for(i=0; i<nsplit1; i++) { /* going through splits in current tree */
\r
3954 j = binarysearch(split1+i*lsplit, splits, nsplits, lsplit, (int(*)(const void *, const void *))strcmp, &found);
\r
3955 if(found) /* found */
\r
3958 if(j<nsplits) { /* check the size of the moved block here */
\r
3959 memmove(splits+(j+1)*lsplit, splits+j*lsplit, (nsplits-j)*lsplit*sizeof(char));
\r
3960 memmove(countsplits+(j+1), countsplits+j, (nsplits-j)*sizeof(double));
\r
3962 memcpy(splits+j*lsplit, split1+i*lsplit, lsplit*sizeof(char));
\r
3968 printf("%4d splits: ", nsplits);
\r
3969 for(k=0; k<nsplits; k++) printf(" %s (%.0f)", splits+k*lsplit, countsplits[k]);
\r
3973 printf("\n%6d trees read, %d distinct trees.\n", ntree, nptree);
\r
3975 k = max2(nsplits, nptree);
\r
3976 if((index=(int*)malloc(k*2*sizeof(int)))==NULL) error2("oom index");
\r
3977 indexspace = index+k;
\r
3979 printf("\nSpecies in order:\n");
\r
3980 for(i=0; i<s; i++) printf("%2d. %s\n", i+1, com.spname[i]);
\r
3981 printf("\n(A) Best trees in the sample (%d distinct trees in all)\n", nptree);
\r
3982 fprintf(fout, "\n\nSpecies in order:\n");
\r
3983 for(i=0; i<s; i++) fprintf(fout, "%2d. %s\n", i+1, com.spname[i]);
\r
3984 fprintf(fout, "\n(A) Best trees in the sample (%d distinct trees in all)\n", nptree);
\r
3986 indexing(countptree, nptree, index, 1, indexspace);
\r
3988 for(k=0,cdf=0; k<nptree; k++) {
\r
3989 j = index[k]; y=countptree[j];
\r
3990 for(i=0,pc=split1; i<nsplit1; i++,*pc++='\0') for(i1=0; i1<s; i1++)
\r
3991 *pc++ = ptrees[j*sizeptree + i*s + i1];
\r
3992 Partition2Tree(split1, lsplit, s, nsplit1, NULL);
\r
3993 printf(" %6.0f %8.5f %8.5f ", y, y/ntree, (cdf+=y/ntree));
\r
3994 OutTreeN(F0, 1, 0);
\r
3995 /* for(i=0; i<nsplit1; i++) printf(" %s", split1+i*lsplit); */
\r
3998 fprintf(fout, " %6.0f %8.5f %8.5f ", y, y/ntree, cdf);
\r
3999 OutTreeN(fout, 1, 0);
\r
4000 /* for(i=0; i<nsplit1; i++) fprintf(fout, " %s", split1+i*lsplit); */
\r
4001 fprintf(fout, "\n");
\r
4003 if(cdf > 0.999) break;
\r
4006 printf("\n(B) Best splits in the sample of trees (%d splits in all)\n", nsplits);
\r
4007 indexing(countsplits, nsplits, index, 1, indexspace);
\r
4008 for(k=0; k<nsplits; k++) {
\r
4009 j = index[k]; y=countsplits[j];
\r
4010 printf(" %6.0f %9.5f %s\n", y, y/ntree, splits+j*lsplit);
\r
4011 if(y/ntree < 0.001) break;
\r
4013 fprintf(fout, "\n(B) Best splits in the sample of trees (%d splits in all)\n", nsplits);
\r
4014 for(k=0; k<nsplits; k++) {
\r
4015 j = index[k]; y=countsplits[j];
\r
4016 fprintf(fout, " %6.0f %9.5f %s\n", y, y/ntree, splits+j*lsplit);
\r
4017 if(y/ntree < 0.001) break;
\r
4020 /* Majority-rule consensus tree */
\r
4021 for(k=0,nsplit50=0; k<nsplits; k++)
\r
4022 if(countsplits[k]/ntree >= 0.5) nsplit50++;
\r
4023 for(k=0,nsplit50=0; k<nsplits; k++) {
\r
4024 if(countsplits[k]/ntree > 0.5) {
\r
4025 memmove(split50+nsplit50*lsplit, splits+k*lsplit, lsplit);
\r
4026 Psplit50[nsplit50 ++] = countsplits[k]/ntree;
\r
4029 Partition2Tree(split50, lsplit, s, nsplit50, Psplit50);
\r
4030 printf("\n(C) Majority-rule consensus tree\n");
\r
4031 OutTreeN(F0, 1, PrLabel); FPN(F0);
\r
4032 fprintf(fout, "\n(C) Majority-rule consensus tree\n");
\r
4033 OutTreeN(fout, 1, PrLabel); FPN(fout);
\r
4035 if(mastertreef) fM = fopen(mastertreef, "r");
\r
4036 if(fM==NULL) goto CleanUp;
\r
4038 fscanf(fM, "%d%d", &i, &ntreeM);
\r
4039 if(i!=s || ntreeM<1) error2("<ns> <ntree> on the first line in master tree.");
\r
4041 /* Probabilities of trees in the master tree file */
\r
4042 splitM = (char*)malloc(ntreeM * (s-2)*lsplit * sizeof(char));
\r
4043 Psame = (double*)malloc(ntreeM * sizeof(double));
\r
4044 if(splitM==NULL || Psame==NULL) error2("oom splitM");
\r
4045 zero(Psame, ntreeM);
\r
4046 if(pick1tree>=1 && pick1tree<=ntreeM && (f1tree=(FILE*)fopen(pick1treef,"w"))==NULL)
\r
4048 for(itreeM=0,pM=splitM; itreeM<ntreeM; itreeM++,pM+=(s-2)*lsplit) {
\r
4049 if(ReadTreeN(fM, &i, &j, 0, 1)) break;
\r
4050 if(tree.nnode<s*2-2 || tree.nnode>s*2-1) error2("Master trees have to be binary");
\r
4051 Tree2Partition(pM);
\r
4052 qsort(pM, tree.nnode-s-1, lsplit, (int(*)(const void *, const void *))strcmp);
\r
4054 printf("\nMaster tree %2d: ", itreeM+1);
\r
4055 OutTreeN(F0, 1, 0);
\r
4056 for(i=0; i<tree.nnode-s-1; i++) printf(" %s", pM+i*lsplit);
\r
4059 /* read the tree sample again */
\r
4061 for(ntree=0; ; ntree++) {
\r
4062 if(ReadTreeN(ft, &i, &j, 0, 0)) break;
\r
4063 fgets(line, lline, ft);
\r
4064 Tree2Partition(split1);
\r
4065 for(itreeM=0,pM=splitM; itreeM<ntreeM; itreeM++,pM+=(s-2)*lsplit) {
\r
4066 for(i=0,same=1; i<tree.nnode-s-1; i++) {
\r
4067 if(bsearch(split1+i*lsplit, pM, tree.nnode-s-1, lsplit, (int(*)(const void *, const void *))strcmp) == NULL)
\r
4068 { same=0; break; }
\r
4072 if(pick1tree-1==itreeM) {
\r
4073 OutTreeN(f1tree, 1, 1); fprintf(f1tree, "%s", line);
\r
4079 printf("\n(D) Probabilities of trees in the master tree file\n");
\r
4080 fprintf(fout, "\n(D) Probabilities of trees in the master tree file\n");
\r
4081 /* read the master trees another round just for printing. */
\r
4083 fscanf(fM, "%d%d", &s, &ntreeM);
\r
4084 for(itreeM=0; itreeM<ntreeM; itreeM++) {
\r
4085 if(ReadTreeN(fM, &i, &j, 0, 1)) break;
\r
4086 Tree2Partition(split1);
\r
4087 for(i=s,k=0; i<tree.nnode; i++) {
\r
4088 if(i==tree.root) continue;
\r
4089 j = binarysearch(split1+k*lsplit, splits, nsplits, lsplit, (int(*)(const void *, const void *))strcmp, &found);
\r
4090 if(found) nodes[i].label = countsplits[j]/ntree;
\r
4093 printf(" P = %6.4f ", Psame[itreeM]/ntree);
\r
4094 OutTreeN(F0, 1, PrLabel); FPN(F0);
\r
4096 fprintf(fout, " P = %6.4f ", Psame[itreeM]/ntree);
\r
4097 OutTreeN(fout, 1, PrLabel); FPN(fout);
\r
4100 if(pick1tree) printf("\ntree #%d collected into %s\n", pick1tree, pick1treef);
\r
4104 free(splitM); free(Psame);
\r
4105 fclose(fM); if(f1tree) fclose(f1tree);
\r
4107 free(split1); free(splits); free(countsplits); free(Psplit50);
\r
4108 free(ptrees); free(countptree); free(index);
\r
4117 int NSameBranch (char partition1[],char partition2[], int nib1,int nib2, int IBsame[])
\r
4119 /* counts the number of correct (identical) bipartitions.
\r
4120 nib1 and nib2 are the numbers of interior branches in the two trees
\r
4121 correctIB[0,...,(correctbranch-1)] lists the correct interior branches,
\r
4122 that is, interior branches in tree 1 that is also in tree 2.
\r
4123 IBsame[i]=1 if interior branch i is correct.
\r
4125 int i,j,k=0, nsamebranch;
\r
4128 for (i=0,nsamebranch=0; i<nib1; i++)
\r
4129 for(j=0,IBsame[i]=0; j<nib2; j++) {
\r
4130 if(strcmp(partition1+i*(com.ns+1), partition2+j*(com.ns+1)) == 0) {
\r
4131 nsamebranch++; IBsame[i]=1; break;
\r
4135 for (i=0,nsamebranch=0; i<nib1; i++)
\r
4136 for(j=0,IBsame[i]=0; j<nib2; j++) {
\r
4137 for (k=0;k<com.ns;k++)
\r
4138 if(partition1[i*(com.ns+1)+k] != partition2[j*(com.ns+1)+k]) break;
\r
4140 nsamebranch++; IBsame[i]=1; break;
\r
4144 return (nsamebranch);
\r
4148 int AddSpecies (int is, int ib)
\r
4150 /* Add species (is) to tree at branch ib. The tree currently has
\r
4151 is+1-1 species. Interior node numbers are increased by 2 to make
\r
4152 room for the new nodes.
\r
4153 if(com.clock && ib==tree.nbranch), the new species is added as an
\r
4154 outgroup to the rooted tree.
\r
4158 if(ib>tree.nbranch+1 || (ib==tree.nbranch && !com.clock)) return(-1);
\r
4160 if(ib==tree.nbranch && com.clock) {
\r
4161 FOR(i,tree.nbranch) FOR(j,2)
\r
4162 if (tree.branches[i][j]>=is) tree.branches[i][j]+=2;
\r
4163 it=tree.root; if(tree.root>=is) it+=2;
\r
4164 FOR(i,2) tree.branches[tree.nbranch+i][0]=tree.root=is+1;
\r
4165 tree.branches[tree.nbranch++][1]=it;
\r
4166 tree.branches[tree.nbranch++][1]=is;
\r
4169 FOR(i,tree.nbranch) FOR(j,2)
\r
4170 if (tree.branches[i][j]>=is) tree.branches[i][j]+=2;
\r
4171 it=tree.branches[ib][1];
\r
4172 tree.branches[ib][1]=is+1;
\r
4173 tree.branches[tree.nbranch][0]=is+1;
\r
4174 tree.branches[tree.nbranch++][1]=it;
\r
4175 tree.branches[tree.nbranch][0]=is+1;
\r
4176 tree.branches[tree.nbranch++][1]=is;
\r
4177 if (tree.root>=is) tree.root+=2;
\r
4186 static struct TREE
\r
4187 {struct TREEB tree; struct TREEN nodes[2*NS-1]; double x[NP]; }
\r
4188 treebest, treestar;
\r
4190 static struct TREE
\r
4191 {struct TREEB tree; struct TREEN nodes[2*NS-1];} treestar;
\r
4194 int Perturbation(FILE* fout, int initialMP, double space[]);
\r
4196 int Perturbation(FILE* fout, int initialMP, double space[])
\r
4198 /* heuristic tree search by the NNI tree perturbation algorithm.
\r
4199 Some trees are evaluated multiple times as no trees are kept.
\r
4200 This needs more work.
\r
4202 int step=0, ntree=0, nmove=0, improve=0, ineighb, i,j;
\r
4203 int sizetree=(2*com.ns-1)*sizeof(struct TREEN);
\r
4204 double *x=treestar.x;
\r
4207 if(com.clock) error2("\n\aerr: pertubation does not work with a clock yet.\n");
\r
4208 if(initialMP&&!com.cleandata)
\r
4209 error2("\ncannot get initial parsimony tree for gapped data yet.");
\r
4211 fprintf(fout, "\n\nHeuristic tree search by NNI perturbation\n");
\r
4213 if (noisy) printf("\nInitial tree from stepwise addition with MP:\n");
\r
4214 fprintf(fout, "\nInitial tree from stepwise addition with MP:\n");
\r
4215 StepwiseAdditionMP (space);
\r
4218 if (noisy) printf ("\nInitial tree read from file %s:\n", com.treef);
\r
4219 fprintf(fout, "\nInitial tree read from file.\n");
\r
4220 if ((ftree=fopen (com.treef,"r"))==NULL) error2("treefile not exist?");
\r
4221 fscanf (ftree, "%d%d", &i, &ntree);
\r
4222 if (i!=com.ns) error2("ns in the tree file");
\r
4223 if(ReadTreeN(ftree, &i, &j, 0, 1)) error2("err tree..");
\r
4226 if (noisy) { FPN (F0); OutTreeN(F0,0,0); FPN(F0); }
\r
4227 tree.lnL=TreeScore(x, space);
\r
4228 if (noisy) { OutTreeN(F0,0,1); printf("\n lnL = %.4f\n",-tree.lnL); }
\r
4229 OutTreeN(fout,1,1); fprintf(fout, "\n lnL = %.4f\n",-tree.lnL);
\r
4230 if (com.np>com.ntime) {
\r
4231 fprintf(fout, "\tparameters:");
\r
4232 for(i=com.ntime; i<com.np; i++) fprintf(fout, "%9.5f", x[i]);
\r
4236 treebest.tree=tree; memcpy(treebest.nodes, nodes, sizetree);
\r
4238 for (step=0; ; step++) {
\r
4239 for (ineighb=0,improve=0; ineighb<(tree.nbranch-com.ns)*2; ineighb++) {
\r
4240 tree=treebest.tree; memcpy (nodes, treebest.nodes, sizetree);
\r
4241 NeighborNNI (ineighb);
\r
4243 printf("\nTrying tree # %d (%d move[s]) \n", ++ntree,nmove);
\r
4244 OutTreeN(F0,0,0); FPN(F0);
\r
4246 tree.lnL=TreeScore(x, space);
\r
4247 if (noisy) { OutTreeN(F0,1,1); printf("\n lnL = %.4f\n",-tree.lnL);}
\r
4248 if (noisy && com.np>com.ntime) {
\r
4249 printf("\tparameters:");
\r
4250 for(i=com.ntime; i<com.np; i++) printf("%9.5f", x[i]);
\r
4253 if (tree.lnL<=treebest.tree.lnL) {
\r
4254 treebest.tree=tree; memcpy (treebest.nodes, nodes, sizetree);
\r
4255 improve=1; nmove++;
\r
4256 if (noisy) printf(" moving to this tree\n");
\r
4258 fprintf(fout, "\nA better tree:\n");
\r
4259 OutTreeN(fout,0,0); FPN(fout); OutTreeN(fout,1,1); FPN(fout);
\r
4260 fprintf(fout, "\nlnL = %.4f\n", tree.lnL);
\r
4261 if (com.np>com.ntime) {
\r
4262 fprintf(fout,"\tparameters:");
\r
4263 for(i=com.ntime; i<com.np; i++) fprintf(fout,"%9.5f", x[i]);
\r
4270 if (!improve) break;
\r
4272 tree=treebest.tree; memcpy (nodes, treebest.nodes, sizetree);
\r
4274 printf("\n\nBest tree found:\n");
\r
4275 OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,1); FPN(F0);
\r
4276 printf("\nlnL = %.4f\n", tree.lnL);
\r
4279 fprintf(fout, "\n\nBest tree found:\n");
\r
4280 OutTreeN(fout,0,0); FPN(fout); OutTreeN(fout,1,1); FPN(fout);
\r
4281 fprintf(fout, "\nlnL = %.4f\n", tree.lnL);
\r
4287 static int *_U0, *_step0, _mnnode;
\r
4288 /* up pass characters and changes for the star tree: each of size npatt*nnode*/
\r
4290 int StepwiseAdditionMP (double space[])
\r
4292 /* tree search by species addition.
\r
4295 int ns0=com.ns, is, i,j,h, tiestep=0,tie,bestbranch=0;
\r
4296 int sizetree=(2*com.ns-1)*sizeof(struct TREEN);
\r
4297 double bestscore=0,score;
\r
4299 _mnnode=com.ns*2-1;
\r
4300 _U0=(int*)malloc(com.npatt*_mnnode*sizeof(int));
\r
4301 _step0=(int*)malloc(com.npatt*_mnnode*sizeof(int));
\r
4303 printf("\n%9ld bytes for MP (U0 & N0)\n", 2*com.npatt*_mnnode*sizeof(int));
\r
4304 if (_U0==NULL || _step0==NULL) error2("oom U0&step0");
\r
4306 FOR (i,ns0) z0[i]=com.z[i];
\r
4307 tree.nbranch=tree.root=com.ns=3;
\r
4308 FOR (i, tree.nbranch) { tree.branches[i][0]=com.ns; tree.branches[i][1]=i; }
\r
4310 FOR (h, com.npatt)
\r
4312 { _U0[h*_mnnode+i]=1<<(com.z[i][h]-1); _step0[h*_mnnode+i]=0; }
\r
4313 for (is=com.ns,tie=0; is<ns0; is++) {
\r
4314 treestar.tree=tree; memcpy (treestar.nodes, nodes, sizetree);
\r
4316 for (j=0; j<treestar.tree.nbranch; j++,com.ns--) {
\r
4317 tree=treestar.tree; memcpy (nodes, treestar.nodes, sizetree);
\r
4319 AddSpecies (is, j);
\r
4320 score=MPScoreStepwiseAddition(is, space, 0);
\r
4322 OutTreeN(F0, 0, 0);
\r
4323 printf(" Add sp %d (ns=%d) at branch %d, score %.0f\n", is+1,com.ns,j+1,score);
\r
4325 if (j && score==bestscore) tiestep=1;
\r
4326 if (j==0 || score<bestscore || (score==bestscore&&rndu()<.1)) {
\r
4328 bestscore=score; bestbranch=j;
\r
4332 tree=treestar.tree; memcpy (nodes, treestar.nodes, sizetree);
\r
4334 AddSpecies (is, bestbranch);
\r
4335 score=MPScoreStepwiseAddition(is, space, 1);
\r
4338 { printf("\r Added %d [%5.0f steps]",is+1,-bestscore); fflush(F0);}
\r
4340 if (noisy>2) printf(" %d stages with ties, ", tie);
\r
4341 tree.lnL=bestscore;
\r
4342 free(_U0); free(_step0);
\r
4346 double MPScoreStepwiseAddition (int is, double space[], int save)
\r
4348 /* this changes only the part of the tree affected by the newly added
\r
4350 save=1 for the best tree, so that _U0 & _step0 are updated
\r
4352 int *U,*N,U3[3], h,ist, i,father,son2,*pU0=_U0,*pN0=_step0;
\r
4355 U=(int*)space; N=U+_mnnode;
\r
4356 for (h=0,score=0; h<com.npatt; h++,pU0+=_mnnode,pN0+=_mnnode) {
\r
4357 FOR (i, tree.nnode) { U[i]=pU0[i-2*(i>=is)]; N[i]=pN0[i-2*(i>=is)]; }
\r
4358 U[is]=1<<(com.z[is][h]-1); N[is]=0;
\r
4359 for (ist=is; (father=nodes[ist].father)!=tree.root; ist=father) {
\r
4360 if ((son2=nodes[father].sons[0])==ist) son2=nodes[father].sons[1];
\r
4361 N[father]=N[ist]+N[son2];
\r
4362 if ((U[father]=U[ist]&U[son2])==0)
\r
4363 { U[father]=U[ist]|U[son2]; N[father]++; }
\r
4365 FOR (i,3) U3[i]=U[nodes[tree.root].sons[i]];
\r
4367 if (U3[0]&U3[1]&U3[2]) N[tree.root]=0;
\r
4368 else if (U3[0]&U3[1] || U3[1]&U3[2] || U3[0]&U3[2]) N[tree.root]=1;
\r
4369 FOR(i,3) N[tree.root]+=N[nodes[tree.root].sons[i]];
\r
4372 memcpy (pU0, U, tree.nnode*sizeof(int));
\r
4373 memcpy (pN0, N, tree.nnode*sizeof(int));
\r
4375 score+=N[tree.root]*com.fpatt[h];
\r
4381 double TreeScore(double x[], double space[])
\r
4383 static int fromfile=0;
\r
4385 double xb[NP][2], e=1e-9, lnL=0;
\r
4387 if(com.clock==2) error2("local clock in TreeScore");
\r
4388 com.ntime = com.clock ? tree.nnode-com.ns : tree.nbranch;
\r
4390 GetInitials(x, &i); /* this shoulbe be improved??? */
\r
4394 if(com.method==0 || !fromfile) SetxBound(com.np, xb);
\r
4397 lnL = com.plfun(x,com.np);
\r
4398 com.np = com.ntime;
\r
4401 if(com.method==0 || com.ntime==0)
\r
4402 ming2(NULL,&lnL,com.plfun,NULL,x,xb, space,e,com.np);
\r
4404 minB(NULL, &lnL, x, xb, e, space);
\r
4410 int StepwiseAddition (FILE* fout, double space[])
\r
4412 /* heuristic tree search by species addition. Species are added in the order
\r
4413 of occurrence in the data.
\r
4414 Try to get good initial values.
\r
4416 char *z0[NS], *spname0[NS];
\r
4417 int ns0=com.ns, is, i,j, bestbranch=0, randadd=0, order[NS];
\r
4418 int sizetree=(2*com.ns-1)*sizeof(struct TREEN);
\r
4419 double bestscore=0,score, *x=treestar.x;
\r
4421 if(com.ns>50) printf("if this crashes, increase com.sspace?");
\r
4423 if(com.ns<3) error2("2 sequences, no need for tree search");
\r
4424 if (noisy) printf("\n\nHeuristic tree search by stepwise addition\n");
\r
4425 if (fout) fprintf(fout, "\n\nHeuristic tree search by stepwise addition\n");
\r
4426 FOR (i,ns0) { z0[i]=com.z[i]; spname0[i]=com.spname[i]; }
\r
4427 tree.nbranch=tree.root=com.ns=(com.clock?2:3);
\r
4429 FOR(i,ns0) order[i]=i;
\r
4432 { j=(int)(ns0*rndu()); is=order[i]; order[i]=order[j]; order[j]=is; }
\r
4433 if(noisy) FOR(i,ns0) printf(" %d", order[i]+1);
\r
4435 fputs("\nOrder of species addition:\n",fout);
\r
4436 FOR(i,ns0)fprintf(fout,"%3d %-s\n", order[i]+1,com.spname[order[i]]);
\r
4438 for(i=0; i<ns0; i++) {
\r
4439 com.z[i]=z0[order[i]];
\r
4440 com.spname[i]=spname0[order[i]];
\r
4444 for(i=0; i<tree.nbranch; i++) {
\r
4445 tree.branches[i][0]=com.ns; tree.branches[i][1]=i;
\r
4448 for (is=com.ns; is<ns0; is++) { /* add the is_th species */
\r
4449 treestar.tree=tree; memcpy (treestar.nodes, nodes, sizetree);
\r
4451 for (j=0; j<treestar.tree.nbranch+(com.clock>0); j++,com.ns--) {
\r
4452 tree=treestar.tree; memcpy(nodes, treestar.nodes, sizetree);
\r
4455 score=TreeScore(x, space);
\r
4457 { printf("\n "); OutTreeN(F0, 0, 0); printf("%12.3f",-score); }
\r
4459 if (j==0 || score<bestscore || (score==bestscore&&rndu()<.2)) {
\r
4460 treebest.tree=tree; memcpy(treebest.nodes, nodes, sizetree);
\r
4461 xtoy (x, treebest.x, com.np);
\r
4462 bestscore=score; bestbranch=j;
\r
4465 tree=treebest.tree; memcpy(nodes,treebest.nodes, sizetree);
\r
4466 xtoy (treebest.x, x, com.np);
\r
4470 printf("\n\nAdded sp. %d, %s [%.3f]\n",is+1,com.spname[is],-bestscore);
\r
4471 OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0); FPN(F0);
\r
4472 if (com.np>com.ntime) {
\r
4473 printf("\tparameters:");
\r
4474 for(i=com.ntime; i<com.np; i++) printf("%9.5f", x[i]);
\r
4479 fprintf(fout,"\n\nAdded sp. %d, %s [%.3f]\n",
\r
4480 is+1, com.spname[is], -bestscore);
\r
4481 OutTreeN(fout,0,0); FPN(fout);
\r
4482 OutTreeN(fout,1,1); FPN(fout);
\r
4483 if (com.np>com.ntime) {
\r
4484 fprintf(fout, "\tparameters:");
\r
4485 for(i=com.ntime; i<com.np; i++) fprintf(fout, "%9.5f", x[i]);
\r
4491 tree.lnL=bestscore;
\r
4497 int DecompTree (int inode, int ison1, int ison2);
\r
4498 #define hdID(i,j) (max2(i,j)*(max2(i,j)-1)/2+min2(i,j))
\r
4500 int StarDecomposition (FILE *fout, double space[])
\r
4502 /* automatic tree search by star decomposition, nhomo<=1
\r
4503 returns (0,1,2,3) for the 4s problem.
\r
4505 int status=0,stage=0, i,j, itree,ntree=0,ntreet,best=0,improve=1,collaps=0;
\r
4506 int inode, nson=0, ison1,ison2, son1, son2;
\r
4507 int sizetree=(2*com.ns-1)*sizeof(struct TREEN);
\r
4509 FILE *ftree, *fsum=frst;
\r
4511 if (com.runmode==1) { /* read the star-like tree from tree file */
\r
4512 if ((ftree=fopen (com.treef,"r"))==NULL)
\r
4513 error2("no treefile");
\r
4514 fscanf (ftree, "%d%d", &i, &ntree);
\r
4515 if (ReadTreeN(ftree, &i, &j, 0, 1)) error2("err tree file");
\r
4518 else { /* construct the star tree of ns species */
\r
4519 tree.nnode = (tree.nbranch=tree.root=com.ns)+1;
\r
4520 for (i=0; i<tree.nbranch; i++)
\r
4521 { tree.branches[i][0]=com.ns; tree.branches[i][1]=i; }
\r
4522 com.ntime = com.clock?1:tree.nbranch;
\r
4525 if (noisy) { printf("\n\nstage 0: "); OutTreeN(F0,0,0); }
\r
4526 if (fsum) { fprintf(fsum,"\n\nstage 0: "); OutTreeN(fsum,0,0); }
\r
4527 if (fout) { fprintf(fout,"\n\nstage 0: "); OutTreeN(fout,0,0); }
\r
4529 tree.lnL=TreeScore(x,space);
\r
4531 if (noisy) printf("\nlnL:%14.6f%6d", -tree.lnL, NFunCall);
\r
4532 if (fsum) fprintf(fsum,"\nlnL:%14.6f%6d", -tree.lnL, NFunCall);
\r
4534 fprintf(fout,"\nlnL(ntime:%3d np:%3d):%14.6f\n",
\r
4535 com.ntime, com.np, -tree.lnL);
\r
4536 OutTreeB (fout); FPN(fout);
\r
4537 FOR (i, com.np) fprintf (fout,"%9.5f", x[i]); FPN (fout);
\r
4539 treebest.tree=tree; memcpy(treebest.nodes,nodes,sizetree);
\r
4540 FOR (i,com.np) treebest.x[i]=x[i];
\r
4541 for (ntree=0,stage=1; ; stage++) {
\r
4542 for (inode=treebest.tree.nnode-1; inode>=0; inode--) {
\r
4543 nson=treebest.nodes[inode].nson;
\r
4544 if (nson>3) break;
\r
4545 if (com.clock) { if (nson>2) break; }
\r
4546 else if (nson>2+(inode==treebest.tree.root)) break;
\r
4548 if (inode==-1 || /*stage>com.ns-3+com.clock ||*/ !improve) { /* end */
\r
4549 tree=treebest.tree; memcpy (nodes, treebest.nodes, sizetree);
\r
4552 printf("\n\nbest tree: "); OutTreeN(F0,0,0);
\r
4553 printf(" lnL:%14.6f\n", -tree.lnL);
\r
4556 fprintf(fsum, "\n\nbest tree: "); OutTreeN(fsum,0,0);
\r
4557 fprintf(fsum, " lnL:%14.6f\n", -tree.lnL);
\r
4560 fprintf(fout, "\n\nbest tree: "); OutTreeN(fout,0,0);
\r
4561 fprintf(fout, " lnL:%14.6f\n", -tree.lnL);
\r
4562 OutTreeN(fout,1,1); FPN(fout);
\r
4566 treestar=treebest; memcpy(nodes,treestar.nodes,sizetree);
\r
4568 if (collaps && stage) {
\r
4569 printf ("\ncollapsing nodes\n");
\r
4570 OutTreeN(F0, 1, 1); FPN(F0);
\r
4572 tree=treestar.tree; memcpy(nodes, treestar.nodes, sizetree);
\r
4573 for (i=com.ns,j=0; i<tree.nnode; i++)
\r
4574 if (i!=tree.root && nodes[i].branch<1e-7)
\r
4575 { CollapsNode (i, treestar.x); j++; }
\r
4576 treestar.tree=tree; memcpy(treestar.nodes, nodes, sizetree);
\r
4579 fprintf (fout, "\n%d node(s) collapsed\n", j);
\r
4580 OutTreeN(fout, 1, 1); FPN(fout);
\r
4583 printf ("\n%d node(s) collapsed\n", j);
\r
4584 OutTreeN(F0, 1, 1); FPN(F0);
\r
4585 /* if (j) getchar (); */
\r
4589 ntreet = nson*(nson-1)/2;
\r
4590 if (!com.clock && inode==treestar.tree.root && nson==4) ntreet=3;
\r
4591 com.ntime++; com.np++;
\r
4594 printf ("\n\nstage %d:%6d trees, ntime:%3d np:%3d\nstar tree: ",
\r
4595 stage, ntreet, com.ntime, com.np);
\r
4596 OutTreeN(F0, 0, 0);
\r
4597 printf (" lnL:%10.3f\n", -treestar.tree.lnL);
\r
4600 fprintf (fsum, "\n\nstage %d:%6d trees, ntime:%3d np:%3d\nstar tree: ",
\r
4601 stage, ntreet, com.ntime, com.np);
\r
4602 OutTreeN(fsum, 0, 0);
\r
4603 fprintf (fsum, " lnL:%10.6f\n", -treestar.tree.lnL);
\r
4606 fprintf (fout,"\n\nstage %d:%6d trees\nstar tree: ", stage, ntreet);
\r
4607 OutTreeN(fout, 0, 0);
\r
4608 fprintf (fout, " lnL:%14.6f\n", -treestar.tree.lnL);
\r
4609 OutTreeN(fout, 1, 1); FPN (fout);
\r
4612 for (ison1=0,itree=improve=0; ison1<nson; ison1++)
\r
4613 for (ison2=ison1+1; ison2<nson&&itree<ntreet; ison2++,itree++,ntree++) {
\r
4614 DecompTree (inode, ison1, ison2);
\r
4615 son1=nodes[tree.nnode-1].sons[0];
\r
4616 son2=nodes[tree.nnode-1].sons[1];
\r
4618 for(i=com.np-1; i>0; i--) x[i]=treestar.x[i-1];
\r
4620 for (i=0; i<tree.nbranch; i++)
\r
4621 x[i]=max2(nodes[tree.branches[i][1]].branch*0.99, 0.0001);
\r
4623 for (i=1,x[0]=max2(x[0],.01); i<com.ntime; i++) x[i]=.5;
\r
4626 printf("\nS=%d:%3d/%d T=%4d ", stage,itree+1,ntreet,ntree+1);
\r
4627 OutTreeN(F0, 0, 0);
\r
4630 fprintf(fsum, "\nS=%d:%3d/%d T=%4d ", stage,itree+1,ntreet,ntree+1);
\r
4631 OutTreeN(fsum, 0, 0);
\r
4634 fprintf(fout,"\nS=%d:%4d/%4d T=%4d ",stage,itree+1,ntreet,ntree+1);
\r
4635 OutTreeN(fout, 0, 0);
\r
4637 tree.lnL=TreeScore(x, space);
\r
4639 if (tree.lnL<treebest.tree.lnL) {
\r
4640 treebest.tree=tree; memcpy (treebest.nodes, nodes, sizetree);
\r
4641 FOR(i,com.np) treebest.x[i]=x[i];
\r
4642 best=itree+1; improve=1;
\r
4645 printf("%6d%2c %+8.6f", NFunCall,(status?'?':'X'),treestar.tree.lnL-tree.lnL);
\r
4647 fprintf(fsum, "%6d%2c", NFunCall, (status?'?':'X'));
\r
4648 for (i=com.ntime; i<com.np; i++) fprintf(fsum, "%7.3f", x[i]);
\r
4649 fprintf(fsum, " %+8.6f", treestar.tree.lnL-tree.lnL);
\r
4653 fprintf(fout,"\nlnL(ntime:%3d np:%3d):%14.6f\n",
\r
4654 com.ntime, com.np, -tree.lnL);
\r
4655 OutTreeB (fout); FPN(fout);
\r
4656 FOR (i,com.np) fprintf(fout,"%9.5f", x[i]);
\r
4657 FPN(fout); fflush(fout);
\r
4659 } /* for (itree) */
\r
4660 son1=treebest.nodes[tree.nnode-1].sons[0];
\r
4661 son2=treebest.nodes[tree.nnode-1].sons[1];
\r
4662 } /* for (stage) */
\r
4664 if (com.ns<=4 && !improve && best) error2("strange");
\r
4666 if (com.ns<=4) return (best);
\r
4670 int DecompTree (int inode, int ison1, int ison2)
\r
4672 /* decompose treestar at NODE inode into tree and nodes[]
\r
4674 int i, son1, son2;
\r
4675 int sizetree=(2*com.ns-1)*sizeof(struct TREEN);
\r
4676 double bt, fmid=0.001, fclock=0.0001;
\r
4678 tree=treestar.tree; memcpy (nodes, treestar.nodes, sizetree);
\r
4679 for (i=0,bt=0; i<tree.nnode; i++)
\r
4680 if (i!=tree.root) bt+=nodes[i].branch/tree.nbranch;
\r
4682 nodes[tree.nnode].nson=2;
\r
4683 nodes[tree.nnode].sons[0]=son1=nodes[inode].sons[ison1];
\r
4684 nodes[tree.nnode].sons[1]=son2=nodes[inode].sons[ison2];
\r
4685 nodes[tree.nnode].father=inode;
\r
4686 nodes[son1].father=nodes[son2].father=tree.nnode;
\r
4688 nodes[inode].sons[ison1]=tree.nnode;
\r
4689 for (i=ison2; i<nodes[inode].nson; i++)
\r
4690 nodes[inode].sons[i]=nodes[inode].sons[i+1];
\r
4691 nodes[inode].nson--;
\r
4696 nodes[tree.nnode-1].branch=bt*fmid;
\r
4698 nodes[tree.nnode-1].age=nodes[inode].age*(1-fclock);
\r
4704 #ifdef REALSEQUENCE
\r
4707 int MultipleGenes (FILE* fout, FILE*fpair[], double space[])
\r
4709 /* This does the separate analysis of multiple-gene data.
\r
4710 Note that com.pose[] is not correct and so RateAncestor = 0 should be set
\r
4711 in baseml and codeml.
\r
4713 int ig=0, j, ngene0, npatt0, lgene0[NGENE], posG0[NGENE+1];
\r
4714 int nb = ((com.seqtype==1 && !com.cleandata) ? 3 : 1);
\r
4716 if(com.ndata>1) error2("multiple data sets & multiple genes?");
\r
4718 ngene0=com.ngene; npatt0=com.npatt;
\r
4719 for(ig=0; ig<ngene0; ig++) lgene0[ig]=com.lgene[ig];
\r
4720 for(ig=0; ig<ngene0+1; ig++) posG0[ig]=com.posG[ig];
\r
4724 printf("\nStart from gene (1-%d)? ", com.ngene);
\r
4725 scanf("%d", &ig);
\r
4729 for ( ; ig<ngene0; ig++) {
\r
4732 com.ls=com.lgene[0]= ig==0?lgene0[0]:lgene0[ig]-lgene0[ig-1];
\r
4733 com.npatt = ig==ngene0-1 ? npatt0-posG0[ig] : posG0[ig+1]-posG0[ig];
\r
4734 com.posG[0]=0; com.posG[1]=com.npatt;
\r
4735 FOR (j,com.ns) com.z[j]+=posG0[ig]*nb; com.fpatt+=posG0[ig];
\r
4736 xtoy (com.piG[ig], com.pi, com.ncode);
\r
4738 printf ("\n\nGene %2d ls:%4d npatt:%4d\n",ig+1,com.ls,com.npatt);
\r
4739 fprintf(fout,"\nGene %2d ls:%4d npatt:%4d\n",ig+1,com.ls,com.npatt);
\r
4740 fprintf(frst,"\nGene %2d ls:%4d npatt:%4d\n",ig+1,com.ls,com.npatt);
\r
4741 fprintf(frst1,"%d\t%d\t%d",ig+1,com.ls,com.npatt);
\r
4744 if(com.seqtype==CODONseq) {
\r
4745 DistanceMatNG86(fout,fpair[0],fpair[1],fpair[2],0);
\r
4746 if(com.codonf>=F1x4MG) com.pf3x4 = com.f3x4[ig];
\r
4750 DistanceMatNuc(fout,fpair[0],com.model,com.alpha);
\r
4753 if (com.runmode==0) Forestry(fout);
\r
4755 else if (com.runmode==-2) {
\r
4756 if(com.seqtype==CODONseq) PairwiseCodon(fout,fpair[3],fpair[4],fpair[5],space);
\r
4757 else PairwiseAA(fout,fpair[0]);
\r
4760 else StepwiseAddition(fout, space);
\r
4762 for(j=0; j<com.ns; j++) com.z[j] -= posG0[ig]*nb;
\r
4763 com.fpatt -= posG0[ig];
\r
4766 com.ngene = ngene0;
\r
4767 com.npatt = npatt0;
\r
4768 com.ls = lgene0[ngene0-1];
\r
4769 for(ig=0; ig<ngene0; ig++)
\r
4770 com.lgene[ig] = lgene0[ig];
\r
4771 for(ig=0; ig<ngene0+1; ig++)
\r
4772 com.posG[ig] = posG0[ig];
\r
4776 void printSeqsMgenes (void)
\r
4778 /* separate sites from different partitions (genes) into different files.
\r
4779 called before sequences are coded.
\r
4780 Note that this is called before PatternWeight and so posec or posei is used
\r
4781 and com.pose is not yet allocated.
\r
4782 In case of codons, com.ls is the number of codons.
\r
4786 int ig, lg, i,j,h;
\r
4787 int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
\r
4789 puts("Separating sites in genes into different files.\n");
\r
4790 for (ig=0, FPN(F0); ig<com.ngene; ig++) {
\r
4791 for (h=0,lg=0; h<com.ls; h++)
\r
4792 if(com.pose[h]==ig)
\r
4794 sprintf(seqf, "Gene%d.seq", ig+1);
\r
4795 if((fseq=fopen(seqf,"w"))==NULL) error2("file creation err.");
\r
4796 printf("%d sites in gene %d go to file %s\n", lg, ig+1,seqf);
\r
4798 fprintf (fseq, "%8d%8d\n", com.ns, lg*n31);
\r
4799 for (j=0; j<com.ns; j++) {
\r
4801 /* fprintf(fseq,"*\n>\n%s\n", com.spname[j]); */
\r
4802 fprintf(fseq,"%-20s ", com.spname[j]);
\r
4803 if (n31==1) { /* nucleotide or aa sequences */
\r
4805 if(com.pose[h]==ig)
\r
4806 fprintf(fseq, "%c", com.z[j][h]);
\r
4808 else { /* codon sequences */
\r
4810 if(com.pose[h]==ig) {
\r
4811 FOR (i,3) fprintf(fseq,"%c", com.z[j][h*3+i]);
\r
4822 void printSeqsMgenes2 (void)
\r
4824 /* This print sites from certain genes into one file.
\r
4825 called before sequences are coded.
\r
4826 In case of codons, com.ls is the number of codons.
\r
4829 char seqf[20]="newseqs";
\r
4830 int ig, lg, i,j,h;
\r
4831 int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
\r
4834 char *genenames[44]={"atpa", "atpb", "atpe", "atpf", "atph", "petb", "petg", "psaa",
\r
4835 "psab", "psac", "psaj", "psba", "psbb", "psbc", "psbd", "psbe",
\r
4836 "psbf", "psbh", "psbi", "psbj", "psbk", "psbl", "psbn", "psbt",
\r
4837 "rl14", "rl16", "rl2", "rl20", "rl36", "rpob", "rpoc", "rpod", "rs11",
\r
4838 "rs12", "rs14", "rs18", "rs19", "rs2", "rs3", "rs4", "rs7", "rs8",
\r
4840 int wantgene[44]={0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
\r
4841 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
\r
4844 for(ig=0,lg=0; ig<com.ngene; ig++) wantgene[ig]=!wantgene[ig];
\r
4847 if(com.ngene!=44) error2("ngene!=44");
\r
4849 printf("%3d",com.pose[h]);
\r
4850 if((h+1)%20==0) FPN(F0); if((h+1)%500==0) getchar();
\r
4852 matIout(F0,com.lgene,1,com.ngene);
\r
4853 matIout(F0,wantgene,1,com.ngene);
\r
4855 for(ig=0,lg=0; ig<com.ngene; ig++)
\r
4856 if(wantgene[ig]) { ngenekept++; lg+=com.lgene[ig]; }
\r
4858 if((fseq=fopen(seqf,"w"))==NULL) error2("file creation err.");
\r
4859 fprintf(fseq,"%4d %4d G\nG %d ", com.ns, lg*n31, ngenekept);
\r
4860 FOR(ig,com.ngene) if(wantgene[ig]) fprintf(fseq," %3d", com.lgene[ig]);
\r
4863 for (j=0; j<com.ns; FPN(fseq),j++) {
\r
4864 fprintf(fseq,"%-20s ", com.spname[j]);
\r
4865 if (n31==1) { /* nucleotide or aa sequences */
\r
4867 if(wantgene[ig=com.pose[h]]) fprintf(fseq,"%c",com.z[j][h]);
\r
4869 else { /* codon sequences */
\r
4871 if (wantgene[ig=com.pose[h]]) {
\r
4872 FOR (i,3) fprintf(fseq,"%c", com.z[j][h*3+i]);
\r
4878 FOR(ig,com.ngene) if(wantgene[ig]) fprintf(fseq," %s", genenames[ig]);
\r
4885 #endif /* ifdef REALSEQUENCE */
\r
4886 #endif /* ifdef TREESEARCH */
\r
4887 #endif /* ifdef NODESTRUCTURE */
\r
4893 void UpPassScoreOnly (int inode);
\r
4894 void UpPassScoreOnlyB (int inode);
\r
4896 static int *Nsteps, *chUB; /* MM */
\r
4897 static char *Kspace, *chU, *NchU;
\r
4898 /* Elements of chU are character states (there are NchU of them). This
\r
4899 representation is used to speed up calculation for large trees.
\r
4900 Bit operations on chUB are performed for binary trees
\r
4903 void UpPassScoreOnly (int inode)
\r
4905 /* => VU, VL, & MM, theorem 2 */
\r
4907 char *K=Kspace, maxK; /* chMark (VV) not used in up pass */
\r
4909 FOR (i,nodes[inode].nson)
\r
4910 if (nodes[nodes[inode].sons[i]].nson>0)
\r
4911 UpPassScoreOnly (nodes[inode].sons[i]);
\r
4913 FOR (i,com.ncode) K[i]=0;
\r
4914 FOR (i,nodes[inode].nson)
\r
4915 for (j=0,ison=nodes[inode].sons[i]; j<NchU[ison]; j++)
\r
4916 K[(int)chU[ison*com.ncode+j]]++;
\r
4917 for (i=0,maxK=0; i<com.ncode; i++) if (K[i]>maxK) maxK=K[i];
\r
4918 for (i=0,NchU[inode]=0; i<com.ncode; i++)
\r
4919 if (K[i]==maxK) chU[inode*com.ncode+NchU[inode]++]=(char)i;
\r
4920 Nsteps[inode]=nodes[inode].nson-maxK;
\r
4921 FOR (i, nodes[inode].nson) Nsteps[inode]+=Nsteps[nodes[inode].sons[i]];
\r
4924 void UpPassScoreOnlyB (int inode)
\r
4926 /* uses bit operation, for binary trees only
\r
4928 int ison1,ison2, i, change=0;
\r
4930 FOR (i,nodes[inode].nson)
\r
4931 if (nodes[nodes[inode].sons[i]].nson>0)
\r
4932 UpPassScoreOnlyB (nodes[inode].sons[i]);
\r
4934 ison1=nodes[inode].sons[0]; ison2=nodes[inode].sons[1];
\r
4935 if ((chUB[inode]=(chUB[ison1] & chUB[ison2]))==0)
\r
4936 { chUB[inode]=(chUB[ison1] | chUB[ison2]); change=1; }
\r
4937 Nsteps[inode]=change+Nsteps[ison1]+Nsteps[ison2];
\r
4941 double MPScore (double space[])
\r
4943 /* calculates MP score for a given tree using Hartigan's (1973) algorithm.
\r
4944 sizeof(space) = nnode*sizeof(int)+(nnode+2)*ncode*sizeof(char).
\r
4945 Uses Nsteps[nnode], chU[nnode*ncode], NchU[nnode].
\r
4946 if(BitOperation), bit operations are used on binary trees.
\r
4948 int h,i, BitOperation,U[3],change;
\r
4951 Nsteps=(int*)space;
\r
4952 BitOperation=(tree.nnode==2*com.ns-1 - (nodes[tree.root].nson==3));
\r
4953 BitOperation=(BitOperation&&com.ncode<32);
\r
4954 if (BitOperation) chUB=Nsteps+tree.nnode;
\r
4956 chU=(char*)(Nsteps+tree.nnode);
\r
4957 NchU=chU+tree.nnode*com.ncode; Kspace=NchU+tree.nnode;
\r
4959 for (h=0,score=0; h<com.npatt; h++) {
\r
4960 FOR (i,tree.nnode) Nsteps[i]=0;
\r
4961 if (BitOperation) {
\r
4962 FOR (i,com.ns) chUB[i]=1<<(com.z[i][h]);
\r
4963 UpPassScoreOnlyB (tree.root);
\r
4964 if (nodes[tree.root].nson>2) {
\r
4965 FOR (i,3) U[i]=chUB[nodes[tree.root].sons[i]];
\r
4967 if (U[0]&U[1]&U[2]) change=0;
\r
4968 else if (U[0]&U[1] || U[1]&U[2] || U[0]&U[2]) change=1;
\r
4969 for (i=0,Nsteps[tree.root]=change; i<3; i++)
\r
4970 Nsteps[tree.root]+=Nsteps[nodes[tree.root].sons[i]];
\r
4973 else { /* polytomies, use characters */
\r
4975 {chU[i*com.ncode]=(char)(com.z[i][h]); NchU[i]=(char)1; }
\r
4976 for (i=com.ns; i<tree.nnode; i++) NchU[i]=0;
\r
4977 UpPassScoreOnly (tree.root);
\r
4979 score+=Nsteps[tree.root]*com.fpatt[h];
\r
4981 printf("\nh %3d: ", h+1);
\r
4982 FOR(i,com.ns) printf("%2d ", com.z[i][h]);
\r
4983 printf(" %6d ", Nsteps[tree.root]);
\r
4984 if((h+1)%10==0) exit(1);
\r
4991 double RemoveMPNinfSites (double *nsiteNinf)
\r
4993 /* Removes parsimony-noninformative sites and return the number of changes
\r
4995 Changes .z[], .fpatt[], .npatt, etc.
\r
4997 int h,j, it, npatt0=com.npatt, markb[NCODE], gt2;
\r
4998 double MPScoreNinf;
\r
5000 for (h=0,com.npatt=0,MPScoreNinf=0,*nsiteNinf=0; h<npatt0; h++) {
\r
5001 FOR (j, com.ncode) markb[j]=0;
\r
5002 FOR (j, com.ns) markb[(int)com.z[j][h]]++;
\r
5003 for (j=0,it=gt2=0; j<com.ncode; j++)
\r
5004 if (markb[j]>=2) { it++; gt2=1; }
\r
5005 if (it<2) { /* non-informative */
\r
5006 *nsiteNinf+=com.fpatt[h];
\r
5007 FOR (j,com.ncode) if(markb[j]==1) MPScoreNinf+=com.fpatt[h];
\r
5008 if (!gt2) MPScoreNinf-=com.fpatt[h];
\r
5011 FOR (j, com.ns) com.z[j][com.npatt]=com.z[j][h];
\r
5012 com.fpatt[com.npatt++]=com.fpatt[h];
\r
5015 return (MPScoreNinf);
\r
5021 #ifdef RECONSTRUCTION
\r
5023 static char *chMark, *chMarkU, *chMarkL; /* VV, VU, VL */
\r
5024 /* chMark, chMarkU, chMarkL (VV, VU, VL) have elements 0 or 1, marking
\r
5025 whether the character state is present in the set */
\r
5026 static char *PATHWay, *NCharaCur, *ICharaCur, *CharaCur;
\r
5027 /* PATHWay, NCharaCur, ICharaCur, CharaCur are for the current
\r
5031 int UpPass (int inode);
\r
5032 int DownPass (int inode);
\r
5034 int UpPass (int inode)
\r
5036 /* => VU, VL, & MM, theorem 2 */
\r
5037 int n=com.ncode, i, j;
\r
5038 char *K=chMark, maxK; /* chMark (VV) not used in up pass */
\r
5040 FOR (i,nodes[inode].nson)
\r
5041 if (nodes[nodes[inode].sons[i]].nson>0) UpPass (nodes[inode].sons[i]);
\r
5043 FOR (i, n) K[i]=0;
\r
5044 FOR (i,nodes[inode].nson)
\r
5045 FOR (j, n) if(chMarkU[nodes[inode].sons[i]*n+j]) K[j]++;
\r
5046 for (i=0,maxK=0; i<n; i++) if (K[i]>maxK) maxK=K[i];
\r
5047 for (i=0; i<n; i++) {
\r
5048 if (K[i]==maxK) chMarkU[inode*n+i]=1;
\r
5049 else if (K[i]==maxK-1) chMarkL[inode*n+i]=1;
\r
5051 Nsteps[inode]=nodes[inode].nson-maxK;
\r
5052 FOR (i, nodes[inode].nson) Nsteps[inode]+=Nsteps[nodes[inode].sons[i]];
\r
5056 int DownPass (int inode)
\r
5058 /* VU, VL => VV, theorem 3 */
\r
5059 int n=com.ncode, i, j, ison;
\r
5061 FOR (i,nodes[inode].nson) {
\r
5062 ison=nodes[inode].sons[i];
\r
5063 FOR (j,n) if (chMark[inode*n+j]>chMarkU[ison*n+j]) break;
\r
5065 FOR (j,n) chMark[ison*n+j]=chMark[inode*n+j];
\r
5068 chMark[ison*n+j] =
\r
5069 (char)(chMarkU[ison*n+j]||(chMark[inode*n+j]&&chMarkL[ison*n+j]));
\r
5071 FOR (i,nodes[inode].nson)
\r
5072 if (nodes[nodes[inode].sons[i]].nson>0) DownPass (nodes[inode].sons[i]);
\r
5077 int DownStates (int inode)
\r
5079 /* VU, VL => NCharaCur, CharaCur, theorem 4 */
\r
5082 FOR (i,nodes[inode].nson)
\r
5083 if (nodes[inode].sons[i]>=com.ns)
\r
5084 DownStatesOneNode (nodes[inode].sons[i], inode);
\r
5088 int DownStatesOneNode (int ison, int father)
\r
5090 /* States down inode, given father */
\r
5091 char chi=PATHWay[father-com.ns];
\r
5092 int n=com.ncode, j, in;
\r
5094 if((in=ison-com.ns)<0) return (0);
\r
5095 if (chMarkU[ison*n+chi]) {
\r
5096 NCharaCur[in]=1; CharaCur[in*n+0]=chi;
\r
5098 else if (chMarkL[ison*n+chi]) {
\r
5099 for (j=0,NCharaCur[in]=0; j<n; j++)
\r
5100 if (chMarkU[ison*n+j] || j==chi) CharaCur[in*n+NCharaCur[in]++]=(char)j;
\r
5103 for (j=0,NCharaCur[in]=0; j<n; j++)
\r
5104 if (chMarkU[ison*n+j]) CharaCur[in*n+NCharaCur[in]++]=(char)j;
\r
5106 PATHWay[in]=CharaCur[in*n+(ICharaCur[in]=0)];
\r
5107 FOR (j, nodes[ison].nson) if (nodes[ison].sons[j]>=com.ns) break;
\r
5108 if (j<nodes[ison].nson) DownStates (ison);
\r
5113 int InteriorStatesMP (int job, int h, int *nchange, char NChara[NS-1],
\r
5114 char Chara[(NS-1)*NCODE], double space[]);
\r
5116 int InteriorStatesMP (int job, int h, int *nchange, char NChara[NS-1],
\r
5117 char Chara[(NS-1)*NCODE], double space[])
\r
5119 /* sizeof(space) = nnode*sizeof(int)+3*nnode*ncode*sizeof(char)
\r
5120 job: 0=# of changes; 1:equivocal states
\r
5122 int n=com.ncode, i,j;
\r
5124 Nsteps=(int*)space; chMark=(char*)(Nsteps+tree.nnode);
\r
5125 chMarkU=chMark+tree.nnode*n; chMarkL=chMarkU+tree.nnode*n;
\r
5126 FOR (i,tree.nnode) Nsteps[i]=0;
\r
5127 FOR (i,3*n*tree.nnode) chMark[i]=0;
\r
5128 FOR (i,com.ns) chMark[i*n+com.z[i][h]]=chMarkU[i*n+com.z[i][h]]=1;
\r
5129 UpPass (tree.root);
\r
5130 *nchange=Nsteps[tree.root];
\r
5131 if (job==0) return (0);
\r
5132 FOR (i,n) chMark[tree.root*n+i]=chMarkU[tree.root*n+i];
\r
5133 DownPass (tree.root);
\r
5134 FOR (i,tree.nnode-com.ns)
\r
5135 for (j=0,NChara[i]=0; j<n; j++)
\r
5136 if (chMark[(i+com.ns)*n+j]) Chara[i*n+NChara[i]++]=(char)j;
\r
5141 int PathwayMP (FILE *fout, double space[])
\r
5143 /* Hartigan, JA. 1973. Minimum mutation fits to a given tree.
\r
5144 Biometrics, 29:53-65.
\r
5146 char *pch=(com.seqtype==0?BASEs:AAs), visit[NS-1];
\r
5147 int n=com.ncode, nid=tree.nbranch-com.ns+1, it, i,j,k, h, npath;
\r
5148 int nchange, nchange0;
\r
5149 char nodeb[NNODE], Equivoc[NS-1];
\r
5151 PATHWay=(char*)malloc(nid*(n+3)*sizeof(char));
\r
5152 NCharaCur=PATHWay+nid; ICharaCur=NCharaCur+nid; CharaCur=ICharaCur+nid;
\r
5154 for (j=0,visit[i=0]=(char)(tree.root-com.ns); j<tree.nbranch; j++)
\r
5155 if (tree.branches[j][1]>=com.ns)
\r
5156 visit[++i]=(char)(tree.branches[j][1]-com.ns);
\r
5158 printf ("\nOrder in nodes: ");
\r
5159 FOR (j, nid) printf ("%4d", visit[j]+1+com.ns); FPN(F0);
\r
5161 for (h=0; h<com.npatt; h++) {
\r
5162 fprintf (fout, "\n%4d%6.0f ", h+1, com.fpatt[h]);
\r
5163 FOR (j, com.ns) fprintf (fout, "%c", pch[(int)com.z[j][h]]);
\r
5164 fprintf (fout, ": ");
\r
5166 FOR (j,com.ns) nodeb[j]=(char)(com.z[j][h]);
\r
5168 InteriorStatesMP (1, h, &nchange, NCharaCur, CharaCur, space);
\r
5169 ICharaCur[j=tree.root-com.ns]=0; PATHWay[j]=CharaCur[j*n+0];
\r
5170 FOR (j,nid) Equivoc[j]=(char)(NCharaCur[j]>1);
\r
5171 DownStates (tree.root);
\r
5173 for (npath=0; ;) {
\r
5174 for (j=0,k=visit[nid-1]; j<NCharaCur[k]; j++) {
\r
5175 PATHWay[k]=CharaCur[k*n+j]; npath++;
\r
5176 FOR (i, nid) fprintf (fout, "%c", pch[(int)PATHWay[i]]);
\r
5177 fprintf (fout, " ");
\r
5179 FOR (i,nid) nodeb[i+com.ns]=PATHWay[i];
\r
5180 for (i=0,nchange0=0; i<tree.nbranch; i++)
\r
5181 nchange0+=(nodeb[tree.branches[i][0]]!=nodeb[tree.branches[i][1]]);
\r
5182 if (nchange0!=nchange)
\r
5183 { puts("\a\nerr:PathwayMP"); fprintf(fout,".%d. ", nchange0);}
\r
5186 for (j=nid-2; j>=0; j--) {
\r
5187 if(Equivoc[k=visit[j]] == 0) continue;
\r
5188 if (ICharaCur[k]+1<NCharaCur[k]) {
\r
5189 PATHWay[k] = CharaCur[k*n + (++ICharaCur[k])];
\r
5190 DownStates (k+com.ns);
\r
5193 else { /* if (next equivocal node is not ancestor) update node k */
\r
5194 for (i=j-1; i>=0; i--) if (Equivoc[(int)visit[i]]) break;
\r
5196 for (it=k+com.ns,i=visit[i]+com.ns; ; it=nodes[it].father)
\r
5197 if (it==tree.root || nodes[it].father==i) break;
\r
5198 if (it==tree.root)
\r
5199 DownStatesOneNode(k+com.ns, nodes[k+com.ns].father);
\r
5205 fprintf (fout, " |%4d (%d)", npath, nchange);
\r
5215 #if(BASEML || CODEML)
\r
5218 int BootstrapSeq (char* seqf)
\r
5220 /* This is called from within ReadSeq(), right after the sequences are read
\r
5221 and before the data are coded.
\r
5222 jackknife if(lsb<com.ls && com.ngene==1).
\r
5223 gmark[start+19] marks the position of the 19th site in that gene.
\r
5225 int iboot,nboot=com.bootstrap, h, is, ig, lg[NGENE]={0}, j, start;
\r
5226 int lsb=com.ls, n31=1,gap=10, gpos[NGENE];
\r
5227 int *sites=(int*)malloc(com.ls*sizeof(int)), *gmark=NULL;
\r
5228 FILE *fseq=(FILE*)gfopen(seqf, "w");
\r
5229 enum {PAML=0, PAUP};
\r
5230 char *datatype = (com.seqtype==AAseq?"protein":"dna");
\r
5231 char *paupstart="paupstart", *paupblock="paupblock", *paupend="paupend";
\r
5232 int format=0; /* 0: paml-phylip; 1:paup-nexus */
\r
5234 if(com.readpattern) error2("work on bootstrapping pattern data.");
\r
5236 printf("\nGenerating bootstrap samples in file %s\n", seqf);
\r
5237 if(format==PAUP) {
\r
5238 printf("%s, %s, & %s will be appended if existent.\n",
\r
5239 paupstart,paupblock,paupend);
\r
5240 appendfile(fseq, paupstart);
\r
5243 if(com.seqtype==CODONseq||com.seqtype==CODON2AAseq) { n31=3; gap=1; }
\r
5244 if(sites==NULL) error2("oom in BootstrapSeq");
\r
5246 if(lsb<com.ls) error2("jackknife when #gene>1");
\r
5247 if((gmark=(int*)malloc(com.ls*sizeof(int)))==NULL)
\r
5248 error2("oom in BootstrapSeq");
\r
5250 for(ig=0; ig<com.ngene; ig++) com.lgene[ig] = gpos[ig] = 0;
\r
5251 for(h=0; h<com.ls; h++) com.lgene[com.pose[h]]++;
\r
5252 for(j=0; j<com.ngene; j++) lg[j] = com.lgene[j];
\r
5253 for(j=1; j<com.ngene; j++) com.lgene[j] += com.lgene[j-1];
\r
5255 if(noisy && com.ngene>1) {
\r
5256 printf("Bootstrap uses stratefied sampling for %d partitions.", com.ngene);
\r
5257 printf("\nnumber of sites in each partition: ");
\r
5258 for(ig=0; ig<com.ngene; ig++) printf(" %4d", lg[ig]);
\r
5262 for(h=0; h<com.ls; h++) { /* create gmark[] */
\r
5264 start = (ig==0 ? 0 : com.lgene[ig-1]);
\r
5265 gmark[start + gpos[ig]++] = h;
\r
5269 for (iboot=0; iboot<nboot; iboot++,FPN(fseq)) {
\r
5271 for(h=0; h<lsb; h++) sites[h] = (int)(rndu()*com.ls);
\r
5273 for(ig=0; ig<com.ngene; ig++) {
\r
5274 start = (ig==0 ? 0 : com.lgene[ig-1]);
\r
5275 for(h=0; h<lg[ig]; h++)
\r
5276 sites[start+h] = gmark[start+(int)(rndu()*lg[ig])];
\r
5280 /* print out the bootstrap sample */
\r
5281 if(format==PAUP) {
\r
5282 fprintf(fseq,"\n\n[Replicate # %d]\n", iboot+1);
\r
5283 fprintf(fseq,"\nbegin data;\n");
\r
5284 fprintf(fseq," dimensions ntax=%d nchar=%d;\n", com.ns, lsb*n31);
\r
5285 fprintf(fseq," format datatype=%s missing=? gap=-;\n matrix\n", datatype);
\r
5287 for(is=0;is<com.ns;is++,FPN(fseq)) {
\r
5288 fprintf(fseq,"%-20s ", com.spname[is]);
\r
5289 for(h=0; h<lsb; h++) {
\r
5290 for(j=0; j<n31; j++) fprintf(fseq,"%c", com.z[is][sites[h]*n31+j]);
\r
5291 if((h+1)%gap==0) fprintf(fseq," ");
\r
5295 fprintf(fseq, " ;\nend;");
\r
5296 /* site partitions */
\r
5298 fprintf(fseq, "\n\nbegin paup;\n");
\r
5299 for(ig=0; ig<com.ngene; ig++)
\r
5300 fprintf(fseq, " charset partition%-2d = %-4d - %-4d;\n",
\r
5301 ig+1, (ig==0 ? 1 : com.lgene[ig-1]+1), com.lgene[ig]);
\r
5302 fprintf(fseq, "end;\n");
\r
5304 appendfile(fseq, paupblock);
\r
5308 fprintf(fseq,"%6d %6d\n", com.ns, lsb*n31);
\r
5310 fprintf(fseq,"%6d %6d G\nG %d ", com.ns, lsb*n31, com.ngene);
\r
5311 for(ig=0; ig<com.ngene; ig++)
\r
5312 fprintf(fseq," %4d", lg[ig]);
\r
5313 fprintf(fseq,"\n\n");
\r
5315 for(is=0; is<com.ns; is++,FPN(fseq)) {
\r
5316 fprintf(fseq,"%-20s ", com.spname[is]);
\r
5317 for(h=0; h<lsb; h++) {
\r
5318 for(j=0; j<n31; j++)
\r
5319 fprintf(fseq,"%c", com.z[is][sites[h]*n31+j]);
\r
5320 if((h+1)%gap==0) fprintf(fseq," ");
\r
5325 if(noisy && (iboot+1)%10==0) printf("\rdid sample #%d", iboot+1);
\r
5326 } /* for(iboot) */
\r
5327 free(sites); if(com.ngene>1) free(gmark);
\r
5334 int rell (FILE*flnf, FILE*fout, int ntree)
\r
5336 /* This implements three methods for tree topology comparison. The first
\r
5337 tests the log likelihood difference using a normal approximation
\r
5338 (Kishino and Hasegawa 1989). The second does approximate bootstrap sampling
\r
5339 (the RELL method, Kishino and Hasegawa 1989, 1993). The third is a
\r
5340 modification of the K-H test with a correction for multiple comparison
\r
5341 (Shimodaira and Hasegawa 1999) .
\r
5342 The routine reads input from the file lnf.
\r
5344 fpattB[npatt] stores the counts of site patterns in the bootstrap sample,
\r
5345 with sitelist[ls] listing sites by gene, for stratefied sampling.
\r
5347 com.space[ntree*(npatt+nr+5)]:
\r
5348 lnf[ntree*npatt] lnL0[ntree] lnL[ntree*nr] pRELL[ntree] pSH[ntree] vdl[ntree]
\r
5351 char *line, timestr[64];
\r
5352 int nr=(com.ls<100000?10000:(com.ls<10000?5000:500));
\r
5353 int lline=16000, ntree0,ns0=com.ns, ls0,npatt0;
\r
5354 int itree, h,ir,j,k, ig, mltree, nbtree, *btrees, status=0;
\r
5355 int *sitelist, *fpattB, *lgeneB, *psitelist;
\r
5356 double *lnf, *lnL0, *lnL, *pRELL, *lnLmSH, *pSH, *vdl, y, mdl, small=1e-5;
\r
5360 puts( "\nTree comparisons (Kishino & Hasegawa 1989; Shimodaira & Hasegawa 1999)");
\r
5361 fputs("\nTree comparisons (Kishino & Hasegawa 1989; Shimodaira & Hasegawa 1999)\n",fout);
\r
5362 fprintf(fout,"Number of replicates: %d\n", nr);
\r
5364 fscanf(flnf,"%d%d%d", &ntree0, &ls0, & npatt0);
\r
5365 if(ntree0!=-1 && ntree0!=ntree) error2("rell: input data file strange. Check.");
\r
5366 if (ls0!=com.ls || npatt0!=com.npatt)
\r
5367 error2("rell: input data file incorrect.");
\r
5368 s = ntree*(com.npatt+nr+5)*sizeof(double);
\r
5369 if(com.sspace < s) {
\r
5370 if(noisy) printf("resetting space to %lu bytes in rell.\n",s);
\r
5372 if((com.space=(double*)realloc(com.space,com.sspace))==NULL)
\r
5373 error2("oom space");
\r
5375 lnf=com.space; lnL0=lnf+ntree*com.npatt; lnL=lnL0+ntree; pRELL=lnL+ntree*nr;
\r
5376 pSH=pRELL+ntree; vdl=pSH+ntree; btrees=(int*)(vdl+ntree);
\r
5377 fpattB=(int*)malloc((com.npatt+com.ls+com.ngene)*sizeof(int));
\r
5378 if(fpattB==NULL) error2("oom fpattB in rell.");
\r
5379 sitelist=fpattB+com.npatt; lgeneB=sitelist+com.ls;
\r
5381 lline = (com.seqtype==1 ? ns0*8 : ns0) + 100;
\r
5382 lline = max2(16000, lline);
\r
5383 if((line=(char*)malloc((lline+1)*sizeof(char)))==NULL) error2("oom rell");
\r
5385 /* read lnf from file flnf, calculates lnL0[] & find ML tree */
\r
5386 for(itree=0,mltree=0; itree<ntree; itree++) {
\r
5387 printf("\r\tReading lnf for tree # %d", itree+1);
\r
5388 fscanf(flnf, "%d", &j);
\r
5390 { printf("\nerr: lnf, reading tree %d.",itree+1); return(-1); }
\r
5391 for(h=0,lnL0[itree]=0; h<com.npatt; h++) {
\r
5392 fscanf (flnf, "%d%d%lf", &j, &k, &y);
\r
5394 { printf("\nlnf, patt %d.",h+1); return(-1); }
\r
5395 fgets(line,lline,flnf);
\r
5396 lnL0[itree]+=com.fpatt[h]*(lnf[itree*com.npatt+h]=y);
\r
5398 if(itree && lnL0[itree]>lnL0[mltree]) mltree=itree;
\r
5400 printf(", done.\n");
\r
5403 /* calculates SEs (vdl) by sitewise comparison */
\r
5405 printtime(timestr);
\r
5406 printf("\r\tCalculating SEs by sitewise comparison");
\r
5407 FOR(itree,ntree) {
\r
5408 if(itree==mltree) { vdl[itree]=0; continue; }
\r
5409 mdl=(lnL0[itree]-lnL0[mltree])/com.ls;
\r
5410 for(h=0,vdl[itree]=0; h<com.npatt; h++) {
\r
5411 y=lnf[itree*com.npatt+h]-lnf[mltree*com.npatt+h];
\r
5412 vdl[itree]+=com.fpatt[h]*(y-mdl)*(y-mdl);
\r
5414 vdl[itree]=sqrt(vdl[itree]);
\r
5416 printf(", %s\n", printtime(timestr));
\r
5418 /* bootstrap resampling */
\r
5419 for(ig=0; ig<com.ngene; ig++)
\r
5420 lgeneB[ig]=(ig?com.lgene[ig]-com.lgene[ig-1]:com.lgene[ig]);
\r
5421 for(h=0,k=0;h<com.npatt;h++)
\r
5422 FOR(j,(int)com.fpatt[h]) sitelist[k++]=h;
\r
5424 zero(pRELL,ntree); zero(pSH,ntree); zero(lnL,ntree*nr);
\r
5425 for(ir=0; ir<nr; ir++) {
\r
5426 for(h=0; h<com.npatt; h++) fpattB[h]=0;
\r
5427 for(ig=0,psitelist=sitelist; ig<com.ngene; psitelist+=lgeneB[ig++]) {
\r
5428 for(k=0; k<lgeneB[ig]; k++) {
\r
5429 j=(int)(lgeneB[ig]*rndu());
\r
5434 for(h=0; h<com.npatt; h++) {
\r
5436 for(itree=0; itree<ntree; itree++)
\r
5437 lnL[itree*nr+ir] += fpattB[h]*lnf[itree*com.npatt+h];
\r
5440 /* y is the lnL for the best tree from replicate ir. */
\r
5441 for(j=1,nbtree=1,btrees[0]=0,y=lnL[ir]; j<ntree; j++) {
\r
5442 if(fabs(lnL[j*nr+ir]-y)<small)
\r
5443 btrees[nbtree++]=j;
\r
5444 else if (lnL[j*nr+ir]>y)
\r
5445 { nbtree=1; btrees[0]=j; y=lnL[j*nr+ir]; }
\r
5448 for(j=0; j<nbtree; j++)
\r
5449 pRELL[btrees[j]]+=1./(nr*nbtree);
\r
5450 if(nr>100 && (ir+1)%(nr/100)==0)
\r
5451 printf("\r\tRELL Bootstrapping.. replicate: %6d / %d %s",ir+1,nr, printtime(timestr));
\r
5456 if(fabs(1-sum(pRELL,ntree))>1e-6) error2("sum pRELL != 1.");
\r
5458 /* Shimodaira & Hasegawa correction (1999), working on lnL[ntree*nr] */
\r
5459 printf("\nnow doing S-H test");
\r
5460 if((lnLmSH=(double*)malloc(nr*sizeof(double))) == NULL) error2("oom in rell");
\r
5461 for(j=0; j<ntree; j++) /* step 3: centering */
\r
5462 for(ir=0,y=sum(lnL+j*nr,nr)/nr; ir<nr; ir++) lnL[j*nr+ir] -= y;
\r
5463 for(ir=0; ir<nr; ir++) {
\r
5464 for(j=1,lnLmSH[ir]=lnL[ir]; j<ntree; j++)
\r
5465 if(lnL[j*nr+ir]>lnLmSH[ir]) lnLmSH[ir] = lnL[j*nr+ir];
\r
5467 for(itree=0; itree<ntree; itree++) { /* steps 4 & 5 */
\r
5468 for(ir=0; ir<nr; ir++)
\r
5469 if(lnLmSH[ir]-lnL[itree*nr+ir] > lnL0[mltree]-lnL0[itree])
\r
5470 pSH[itree] += 1./nr;
\r
5473 fprintf(fout,"\n%6s %12s %9s %9s%8s%10s%9s\n\n",
\r
5474 "tree","li","Dli"," +- SE","pKH","pSH","pRELL");
\r
5476 mdl=lnL0[j]-lnL0[mltree];
\r
5477 if(j==mltree || fabs(vdl[j])<1e-6) { y=-1; pSH[j]=-1; status=-1; }
\r
5478 else y=1-CDFNormal(-mdl/vdl[j]);
\r
5479 fprintf(fout,"%6d%c%12.3f %9.3f %9.3f%8.3f%10.3f%9.3f\n",
\r
5480 j+1,(j==mltree?'*':' '),lnL0[j],mdl,vdl[j],y,pSH[j],pRELL[j]);
\r
5483 fprintf(frst1,"%3d %12.6f",mltree+1, lnL0[mltree]);
\r
5484 for(j=0;j<ntree;j++) fprintf(frst1," %5.3f",pRELL[j]);
\r
5486 for(j=0;j<ntree;j++) if(j!=mltree) fprintf(frst1,"%9.6f",pSH[j]);
\r
5489 fputs("\npKH: P value for KH normal test (Kishino & Hasegawa 1989)\n",fout);
\r
5490 fputs("pRELL: RELL bootstrap proportions (Kishino & Hasegawa 1989)\n",fout);
\r
5491 fputs("pSH: P value with multiple-comparison correction (MC in table 1 of Shimodaira & Hasegawa 1999)\n",fout);
\r
5492 if(status) fputs("(-1 for P values means N/A)\n",fout);
\r
5505 #ifdef RECONSTRUCTION
\r
5508 void ListAncestSeq(FILE *fout, char *zanc);
\r
5510 void ListAncestSeq(FILE *fout, char *zanc)
\r
5512 /* zanc[nintern*com.npatt] holds ancestral sequences.
\r
5513 Extant sequences are coded if cleandata.
\r
5515 int wname=15, j,h, n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);
\r
5516 int lst=(com.readpattern?com.npatt:com.ls);
\r
5518 fputs("\n\n\nList of extant and reconstructed sequences\n\n",fout);
\r
5519 if(!com.readpattern) fprintf(fout, "%6d %6d\n\n", tree.nnode, lst*n31);
\r
5520 else fprintf(fout, "%6d %6d P\n\n", tree.nnode, lst*n31);
\r
5521 for(j=0;j<com.ns;j++,FPN(fout)) {
\r
5522 fprintf(fout,"%-*s ", wname,com.spname[j]);
\r
5523 print1seq(fout, com.z[j], lst, com.pose);
\r
5525 for(j=0;j<tree.nnode-com.ns;j++,FPN(fout)) {
\r
5526 fprintf(fout,"node #%-*d ", wname-5,com.ns+j+1);
\r
5527 print1seq(fout, zanc+j*com.npatt, lst, com.pose);
\r
5529 if(com.readpattern) {
\r
5530 for(h=0,FPN(fout); h<com.npatt; h++) {
\r
5531 fprintf(fout," %4.0f", com.fpatt[h]);
\r
5532 if((h+1)%15==0) FPN(fout);
\r
5534 fprintf(fout,"\n\n");
\r
5538 int ProbSitePattern(double x[], double *lnL, double fhsiteAnc[], double ScaleC[]);
\r
5539 int AncestralMarginal(FILE *fout, double x[], double fhsiteAnc[], double Sir[]);
\r
5540 int AncestralJointPPSG2000(FILE *fout, double x[]);
\r
5543 int ProbSitePattern (double x[], double *lnL, double fhsiteAnc[], double ScaleC[])
\r
5545 /* This calculates probabilities for observing site patterns fhsite[].
\r
5546 The following notes are for ncatG>1 and method = 0.
\r
5547 The routine calculates the scale factor common to all site classes (ir),
\r
5548 that is, the greatest of the scale factors among the ir classes.
\r
5549 The common scale factors will be used in scaling nodes[].conP for all site
\r
5550 classes for all nodes in PostProbNode(). Small conP for some site classes
\r
5551 will be essentially set to 0, which is fine.
\r
5556 Ziheng Yang, 7 Sept, 2001
\r
5558 int ig, i,k,h, ir;
\r
5559 double fh, S, y=1;
\r
5561 if(com.ncatG>1 && com.method==1) error2("don't need this?");
\r
5562 if (SetParameters(x)) puts ("par err.");
\r
5563 for(h=0; h<com.npatt; h++)
\r
5565 if (com.ncatG<=1) {
\r
5566 for (ig=0,*lnL=0; ig<com.ngene; ig++) {
\r
5567 if(com.Mgene>1) SetPGene(ig, 1, 1, 0, x);
\r
5568 ConditionalPNode (tree.root, ig, x);
\r
5569 for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
5570 for (i=0; i<com.ncode; i++)
\r
5571 fhsiteAnc[h] += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];
\r
5572 *lnL -= log(fhsiteAnc[h])*com.fpatt[h];
\r
5573 if(com.NnodeScale)
\r
5574 for(k=0; k<com.NnodeScale; k++)
\r
5575 *lnL -= com.nodeScaleF[k*com.npatt+h]*com.fpatt[h];
\r
5580 for (ig=0; ig<com.ngene; ig++) {
\r
5581 if(com.Mgene>1 || com.nalpha>1)
\r
5582 SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);
\r
5583 for (ir=0; ir<com.ncatG; ir++) {
\r
5585 if(com.seqtype==1 && com.NSsites /* && com.model */) IClass=ir;
\r
5587 SetPSiteClass(ir, x);
\r
5588 ConditionalPNode (tree.root, ig, x);
\r
5590 for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
5591 for (i=0,fh=0; i<com.ncode; i++)
\r
5592 fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];
\r
5594 if(com.NnodeScale) {
\r
5595 for(k=0,S=0; k<com.NnodeScale; k++) S += com.nodeScaleF[k*com.npatt+h];
\r
5597 if(ir==0) ScaleC[h]=S;
\r
5598 else if(S<=ScaleC[h]) y=exp(S-ScaleC[h]);
\r
5599 else /* change of scale factor */
\r
5600 { fhsiteAnc[h] *= exp(ScaleC[h]-S); ScaleC[h]=S; }
\r
5602 fhsiteAnc[h] += com.freqK[ir]*fh*y;
\r
5606 for(h=0, *lnL=0; h<com.npatt; h++)
\r
5607 *lnL -= log(fhsiteAnc[h])*com.fpatt[h];
\r
5608 if(com.NnodeScale)
\r
5609 for(h=0; h<com.npatt; h++)
\r
5610 *lnL -= ScaleC[h]*com.fpatt[h];
\r
5612 if(noisy) printf("\nlnL = %12.6f from ProbSitePattern.\n", - *lnL);
\r
5618 int updateconP(double x[], int inode);
\r
5620 int PostProbNode (int inode, double x[], double fhsiteAnc[], double ScaleC[],
\r
5621 double *lnL, double pChar1node[], char za[], double pnode[])
\r
5623 /* This calculates the full posterior distribution for node inode at each site.
\r
5624 Below are special comments on gamma models and method = 0.
\r
5626 Marginal reconstruction under gamma models, with complications arising from
\r
5627 scaling on large trees (com.NnodeScale) and the use of two iteration algorithms
\r
5631 The algorithm is different depending on method, which makes the code clumsy.
\r
5633 gamma method=0 or 2 (simultaneous updating):
\r
5634 nodes[].conP overlap and get destroyed for different site classes (ir)
\r
5635 The same for scale factors com.nodeScaleF.
\r
5636 fhsite[npatt] and common scale factors ScaleC[npatt] are calculated for all
\r
5637 nodes before this routine is called. The common scale factors are then
\r
5638 used to adjust nodes[].conP before they are summed across ir classes.
\r
5640 gamma method=1 (one branch at a time):
\r
5641 nodes[].conP (and com.nodeScaleF if node scaling is on) are separately
\r
5642 allocated for different site classes (ir), so that all info needed is
\r
5643 available. Use of updateconP() saves computation on large trees.
\r
5644 Scale factor Sir[] is of size ncatG and reused for each h.
\r
5646 int n=com.ncode, i,k,h, ir,it=-1,best, ig;
\r
5647 double fh, y,pbest, *Sir=ScaleC, S;
\r
5650 zero(pChar1node,com.npatt*n);
\r
5652 /* nodes[].conP are reused for different ir, with or without node scaling */
\r
5653 if (com.ncatG>1 && com.method!=1) {
\r
5654 ReRootTree(inode);
\r
5655 for (ig=0; ig<com.ngene; ig++) {
\r
5656 if(com.Mgene>1 || com.nalpha>1)
\r
5657 SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,x);
\r
5658 for (ir=0; ir<com.ncatG; ir++) {
\r
5660 if(com.seqtype==1 && com.NSsites) IClass=ir;
\r
5662 SetPSiteClass(ir, x);
\r
5663 ConditionalPNode (tree.root, ig, x);
\r
5665 for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
5666 if(!com.NnodeScale) S=1;
\r
5668 for(k=0,S=0; k<com.NnodeScale; k++)
\r
5669 S += com.nodeScaleF[k*com.npatt+h];
\r
5670 S=exp(S-ScaleC[h]);
\r
5672 for (i=0,fh=0; i<n; i++) {
\r
5673 y = com.freqK[ir]*com.pi[i]*nodes[tree.root].conP[h*n+i] * S;
\r
5675 pChar1node[h*n+i] += y ;
\r
5680 for (h=0; h<com.npatt; h++) {
\r
5681 for(i=0,y=0;i<n;i++) y += (pChar1node[h*n+i]/=fhsiteAnc[h]);
\r
5682 if (fabs(1-y)>1e-5)
\r
5683 error2("PostProbNode: sum!=1");
\r
5684 for (i=0,best=-1,pbest=-1; i<n; i++)
\r
5685 if (pChar1node[h*n+i]>pbest) {
\r
5687 pbest=pChar1node[h*n+i];
\r
5689 za[(inode-com.ns)*com.npatt+h] = (char)best;
\r
5690 pnode[(inode-com.ns)*com.npatt+h] = pbest;
\r
5691 *lnL -= log(fhsiteAnc[h])*com.fpatt[h];
\r
5692 if(com.NnodeScale) *lnL -= ScaleC[h]*com.fpatt[h];
\r
5695 else { /* all other cases: (alpha==0 || method==1) */
\r
5696 for(i=0; i<tree.nnode; i++) com.oldconP[i] = 1;
\r
5697 ReRootTree(inode);
\r
5698 updateconP(x,inode);
\r
5699 if (com.alpha==0 && com.ncatG<=1) { /* (alpha==0) (ngene>1 OK) */
\r
5700 for (ig=0; ig<com.ngene; ig++) {
\r
5701 if(com.Mgene==2 || com.Mgene==4)
\r
5702 xtoy(com.piG[ig], com.pi, n);
\r
5703 for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
5704 for (i=0,fh=0,pbest=0,best=-1; i<n; i++) {
\r
5705 y = com.pi[i]*nodes[tree.root].conP[h*n+i];
\r
5708 { pbest=y; best=i; }
\r
5709 pChar1node[h*n+i] = y;
\r
5711 za[(inode-com.ns)*com.npatt+h] = (char)best;
\r
5712 pnode[(inode-com.ns)*com.npatt+h] = (pbest/=fh);
\r
5713 for (i=0; i<n; i++)
\r
5714 pChar1node[h*n+i] /= fh;
\r
5715 *lnL -= log(fh)*(double)com.fpatt[h];
\r
5716 for(i=0; i<com.NnodeScale; i++)
\r
5717 *lnL -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];
\r
5721 else { /* (ncatG>1 && method = 1) This should work for NSsites? */
\r
5722 for (ig=0; ig<com.ngene; ig++) {
\r
5723 if(com.Mgene==2 || com.Mgene==4)
\r
5724 xtoy(com.piG[ig], com.pi, n);
\r
5725 for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
5726 if(com.NnodeScale)
\r
5727 for(ir=0,it=0; ir<com.ncatG; ir++) { /* Sir[it] is the biggest */
\r
5728 for(k=0,Sir[ir]=0; k<com.NnodeScale; k++)
\r
5729 Sir[ir] += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];
\r
5730 if(Sir[ir]>Sir[it]) it = ir;
\r
5732 for (i=0,fh=0; i<n; i++) {
\r
5733 for(ir=0; ir<com.ncatG; ir++) {
\r
5735 y = nodes[tree.root].conP[ir*(tree.nnode-com.ns)*com.npatt*n+h*n+i];
\r
5737 y = nodes[tree.root].conP[h*n+i]; /* wrong right now */
\r
5738 y *= com.pi[i]*com.freqK[ir];
\r
5739 if(com.NnodeScale) y *= exp(Sir[ir]-Sir[it]);
\r
5741 pChar1node[h*n+i] += y;
\r
5745 for (i=0,best=0; i<n; i++) {
\r
5746 pChar1node[h*n+i] /= fh;
\r
5747 if(i && pChar1node[h*n+best]<pChar1node[h*n+i])
\r
5750 za[(inode-com.ns)*com.npatt+h] = (char)best;
\r
5751 pnode[(inode-com.ns)*com.npatt+h] = pChar1node[h*n+best];
\r
5752 *lnL -= log(fh)*com.fpatt[h];
\r
5753 if(com.NnodeScale) *lnL -= Sir[it]*com.fpatt[h];
\r
5762 void getCodonNode1Site(char codon[], char zanc[], int inode, int site);
\r
5764 int AncestralMarginal (FILE *fout, double x[], double fhsiteAnc[], double Sir[])
\r
5766 /* Ancestral reconstruction for each interior node. This works under both
\r
5767 the one rate and gamma rates models.
\r
5768 pnode[npatt*nid] stores the prob for the best chara at a node and site.
\r
5769 The best character is kept in za[], coded as 0,...,n-1.
\r
5770 The data may be coded (com.cleandata==1) or not (com.cleandata==0).
\r
5771 Call ProbSitePatt() before running this routine.
\r
5772 pMAPnode[NS-1], pMAPnodeA[] stores the MAP probabilities (accuracy)
\r
5773 for a site and for the entire sequence, respectively.
\r
5775 The routine PostProbNode calculates pChar1node[npatt*ncode], which stores
\r
5776 prob for each char at each pattern at each given node inode. The rest of
\r
5777 the routine is to output the results in different ways.
\r
5779 Deals with node scaling to avoid underflows. See above
\r
5780 (Z. Yang, 2 Sept 2001)
\r
5782 char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
\r
5783 char *zanc, str[4]="",codon[2][4]={" "," "}, aa[4]="";
\r
5784 char *sitepatt=(com.readpattern?"pattern":"site");
\r
5785 int n=com.ncode, inode, ic=0,b[3],i,j,k1=-1,k2=-1,c1,c2,k3, lsc=com.ls;
\r
5786 int lst=(com.readpattern?com.npatt:com.ls);
\r
5787 int h,hp,ig, best, oldroot=tree.root;
\r
5788 int nid=tree.nnode-com.ns, nchange;
\r
5789 double lnL=0, fh, y, pbest, *pChar1node, *pnode, p1=-1,p2=-1;
\r
5790 double pMAPnode[NS-1], pMAPnodeA[NS-1], smallp=0.001;
\r
5792 char coding=0, *bestAA=NULL;
\r
5793 double pAA[21], *pbestAA=NULL, ns,na, nst,nat,S,N;
\r
5794 /* bestAA[nid*npatt], pbestAA[nid*npatt]:
\r
5795 To reconstruct aa seqs using codon or nucleotide seqs, universal code */
\r
5797 if(noisy) puts("Marginal reconstruction.");
\r
5799 fprintf (fout,"\n(1) Marginal reconstruction of ancestral sequences\n");
\r
5800 fprintf (fout,"(eqn. 4 in Yang et al. 1995 Genetics 141:1641-1650).\n");
\r
5801 pChar1node = (double*)malloc(com.npatt*n*sizeof(double));
\r
5802 pnode = (double*)malloc((nid*com.npatt+1)*(sizeof(double)+sizeof(char)));
\r
5803 if (pnode==NULL||pChar1node==NULL)
\r
5804 error2("oom pnode");
\r
5805 zanc = (char*)(pnode+nid*com.npatt);
\r
5808 if(com.seqtype==0 && com.ls%3==0 && com.coding) { coding=1; lsc=com.ls/3; }
\r
5810 if(com.seqtype==1) { coding=1; lsc=com.npatt; }
\r
5812 if((pbestAA=(double*)malloc(nid*lsc*2*sizeof(double)))==NULL)
\r
5813 error2("oom pbestAA");
\r
5814 bestAA = (char*)(pbestAA+nid*lsc);
\r
5817 if(SetParameters(x)) puts("par err.");
\r
5819 if(com.verbose>1)
\r
5820 fprintf(fout,"\nProb distribs at nodes, those with p < %.3f not listed\n", smallp);
\r
5822 /* This loop reroots the tree at inode & reconstructs sequence at inode */
\r
5823 for (inode=com.ns; inode<tree.nnode; inode++) {
\r
5825 PostProbNode (inode, x, fhsiteAnc, Sir, &lnL, pChar1node, zanc, pnode);
\r
5826 if(noisy) printf ("\tNode %3d: lnL = %12.6f\n", inode+1, -lnL);
\r
5828 /* print Prob distribution at inode if com.verbose>1 */
\r
5829 if (com.verbose>1) {
\r
5830 fprintf(fout,"\nProb distribution at node %d, by %s\n", inode+1, sitepatt);
\r
5831 fprintf(fout,"\n%7s Freq Data\n\n", sitepatt);
\r
5832 for(h=0;h<lst;h++,FPN(fout)) {
\r
5833 hp = (!com.readpattern ? com.pose[h] : h);
\r
5834 fprintf (fout,"%7d%7.0f ", h+1, com.fpatt[hp]);
\r
5835 print1site(fout, hp);
\r
5836 fputs(": ", fout);
\r
5837 for(j=0; j<n; j++) {
\r
5838 if (com.seqtype!=CODONseq) {
\r
5843 strcpy(str, CODONs[j]);
\r
5844 fprintf(fout,"%s(%5.3f) ", str, pChar1node[hp*n+j]);
\r
5847 } /* if (verbose) */
\r
5850 /* find the best amino acid for coding seqs */
\r
5852 if(com.seqtype==CODONseq)
\r
5853 for(h=0; h<com.npatt; h++) {
\r
5854 for(j=0; j<20; j++) pAA[j]=0;
\r
5855 for(j=0; j<n; j++) {
\r
5856 i = GeneticCode[com.icode][FROM61[j]];
\r
5857 pAA[i] += pChar1node[h*n+j];
\r
5859 /* matout(F0,pAA,1,20); */
\r
5860 for(j=0,best=0,pbest=0; j<20; j++)
\r
5861 if(pAA[j]>pbest) { pbest=pAA[j]; best=j; }
\r
5862 bestAA[(inode-com.ns)*com.npatt+h] = (char)best;
\r
5863 pbestAA[(inode-com.ns)*com.npatt+h] = pbest;
\r
5866 if(com.seqtype==0 && coding) { /* coding seqs analyzed by baseml */
\r
5867 for(h=0; h<lsc; h++) { /* h-th codon */
\r
5868 /* sums up probs for the 20 AAs for each node. Stop codons are
\r
5869 ignored, and so those probs are approxiamte. */
\r
5870 for(j=0,y=0; j<20; j++) pAA[j]=0;
\r
5871 for(k1=0; k1<4; k1++) for(k2=0; k2<4; k2++) for(k3=0; k3<4; k3++) {
\r
5872 ic = k1*16+k2*4+k3;
\r
5873 b[0] = com.pose[h*3+0]*n+k1;
\r
5874 b[1] = com.pose[h*3+1]*n+k2;
\r
5875 b[2] = com.pose[h*3+2]*n+k3;
\r
5876 fh = pChar1node[b[0]]*pChar1node[b[1]]*pChar1node[b[2]];
\r
5877 if((ic=GeneticCode[com.icode][ic])==-1)
\r
5882 if(fabs(1-y-sum(pAA,20))>1e-6) error2("AncestralMarginal strange?");
\r
5884 for(j=0,best=0,pbest=0; j<20; j++)
\r
5885 if(pAA[j]>pbest) { pbest=pAA[j]; best=j; }
\r
5887 bestAA[(inode-com.ns)*com.ls/3+h] = (char)best;
\r
5888 pbestAA[(inode-com.ns)*com.ls/3+h] = pbest/(1-y);
\r
5891 } /* for (inode), This closes the big loop */
\r
5893 for(i=0; i<tree.nnode; i++)
\r
5894 com.oldconP[i] = 0;
\r
5895 ReRootTree(oldroot);
\r
5897 if(com.seqtype==0 && coding && !com.readpattern) { /* coding seqs analyzed by baseml */
\r
5898 fputs("\nBest amino acids reconstructed from nucleotide model.\n",fout);
\r
5899 fputs("Prob at each node listed by amino acid (codon) site\n",fout);
\r
5900 fputs("(Please ignore if not relevant)\n\n",fout);
\r
5901 for(h=0;h<com.ls/3;h++,FPN(fout)) {
\r
5902 fprintf(fout,"%4d ", h+1);
\r
5903 for(j=0; j<com.ns; j++) {
\r
5904 getCodonNode1Site(codon[0], NULL, j, h);
\r
5905 Codon2AA(codon[0], aa, com.icode, &i);
\r
5906 fprintf(fout," %s(%c)",codon[0],AAs[i]);
\r
5908 fprintf(fout,": ");
\r
5909 for (j=0; j<tree.nnode-com.ns; j++) {
\r
5910 fprintf(fout," %1c (%5.3f)", AAs[bestAA[j*com.ls/3+h]], pbestAA[j*com.ls/3+h]);
\r
5915 /* calculate accuracy measures */
\r
5916 zero(pMAPnode,nid); fillxc(pMAPnodeA, 1., nid);
\r
5917 for (inode=0; inode<tree.nnode-com.ns; inode++) {
\r
5918 for(h=0; h<com.npatt; h++) {
\r
5919 pMAPnode[inode] += com.fpatt[h]*pnode[inode*com.npatt+h]/com.ls;
\r
5920 pMAPnodeA[inode] *= pow(pnode[inode*com.npatt+h], com.fpatt[h]);
\r
5924 fprintf(fout,"\nProb of best state at each node, listed by %s", sitepatt);
\r
5925 if (com.ngene>1) fprintf(fout,"\n\n%7s (g) Freq Data: \n", sitepatt);
\r
5926 else fprintf(fout,"\n\n%7s Freq Data: \n", sitepatt);
\r
5928 for(h=0; h<lst; h++) {
\r
5929 hp = (!com.readpattern ? com.pose[h] : h);
\r
5930 fprintf(fout,"\n%4d ",h+1);
\r
5931 if (com.ngene>1) { /* which gene the site is from */
\r
5932 for(ig=1; ig<com.ngene; ig++)
\r
5933 if(hp<com.posG[ig]) break;
\r
5934 fprintf(fout,"(%d)",ig);
\r
5936 fprintf(fout," %5.0f ", com.fpatt[hp]);
\r
5937 print1site(fout, hp);
\r
5938 fprintf(fout, ": ");
\r
5940 for(j=0; j<nid; j++) {
\r
5941 if (com.seqtype!=CODONseq)
\r
5942 fprintf(fout,"%c(%5.3f) ", pch[(int)zanc[j*com.npatt+hp]],pnode[j*com.npatt+hp]);
\r
5945 ic = zanc[j*com.npatt+hp];
\r
5946 Codon2AA(CODONs[ic], aa, com.icode, &i);
\r
5947 fprintf(fout," %s %1c %5.3f (%1c %5.3f)",
\r
5948 CODONs[ic], AAs[i], pnode[j*com.npatt+hp], AAs[(int)bestAA[j*com.npatt+hp]], pbestAA[j*com.npatt+hp]);
\r
5952 if(noisy && (h+1)%100000==0) printf("\r\tprinting, %d sites done", h+1);
\r
5954 if(noisy && h>=100000) printf("\n");
\r
5956 /* Map changes onto branches
\r
5957 k1 & k2 are the two characters; p1 and p2 are the two probs. */
\r
5959 if(!com.readpattern) {
\r
5960 fputs("\n\nSummary of changes along branches.\n",fout);
\r
5961 fputs("Check root of tree for directions of change.\n",fout);
\r
5962 if(!com.cleandata && com.seqtype==1)
\r
5963 fputs("Counts of n & s are incorrect along tip branches with ambiguity data.\n",fout);
\r
5964 for(j=0; j<tree.nbranch; j++,FPN(fout)) {
\r
5965 inode = tree.branches[j][1];
\r
5967 fprintf(fout,"\nBranch %d:%5d..%-2d",j+1,tree.branches[j][0]+1,inode+1);
\r
5968 if(inode<com.ns) fprintf(fout," (%s) ",com.spname[inode]);
\r
5971 lsc = (com.seqtype==1 ? com.ls : com.ls/3);
\r
5972 for (h=0,nst=nat=0; h<lsc; h++) {
\r
5973 getCodonNode1Site(codon[0], zanc, inode, h);
\r
5974 getCodonNode1Site(codon[1], zanc, tree.branches[j][0], h);
\r
5975 difcodonNG(codon[0], codon[1], &S, &N, &ns,&na, 0, com.icode);
\r
5979 fprintf(fout," (n=%4.1f s=%4.1f)",nat,nst);
\r
5981 fprintf(fout,"\n\n");
\r
5982 for(h=0; h<lst; h++) {
\r
5983 hp = (!com.readpattern ? com.pose[h] : h);
\r
5984 if (com.seqtype!=CODONseq) {
\r
5986 k2 = pch[(int)com.z[inode][hp]];
\r
5988 k2 = pch[(int)zanc[(inode-com.ns)*com.npatt+hp]];
\r
5989 p2 = pnode[(inode-com.ns)*com.npatt+hp];
\r
5991 k1 = pch[ zanc[(tree.branches[j][0]-com.ns)*com.npatt+hp] ];
\r
5992 p1 = pnode[(tree.branches[j][0]-com.ns)*com.npatt+hp];
\r
5996 if(inode<com.ns) {
\r
5997 strcpy(codon[1], CODONs[com.z[inode][hp]]);
\r
5998 k2 = GetAASiteSpecies(inode, hp);
\r
6001 strcpy(codon[1], CODONs[(int)zanc[(inode-com.ns)*com.npatt+hp]]);
\r
6002 k2 = AAs[(int)bestAA[(inode-com.ns)*com.npatt+hp]];
\r
6003 p2 = pbestAA[(inode-com.ns)*com.npatt+hp];
\r
6005 strcpy(codon[0], CODONs[(int)zanc[(tree.branches[j][0]-com.ns)*com.npatt+hp]]);
\r
6006 k1 = AAs[(int)bestAA[(tree.branches[j][0]-com.ns)*com.npatt+hp]];
\r
6007 p1 = pbestAA[(tree.branches[j][0]-com.ns)*com.npatt+hp];
\r
6009 if(strcmp(codon[0],codon[1])) {
\r
6011 fprintf(fout,"\t%4d %s (%c) %.3f -> %s (%c)\n", h+1,codon[0],k1,p1, codon[1],k2);
\r
6013 fprintf(fout,"\t%4d %s (%c) %.3f -> %s (%c) %.3f\n",h+1,codon[0],k1,p1, codon[1],k2,p2);
\r
6018 if(k1==k2) continue;
\r
6019 fprintf(fout,"\t%4d ",h+1);
\r
6022 if(sitelabels) fprintf(fout," %5s ",sitelabels[h]);
\r
6024 if(inode<com.ns) fprintf(fout,"%c %.3f -> %1c\n",k1,p1,k2);
\r
6025 else fprintf(fout,"%c %.3f -> %1c %.3f\n",k1,p1,k2,p2);
\r
6031 ListAncestSeq(fout, zanc);
\r
6032 fprintf(fout,"\n\nOverall accuracy of the %d ancestral sequences:", nid);
\r
6033 matout2(fout,pMAPnode, 1, nid, 9,5); fputs("for a site.\n",fout);
\r
6034 matout2(fout,pMAPnodeA, 1, nid, 9,5); fputs("for the sequence.\n", fout);
\r
6036 /* best amino acid sequences from codonml */
\r
6038 if(com.seqtype==1) {
\r
6039 fputs("\n\nAmino acid sequences inferred by codonml.\n",fout);
\r
6040 if(!com.cleandata)
\r
6041 fputs("Results unreliable for sites with alignment gaps.\n",fout);
\r
6042 for(inode=0; inode<nid; inode++) {
\r
6043 fprintf(fout,"\nNode #%-10d ",com.ns+inode+1);
\r
6044 for(h=0; h<lst; h++) {
\r
6045 hp = (!com.readpattern ? com.pose[h] : h);
\r
6046 fprintf(fout, "%c", AAs[(int)bestAA[inode*com.npatt+hp]]);
\r
6047 if((h+1)%10==0) fputc(' ', fout);
\r
6053 ChangesSites(fout, coding, zanc);
\r
6057 if(coding) free(pbestAA);
\r
6062 void getCodonNode1Site(char codon[], char zanc[], int inode, int site)
\r
6064 /* this is used to retrive the codon from a codon sequence for codonml
\r
6065 or coding sequence in baseml, used in ancestral reconstruction
\r
6066 zanc has ancestral sequences
\r
6067 site is codon site
\r
6071 for(i=0; i<3; i++) /* to force crashes */
\r
6073 if(com.seqtype==CODONseq) {
\r
6074 hp = (!com.readpattern ? com.pose[site] : site);
\r
6077 strcpy(codon, CODONs[zanc[(inode-com.ns)*com.npatt+hp]]);
\r
6079 strcpy(codon, CODONs[com.z[inode][hp]]);
\r
6082 else { /* baseml coding reconstruction */
\r
6084 for(i=0; i<3; i++)
\r
6085 codon[i] = BASEs[(int)zanc[(inode-com.ns)*com.npatt+com.pose[site*3+i]]];
\r
6087 for(i=0; i<3; i++) codon[i] = BASEs[ com.z[inode][com.pose[site*3+i]] ];
\r
6092 int ChangesSites(FILE*frst, int coding, char *zanc)
\r
6094 /* this lists and counts changes at sites from reconstructed ancestral sequences
\r
6095 com.z[] has the data, and zanc[] has the ancestors
\r
6096 For codon sequences (codonml or baseml with com.coding), synonymous and
\r
6097 nonsynonymous changes are counted separately.
\r
6098 Added in Nov 2000.
\r
6100 char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
\r
6101 char codon[2][4]={" "," "};
\r
6102 int h,hp,inode,k1,k2,d, ls1=(com.readpattern?com.npatt:com.ls);
\r
6103 double S,N,Sd,Nd, S1,N1,Sd1,Nd1, b,btotal=0, p,C;
\r
6105 if(com.seqtype==0 && coding) ls1/=3;
\r
6107 fprintf(frst,"\n\nCounts of changes at sites, listed by %s\n\n",
\r
6108 (com.readpattern?"pattern":"site"));
\r
6109 fprintf(frst1,"\nList of sites with changes according to ancestral reconstruction\n");
\r
6110 fprintf(frst1,"Suzuki-Gojobori (1999) style test\n");
\r
6111 if(!com.cleandata)
\r
6112 fprintf(frst, "(Counts of n & s are incorrect at sites with ambiguity data)\n\n");
\r
6114 for(inode=0; inode<tree.nnode; inode++)
\r
6115 if(inode!=tree.root) btotal += nodes[inode].branch;
\r
6116 for(h=0; h<ls1; h++) {
\r
6117 fprintf(frst,"%4d ",h+1);
\r
6118 for(inode=0,S=N=Sd=Nd=0; inode<tree.nnode; inode++) {
\r
6119 if(inode==tree.root) continue;
\r
6120 b = nodes[inode].branch;
\r
6121 getCodonNode1Site(codon[0], zanc, nodes[inode].father, h);
\r
6122 getCodonNode1Site(codon[1], zanc, inode, h);
\r
6124 difcodonNG(codon[0], codon[1], &S1, &N1, &Sd1, &Nd1, 0, com.icode);
\r
6130 fprintf(frst," %3s.%3s ",codon[0],codon[1]);
\r
6133 b = S+N; S /= b; N /= b;
\r
6134 fprintf(frst,"(S N: %7.3f%7.3f Sd Nd: %6.1f %5.1f)\n", S*3,N*3,Sd,Nd);
\r
6135 fprintf(frst1,"%4d S N: %7.3f%7.3f Sd Nd: %6.1f %5.1f ", h+1,S*3,N*3,Sd,Nd);
\r
6137 if(Nd/(Sd+Nd)<N) {
\r
6138 for(d=0,p=0,C=1; d<=Nd; d++) {
\r
6139 p += C*pow(N,d) * pow(1-N,Sd+Nd-d);
\r
6140 C *= (Sd+Nd-d)/(d+1);
\r
6142 fprintf(frst1," - p =%6.3f %s", p,(p<.01?"**":(p<.05?"*":"")));
\r
6145 for(d=0,p=0,C=1; d<=Sd; d++) {
\r
6146 p += C*pow(S,d)*pow(1-S,Sd+Nd-d);
\r
6147 C *= (Sd+Nd-d)/(d+1);
\r
6149 fprintf(frst1," + p =%6.3f %s", p,(p<.01?"**":(p<.05?"*":"")));
\r
6152 fprintf(frst1,"\n");
\r
6155 else { /* noncoding nucleotide or aa sequences */
\r
6156 fprintf(frst,"\n\nCounts of changes at sites%s\n\n",
\r
6157 (com.readpattern?", listed by pattern":""));
\r
6158 for(h=0; h<ls1; h++) {
\r
6159 hp=(!com.readpattern ? com.pose[h] : h);
\r
6160 fprintf(frst,"%4d ",h+1);
\r
6161 for(inode=0,d=0;inode<tree.nnode;inode++) {
\r
6162 if(inode==tree.root) continue;
\r
6163 k1 = pch[(int) zanc[(nodes[inode].father-com.ns)*com.npatt+hp] ];
\r
6165 k2 = pch[com.z[inode][hp]];
\r
6167 k2 = pch[(int) zanc[(inode-com.ns)*com.npatt+hp] ];
\r
6170 fprintf(frst," %c%c", k1,k2);
\r
6173 fprintf(frst," (%d)\n", d);
\r
6181 #define NBESTANC 4 /* use 1 2 3 or 4 */
\r
6182 int parsimony=0, *nBestScore, *icharNode[NBESTANC], *combIndex;
\r
6183 double *fhsiteAnc, *lnPanc[NBESTANC], *PMatTips, *combScore;
\r
6184 char *charNode[NBESTANC], *ancSeq, *ancState1site;
\r
6186 int largeReconstruction;
\r
6188 void DownPassPPSG2000OneSite (int h, int inode, int inodestate, int ipath);
\r
6189 void PrintAncState1site (char ancState1site[], double prob);
\r
6192 double P0[16]={0, 1, 1.5, 1.5,
\r
6197 double piroot[NCODE]={0};
\r
6199 /* combIndex[] uses two bits for each son to record the path that is taken by
\r
6200 each reconstruction; for 32-bit integers, the maximum number of sons for
\r
6203 lnPanc[3][(tree.nnode-com.ns)*npatt*n] uses the space of com.conP.
\r
6204 It holds the ln(Pr) for the best reconstructions at the subtree down inode
\r
6205 given the state of the father node.
\r
6206 charNode[0,1,2] holds the corresponding state at inode.
\r
6208 int nBestScore[maxnson];
\r
6209 int combIndex[2*n*ncomb];
\r
6210 double *combScore[n*ncomb];
\r
6211 char ancSeq[nintern*npatt], ancState1site[nintern];
\r
6212 int icharNode[NBESTANC][nintern*npatt*n];
\r
6213 char charNode[NBESTANC][nintern*npatt*n];
\r
6216 void UpPassPPSG2000 (int inode, int igene, double x[])
\r
6218 /* The algorithm of PPSG2000, modified. This routine is based on ConditionalPNode().
\r
6219 lnPanc[h*n+i] is the best lnP, given that inode has state i.
\r
6220 charNode[] stores the characters that achieved the best lnP.
\r
6223 int n=com.ncode, it,ibest,i,j,k,h, ison, nson=nodes[inode].nson, *pc;
\r
6224 int pos0=com.posG[igene],pos1=com.posG[igene+1], ichar,jchar;
\r
6225 int ncomb=1,icomb, ipath;
\r
6226 double t, y, psum1site=-1;
\r
6228 if(com.ncode!=4) debug=0;
\r
6230 for(i=0; i<nson; i++)
\r
6231 if(nodes[nodes[inode].sons[i]].nson>0)
\r
6232 UpPassPPSG2000(nodes[inode].sons[i], igene, x);
\r
6233 for(i=0,ncomb=1; i<nson; i++)
\r
6234 ncomb *= (nBestScore[i] = (nodes[nodes[inode].sons[i]].nson>0 ? NBESTANC : 1));
\r
6236 printf("\n\nNode %2d has sons ", inode+1);
\r
6237 for(i=0; i<nson; i++) printf(" %2d", nodes[inode].sons[i]+1);
\r
6238 printf(" ncomb=%2d: ", ncomb);
\r
6239 for(i=0; i<nson; i++) printf(" %2d", nBestScore[i]); FPN(F0);
\r
6242 if(inode!=tree.root) { /* calculate log{P(t)} from father to inode */
\r
6243 t = nodes[inode].branch*_rateSite;
\r
6245 if(com.clock) t *= GetBranchRate(igene,(int)nodes[inode].label,x,NULL);
\r
6246 else t *= com.rgene[igene];
\r
6248 GetPMatBranch(PMat, x, t, inode);
\r
6249 for(j=0; j<n*n; j++)
\r
6250 PMat[j] = (PMat[j]<1e-300 ? 300 : -log(PMat[j]));
\r
6253 for(h=pos0; h<pos1; h++) { /* loop through site patterns */
\r
6255 /* The last round for inode==tree.root, shares some code with other nodes,
\r
6256 and is thus embedded in the same loop. Alternatively this round can be
\r
6257 taken out of the loop with some code duplicated.
\r
6259 for(ichar=0; ichar<(inode!=tree.root?n:1); ichar++) { /* ichar for father */
\r
6260 /* given ichar for the father, what are the best reconstructions at
\r
6261 inode? Look at n*ncomb possibilities, given father state ichar.
\r
6264 if(inode==tree.root) printf("\n\nfather is root\n");
\r
6265 else printf("\n\nichar = %2d %c for father\n", ichar+1,BASEs[ichar]);
\r
6268 for(icomb=0; icomb<n*ncomb; icomb++) {
\r
6269 jchar = icomb/ncomb; /* jchar is for inode */
\r
6270 if(inode==tree.root)
\r
6271 combScore[icomb] = -log(com.pi[jchar]+1e-300);
\r
6273 combScore[icomb] = PMat[ichar*n+jchar];
\r
6275 if(inode==tree.root && parsimony) combScore[icomb] = 0;
\r
6277 if(debug) printf("comb %2d %c", icomb+1,BASEs[jchar]);
\r
6279 for(i=0,it=icomb%ncomb; i<nson; i++) { /* The ibest-th state in ison. */
\r
6280 ison = nodes[inode].sons[i];
\r
6281 ibest = it%nBestScore[i];
\r
6282 it /= nBestScore[i];
\r
6284 if(nodes[ison].nson) /* internal node */
\r
6285 y = lnPanc[ibest][(ison-com.ns)*com.npatt*n+h*n+jchar];
\r
6286 else if (com.cleandata) /* tip clean: PMatTips[] has log{P(t)}. */
\r
6287 y = PMatTips[ ison*n*n + jchar*n + com.z[ison][h] ];
\r
6288 else { /* tip unclean: PMatTips[] has P(t). */
\r
6289 for(k=0,y=0; k<nChara[com.z[ison][h]]; k++)
\r
6290 y += PMatTips[ ison*n*n+jchar*n + CharaMap[com.z[ison][h]][k] ];
\r
6294 combScore[icomb] += y;
\r
6295 if(debug) printf("%*s son %2d #%2d %7.1f\n", (i?10:1),"", ison+1, ibest+1,y);
\r
6297 } /* for(icomb) */
\r
6299 if(debug) { printf("score "); for(i=0;i<n*ncomb; i++) printf(" %4.1f",combScore[i]); FPN(F0); }
\r
6300 indexing(combScore, n*ncomb, combIndex, 0, combIndex+n*ncomb);
\r
6301 if(debug) { printf("index "); for(i=0;i<n*ncomb; i++) printf(" %4d",combIndex[i]); FPN(F0); }
\r
6303 /* print out reconstructions at the site if inode is root. */
\r
6304 if(inode==tree.root) {
\r
6305 fprintf(fanc,"%4d ", h+1);
\r
6306 if(com.ngene>1) fprintf(fanc,"(%d) ", igene+1);
\r
6307 fprintf(fanc," %6.0f ",com.fpatt[h]);
\r
6308 print1site(fanc, h);
\r
6309 fprintf(fanc, ": ");
\r
6311 psum1site=0; /* used if inode is root */
\r
6313 for(j=0; j<(inode!=tree.root ? NBESTANC : n*ncomb); j++) {
\r
6314 jchar = (it=combIndex[j])/ncomb; it%=ncomb;
\r
6316 lnPanc[j][(inode-com.ns)*com.npatt*n+h*n+ichar] = combScore[combIndex[j]];
\r
6317 charNode[j][(inode-com.ns)*com.npatt*n+h*n+ichar] = jchar;
\r
6319 if(debug) printf("\t#%d: %6.1f %c ", j+1, combScore[combIndex[j]], BASEs[jchar]);
\r
6321 for(i=0,ipath=0; i<nson; i++) {
\r
6322 ison=nodes[inode].sons[i];
\r
6323 ibest=it%nBestScore[i];
\r
6324 it/=nBestScore[i];
\r
6325 ipath |= ibest<<(2*i);
\r
6326 if(debug) printf("%2d", ibest+1);
\r
6329 icharNode[j][(inode-com.ns)*com.npatt*n+h*n+ichar]=ipath;
\r
6331 if(debug) printf(" (%o)", ipath);
\r
6333 /* print if inode is root. */
\r
6334 if(inode==tree.root) {
\r
6335 ancState1site[inode-com.ns]=jchar;
\r
6336 if(parsimony) y = combScore[combIndex[j]];
\r
6337 else psum1site += y = exp(-combScore[combIndex[j]]-fhsiteAnc[h]);
\r
6339 for(i=0; i<nson; i++) {
\r
6340 if(nodes[ison=nodes[inode].sons[i]].nson)
\r
6341 DownPassPPSG2000OneSite(h, tree.root, jchar, ipath);
\r
6343 PrintAncState1site(ancState1site, y);
\r
6344 if(j>NBESTANC && y<.001) break;
\r
6347 } /* for(ichar) */
\r
6348 if(inode==tree.root) fprintf(fanc," (total %6.3f)\n", psum1site);
\r
6350 if(largeReconstruction && (h+1)%2000==0)
\r
6351 printf("\r\tUp pass for gene %d node %d sitepatt %d.", igene+1,inode+1,h+1);
\r
6354 if(largeReconstruction)
\r
6355 printf("\r\tUp pass for gene %d node %d.", igene+1,inode+1);
\r
6358 void DownPassPPSG2000OneSite (int h, int inode, int inodestate, int ipath)
\r
6360 /* this puts the state in ancState1site[nintern], using
\r
6361 int icharNode[NBESTANC][nintern*npatt*n],
\r
6362 char charNode[NBESTANC][nintern*npatt*n].
\r
6363 jchar is the state at inode, and ipath is the ipath code for inode.
\r
6365 int n=com.ncode, i, ison, ibest, sonstate;
\r
6367 for(i=0; i<nodes[inode].nson; i++) {
\r
6368 ison=nodes[inode].sons[i];
\r
6369 if(nodes[ison].nson>1) {
\r
6370 ibest = (ipath & (3<<(2*i))) >> (2*i);
\r
6371 ancState1site[ison-com.ns] = sonstate =
\r
6372 charNode[ibest][(ison-com.ns)*com.npatt*n+h*n+inodestate];
\r
6373 DownPassPPSG2000OneSite(h, ison, sonstate,
\r
6374 icharNode[ibest][(ison-com.ns)*com.npatt*n+h*n+inodestate]);
\r
6380 void PrintAncState1site (char ancState1site[], double prob)
\r
6384 char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
\r
6386 for(i=0; i<tree.nnode-com.ns; i++) {
\r
6387 if(com.seqtype==1) {
\r
6389 fprintf(fanc,"%s ",getcodon(codon,FROM61[(int)ancState1site[i]]));
\r
6393 fprintf(fanc, "%c", pch[(int)ancState1site[i]]);
\r
6395 fprintf(fanc," (%5.3f) ", prob);
\r
6398 void DownPassPPSG2000 (int inode)
\r
6400 /* this reads out the best chara for inode from charNode[] into ancSeq[].
\r
6405 for(h=0; h<com.npatt; h++) {
\r
6406 if(inode!=tree.root)
\r
6407 c0=ancSeq[(nodes[inode].father-com.ns)*com.npatt+h];
\r
6408 ancSeq[(inode-com.ns)*com.npatt+h]
\r
6409 = charNode[0][(inode-com.ns)*com.npatt*com.ncode+h*com.ncode+c0];
\r
6411 for(i=0; i<nodes[inode].nson; i++)
\r
6412 if(nodes[ison=nodes[inode].sons[i]].nson > 1)
\r
6413 DownPassPPSG2000(ison);
\r
6418 int AncestralJointPPSG2000 (FILE *fout, double x[])
\r
6420 /* Ziheng Yang, 8 June 2000, rewritten on 8 June 2005.
\r
6421 Joint ancestral reconstruction, taking character states for all nodes at a
\r
6422 site as one entity, based on the algorithm of Pupko et al. (2000
\r
6423 Mol. Biol. Evol. 17:890-896).
\r
6425 fhsiteAns[]: fh[] for each site pattern
\r
6426 nodes[].conP[] are destroyed and restored at the end of the routine.
\r
6427 ancSeq[] stores the ancestral seqs, the best reconstruction.
\r
6429 This outputs results by pattern. I tried to print results by sites, but gave up as
\r
6430 some variables use the same memory (e.g., combIndex) for different site patterns.
\r
6432 char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
\r
6434 int n=com.ncode,nintern=tree.nnode-com.ns, i,j,k,h,hp,igene;
\r
6435 int maxnson=0, maxncomb, lst=(com.readpattern?com.npatt:com.ls);
\r
6436 char *sitepatt=(com.readpattern?"pattern":"site");
\r
6438 size_t sconPold = com.sconP, s;
\r
6440 largeReconstruction = (noisy && (com.ns>300 || com.ls>1000000));
\r
6442 if(noisy) puts("Joint reconstruction.");
\r
6444 for(i=0; i<tree.nnode; i++) maxnson=max2(maxnson,nodes[i].nson);
\r
6445 if(maxnson>16 || NBESTANC>4) /* for int at least 32 bits */
\r
6446 error2("NBESTANC too large or too many sons.");
\r
6447 for(i=0,maxncomb=1; i<maxnson; i++) maxncomb*=NBESTANC;
\r
6448 if((PMatTips=(double*)malloc(com.ns*n*n*sizeof(double)))==NULL)
\r
6449 error2("oom PMatTips");
\r
6450 s = NBESTANC*nintern*(size_t)com.npatt*n*sizeof(double);
\r
6451 if(s > sconPold) {
\r
6453 printf("\n%9lu bytes for conP, adjusted\n", com.sconP);
\r
6454 if((com.conP=(double*)realloc(com.conP,com.sconP))==NULL)
\r
6455 error2("oom conP");
\r
6457 s = NBESTANC*nintern*com.npatt*n;
\r
6458 s = ((s*sizeof(int)+(s+nintern)*sizeof(char)+16)/sizeof(double))*sizeof(double);
\r
6459 if(s > com.sspace) {
\r
6461 printf("\n%9lu bytes for space, adjusted\n",com.sspace);
\r
6462 if((com.space=(double*)realloc(com.space,com.sspace))==NULL) error2("oom space");
\r
6464 for(i=0; i<NBESTANC; i++) {
\r
6465 lnPanc[i]= com.conP+i*nintern*com.npatt*n;
\r
6466 icharNode[i] = (int*)com.space+i*nintern*com.npatt*n;
\r
6467 charNode[i] = (char*)((int*)com.space+NBESTANC*nintern*com.npatt*n)
\r
6468 + i*nintern*com.npatt*n;
\r
6469 ancState1site = charNode[0]+NBESTANC*nintern*com.npatt*n;
\r
6471 if((ancSeq=(char*)malloc(nintern*com.npatt*n*sizeof(char)))==NULL)
\r
6472 error2("oom charNode");
\r
6474 if((combScore=(double*)malloc((3*n*maxncomb+com.ns)*sizeof(double)))==NULL)
\r
6475 error2("oom combScore");
\r
6476 nBestScore = (int*)(combScore+n*maxncomb);
\r
6477 combIndex = nBestScore + com.ns; /* combIndex[2*n*ncomb] contains work space */
\r
6480 fprintf(fout, "\n\n(2) Joint reconstruction of ancestral sequences\n");
\r
6481 fprintf(fout, "(eqn. 2 in Yang et al. 1995 Genetics 141:1641-1650), using ");
\r
6482 fprintf(fout, "the algorithm of Pupko et al. (2000 Mol Biol Evol 17:890-896),\n");
\r
6483 fprintf(fout, "modified to generate sub-optimal reconstructions.\n");
\r
6484 fprintf(fout, "\nReconstruction (prob.), listed by pattern (use the observed data to find the right site).\n");
\r
6485 fprintf(fout, "\nPattern Freq Data:\n\n");
\r
6487 for(igene=0; igene<com.ngene; igene++) {
\r
6488 if(com.Mgene>1) SetPGene(igene,1,1,0,x);
\r
6489 for(i=0; i<com.ns; i++) {
\r
6490 t = nodes[i].branch*_rateSite;
\r
6492 if(com.clock) t *= GetBranchRate(igene,(int)nodes[i].label,x,NULL);
\r
6493 else t *= com.rgene[igene];
\r
6495 GetPMatBranch(PMatTips+i*n*n, x, t, i);
\r
6498 if(com.cleandata) {
\r
6499 for(i=0; i<com.ns*n*n; i++)
\r
6500 PMatTips[i] = (PMatTips[i]<1e-20 ? 300 : -log(PMatTips[i]));
\r
6503 for(i=0; i<com.ns; i++)
\r
6504 xtoy(P0, PMatTips+i*n*n, n*n);
\r
6506 UpPassPPSG2000(tree.root, igene, x); /* this prints into frst as well */
\r
6509 if(largeReconstruction) puts("\n\tDown pass.");
\r
6510 DownPassPPSG2000(tree.root);
\r
6512 ListAncestSeq(fout, ancSeq);
\r
6517 com.sconP = sconPold;
\r
6518 if((com.conP=(double*)realloc(com.conP,com.sconP))==NULL)
\r
6526 int AncestralSeqs (FILE *fout, double x[])
\r
6528 /* Ancestral sequence reconstruction using likelihood (Yang et al. 1995).
\r
6529 Marginal works with constant rate and variable rates among sites.
\r
6530 Joint works only with constant rate among sites (ncatG=1).
\r
6533 double lnL, *ScaleC=NULL; /* collected scale factors */
\r
6536 error2("When Mgene=1, use RateAncestor = 0.");
\r
6537 if (tree.nnode==com.ns)
\r
6538 { puts("\nNo ancestral nodes to reconstruct..\n"); return(0); }
\r
6539 if (noisy) printf ("\nReconstructed ancestral states go into file rst.\n");
\r
6540 fprintf(fout, "\nAncestral reconstruction by %sML.\n",
\r
6541 (com.seqtype==0?"BASE":(com.seqtype==1?"CODON":"AA")));
\r
6542 FPN(fout); OutTreeN(fout,1,1); FPN(fout); FPN(fout);
\r
6543 OutTreeN(fout,0,0); FPN(fout); FPN(fout);
\r
6544 OutTreeB(fout); FPN(fout);
\r
6546 fputs("\ntree with node labels for Rod Page's TreeView\n",fout);
\r
6547 OutTreeN(fout,1,PrNodeNum); FPN(fout);
\r
6549 fprintf (fout, "\nNodes %d to %d are ancestral\n", com.ns+1,tree.nnode);
\r
6550 if((fhsiteAnc=(double*)malloc(com.npatt*sizeof(double)))==NULL)
\r
6551 error2("oom fhsiteAnc");
\r
6552 if(com.NnodeScale && com.ncatG>1)
\r
6553 if((ScaleC=(double*)malloc(max2(com.npatt,com.ncatG) *sizeof(double)))==NULL)
\r
6554 error2("oom ScaleC in AncestralSeqs");
\r
6557 puts("Rates are variable among sites, marginal reconstructions only.");
\r
6558 if(!com.cleandata) fputs("Unreliable at sites with alignment gaps\n", fout);
\r
6560 if(com.ncatG<=1 || com.method!=1)
\r
6561 ProbSitePattern (x, &lnL, fhsiteAnc, ScaleC);
\r
6566 AncestralMarginal(fout, x, fhsiteAnc, ScaleC);
\r
6569 /* fhsiteAnc[] is modified by both Marginal and Joint. */
\r
6570 if(com.ncatG<=1 && tree.nnode>com.ns+1) {
\r
6571 ProbSitePattern (x, &lnL, fhsiteAnc, ScaleC);
\r
6572 for(h=0; h<com.npatt; h++) {
\r
6573 fhsiteAnc[h] = log(fhsiteAnc[h]);
\r
6574 for(k=0; k<com.NnodeScale; k++)
\r
6575 fhsiteAnc[h] += com.nodeScaleF[k*com.npatt+h];
\r
6577 /* AncestralJointPPSG2000 corrupts com.conP[] and fhsiteAnc[].
\r
6579 AncestralJointPPSG2000(fout, x);
\r
6583 if(com.NnodeScale && com.ncatG>1) free(ScaleC);
\r
6592 int SetNodeScale(int inode);
\r
6593 int NodeScale(int inode, int pos0, int pos1);
\r
6595 void InitializeNodeScale(void)
\r
6597 /* This allocates memory to hold scale factors for nodes and also decide on the
\r
6598 nodes for scaling by calling SetNodeScale().
\r
6599 The scaling node is chosen before the iteration by counting the number of
\r
6600 nodes visited in the post-order tree travesal algorithm (see the routine
\r
6602 See Yang (2000 JME 51:423-432) for details.
\r
6603 The memory required is com.NnodeScale*com.npatt*sizeof(double).
\r
6607 if(com.clock>=5) return;
\r
6609 com.NnodeScale = 0;
\r
6610 com.nodeScale = (char*)realloc(com.nodeScale, tree.nnode*sizeof(char));
\r
6611 if(com.nodeScale==NULL) error2("oom");
\r
6612 for(i=0; i<tree.nnode; i++) com.nodeScale[i] = 0;
\r
6613 SetNodeScale(tree.root);
\r
6614 nS = com.NnodeScale*com.npatt;
\r
6615 if(com.conPSiteClass) nS *= com.ncatG;
\r
6616 if(com.NnodeScale) {
\r
6617 if((com.nodeScaleF=(double*)realloc(com.nodeScaleF, nS*sizeof(double)))==NULL)
\r
6618 error2("oom nscale");
\r
6619 for(i=0; i<nS; i++) com.nodeScaleF[i] = 0;
\r
6622 printf("\n%d node(s) used for scaling (Yang 2000 J Mol Evol 51:423-432):\n",com.NnodeScale);
\r
6623 for(i=0; i<tree.nnode; i++)
\r
6624 if(com.nodeScale[i]) printf(" %2d",i+1);
\r
6631 int SetNodeScale (int inode)
\r
6633 /* This marks nodes for applying scaling factors when calculating f[h].
\r
6635 int i,ison, d=0, every;
\r
6637 if(com.seqtype==0) every=100; /* baseml */
\r
6638 else if(com.seqtype==1) every=15; /* codonml */
\r
6639 else every=50; /* aaml */
\r
6641 for(i=0; i<nodes[inode].nson; i++) {
\r
6642 ison = nodes[inode].sons[i];
\r
6643 d += (nodes[ison].nson ? SetNodeScale(ison) : 1);
\r
6645 if(inode!=tree.root && d>every) {
\r
6646 com.nodeScale[inode] = 1;
\r
6648 com.NnodeScale++;
\r
6654 int NodeScale (int inode, int pos0, int pos1)
\r
6656 /* scale to avoid underflow
\r
6658 int h,k,j, n=com.ncode;
\r
6659 double t, smallw=1e-12;
\r
6661 for(j=0,k=0; j<tree.nnode; j++) /* k-th node for scaling */
\r
6662 if(j==inode) break;
\r
6663 else if(com.nodeScale[j]) k++;
\r
6665 for(h=pos0; h<pos1; h++) {
\r
6666 for(j=0,t=0;j<n;j++)
\r
6667 if(nodes[inode].conP[h*n+j]>t)
\r
6668 t = nodes[inode].conP[h*n+j];
\r
6672 nodes[inode].conP[h*n+j]=1; /* both 0 and 1 fine */
\r
6673 com.nodeScaleF[k*com.npatt+h] = -800; /* this is problematic? */
\r
6677 nodes[inode].conP[h*n+j]/=t;
\r
6678 com.nodeScaleF[k*com.npatt+h] = log(t);
\r
6686 static double *dfsites;
\r
6688 int fx_r(double x[], int np);
\r
6691 #if (BASEML || CODEML)
\r
6693 int HessianSKT2004 (double xmle[], double lnLm, double g[], double H[])
\r
6695 /* this calculates the hessian matrix of branch lengths using the approximation
\r
6696 of Seo et al. (2004), especially useful for approximate likelihood calcualtion
\r
6697 in divergence time estimation.
\r
6698 df[0][i*com.npatt+h] has d log(f_h)/d b_i.
\r
6699 method = 0 uses difference approximation to first derivatives.
\r
6700 method = 1 uses analytical calculation of first derivatives (Yang 2000).
\r
6701 I am under the impression that method = 1 may be useful for very large datasets
\r
6702 with >10M sites, but I have not implemented this method because the analytical
\r
6703 calculation of first derivatives is possible for branch lengths only, and not
\r
6704 available for other parameters. Right now with method = 0, H and the SEs are
\r
6705 calculated for all parameters although the H matrix in rst2 is a subset for
\r
6706 branch lengths only. More thought about what to do. Ziheng's note on 8 March 2010.
\r
6708 int method=0, backforth, h, i, j, lastround0=LASTROUND, nzero=0;
\r
6709 double *x, *lnL[2], *df[2], eh0=Small_Diff*2, eh, small;
\r
6711 if(com.np!=tree.nbranch && method==1)
\r
6712 error2("I think HessianSKT2004 works for branch lengths only");
\r
6713 df[0] = (double*)malloc((com.npatt*2+1)*com.np*sizeof(double));
\r
6714 if(df[0]==NULL) error2("oom space in HessianSKT2004");
\r
6715 df[1] = df[0] + com.npatt*com.np;
\r
6716 x = df[1] + com.npatt*com.np;
\r
6717 lnL[0] = (double*)malloc(com.np*2*sizeof(double));
\r
6718 lnL[1] = lnL[0]+com.np;
\r
6722 for(backforth=0; backforth<2; backforth++) {
\r
6723 for(i=0; i<com.np; i++) {
\r
6724 xtoy(xmle, x, com.np);
\r
6725 eh = eh0*(fabs(xmle[i]) + 1);
\r
6726 if(backforth==0) x[i] = xmle[i] - eh;
\r
6727 else x[i] = xmle[i] + eh;
\r
6730 dfsites = df[backforth] + i*com.npatt;
\r
6731 lnL[backforth][i] = -com.plfun(x, com.np);
\r
6735 for(i=0; i<com.np; i++) {
\r
6736 eh = eh0*(fabs(xmle[i]) + 1);
\r
6737 g[i] = (lnL[1][i] - lnL[0][i])/(eh*2);
\r
6740 printf("\nx gL g H");
\r
6741 matout(F0, xmle, 1, com.np);
\r
6742 matout(F0, g, 1, com.np);
\r
6744 zero(H, com.np*com.np);
\r
6745 for(i=0; i<com.np; i++) {
\r
6746 eh = eh0*(fabs(xmle[i]) + 1);
\r
6747 for(h=0; h<com.npatt; h++)
\r
6748 df[0][i*com.npatt+h] = (df[1][i*com.npatt+h] - df[0][i*com.npatt+h])/(eh*2);
\r
6751 for(i=0; i<com.np; i++) {
\r
6752 for(j=0; j<com.np; j++)
\r
6753 for(h=0; h<com.npatt; h++)
\r
6754 H[i*com.np+j] -= df[0][i*com.npatt+h] * df[0][j*com.npatt+h] * com.fpatt[h];
\r
6757 if(nzero) printf("\nWarning: Hessian matrix may be unreliable for zero branch lengths\n");
\r
6758 LASTROUND = lastround0;
\r
6766 int lfunRates (FILE* fout, double x[], int np)
\r
6768 /* for dG, AdG or similar non-parametric models
\r
6769 This distroys com.fhK[], and in return,
\r
6770 fhK[<npatt] stores rates for conditional mean (re), and
\r
6771 fhK[<2*npatt] stores the most probable rate category number.
\r
6772 fhsite[npatt] stores fh=log(fh).
\r
6774 int ir,il,it, h,hp,j, nscale=1, direction=-1;
\r
6775 int lst=(com.readpattern?com.npatt:com.ls);
\r
6776 double lnL=0,fh,fh1, t, re,mre,vre, b1[NCATG],b2[NCATG],*fhsite;
\r
6778 if (noisy) printf("\nEstimated rates for sites go into file %s\n",ratef);
\r
6779 if (SetParameters(x)) puts ("par err. lfunRates");
\r
6781 fprintf(fout, "\nEstimated rates for sites from %sML.\n",
\r
6782 (com.seqtype==0?"BASE":(com.seqtype==1?"CODON":"AA")));
\r
6783 OutTreeN(fout,1,1); FPN(fout);
\r
6784 fprintf (fout,"\nFrequencies and rates for categories (K=%d)", com.ncatG);
\r
6785 fprintf(fout, "\nrate:"); FOR(j,com.ncatG) fprintf(fout," %8.5f",com.rK[j]);
\r
6786 fprintf(fout, "\nfreq:"); FOR(j,com.ncatG) fprintf(fout," %8.5f",com.freqK[j]);
\r
6790 fprintf(fout,"\nTransition prob matrix over sites");
\r
6791 matout2(fout,com.MK,com.ncatG,com.ncatG,8,4);
\r
6794 if((fhsite=(double*)malloc(com.npatt*sizeof(double)))==NULL) error2("oom fhsite");
\r
6796 if(com.NnodeScale) {
\r
6797 FOR(h,com.npatt) {
\r
6798 for(ir=1,it=0; ir<com.ncatG; ir++)
\r
6799 if(com.fhK[ir*com.npatt+h] > com.fhK[it*com.npatt+h])
\r
6801 t = com.fhK[it*com.npatt+h];
\r
6802 lnL -= com.fpatt[h]*t;
\r
6803 for(ir=0; ir<com.ncatG; ir++)
\r
6804 com.fhK[ir*com.npatt+h] = exp(com.fhK[ir*com.npatt+h] - t);
\r
6807 for(h=0; h<com.npatt; h++) {
\r
6808 for(ir=0,fhsite[h]=0; ir<com.ncatG; ir++)
\r
6809 fhsite[h] += com.freqK[ir]*com.fhK[ir*com.npatt+h];
\r
6812 if (com.rho==0) { /* dG model */
\r
6813 if(com.verbose>1) {
\r
6814 fprintf(fout,"\nPosterior probabilities for site classes, by %s\n\n",
\r
6815 (com.readpattern?"pattern":"site"));
\r
6816 for (h=0; h<lst; h++,FPN(fout)) {
\r
6817 fprintf(fout, " %5d ", h+1);
\r
6818 hp = (!com.readpattern ? com.pose[h] : h);
\r
6819 for (ir=0; ir<com.ncatG; ir++)
\r
6820 fprintf(fout, " %9.4f", com.freqK[ir]*com.fhK[ir*com.npatt+hp]/fhsite[hp]);
\r
6824 fprintf(fout,"\n%7s Freq Data Rate (posterior mean & category)\n\n",
\r
6825 (com.readpattern?"Pattern":"Site"));
\r
6826 for (h=0,mre=vre=0; h<com.npatt; h++) {
\r
6827 for (ir=0,it=0,t=re=0; ir<com.ncatG; ir++) {
\r
6828 fh1 = com.freqK[ir]*com.fhK[ir*com.npatt+h];
\r
6829 if(fh1>t) { t=fh1; it=ir; }
\r
6830 re += fh1*com.rK[ir];
\r
6832 lnL -= com.fpatt[h]*log(fhsite[h]);
\r
6835 mre += com.fpatt[h]*re/com.ls;
\r
6836 vre += com.fpatt[h]*re*re/com.ls;
\r
6838 com.fhK[com.npatt+h] = it+1.;
\r
6841 for(h=0; h<lst; h++) {
\r
6842 hp=(!com.readpattern ? com.pose[h] : h);
\r
6843 fprintf(fout,"%7d %5.0f ",h+1, com.fpatt[hp]);
\r
6844 print1site(fout, hp);
\r
6845 fprintf(fout," %8.3f%6.0f\n", com.fhK[hp], com.fhK[com.npatt+hp]);
\r
6848 else { /* Auto-dGamma model */
\r
6849 fputs("\nSite Freq Data Rates\n\n",fout);
\r
6850 h = (direction==1?com.ls-1:0);
\r
6851 for (il=0,mre=vre=0; il<lst; h-=direction,il++) {
\r
6852 hp=(!com.readpattern ? com.pose[h] : h);
\r
6854 FOR(ir,com.ncatG) b1[ir]=com.fhK[ir*com.npatt+hp];
\r
6856 for (ir=0; ir<com.ncatG; ir++) {
\r
6857 for (j=0,fh=0; j<com.ncatG; j++)
\r
6858 fh+=com.MK[ir*com.ncatG+j]*b1[j];
\r
6859 b2[ir] = fh*com.fhK[ir*com.npatt+hp];
\r
6861 xtoy (b2, b1, com.ncatG);
\r
6863 if ((il+1)%nscale==0)
\r
6864 { fh=sum(b1,com.ncatG); abyx(1/fh,b1,com.ncatG); lnL-=log(fh); }
\r
6866 for (ir=0,it=-1,re=fh1=t=0; ir<com.ncatG; ir++) {
\r
6867 re+=com.freqK[ir]*b1[ir]*com.rK[ir];
\r
6868 fh1+=com.freqK[ir]*b1[ir];
\r
6869 if (b1[ir]>t) {it=ir; t=b1[ir]; }
\r
6873 vre += re*re/com.ls;
\r
6875 fprintf(fout,"%4d %5.0f ",h+1, com.fpatt[hp]);
\r
6876 print1site(fout, hp);
\r
6877 fprintf(fout," %8.3f%6.0f\n", re, it+1.);
\r
6880 for (ir=0,fh=0; ir<com.ncatG; ir++) fh += com.freqK[ir]*b1[ir];
\r
6883 if (noisy) printf ("lnL =%14.6f\n", -lnL);
\r
6884 fprintf (fout,"\nlnL =%14.6f\n", -lnL);
\r
6885 if(com.ngene==1) {
\r
6886 fprintf (fout,"\nmean(r^)=%9.4f var(r^)=%9.4f", mre, vre);
\r
6887 fprintf (fout,"\nAccuracy of rate prediction: corr(r^,r) =%9.4f\n",
\r
6888 sqrt(com.alpha*vre));
\r
6895 double lfunAdG (double x[], int np)
\r
6897 /* Auto-Discrete-Gamma rates for sites
\r
6898 See notes in lfundG().
\r
6900 int nscale=1, h,il, ir, j, FPE=0;
\r
6901 int direction=-1; /* 1: n->1; -1: 1->n */
\r
6902 double lnL=0, b1[NCATG], b2[NCATG], fh;
\r
6906 if(com.NnodeScale)
\r
6907 FOR(h,com.npatt) {
\r
6908 fh=com.fhK[0*com.npatt+h];
\r
6909 lnL-=fh*com.fpatt[h];
\r
6910 for(ir=1,com.fhK[h]=1; ir<com.ncatG; ir++)
\r
6911 com.fhK[ir*com.npatt+h]=exp(com.fhK[ir*com.npatt+h]-fh);
\r
6913 h = (direction==1?com.ls-1:0);
\r
6914 for (il=0; il<com.ls; h-=direction,il++) {
\r
6916 FOR(ir,com.ncatG) b1[ir]=com.fhK[ir*com.npatt+com.pose[h]];
\r
6918 for (ir=0; ir<com.ncatG; ir++) {
\r
6919 for (j=0,fh=0; j<com.ncatG; j++)
\r
6920 fh+=com.MK[ir*com.ncatG+j]*b1[j];
\r
6921 b2[ir]=fh*com.fhK[ir*com.npatt+com.pose[h]];
\r
6923 xtoy(b2,b1,com.ncatG);
\r
6925 if((il+1)%nscale==0) {
\r
6926 fh=sum(b1,com.ncatG);
\r
6929 FPE=1; printf ("h,fh%6d %12.4e\n", h+1,fh);
\r
6935 abyx(1/fh,b1,com.ncatG); lnL-=log(fh);
\r
6938 for (ir=0,fh=0; ir<com.ncatG; ir++) fh+=com.freqK[ir]*b1[ir];
\r
6948 #if (defined(BASEML))
\r
6950 int GetPMatBranch (double Pt[], double x[], double t, int inode)
\r
6952 /* P(t) for branch leading to inode, called by routines ConditionalPNode()
\r
6953 and AncestralSeq() in baseml and codeml. x[] is not used by baseml.
\r
6955 int n=com.ncode, i;
\r
6956 double space[NCODE*NCODE*3] = {0};
\r
6958 if (com.model<=K80)
\r
6959 PMatK80(Pt, t, (com.nhomo==2 ? *nodes[inode].pkappa : com.kappa));
\r
6962 eigenTN93(com.model, *nodes[inode].pkappa, -1, com.pi, &nR, Root, Cijk);
\r
6963 else if (com.nhomo>2 && com.model<=TN93)
\r
6964 eigenTN93(com.model, *nodes[inode].pkappa, *(nodes[inode].pkappa+1), nodes[inode].pi, &nR, Root, Cijk);
\r
6965 else if (com.nhomo>2 && com.model==REV)
\r
6966 eigenQREVbase(NULL, Pt, nodes[inode].pkappa, nodes[inode].pi, &nR, Root, Cijk);
\r
6968 if(com.model<=REV||com.model==REVu)
\r
6971 QUNREST(NULL, Pt, x+com.ntime+com.nrgene, com.pi);
\r
6972 for(i=0; i<n*n; i++) Pt[i] *= t;
\r
6973 matexp (Pt, n, 7, 5, space);
\r
6979 #elif (defined(CODEML))
\r
6981 int GetPMatBranch (double Pt[], double x[], double t, int inode)
\r
6983 /* P(t) for branch leading to inode, called by routines ConditionalPNode()
\r
6984 and AncestralSeq() in baseml and codeml.
\r
6986 Qfactor in branch & site models (model = 2 or 3 and NSsites = 2 or 3):
\r
6987 Qfactor scaling is applied here and not inside eigenQcodon().
\r
6989 int iUVR=0, nUVR=NBTYPE+2, ib = (int)nodes[inode].label, updateUVR=0;
\r
6990 double *pkappa, w, mr=1, Qfactor=1;
\r
6991 double *pomega = com.pomega; /* x+com.ntime+com.nrgene+com.nkappa; */
\r
6993 pkappa = (com.hkyREV||com.codonf==FMutSel?x+com.ntime+com.nrgene:&com.kappa);
\r
6995 if(com.seqtype==CODONseq && com.NSsites && com.model) {
\r
6996 /* branch&site models (both NSsites & model):
\r
6997 Usual likelihood calculation, no need to re-calculate UVRoot.
\r
6998 Only need to point to the right place.
\r
7000 iUVR = Set_UVR_BranchSite (IClass, ib);
\r
7001 Qfactor = Qfactor_NS_branch[ib];
\r
7003 else if (com.seqtype==CODONseq && BayesEB==2 && com.model>1) { /* BEB for A&C */
\r
7004 /* branch&site models (both NSsites & model) BEB calculation:
\r
7005 Need to calculate UVRoot, as w is different. com.pomega points to wbranches[]
\r
7006 in get_grid_para_like_M2M8() or get_grid_para_like_AC().
\r
7008 Qfactor_NS_branch[] is fixed at the MLE:
\r
7009 "we fix the branch lengths at the synonymous sites (i.e., the expected
\r
7010 number of synonymous substitutions per codon) at their MLEs."
\r
7012 w = com.pomega[ib];
\r
7013 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, w, Pt);
\r
7014 Qfactor = Qfactor_NS_branch[ib];
\r
7016 else if (com.seqtype==CODONseq && (com.model==1 ||com.model==2) && com.nbtype<=nUVR) {
\r
7017 /* branch model, also for AAClasses */
\r
7018 iUVR = (int)nodes[inode].label;
\r
7019 U=_UU[iUVR]; V=_VV[iUVR]; Root=_Root[iUVR];
\r
7021 else if (com.seqtype==CODONseq && com.model) {
\r
7023 if(com.aaDist==AAClasses) { /* AAClass model */
\r
7024 com.pomega = PointOmega(x+com.ntime, -1, inode, -1);
\r
7025 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, -1, Pt);
\r
7027 else if(com.nbtype>nUVR) { /* branch models, with more than 8 omega */
\r
7028 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[inode].omega, Pt);
\r
7032 if (com.seqtype == AAseq && com.model == Poisson)
\r
7033 PMatJC69like(Pt, t, com.ncode);
\r
7036 PMatUVRoot(Pt, t, com.ncode, U, V, Root);
\r
7046 void print_lnf_site (int h, double logfh)
\r
7048 #if(defined BASEML || defined CODEML)
\r
7049 fprintf(flnf, "\n%6d %6.0f %16.10f %16.12f %12.4f ",
\r
7050 h+1, com.fpatt[h], logfh, exp(logfh), com.ls*exp(logfh));
\r
7051 print1site(flnf, h);
\r
7056 double lfundG (double x[], int np)
\r
7058 /* likelihood function for site-class models.
\r
7059 This deals with scaling for nodes to avoid underflow if(com.NnodeScale).
\r
7060 The routine calls fx_r() to calculate com.fhK[], which holds log{f(x|r)}
\r
7061 when scaling or f(x|r) when not. Scaling factors are set and used for each
\r
7062 site class (ir) to calculate log(f(x|r). When scaling is used, the routine
\r
7063 converts com.fhK[] into f(x|r), by collecting scaling factors into lnL.
\r
7064 The rest of the calculation then becomes the same and relies on f(x|r).
\r
7065 Check notes in fx_r.
\r
7066 This is also used for NSsites models in codonml.
\r
7067 Note that scaling is used between fx_r() and ConditionalPNode()
\r
7068 When this routine is used under the multiple-gene site-class model, note
\r
7069 that right now it assumes one set of com.freqK[] for the different genes,
\r
7070 which may be an issue.
\r
7072 int h,ir, it, FPE=0;
\r
7073 double lnL=0, fh=0,t;
\r
7078 for(h=0; h<com.npatt; h++) {
\r
7079 if (com.fpatt[h]<=0 && com.print>=0) continue;
\r
7080 if(com.NnodeScale) { /* com.fhK[] has log{f(x|r}. Note the scaling for nodes */
\r
7081 for(ir=1,it=0; ir<com.ncatG; ir++) /* select term for scaling */
\r
7082 if(com.fhK[ir*com.npatt+h] > com.fhK[it*com.npatt+h]) it = ir;
\r
7083 t = com.fhK[it*com.npatt+h];
\r
7084 for(ir=0,fh=0; ir<com.ncatG; ir++)
\r
7085 fh += com.freqK[ir]*exp(com.fhK[ir*com.npatt+h]-t);
\r
7089 for(ir=0,fh=0; ir<com.ncatG;ir++)
\r
7090 fh += com.freqK[ir]*com.fhK[ir*com.npatt+h];
\r
7093 FPE=1; matout(F0,x,1,np);
\r
7094 printf("\nlfundG: h=%4d fhK=%9.6e\ndata: ", h+1, fh);
\r
7095 print1site(F0, h);
\r
7102 lnL -= fh*com.fpatt[h];
\r
7103 if(LASTROUND==2) dfsites[h] = fh;
\r
7104 if (com.print<0) print_lnf_site(h, fh);
\r
7111 int SetPSiteClass(int iclass, double x[])
\r
7113 /* This sets parameters for the iclass-th site class
\r
7114 This is used by ConditionalPNode() and also updateconP in both algorithms
\r
7115 For method=0 and 1.
\r
7117 int k = com.nrgene + !com.fix_kappa;
\r
7118 double *pkappa=NULL, *xcom=x+com.ntime, mr;
\r
7120 _rateSite = com.rK[iclass];
\r
7123 mr = 1/Qfactor_NS;
\r
7124 pkappa = (com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);
\r
7125 if(com.seqtype == CODONseq && com.NSsites) {
\r
7127 if (com.model==0) {
\r
7129 if(com.aaDist<10) com.pomega = xcom + k + com.ncatG - 1 + 2*iclass;
\r
7130 else if(com.aaDist==11) com.pomega = xcom + k + com.ncatG - 1 + 4*iclass;
\r
7131 else if(com.aaDist==12) com.pomega = xcom + k + com.ncatG - 1 + 5*iclass;
\r
7133 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, com.rK[iclass], PMat);
\r
7140 extern int prt, Locus, Ir;
\r
7143 int fx_r (double x[], int np)
\r
7145 /* This calculates f(x|r) if(com.NnodeScale==0) or log{f(x|r)}
\r
7146 if(com.NnodeScale>0), that is, the (log) probability of observing data x
\r
7147 at a site, given the rate r or dN/dS ratio for the site. This is used by
\r
7148 the discrete-gamma models in baseml and codeml as well as the NSsites models
\r
7150 The results are stored in com.fhK[com.ncatG*com.npatt].
\r
7151 This deals with underflows with large trees using global variables
\r
7152 com.nodeScale and com.nodeScaleF[com.NnodeScale*com.npatt].
\r
7154 int h, ir, i,k, ig, FPE=0;
\r
7155 double fh, smallw=1e-12; /* for testing site class with w=0 */
\r
7158 if(SetParameters(x)) puts("\npar err..");
\r
7160 for(ig=0; ig<com.ngene; ig++) { /* alpha may differ over ig */
\r
7161 if(com.Mgene>1 || com.nalpha>1)
\r
7162 SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);
\r
7163 for(ir=0; ir<com.ncatG; ir++) {
\r
7164 if(ir && com.conPSiteClass) { /* shift com.nodeScaleF & conP */
\r
7165 if(com.NnodeScale)
\r
7166 com.nodeScaleF += (size_t)com.npatt*com.NnodeScale;
\r
7167 for(i=com.ns; i<tree.nnode; i++)
\r
7168 nodes[i].conP += (tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;
\r
7170 SetPSiteClass(ir,x);
\r
7171 ConditionalPNode(tree.root,ig, x);
\r
7173 for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
7174 if (com.fpatt[h]<=0 && com.print>=0) continue;
\r
7175 for (i=0,fh=0; i<com.ncode; i++)
\r
7176 fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];
\r
7178 if(fh<-1e-10 /* && !FPE */) { /* note that 0 may be o.k. here */
\r
7179 FPE=1; matout(F0,x,1,np);
\r
7180 printf("\nfx_r: h = %d r = %d fhK = %.5e ", h+1,ir+1,fh);
\r
7181 if(com.seqtype==0||com.seqtype==2) {
\r
7183 print1site(F0, h);
\r
7189 if(!com.NnodeScale)
\r
7190 com.fhK[ir*com.npatt+h] = fh;
\r
7192 for(k=0,com.fhK[ir*com.npatt+h]=log(fh); k<com.NnodeScale; k++)
\r
7193 com.fhK[ir*com.npatt+h] += com.nodeScaleF[k*com.npatt+h];
\r
7197 if(com.conPSiteClass) { /* shift pointers conP back */
\r
7198 if(com.NnodeScale)
\r
7199 com.nodeScaleF -= (com.ncatG-1)*com.NnodeScale*(size_t)com.npatt;
\r
7200 for(i=com.ns; i<tree.nnode; i++)
\r
7201 nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;
\r
7208 double lfun (double x[], int np)
\r
7210 /* likelihood function for models of one rate for all sites including
\r
7213 int h,i,k, ig, FPE=0;
\r
7217 if(SetParameters(x)) puts ("\npar err..");
\r
7218 for(ig=0; ig<com.ngene; ig++) {
\r
7220 SetPGene(ig,1,1,0,x);
\r
7221 ConditionalPNode (tree.root, ig, x);
\r
7223 for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
7224 if (com.fpatt[h]<=0 && com.print>=0) continue;
\r
7225 for(i=0,fh=0; i<com.ncode; i++)
\r
7226 fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];
\r
7228 if(fh<-1e-5 && noisy) {
\r
7229 printf("\nfh = %.6f negative\n",fh);
\r
7233 FPE=1; matout(F0,x,1,np);
\r
7234 printf("lfun: h=%4d fh=%9.6e\nData: ", h+1,fh);
\r
7235 print1site(F0, h);
\r
7241 for(k=0; k<com.NnodeScale; k++)
\r
7242 fh += com.nodeScaleF[k*com.npatt+h];
\r
7244 lnL -= fh*com.fpatt[h];
\r
7245 if(LASTROUND==2) dfsites[h] = fh;
\r
7247 print_lnf_site(h,fh);
\r
7256 int print1site (FILE*fout, int h)
\r
7258 /* This print out one site in the sequence data, com.z[]. It may be the h-th
\r
7259 site in the original data file or the h-th pattern. The data are coded.
\r
7260 naa > 1 if the codon codes for more than one amino acid.
\r
7262 char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));
\r
7263 char compatibleAAs[20]="";
\r
7264 int n=com.ncode, i, b, aa=0;
\r
7266 for(i=0; i<com.ns; i++) {
\r
7268 if(com.seqtype==0 || com.seqtype==2)
\r
7269 fprintf(fout,"%c", pch[b]);
\r
7270 #if defined(CODEML)
\r
7271 else if(com.seqtype==1) {
\r
7272 aa = GetAASiteSpecies(i, h);
\r
7273 fprintf(fout, "%s (%c) ", CODONs[b], aa);
\r
7281 #if(defined MINIMIZATION)
\r
7283 /* November, 1999, Minimization branch by branch */
\r
7284 int noisy_minbranches;
\r
7285 double *space_minbranches, *g_minbranches, *varb_minbranches, e_minbranches;
\r
7287 double minbranches(double xcom[], int np);
\r
7288 int lfunt(double t, int a,int b,double x[],double *l, double space[]);
\r
7289 int lfuntdd(double t, int a,int b,double x[], double *l,double*dl,double*ddl,
\r
7291 int lfunt_SiteClass(double t, int a,int b,double x[],double *l,double space[]);
\r
7292 int lfuntdd_SiteClass(double t, int a,int b,double x[],
\r
7293 double *l,double*dl,double*ddl,double space[]);
\r
7295 int minB (FILE*fout, double *lnL,double x[],double xb[][2],double e0, double space[])
\r
7297 /* This calculates lnL for given values of common parameters by optimizing
\r
7298 branch lengths, cycling through them.
\r
7299 Z. Yang, November 1999
\r
7300 This calls minbranches to optimize branch lengths and ming2 to
\r
7301 estimate other paramters.
\r
7302 At the end of the routine, there is a call to lfun to restore nodes[].conP.
\r
7303 Returns variances of branch lengths in space[].
\r
7304 space[] is com.space[]. com.space may be reallocated here, which may be unsafe
\r
7305 as the pointers in the calling routine may not be pointing to the right places.
\r
7307 return value: 0 convergent; -1: not convergent.
\r
7309 int ntime0=com.ntime, fix_blength0=com.fix_blength;
\r
7310 int status=0, i, npcom=com.np-com.ntime;
\r
7312 double *xcom=x+com.ntime, lnL0= *lnL, dl, e=1e-5;
\r
7313 double (*xbcom)[2]=xb+ntime0;
\r
7314 int small_times=0, max_small_times=100, ir,maxr=(npcom?200:1);
\r
7315 double small_improvement=0.001;
\r
7318 if(com.conPSiteClass) {
\r
7319 s = (2*com.ncode*com.ncode+com.ncode*(size_t)com.npatt)*sizeof(double);
\r
7320 if(com.sspace < s) { /* this assumes that space is com.space */
\r
7321 printf("\n%lu bytes in space, %lu bytes needed\n", com.sspace, s);
\r
7322 printf("minB: reallocating memory for working space.\n");
\r
7323 com.space = (double*)realloc(com.space, s);
\r
7324 if(com.space==NULL) error2("oom space");
\r
7328 g_minbranches = com.space;
\r
7329 varb_minbranches = com.space + com.np;
\r
7330 s = (3*com.ncode*com.ncode + (com.conPSiteClass) * 4 *(size_t)com.npatt) *sizeof(double);
\r
7331 if((space_minbranches=(double*)malloc(s))==NULL)
\r
7332 error2("oom minB");
\r
7333 if(com.ntime==0) error2("minB: should not come here");
\r
7335 if(*lnL<=0) *lnL = com.plfun(x,com.np);
\r
7336 e = e_minbranches = (npcom ? 5.0 : e0);
\r
7337 com.ntime = 0; com.fix_blength = 2;
\r
7339 if(com.NSsites==0) com.pomega = xcom+com.nrgene+!com.fix_kappa;
\r
7342 for(ir=0; (npcom==0||com.method) && ir<maxr; ir++) {
\r
7344 if(noisy>2) printf("\n\nRound %da: Paras (%d) (e=%.6g)",ir+1,npcom,e);
\r
7345 ming2(NULL,lnL,com.plfun,NULL,xcom, xbcom, com.space,e,npcom);
\r
7347 FPN(F0); FOR(i,npcom) printf(" %11.6f", xcom[i]);
\r
7348 printf("%8s%s\n", "", printtime(timestr));
\r
7351 noisy_minbranches = noisy;
\r
7353 printf("\nRound %db: Blengths (%d, e=%.6g)\n",ir+1,tree.nbranch,e_minbranches);
\r
7355 *lnL = minbranches(xcom, -1);
\r
7356 for(i=0; i<tree.nnode; i++)
\r
7357 if(i != tree.root)
\r
7358 x[nodes[i].ibranch] = nodes[i].branch;
\r
7359 if(noisy>2) printf("\n%s\n", printtime(timestr));
\r
7361 if((dl=fabs(*lnL-lnL0))<e0 && e<=0.02) break;
\r
7362 if(dl<small_improvement) small_times++;
\r
7363 else small_times=0;
\r
7364 if((small_times>max_small_times && ntime0<200) || (com.method==2&&ir==1)) {
\r
7365 if(noisy && com.method!=2) puts("\nToo slow, switching algorithm.");
\r
7369 if(noisy && small_times>5)
\r
7370 printf("\n%d rounds of small improvement.",small_times);
\r
7372 e/=2; if(dl<1) e/=2;
\r
7373 if(dl<0.5) e = min2(e,1e-3);
\r
7374 else if(dl>10) e = max2(e,0.1);
\r
7375 e_minbranches = max2(e, 1e-6);
\r
7380 fprintf(fout,"%4d %12.5f x ", ir+1,*lnL);
\r
7381 for(i=0; i<com.np; i++)
\r
7382 fprintf(fout, " %8.5f", x[i]);
\r
7383 FPN(fout); fflush(fout);
\r
7386 if (npcom && ir==maxr) status=-1;
\r
7388 if(npcom && status==2) {
\r
7389 noisy_minbranches = 0;
\r
7390 com.ntime = ntime0;
\r
7391 com.fix_blength = fix_blength0;
\r
7392 ming2(NULL,lnL,com.plfun,NULL,x,xb, com.space,e0,com.np);
\r
7393 for(i=0; i<tree.nnode; i++) space[i] = -1;
\r
7396 for(i=0; i<tree.nnode; i++)
\r
7397 if(i!=tree.root) x[nodes[i].ibranch] = nodes[i].branch;
\r
7399 if(noisy>2) printf("\nlnL = %12.6f\n",- *lnL);
\r
7401 com.ntime = ntime0;
\r
7402 com.fix_blength = fix_blength0;
\r
7403 *lnL = com.plfun(x,com.np); /* restore things, for e.g. AncestralSeqs */
\r
7404 if(fabs(*lnL-lnL0) > 1e-5)
\r
7405 printf("%.6f != %.6f lnL error. Something is wrong in minB\n", *lnL, lnL0);
\r
7406 free(space_minbranches);
\r
7408 return (status==-1 ? -1 : 0);
\r
7412 /********************* START: Testing iteration algorithm ******************/
\r
7414 int minB2 (FILE*fout, double *lnL,double x[],double xb[][2],double e0, double space[])
\r
7418 int ntime0=com.ntime, fix_blength0=com.fix_blength;
\r
7419 int status=0, i, npcom=com.np-com.ntime;
\r
7421 double *xcom=x+com.ntime, lnL0= *lnL;
\r
7422 double (*xbcom)[2]=xb+ntime0;
\r
7424 s = (3*com.ncode*com.ncode + (com.conPSiteClass) * 4*(size_t)com.npatt) * sizeof(double);
\r
7425 if((space_minbranches=(double*)malloc(s))==NULL) error2("oom minB2");
\r
7426 if(com.ntime==0 || npcom==0) error2("minB2: should not come here");
\r
7428 noisy_minbranches=0;
\r
7429 /* if(*lnL<=0) *lnL=com.plfun(x,com.np); */
\r
7430 com.ntime=0; com.fix_blength=2;
\r
7432 if(com.NSsites==0) com.pomega=xcom+com.nrgene+!com.fix_kappa;
\r
7435 ming2(NULL, lnL, minbranches, NULL, xcom, xbcom, space, e0, npcom);
\r
7438 com.ntime = ntime0; com.fix_blength = fix_blength0;
\r
7439 for(i=0; i<tree.nnode; i++)
\r
7440 if(i!=tree.root) x[nodes[i].ibranch] = nodes[i].branch;
\r
7441 *lnL = com.plfun(x,com.np); /* restore things, for e.g. AncestralSeqs */
\r
7442 free(space_minbranches);
\r
7444 return (status==-1 ? -1 : 0);
\r
7447 /********************* END: Testing iteration algorithm ******************/
\r
7450 static int times=0;
\r
7453 int updateconP (double x[], int inode)
\r
7455 /* update conP for inode.
\r
7457 Confusing decision about x[] follows. Think about redesign.
\r
7459 (1) Called by PostProbNode for ancestral reconstruction, with com.clock = 0,
\r
7460 1, 2: x[] is passed over and com.ntime is used to get xcom in
\r
7462 (2) Called from minbranches(), with com.clock = 0. xcom[] is passed
\r
7463 over by minbranches and com.ntime=0 is set. So SetPSiteClass()
\r
7464 can still get the correct substitution parameters.
\r
7465 Also look at ConditionalPNode().
\r
7467 Note that com.nodeScaleF and nodes[].conP are shifted if(com.conPSiteClass).
\r
7471 if(com.conPSiteClass==0)
\r
7472 for(ig=0; ig<com.ngene; ig++) {
\r
7473 if(com.Mgene>1 || com.nalpha>1)
\r
7474 SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,x);
\r
7475 /* x[] needed by local clock models and if(com.aaDist==AAClasses).
\r
7476 This is called from PostProbNode
\r
7479 ConditionalPNode(inode, ig, x);
\r
7481 else { /* site-class models */
\r
7482 FOR(ir,com.ncatG) {
\r
7487 if(com.NnodeScale)
\r
7488 com.nodeScaleF += com.NnodeScale*(size_t)com.npatt;
\r
7489 for(i=com.ns; i<tree.nnode; i++)
\r
7490 nodes[i].conP += (tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;
\r
7492 SetPSiteClass(ir, x);
\r
7493 for(ig=0; ig<com.ngene; ig++) {
\r
7494 if(com.Mgene>1 || com.nalpha>1)
\r
7495 SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);
\r
7496 if(com.nalpha>1) SetPSiteClass(ir, x);
\r
7497 ConditionalPNode(inode,ig, x);
\r
7501 /* shift positions */
\r
7502 com.nodeScaleF -= (com.ncatG-1)*com.NnodeScale*com.npatt;
\r
7503 for(i=com.ns; i<tree.nnode; i++)
\r
7504 nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;
\r
7510 double minbranches (double x[], int np)
\r
7512 /* Ziheng, November 1999.
\r
7513 optimizing one branch at a time
\r
7515 for each branch a..b, reroot the tree at b, and
\r
7516 then calculate conditional probability for node a.
\r
7517 For each branch, this routine determines the Newton search direction
\r
7518 p = -dl/dll. It then halves the steplength to make sure -lnL is decreased.
\r
7519 When the Newton solution is correct, this strategy will waste one
\r
7520 extra call to lfunt. It does not seem possible to remove calculation of
\r
7521 l (lnL) in lfuntddl().
\r
7522 lfun or lfundG and thus SetParameters are called once beforehand to set up
\r
7523 globals like com.pomega.
\r
7524 This works with NSsites and NSbranch models.
\r
7526 com.oldconP[] marks nodes that need to be updated when the tree is rerooted.
\r
7527 The array is declared in baseml and codeml and used in the following
\r
7528 routines: ReRootTree, minbranches, and ConditionalPNode.
\r
7530 Note: At the end of the routine, nodes[].conP are not updated.
\r
7532 int ib,oldroot=tree.root, a,b;
\r
7533 int icycle, maxcycle=500, icycleb, ncycleb=10, i;
\r
7534 double lnL, lnL0=0, l0,l,dl,ddl=-1, t,t0,t00, p,step=1, small=1e-20,y;
\r
7535 double tb[2]={1e-8,50}, e=e_minbranches, *space=space_minbranches;
\r
7536 double *xcom=x+com.ntime; /* this is incorrect as com.ntime=0 */
\r
7537 double smallddl=0.25/com.ls*(1-0.25/com.ls)/com.ls;
\r
7539 if(com.ntime) error2("ntime should be 0 in minbranches");
\r
7540 lnL0 = l0 = l = lnL = com.plfun(xcom,-1);
\r
7542 if(noisy_minbranches>2) printf("\tlnL0 = %14.6f\n",-lnL0);
\r
7544 for(icycle=0; icycle<maxcycle; icycle++) {
\r
7545 for(ib=0; ib<tree.nbranch; ib++) {
\r
7546 t = t0 = t00 = nodes[tree.branches[ib][1]].branch;
\r
7548 a = tree.branches[ib][0];
\r
7549 b = tree.branches[ib][1];
\r
7550 /* if a is the root, why do we want to reroot the tree at b? Just switch a with b? */
\r
7552 for(i=0; i<tree.nnode; i++)
\r
7557 for(icycleb=0; icycleb<ncycleb; icycleb++) { /* iterating a branch */
\r
7558 if(!com.conPSiteClass)
\r
7559 lfuntdd(t, a, b, xcom, &y, &dl, &ddl, space);
\r
7561 lfuntdd_SiteClass(t, a, b, xcom, &y, &dl, &ddl, space);
\r
7563 p = -dl/fabs(ddl);
\r
7564 /* p = -dl/ddl; newton direction */
\r
7565 if (fabs(p)<small) step = 0;
\r
7566 else if(p<0) step = min2(1, (tb[0]-t0)/p);
\r
7567 else step = min2(1, (tb[1]-t0)/p);
\r
7569 if(icycle==0 && step!=1 && step!=0)
\r
7570 step *= 0.99; /* avoid border */
\r
7571 for (i=0; step>small; i++,step/=4) {
\r
7573 if(!com.conPSiteClass) lfunt(t, a, b, xcom, &l, space);
\r
7574 else lfunt_SiteClass(t, a, b, xcom, &l, space);
\r
7577 if(step<=small) { t=t0; l=l0; break; }
\r
7578 if(fabs(t-t0)<e*fabs(1+t) && fabs(l-l0)<e) break;
\r
7581 nodes[a].branch = t;
\r
7583 g_minbranches[ib] = -dl;
\r
7584 varb_minbranches[ib] = -ddl;
\r
7587 if(noisy_minbranches>2) printf("\tCycle %2d: %14.6f\n",icycle+1, -l);
\r
7588 if(fabs(lnL-lnL0) < e) break;
\r
7590 } /* for (icycle) */
\r
7591 ReRootTree(oldroot); /* did not update conP */
\r
7592 FOR(i,tree.nnode) com.oldconP[i]=0;
\r
7598 int lfunt(double t, int a, int b, double xcom[], double *l, double space[])
\r
7600 /* See notes for lfunt_dd and minbranches
\r
7602 int i,j,k, h,ig, n=com.ncode, nroot=n;
\r
7603 int n1 = (com.cleandata&&b<com.ns ? 1 : n), xb, nUVR;
\r
7604 double expt,uexpt=0,multiply;
\r
7605 double *P=space, piqi,pqj, fh, mr=0;
\r
7610 pkappa = (com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);
\r
7611 if (com.seqtype==CODONseq && com.model) {
\r
7612 if((com.model==NSbranchB || com.model==NSbranch2) && com.NSsites==0 && com.nbtype<=nUVR) {
\r
7613 U = _UU[(int)nodes[a].label];
\r
7614 V = _VV[(int)nodes[a].label];
\r
7615 Root = _Root[(int)nodes[a].label];
\r
7618 eigenQcodon(1, -1, NULL, NULL, NULL, Root, U, V, &mr, pkappa, nodes[a].omega, PMat);
\r
7625 eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);
\r
7630 for (ig=0; ig<com.ngene; ig++) {
\r
7631 if(com.Mgene>1) SetPGene(ig,1,1,0,xcom); /* com.ntime=0 */
\r
7632 for(i=0; i<n*n; i++) P[i] = 0;
\r
7634 for(k=0,expt=1; k<nroot; k++) {
\r
7635 multiply = com.rgene[ig]*Root[k];
\r
7636 if(k) expt = exp(t*multiply);
\r
7638 #if (CODEML) /* uses U & V */
\r
7639 for(i=0; i<n; i++)
\r
7640 for(j=0,uexpt=U[i*n+k]*expt; j<n; j++)
\r
7641 P[i*n+j] += uexpt*V[k*n+j];
\r
7642 #elif (BASEML) /* uses Cijk */
\r
7643 for(i=0; i<n; i++) for(j=0; j<n; j++)
\r
7644 P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;
\r
7648 for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
7649 n1 = (b<com.ns ? nChara[com.z[b][h]] : n);
\r
7650 for(i=0,fh=0; i<n1; i++) {
\r
7652 if(b<com.ns) piqi = com.pi[ xb = CharaMap[com.z[b][h]][i] ];
\r
7653 else piqi = com.pi[i] * nodes[b].conP[h*n+i];
\r
7655 for(j=0,pqj=0; j<n; j++)
\r
7656 pqj += P[xb*n+j]*nodes[a].conP[h*n+j];
\r
7659 if(noisy && fh<1e-250)
\r
7660 printf("a bit too small: fh[%d] = %10.6e\n",h,fh);
\r
7661 if(fh<0) fh = -500;
\r
7662 else fh = log(fh);
\r
7664 *l -= fh*com.fpatt[h];
\r
7665 for(i=0; i<com.NnodeScale; i++)
\r
7666 *l -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];
\r
7673 int lfuntdd(double t, int a, int b, double xcom[], double *l, double*dl, double*ddl, double space[])
\r
7675 /* Calculates lnL for branch length t for branch b->a.
\r
7676 See notes in minbranches().
\r
7677 Conditional probability updated correctly already.
\r
7681 int i,j,k, h,ig,n=com.ncode, nroot=n;
\r
7682 int n1 = (com.cleandata&&b<com.ns ? 1 : n), xb, nUVR;
\r
7683 double expt, uexpt = 0, multiply;
\r
7684 double *P=space, *dP=P+n*n,*ddP=dP+n*n, piqi,pqj,dpqj,ddpqj, fh, dfh, ddfh;
\r
7685 double *pkappa, mr=0;
\r
7689 pkappa=(com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);
\r
7690 if (com.seqtype==CODONseq && com.model) {
\r
7691 if((com.model==NSbranchB || com.model==NSbranch2) && com.NSsites==0 && com.nbtype<=nUVR) {
\r
7692 U = _UU[(int)nodes[a].label];
\r
7693 V = _VV[(int)nodes[a].label];
\r
7694 Root = _Root[(int)nodes[a].label];
\r
7697 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[a].omega, PMat);
\r
7704 eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);
\r
7707 *l = *dl = *ddl = 0;
\r
7708 for(ig=0; ig<com.ngene; ig++) {
\r
7709 if(com.Mgene>1) SetPGene(ig,1,1,0,xcom); /* com.ntime=0 */
\r
7710 for(i=0; i<n*n; i++) P[i] = dP[i] = ddP[i] = 0;
\r
7712 for(k=0,expt=1; k<nroot; k++) {
\r
7713 multiply = com.rgene[ig]*Root[k];
\r
7714 if(k) expt = exp(t*multiply);
\r
7716 #if (CODEML) /* uses U & V */
\r
7717 for(i=0; i<n; i++)
\r
7718 for(j=0,uexpt=U[i*n+k]*expt; j<n; j++) {
\r
7719 P[i*n+j] += uexpt*V[k*n+j];
\r
7721 dP[i*n+j] += uexpt*V[k*n+j]*multiply;
\r
7722 ddP[i*n+j] += uexpt*V[k*n+j]*multiply*multiply;
\r
7725 #elif (BASEML) /* uses Cijk */
\r
7726 for(i=0; i<n; i++) for(j=0; j<n; j++) {
\r
7727 P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;
\r
7729 dP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply;
\r
7730 ddP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply*multiply;
\r
7736 for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
7737 n1 = (b<com.ns ? nChara[com.z[b][h]] : n);
\r
7738 for(i=0,fh=dfh=ddfh=0; i<n1; i++) {
\r
7740 if(b<com.ns) piqi = com.pi[ xb = CharaMap[com.z[b][h]][i] ];
\r
7741 else piqi = com.pi[i] * nodes[b].conP[h*n+i];
\r
7742 for(j=0,pqj=dpqj=ddpqj=0; j<n; j++) {
\r
7743 pqj += P[xb*n+j] * nodes[a].conP[h*n+j];
\r
7744 dpqj += dP[xb*n+j] * nodes[a].conP[h*n+j];
\r
7745 ddpqj += ddP[xb*n+j] * nodes[a].conP[h*n+j];
\r
7749 ddfh += piqi*ddpqj;
\r
7751 if(noisy && fh<1e-250) {
\r
7752 printf("too small: fh[%d] = %10.6e\n",h,fh);
\r
7755 *l -= log(fh)*com.fpatt[h];
\r
7756 for(i=0; i<com.NnodeScale; i++)
\r
7757 *l -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];
\r
7758 *dl -= dfh/fh * com.fpatt[h];
\r
7759 *ddl -= (fh*ddfh - dfh*dfh)/(fh*fh) * com.fpatt[h];
\r
7766 int lfunt_SiteClass(double t, int a, int b, double xcom[], double *l, double space[])
\r
7768 /* see notes in lfuntdd_SiteClass
\r
7769 For branch&site models, look at the notes in GetPMatBranch()
\r
7771 int i,j,k, h,ig,ir,it, n=com.ncode, nroot=n;
\r
7772 int n1=(com.cleandata&&b<com.ns?1:n), xb;
\r
7773 double y,expt,uexpt=0,multiply, piqi,pqj;
\r
7774 double *P=space, *fh=P+n*n;
\r
7775 double *Sh=fh+com.npatt; /* scale factor for each site pattern*/
\r
7776 double *pK=com.fhK; /* proportion for each site class after scaling */
\r
7777 double smallw=1e-12;
\r
7781 eigenTN93(com.model, *nodes[a].pkappa,1,com.pi,&nR,Root,Cijk);
\r
7785 if(com.NnodeScale==0)
\r
7786 for(ir=0; ir<com.ncatG; ir++)
\r
7787 for (h=0; h<com.npatt; h++)
\r
7788 pK[ir*com.npatt+h] = com.freqK[ir];
\r
7790 for(h=0; h<com.npatt; h++) {
\r
7791 for(ir=0,it=0; ir<com.ncatG; ir++) {
\r
7792 for(k=0,y=0; k<com.NnodeScale; k++)
\r
7793 y += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];
\r
7794 if((pK[ir*com.npatt+h]=y) > pK[it*com.npatt+h])
\r
7797 Sh[h] = pK[it*com.npatt+h];
\r
7798 for(ir=0; ir<com.ncatG; ir++)
\r
7799 pK[ir*com.npatt+h] = com.freqK[ir]*exp(pK[ir*com.npatt+h]-Sh[h]);
\r
7803 for(h=0; h<com.npatt; h++) fh[h] = 0;
\r
7804 for(ir=0; ir<com.ncatG; ir++) {
\r
7805 SetPSiteClass(ir, xcom); /* com.ntime=0 */
\r
7807 #if CODEML /* branch b->a */
\r
7808 /* branch&site models */
\r
7809 if(com.seqtype==CODONseq && com.NSsites && com.model)
\r
7810 Set_UVR_BranchSite (ir, (int)nodes[a].label);
\r
7814 for(i=com.ns;i<tree.nnode;i++)
\r
7815 nodes[i].conP += (tree.nnode-com.ns)*n*(size_t)com.npatt;
\r
7817 for (ig=0; ig<com.ngene; ig++) {
\r
7818 if(com.Mgene>1 || com.nalpha>1)
\r
7819 SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,xcom); /* com.ntime=0 */
\r
7820 if(com.nalpha>1) SetPSiteClass(ir, xcom); /* com.ntime=0 */
\r
7822 for(i=0; i<n*n; i++) P[i] = 0;
\r
7823 for(k=0,expt=1; k<nroot; k++) {
\r
7824 multiply = com.rgene[ig]*Root[k]*_rateSite;
\r
7826 if(com.seqtype==1 && com.model>=2)
\r
7827 multiply *= Qfactor_NS_branch[(int)nodes[a].label];
\r
7829 if(k) expt = exp(t*multiply);
\r
7831 #if (CODEML) /* uses U & V */
\r
7832 for(i=0; i<n; i++)
\r
7833 for(j=0,uexpt=U[i*n+k]*expt; j<n; j++)
\r
7834 P[i*n+j] += uexpt*V[k*n+j];
\r
7835 #elif (BASEML) /* uses Cijk */
\r
7836 for(i=0; i<n; i++)
\r
7837 for(j=0; j<n; j++)
\r
7838 P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;
\r
7840 } /* for (k), look through eigenroots */
\r
7841 for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
7842 n1 = (b<com.ns ? nChara[com.z[b][h]] : n);
\r
7843 for(i=0; i<n1; i++) {
\r
7845 if(b<com.ns) piqi = pK[ir*com.npatt+h] * com.pi[ xb = CharaMap[com.z[b][h]][i] ];
\r
7846 else piqi = pK[ir*com.npatt+h] * com.pi[i] * nodes[b].conP[h*n+i];
\r
7848 for(j=0,pqj=0; j<n; j++)
\r
7849 pqj += P[xb*n+j]*nodes[a].conP[h*n+j];
\r
7850 fh[h] += piqi*pqj;
\r
7856 for(i=com.ns; i<tree.nnode; i++) /* shift position */
\r
7857 nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*n*(size_t)com.npatt;
\r
7858 for(h=0,*l=0; h<com.npatt; h++) {
\r
7860 printf("small (lfunt_SiteClass): fh[%d] = %10.6e\n",h,fh[h]);
\r
7862 *l -= log(fh[h])*com.fpatt[h];
\r
7863 if(com.NnodeScale) *l -= Sh[h]*com.fpatt[h];
\r
7869 int lfuntdd_SiteClass(double t, int a,int b,double xcom[],
\r
7870 double *l,double*dl,double*ddl,double space[])
\r
7872 /* dt and ddt for site-class models, modified from lfuntdd()
\r
7873 nodes[].conP (and com.nodeScaleF if scaling is used) is shifted for ir,
\r
7874 and moved back to the rootal place at the end of the routine.
\r
7876 At the start of this routine, nodes[].conP has the conditional probabilties
\r
7877 for each node, each site pattern, for each site class (ir).
\r
7878 Scaling: When scaling is used, scale factors
\r
7879 com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h] for all nodes
\r
7880 are collected into Sh[h], after adjusting for rate classes, since the
\r
7881 sum is taken over ir. Sh[h] and pK[ir*com.npatt+h] together store the
\r
7882 scale factors and proportions for site classes. com.freqK[ir] is not
\r
7883 used in this routine beyond this point.
\r
7884 if(com.Malpha), com.freqK[]=1/com.ncatG and does not change with ig,
\r
7885 and so the collection of Sh for sites at the start of the routine is o.k.
\r
7887 The space for com.fhK[] is used.
\r
7888 space[2*ncode*ncode + 4*npatt]:
\r
7889 dP[ncode*ncode],ddP[ncode*ncode],fh[npatt],dfh[npatt],ddfh[npatt],Sh[npatt]
\r
7890 pK[ncatG*npatt]=com.fhK[]
\r
7892 int i,j,k, h,ig,ir,it, n=com.ncode, nroot=n;
\r
7893 int n1=(com.cleandata&&b<com.ns?1:n), xb;
\r
7894 double y,expt,uexpt=0,multiply, piqi,pqj,dpqj,ddpqj;
\r
7895 double *P=PMat, *dP=space,*ddP=dP+n*n;
\r
7896 double *fh=ddP+n*n, *dfh=fh+com.npatt, *ddfh=dfh+com.npatt;
\r
7897 double *Sh=ddfh+com.npatt; /* scale factor for each site pattern */
\r
7898 double *pK=com.fhK; /* proportion for each site class after scaling */
\r
7899 double smallw=1e-12;
\r
7904 eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);
\r
7907 if(com.NnodeScale==0)
\r
7908 for(ir=0; ir<com.ncatG; ir++)
\r
7909 for(h=0; h<com.npatt; h++)
\r
7910 pK[ir*com.npatt+h] = com.freqK[ir];
\r
7912 for(h=0; h<com.npatt; h++) {
\r
7913 for(ir=0,it=0; ir<com.ncatG; ir++) {
\r
7914 for(k=0,y=0; k<com.NnodeScale; k++)
\r
7915 y += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];
\r
7916 if((pK[ir*com.npatt+h]=y) > pK[it*com.npatt+h])
\r
7919 Sh[h] = pK[it*com.npatt+h];
\r
7920 for(ir=0; ir<com.ncatG; ir++)
\r
7921 pK[ir*com.npatt+h] = com.freqK[ir] * exp(pK[ir*com.npatt+h]-Sh[h]);
\r
7925 for(h=0; h<com.npatt; h++)
\r
7926 fh[h] = dfh[h] = ddfh[h] = 0;
\r
7927 for(ir=0; ir<com.ncatG; ir++) {
\r
7928 SetPSiteClass(ir, xcom); /* com.ntime=0 */
\r
7930 #if CODEML /* branch b->a */
\r
7931 /* branch&site models */
\r
7932 if(com.seqtype==CODONseq && com.NSsites && com.model)
\r
7933 Set_UVR_BranchSite (ir, (int)nodes[a].label);
\r
7937 for(i=com.ns; i<tree.nnode; i++)
\r
7938 nodes[i].conP += (tree.nnode-com.ns)*n*(size_t)com.npatt;
\r
7940 for (ig=0; ig<com.ngene; ig++) {
\r
7941 if(com.Mgene>1 || com.nalpha>1)
\r
7942 SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,xcom); /* com.ntime=0 */
\r
7943 if(com.nalpha>1) SetPSiteClass(ir, xcom); /* com.ntime=0 */
\r
7945 for(i=0; i<n*n; i++)
\r
7946 P[i] = dP[i] = ddP[i]=0;
\r
7947 for(k=0,expt=1; k<nroot; k++) { /* k loops through eigenroots */
\r
7948 multiply = com.rgene[ig]*Root[k]*_rateSite;
\r
7950 if(com.seqtype==1 && com.model>=2)
\r
7951 multiply *= Qfactor_NS_branch[(int)nodes[a].label];
\r
7953 if(k) expt = exp(t*multiply);
\r
7955 #if (CODEML) /* uses U & V */
\r
7956 for(i=0; i<n; i++)
\r
7957 for(j=0,uexpt=U[i*n+k]*expt; j<n; j++) {
\r
7958 P[i*n+j] += uexpt*V[k*n+j];
\r
7960 dP[i*n+j] += uexpt*V[k*n+j]*multiply;
\r
7961 ddP[i*n+j] += uexpt*V[k*n+j]*multiply*multiply;
\r
7964 #elif (BASEML) /* uses Cijk */
\r
7965 for(i=0; i<n; i++) for(j=0; j<n; j++) {
\r
7966 P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;
\r
7968 dP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply;
\r
7969 ddP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply*multiply;
\r
7975 for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {
\r
7976 n1 = (b<com.ns ? nChara[com.z[b][h]] : n);
\r
7977 for(i=0; i<n1; i++) {
\r
7980 piqi = pK[ir*com.npatt+h] * com.pi[ xb = CharaMap[com.z[b][h]][i] ];
\r
7982 piqi = pK[ir*com.npatt+h] * com.pi[i] * nodes[b].conP[h*n+i];
\r
7984 for(j=0,pqj=dpqj=ddpqj=0; j<n; j++) {
\r
7985 pqj += P[xb*n+j]*nodes[a].conP[h*n+j];
\r
7986 dpqj += dP[xb*n+j]*nodes[a].conP[h*n+j];
\r
7987 ddpqj += ddP[xb*n+j]*nodes[a].conP[h*n+j];
\r
7989 fh[h] += piqi*pqj;
\r
7990 dfh[h] += piqi*dpqj;
\r
7991 ddfh[h] += piqi*ddpqj;
\r
7997 for(i=com.ns; i<tree.nnode; i++)
\r
7998 nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*n*(size_t)com.npatt;
\r
7999 for(h=0,*l=*dl=*ddl=0; h<com.npatt; h++) {
\r
8001 printf("small fh[%d] = %10.6e\n",h,fh[h]);
\r
8003 *l -= log(fh[h])*com.fpatt[h];
\r
8004 if(com.NnodeScale) *l -= Sh[h]*com.fpatt[h];
\r
8005 *dl -= dfh[h]/fh[h] * com.fpatt[h];
\r
8006 *ddl -= (fh[h]*ddfh[h] - dfh[h]*dfh[h])/(fh[h]*fh[h]) * com.fpatt[h];
\r
8015 #endif /* #ifdef LFUNCTIONS */
\r
8019 void BranchLengthBD(int rooted, double birth, double death, double sample,
\r
8022 /* Generate random branch lengths (nodes[].branch) using the birth and
\r
8023 death process with species sampling, or the Yule (coalescent?) process
\r
8024 if sample=0, when only parameter mut is used.
\r
8025 Note: older interior nodes have larger node numbers, so root is at
\r
8026 node com.ns*2-2 with time t[ns-2], while the youngest node is at
\r
8027 node com.ns with time t[0]. When unrooted=0, the root is removed with
\r
8028 branch lengths adjusted.
\r
8029 This works with the tree generated from RandomLHistory().
\r
8031 int i,j, it, imin,fixt0=1;
\r
8032 double la=birth, mu=death, rho=sample, tmin, r, t[NS-1];
\r
8033 double phi, eml, y;
\r
8035 if (sample==0) /* coalescent model. Check this!!! */
\r
8036 for (i=com.ns,y=0; i>1; i--)
\r
8037 nodes[com.ns*2-i].age=y += -log(rndu())/(i*(i-1.)/2.)*mut/2;
\r
8038 else { /* BD with sampling */
\r
8039 if (fixt0) t[com.ns-2]=1;
\r
8040 if (fabs(la-mu)>1e-6) {
\r
8041 eml = exp(mu-la);
\r
8042 phi = (rho*la*(eml-1)+(mu-la)*eml)/(eml-1);
\r
8043 for (i=0; i<com.ns-1-(fixt0); i++) {
\r
8045 t[i] = log((phi-r*rho*la)/(phi-r*rho*la+r*(la-mu)))/(mu-la);
\r
8049 for (i=0; i<com.ns-1-(fixt0); i++) {
\r
8051 t[i] = r/(1+la*rho*(1-r));
\r
8054 for (i=0; i<com.ns-1-1; i++) {
\r
8055 for (j=i+1,tmin=t[i],imin=i; j<com.ns-1; j++)
\r
8056 if (tmin>t[j]) { tmin=t[j]; imin=j; }
\r
8060 for (i=com.ns; i>1; i--)
\r
8061 nodes[com.ns*2-i].age = t[com.ns-i]*mut;
\r
8063 for(i=0; i<com.ns; i++) nodes[i].age = 0;
\r
8064 for (i=0; i<tree.nnode; i++)
\r
8065 if (i != tree.root)
\r
8066 nodes[i].branch = nodes[nodes[i].father].age - nodes[i].age;
\r
8068 it = nodes[tree.root].sons[2];
\r
8069 nodes[it].branch = 2*nodes[2*com.ns-2].age - nodes[tree.root].age - nodes[it].age;
\r
8076 #ifdef NODESTRUCTURE
\r
8079 int RandomLHistory (int rooted, double space[])
\r
8081 /* random coalescence tree, with each labeled history having equal probability.
\r
8082 interior nodes are numbered ns, ns+1, ..., 2*ns-1-!rooted
\r
8084 int ns=com.ns, i, j, it=0, *nodea=(int*)space;
\r
8087 for (i=0; i<2*ns-1-!rooted; i++) ClearNode(i);
\r
8089 for (i=0; i<ns; i++) nodea[i]=i;
\r
8090 for (i=ns,t=0; i>(1+!rooted); i--) {
\r
8091 nodes[it=2*ns-i].nson = 2;
\r
8092 j = (int)(i*rndu());
\r
8093 nodes[nodea[j]].father = it;
\r
8094 nodes[it].sons[0] = nodea[j];
\r
8095 nodea[j] = nodea[i-1];
\r
8096 j = (int)((i-1)*rndu());
\r
8097 nodes[nodea[j]].father = it;
\r
8098 nodes[it].sons[1] = nodea[j];
\r
8100 if (!rooted && i==3) {
\r
8101 nodes[it].nson++;
\r
8102 nodes[nodea[1-j]].father = it;
\r
8103 nodes[it].sons[2] = nodea[1-j];
\r
8107 tree.nnode = ns*2-1-!rooted;
\r
8114 #endif /* NODESTRUCTURE */
\r
8118 /* routines for dating analysis of heterogeneous data */
\r
8119 #if (defined BASEML || defined CODEML || defined MCMCTREE)
\r
8122 #if (defined MCMCTREE)
\r
8124 int ProcessFossilInfo()
\r
8126 /* This processes fossil calibration information that has been read into
\r
8127 nodes[].nodeStr. It uses both sptree and nodes[], before it is destroyed.
\r
8128 This is called before sequence alignments at loci are read.
\r
8130 Possible confusions:
\r
8131 Simple lower and upper bounds can be specified using <, >, or both < and > in
\r
8132 the tree either with or without quotation marks. These are read in ReadTreeN()
\r
8133 and processed in ReadTreeSeqs().
\r
8134 Other distributions such as G, SN, ST must be specified using the format 'G(alpha, beta)',
\r
8135 say, and are processed here. Simple bounds can also be specified using the format
\r
8136 'L(0.5)', 'U(1.0)', or 'B(0.5, 1.0)', in which case they are processed here.
\r
8137 I kept this complexity, (i) to keep the option of using <, >, which is intuitive,
\r
8138 (ii) for ReadTreeN to be able to read other node labels such as #, $, either with
\r
8141 int i,j,k, nfossiltype=7;
\r
8143 double tailL=0.025, tailR=0.025, p_LOWERBOUND=0.1, c_LOWERBOUND=1.0;
\r
8145 for(i=sptree.nspecies; i<tree.nnode; i++) {
\r
8146 if(nodes[i].nodeStr == NULL)
\r
8148 if(sptree.nodes[i].fossil) { /* fossila specified using <, >, already processed. */
\r
8149 free(nodes[i].nodeStr);
\r
8152 for(j=1; j<nfossiltype+1; j++)
\r
8153 if((pch = strstr(nodes[i].nodeStr, fossils[j]))) break;
\r
8154 if(j == nfossiltype+1)
\r
8155 printf("\nunrecognized fossil calibration: %s\n", nodes[i].nodeStr);
\r
8157 sptree.nodes[i].fossil = j;
\r
8158 pch = strchr(nodes[i].nodeStr, '(') + 1;
\r
8162 /* truncated Cauchy default prior L(tL, p, c) */
\r
8163 sptree.nodes[i].pfossil[1] = p_LOWERBOUND;
\r
8164 sptree.nodes[i].pfossil[2] = c_LOWERBOUND;
\r
8165 sptree.nodes[i].pfossil[3] = tailL;
\r
8166 sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],
\r
8167 &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);
\r
8170 sptree.nodes[i].pfossil[2] = tailR;
\r
8171 sscanf(pch, "%lf,%lf", &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2]);
\r
8174 sptree.nodes[i].pfossil[2] = tailL;
\r
8175 sptree.nodes[i].pfossil[3] = tailR;
\r
8176 sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],
\r
8177 &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);
\r
8178 if(sptree.nodes[i].pfossil[0] > sptree.nodes[i].pfossil[1]) {
\r
8179 printf("fossil bounds (%.4f, %.4f)", sptree.nodes[i].pfossil[0], sptree.nodes[i].pfossil[1]);
\r
8180 error2("fossil bounds in tree incorrect");
\r
8184 sscanf(pch, "%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1]);
\r
8187 sscanf(pch, "%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2]);
\r
8190 sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);
\r
8193 sscanf(pch, "%lf,%lf,%lf,%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],
\r
8194 &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3], &sptree.nodes[i].pfossil[4],
\r
8195 &sptree.nodes[i].pfossil[5], &sptree.nodes[i].pfossil[6]);
\r
8200 sptree.nodes[i].usefossil = 1;
\r
8201 nodes[i].branch = nodes[i].label = 0;
\r
8202 free(nodes[i].nodeStr);
\r
8211 int GenerateGtree (int locus);
\r
8213 int ReadTreeSeqs (FILE*fout)
\r
8215 /* This reads the combined species tree, the fossil calibration information,
\r
8216 and sequence data at each locus. sptree.nodes[].pfossil[] has tL, tU for
\r
8217 bounds or alpha and beta for the gamma prior.
\r
8219 This routine also processes fossil calibration information specified using
\r
8220 <, >, or both. More complex specifications are stored in nodes[].nodeStr and
\r
8221 processed in ProcessFossilInfo(). See notes in that routine.
\r
8223 This also constructs the gene tree at each locus, by pruning the master
\r
8226 FILE *fseq, *ftree;
\r
8227 int i,j, locus, clean0=com.cleandata;
\r
8228 double tailL=0.025, tailR=0.025, p_LOWERBOUND=0.1, c_LOWERBOUND=1.0;
\r
8230 ftree = gfopen(com.treef,"r");
\r
8232 /* read master species tree and process fossil calibration info */
\r
8233 fscanf(ftree, "%d%d", &sptree.nspecies, &i);
\r
8234 com.ns = sptree.nspecies;
\r
8235 if(com.ns>NS) error2("raise NS?");
\r
8236 /* to read master species names into sptree.nodes[].name */
\r
8237 if(noisy) puts("Reading master tree.");
\r
8238 for(j=0; j<sptree.nspecies; j++)
\r
8239 com.spname[j] = sptree.nodes[j].name;
\r
8242 ReadTreeN(ftree, &i, &j, 1, 1);
\r
8244 for(i=j=0; i<tree.nnode; i++)
\r
8245 if(i!=tree.root && nodes[i].branch>0) j++;
\r
8246 if(j==tree.nbranch)
\r
8247 printf("\aTree with fossil calibrations should not have branch lengths!");
\r
8249 if(com.clock==5 || com.clock==6)
\r
8250 for(i=0; i<tree.nnode; i++) nodes[i].branch = nodes[i].label = 0;
\r
8251 for(i=0; i<tree.nnode; i++)
\r
8252 if(nodes[i].label<0) nodes[i].label = 0; /* change -1 into 0 */
\r
8254 /* OutTreeN(F0,0,0); FPN(F0); */
\r
8255 OutTreeN(F0,1,0); FPN(F0);
\r
8256 /* OutTreeN(F0,1,1); FPN(F0); */
\r
8257 /* copy master tree into sptree */
\r
8258 if(tree.nnode != 2*com.ns-1)
\r
8259 error2("check and think about multificating trees.");
\r
8260 sptree.nnode = tree.nnode; sptree.nbranch = tree.nbranch;
\r
8261 sptree.root = tree.root; sptree.nfossil = 0;
\r
8262 for(i=0; i<sptree.nspecies*2-1; i++) {
\r
8263 sptree.nodes[i].father = nodes[i].father;
\r
8264 sptree.nodes[i].nson = nodes[i].nson;
\r
8265 if(nodes[i].nson!=0 && nodes[i].nson!=2)
\r
8266 error2("master tree has to be binary.");
\r
8267 for(j=0; j<sptree.nodes[i].nson; j++)
\r
8268 sptree.nodes[i].sons[j] = nodes[i].sons[j];
\r
8270 sptree.nodes[i].fossil = nodes[i].fossil;
\r
8271 sptree.nodes[i].age = nodes[i].age;
\r
8272 sptree.nodes[i].pfossil[0] = nodes[i].branch; /* ">": Lower bound */
\r
8273 sptree.nodes[i].pfossil[1] = nodes[i].label; /* "<": Upper bound */
\r
8275 if(nodes[i].branch && nodes[i].label > 0) { /* joint bound: >0.8<1.2 */
\r
8276 if(nodes[i].age == 0) {
\r
8277 sptree.nodes[i].fossil = BOUND_F;
\r
8278 sptree.nodes[i].pfossil[2] = tailL;
\r
8279 sptree.nodes[i].pfossil[3] = tailR;
\r
8282 error2("\nUse 'G(alpha, beta)' to specify the gamma calibration");
\r
8286 else if(nodes[i].branch) { /* lower bound: >0.8 */
\r
8287 sptree.nodes[i].fossil = LOWER_F;
\r
8288 sptree.nfossil++;
\r
8289 /* truncated Cauchy default prior L(tL, p, c) */
\r
8290 sptree.nodes[i].pfossil[1] = p_LOWERBOUND;
\r
8291 sptree.nodes[i].pfossil[2] = c_LOWERBOUND;
\r
8292 sptree.nodes[i].pfossil[3] = tailL;
\r
8294 else if(nodes[i].label > 0) { /* upper bound: <1.2 */
\r
8295 sptree.nodes[i].fossil = UPPER_F;
\r
8296 sptree.nfossil++;
\r
8297 sptree.nodes[i].pfossil[2] = tailR;
\r
8300 if(sptree.nodes[i].fossil)
\r
8301 sptree.nodes[i].usefossil = 1;
\r
8303 nodes[i].branch = nodes[i].label = 0;
\r
8306 #if (defined MCMCTREE)
\r
8307 if(!com.TipDate) ProcessFossilInfo();
\r
8310 /* read sequences at each locus, construct gene tree by pruning sptree */
\r
8311 data.ngene = com.ndata;
\r
8313 fseq = gfopen(com.seqf, "r");
\r
8314 if((gnodes=(struct TREEN**)malloc(sizeof(struct TREEN*)*data.ngene)) == NULL)
\r
8317 printf("\nReading sequence data.. %d loci\n", data.ngene);
\r
8318 for(locus=0; locus<data.ngene; locus++) {
\r
8319 fprintf(fout, "\n\n*** Locus %d ***\n", locus+1);
\r
8320 printf("\n\n*** Locus %d ***\n", locus+1);
\r
8322 com.cleandata=(char)clean0;
\r
8323 for(j=0; j<sptree.nspecies; j++)
\r
8324 com.spname[j] = NULL; /* points to nowhere */
\r
8325 #if (defined CODEML)
\r
8326 if(com.seqtype==1) {
\r
8327 com.icode = data.icode[locus];
\r
8331 ReadSeq(fout, fseq, clean0, locus); /* allocates com.spname[] */
\r
8332 #if (defined CODEML)
\r
8333 if(com.seqtype == 1) {
\r
8334 if(com.sspace < max2(com.ngene+1,com.ns)*(64+12+4)*sizeof(double)) {
\r
8335 com.sspace = max2(com.ngene+1,com.ns)*(64+12+4)*sizeof(double);
\r
8336 if((com.space = (double*)realloc(com.space,com.sspace))==NULL)
\r
8337 error2("oom space for #c");
\r
8339 InitializeCodon(fout,com.space);
\r
8343 data.ns[locus] = com.ns;
\r
8344 data.ls[locus] = com.ls;
\r
8346 if(data.datatype[locus] == MORPHC)
\r
8351 if(com.seqtype==0 || com.seqtype==2)
\r
8352 InitializeBaseAA(fout);
\r
8354 if((com.seqtype==0 || com.seqtype==2) && com.model==0)
\r
8355 PatternWeightJC69like(fout);
\r
8356 xtoy(com.pi, data.pi[locus], com.ncode);
\r
8357 data.cleandata[locus] = (char)com.cleandata;
\r
8358 data.npatt[locus] = com.npatt;
\r
8359 data.fpatt[locus] = com.fpatt; com.fpatt=NULL;
\r
8360 for(i=0; i<com.ns; i++) {
\r
8361 data.z[locus][i] = com.z[i];
\r
8364 printf("%3d patterns, %s\n", com.npatt, (com.cleandata? "clean": "messy"));
\r
8367 GenerateGtree(locus); /* free com.spname[] */
\r
8369 for(i=0,com.cleandata=1; i<data.ngene; i++)
\r
8370 if(data.cleandata[i]==0)
\r
8371 com.cleandata = 0;
\r
8373 fclose(ftree); fclose(fseq);
\r
8374 SetMapAmbiguity();
\r
8377 #if(defined MCMCTREE)
\r
8379 /* com.TipDate_TimeUnit is already initialized, and it won't be changed in GetTipDate() */
\r
8380 GetTipDate(&com.TipDate, &com.TipDate_TimeUnit);
\r
8381 for(i=0; i<sptree.nspecies; i++)
\r
8382 sptree.nodes[i].age = nodes[i].age;
\r
8390 int GenerateGtree (int locus)
\r
8392 /* construct the gene tree at locus by pruning tips in the master species
\r
8393 tree. com.spname[] have names of species at the current locus (probably read
\r
8394 from the sequence alignment at the locus). They are used by the routine to compare
\r
8395 with sptree.nodes[].name to decide which species to keep for the locus.
\r
8396 See GetSubTreeN() for more details.
\r
8398 int ns=data.ns[locus], i,j, ipop[NS], keep[NS], newnodeNO[2*NS-1];
\r
8400 for(j=0; j<sptree.nspecies; j++) keep[j]=0;
\r
8401 for(i=0;i<ns;i++) {
\r
8402 for(j=0;j<sptree.nspecies;j++)
\r
8403 if(!strcmp(com.spname[i], sptree.nodes[j].name)) break;
\r
8404 if(j==sptree.nspecies) {
\r
8405 printf("species %s not found in master tree\n", com.spname[i]);
\r
8409 printf("\nspecies %s occurs twice in locus %d", com.spname[i], locus+1);
\r
8410 error2("\ngiving up...");
\r
8412 keep[j] = i+1; ipop[i] = j; /* seq j in alignment is species i in master tree. */
\r
8413 free(com.spname[i]);
\r
8416 /* copy master species tree and then prune it. */
\r
8418 GetSubTreeN(keep, newnodeNO);
\r
8421 for(i=0;i<sptree.nnode;i++)
\r
8422 if(newnodeNO[i]!=-1) nodes[newnodeNO[i]].ipop = i;
\r
8423 /* printGtree(0); */
\r
8425 gnodes[locus] = (struct TREEN*)malloc((ns*2-1)*sizeof(struct TREEN));
\r
8426 if(gnodes[locus] == NULL) error2("oom gtree");
\r
8427 memcpy(gnodes[locus], nodes, (ns*2-1)*sizeof(struct TREEN));
\r
8428 data.root[locus]=tree.root;
\r
8434 int printGtree (int printBlength)
\r
8438 for(i=0; i<com.ns; i++)
\r
8439 com.spname[i]=sptree.nodes[nodes[i].ipop].name;
\r
8440 for(i=0;i<tree.nnode;i++)
\r
8442 nodes[i].branch=nodes[nodes[i].father].age-nodes[i].age;
\r
8443 printf("\nns = %d nnode = %d", com.ns, tree.nnode);
\r
8444 printf("\n%7s%7s %8s %7s%7s","father","node","(ipop)","nson:","sons");
\r
8445 for(i=0; i<tree.nnode; i++) {
\r
8446 printf ("\n%7d%7d (%2d) %7d ",
\r
8447 nodes[i].father+1, i+1, nodes[i].ipop+1, nodes[i].nson);
\r
8448 for(j=0; j<nodes[i].nson; j++) printf (" %2d", nodes[i].sons[j]+1);
\r
8450 FPN(F0); OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0); FPN(F0);
\r
8451 if(printBlength) { OutTreeN(F0,1,1); FPN(F0); }
\r
8456 void copySptree (void)
\r
8458 /* This copies sptree into nodes = nodes_t, for printing or editing
\r
8463 com.ns = sptree.nspecies; tree.root = sptree.root;
\r
8464 tree.nnode = sptree.nnode; tree.nbranch = sptree.nbranch;
\r
8465 for(i=0; i<sptree.nnode; i++) {
\r
8466 /* this is used by mcmctree */
\r
8467 if(i<com.ns) com.spname[i] = sptree.nodes[i].name;
\r
8469 /* The following may be needed by bpp. Check carefully. */
\r
8471 if(i<com.ns) strcpy(com.spname[i], sptree.nodes[i].name);
\r
8473 nodes[i].father =sptree.nodes[i].father;
\r
8474 nodes[i].nson = sptree.nodes[i].nson;
\r
8475 for(j=0;j<nodes[i].nson;j++)
\r
8476 nodes[i].sons[j] = sptree.nodes[i].sons[j];
\r
8477 nodes[i].fossil = sptree.nodes[i].fossil;
\r
8478 nodes[i].age = sptree.nodes[i].age;
\r
8479 if(i != tree.root)
\r
8480 nodes[i].branch = sptree.nodes[nodes[i].father].age - sptree.nodes[i].age;
\r
8484 void printSptree (void)
\r
8488 printf("\n************\nSpecies tree\nns = %d nnode = %d", sptree.nspecies, sptree.nnode);
\r
8489 printf("\n%7s%7s %-8s %12s %12s%16s\n","father","node","name","time","fossil","sons");
\r
8490 for (i=0; i<sptree.nnode; i++) {
\r
8491 printf("%7d%7d %-14s %9.5f",
\r
8492 sptree.nodes[i].father+1, i+1, sptree.nodes[i].name, sptree.nodes[i].age);
\r
8495 if((k = sptree.nodes[i].fossil)) {
\r
8496 printf(" %s ( ", fossils[k]);
\r
8497 for(j=0; j<npfossils[k]; j++) {
\r
8498 printf("%6.4f", sptree.nodes[i].pfossil[j + (k==UPPER_F)]);
\r
8499 printf("%s", (j==npfossils[k]-1 ? " ) " : ", "));
\r
8504 if(sptree.nodes[i].nson)
\r
8505 printf(" (%2d %2d)", sptree.nodes[i].sons[0]+1, sptree.nodes[i].sons[1]+1);
\r
8509 FPN(F0); OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0); FPN(F0);
\r
8510 OutTreeN(F0,1,1); FPN(F0);
\r
8519 #if (defined BASEML || defined CODEML)
\r
8521 #if (defined CODEML)
\r
8523 int GetMemPUVR(int nc, int nUVR)
\r
8525 /* this gets mem for nUVR sets of matrices
\r
8529 PMat=(double*)malloc((nc*nc+nUVR*nc*nc*2+nUVR*nc)*sizeof(double));
\r
8530 if(PMat==NULL) error2("oom getting P&U&V&Root");
\r
8531 U=_UU[0]=PMat+nc*nc; V=_VV[0]=_UU[0]+nc*nc; Root=_Root[0]=_VV[0]+nc*nc;
\r
8532 for(i=1; i<nUVR; i++) {
\r
8533 _UU[i]=_UU[i-1]+nc*nc*2+nc; _VV[i]=_VV[i-1]+nc*nc*2+nc;
\r
8534 _Root[i]=_Root[i-1]+nc*nc*2+nc;
\r
8539 void FreeMemPUVR(void)
\r
8545 int GetUVRoot_codeml (void)
\r
8547 /* This uses data.daafile[] to set up the eigen matrices U, V, Root for
\r
8548 combined clock analyses of multiple protein data sets (clock = 5 or 6).
\r
8550 int locus, nc=(com.seqtype==1?64:20), nUVR=data.ngene;
\r
8553 if(com.seqtype==1 && (!com.fix_kappa || !com.fix_omega)) nUVR=1;
\r
8554 GetMemPUVR(nc, nUVR);
\r
8556 if(nUVR>6) error2("The maximum number of proteins is set to 6.");
\r
8557 if(com.seqtype==2) {
\r
8558 for(locus=0; locus<data.ngene; locus++) {
\r
8560 strcpy(com.daafile, data.daafile[locus]);
\r
8561 GetDaa(NULL, com.daa);
\r
8562 if(com.model==Empirical_F)
\r
8563 xtoy(data.pi[locus], com.pi, nc);
\r
8564 eigenQaa(NULL, _Root[locus], _UU[locus], _VV[locus], NULL);
\r
8567 else if(com.seqtype==1 && com.fix_kappa & com.fix_omega) {
\r
8568 for(locus=0; locus<data.ngene; locus++) {
\r
8569 if(com.seqtype==1) {
\r
8570 com.icode=data.icode[locus];
\r
8573 com.kappa=data.kappa[locus];
\r
8574 com.omega=data.omega[locus];
\r
8575 xtoy(data.pi[locus], com.pi, com.ncode);
\r
8576 eigenQcodon(1,-1,NULL,NULL,NULL, _Root[locus], _UU[locus], _VV[locus], &mr,
\r
8577 &com.kappa, com.omega, PMat);
\r
8587 int UseLocus (int locus, int copycondP, int setmodel, int setSeqName)
\r
8589 /* This point nodes to the gene tree at locus gnodes[locus] and set com.z[]
\r
8590 etc. for likelihood calculation for the locus.
\r
8595 com.ns=data.ns[locus]; com.ls=data.ls[locus];
\r
8596 tree.root=data.root[locus];
\r
8597 tree.nnode=2*com.ns-1; /* assumes binary tree */
\r
8598 tree.nbranch=tree.nnode-1;
\r
8600 nodes=gnodes[locus];
\r
8602 com.cleandata=data.cleandata[locus];
\r
8603 com.npatt=com.posG[1]=data.npatt[locus]; com.posG[0]=0;
\r
8604 com.fpatt=data.fpatt[locus];
\r
8605 for(i=0; i<com.ns; i++) com.z[i] = data.z[locus][i];
\r
8607 /* The following is model-dependent */
\r
8610 com.kappa=data.kappa[locus];
\r
8611 com.omega=data.omega[locus];
\r
8612 com.alpha=data.alpha[locus];
\r
8614 #if(defined CODEML)
\r
8615 if(com.seqtype==1) {
\r
8616 com.icode=data.icode[locus];
\r
8621 #if(defined BASEML)
\r
8622 if(com.seqtype==0 && com.model!=0 && com.model!=1)
\r
8623 xtoy(data.pi[locus], com.pi, com.ncode);
\r
8624 if(com.model<=TN93)
\r
8625 eigenTN93(com.model, com.kappa, com.kappa, com.pi, &nR, Root, Cijk);
\r
8626 else if (com.model==REV)
\r
8627 eigenQREVbase (NULL, PMat, &com.kappa, com.pi, &nR, Root, Cijk);
\r
8629 if((com.seqtype==1 && com.codonf) || (com.seqtype==2 && com.model==3))
\r
8630 xtoy(data.pi[locus], com.pi, com.ncode);
\r
8632 if((com.seqtype==2 && (com.model==2 || com.model==3))
\r
8633 || (com.seqtype==1 && com.fix_kappa && com.fix_omega)) {
\r
8634 Root=_Root[locus]; U=_UU[locus]; V=_VV[locus];
\r
8637 eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, &com.kappa, com.omega,PMat);
\r
8642 DiscreteGamma (com.freqK,com.rK,com.alpha,com.alpha,com.ncatG,DGammaUseMedian);
\r
8644 com.NnodeScale = data.NnodeScale[locus];
\r
8645 com.nodeScale = data.nodeScale[locus];
\r
8646 nS = com.NnodeScale*com.npatt * (com.conPSiteClass ? com.ncatG : 1);
\r
8647 for(i=0; i<nS; i++) com.nodeScaleF[i] = 0;
\r
8650 for(i=0; i<com.ns; i++)
\r
8651 com.spname[i] = sptree.nodes[nodes[i].ipop].name;
\r
8656 void GetMemBC (void)
\r
8658 /* This gets memory for baseml and codeml under local clock models for analysis
\r
8659 of combined data from multiple loci.
\r
8660 com.conP[] is shared across loci.
\r
8661 fhK[] uses shared space for loci.
\r
8663 int j, locus, nc = (com.seqtype==1?64:com.ncode);
\r
8664 size_t maxsizeScale=0, nS, sfhK=0, s1, snode;
\r
8667 for(locus=0,com.sconP=0; locus<data.ngene; locus++) {
\r
8668 snode = nc*data.npatt[locus];
\r
8669 s1 = snode*(data.ns[locus]-1)*sizeof(double);
\r
8670 if(com.alpha) { /* this is for step 1, using method = 1 */
\r
8671 com.conPSiteClass = 1;
\r
8674 if(s1>com.sconP) com.sconP = s1;
\r
8675 if(com.alpha && (size_t)data.npatt[locus]>sfhK)
\r
8676 sfhK = data.npatt[locus];
\r
8679 com.conP = (double*)malloc(com.sconP);
\r
8680 printf("\n%5lu bytes for conP\n", com.sconP);
\r
8681 if(com.conP==NULL)
\r
8682 error2("oom conP");
\r
8684 sfhK *= com.ncatG*sizeof(double);
\r
8685 if((com.fhK=(double*)realloc(com.fhK,sfhK))==NULL) error2("oom");
\r
8688 /* set gnodes[locus][].conP for internal nodes */
\r
8689 for(locus=0; locus<data.ngene; locus++) {
\r
8690 snode = nc*data.npatt[locus];
\r
8691 for(j=data.ns[locus]; j<data.ns[locus]*2-1; j++)
\r
8692 gnodes[locus][j].conP = com.conP + (j-data.ns[locus])*snode;
\r
8694 for(locus=0; locus<data.ngene; locus++) {
\r
8695 if(!data.cleandata[locus]) {
\r
8696 UseLocus(locus, -1, 0, 0);
\r
8700 if(sptree.nspecies>20) {
\r
8701 for(locus=0; locus<data.ngene; locus++) {
\r
8702 UseLocus(locus, -1, 0, 0);
\r
8703 com.NnodeScale = 0;
\r
8704 com.nodeScale = data.nodeScale[locus]=(char*)malloc(tree.nnode*sizeof(char));
\r
8705 if(com.nodeScale==NULL) error2("oom");
\r
8706 for(j=0; j<tree.nnode; j++) com.nodeScale[j] = 0;
\r
8708 SetNodeScale(tree.root);
\r
8710 data.NnodeScale[locus] = com.NnodeScale;
\r
8711 nS = com.NnodeScale*com.npatt;
\r
8712 if(com.conPSiteClass) nS *= com.ncatG;
\r
8713 maxsizeScale = max2(maxsizeScale, nS);
\r
8715 if(com.NnodeScale) {
\r
8716 printf("\n%d node(s) used for scaling at locus %d: \n",com.NnodeScale,locus+1);
\r
8717 FOR(j,tree.nnode) if(com.nodeScale[j]) printf(" %2d",j+1);
\r
8721 if(maxsizeScale) {
\r
8722 if((com.nodeScaleF=(double*)malloc(maxsizeScale*sizeof(double)))==NULL)
\r
8723 error2("oom nscale");
\r
8724 for(j=0; j<(int)maxsizeScale; j++) com.nodeScaleF[j] = 0;
\r
8730 void FreeMemBC (void)
\r
8734 for(locus=0; locus<data.ngene; locus++)
\r
8735 free(gnodes[locus]);
\r
8738 for(locus=0; locus<data.ngene; locus++) {
\r
8739 free(data.fpatt[locus]);
\r
8740 for(j=0;j<data.ns[locus]; j++)
\r
8741 free(data.z[locus][j]);
\r
8746 if(sptree.nspecies>20) {
\r
8747 for(locus=0; locus<data.ngene; locus++)
\r
8748 free(data.nodeScale[locus]);
\r
8749 if(com.nodeScaleF) free(com.nodeScaleF);
\r
8756 double nu_AHRS=0.001, *varb_AHRS;
\r
8759 double funSS_AHRS(double x[], int np);
\r
8762 double lnLfunHeteroData (double x[], int np)
\r
8764 /* This calculates the log likelihood, the log of the probability of the data
\r
8765 given gtree[] for each locus. This is for step 3 of Yang (2004. Acta
\r
8766 Zoologica Sinica 50:645-656)
\r
8767 x[0,1,...s-k] has node ages in the species tree, followed by branch rates
\r
8768 for genes 1, 2, ..., then kappa for genes, then alpha for genes
\r
8771 double lnL=0, lnLt, *pbrate;
\r
8773 /* ??? need more work for codon sequences */
\r
8774 for(locus=0,k=com.ntime-1; locus<data.ngene; locus++)
\r
8775 k+=data.nbrate[locus];
\r
8776 if(!com.fix_kappa) FOR(locus,data.ngene) data.kappa[locus]=x[k++];
\r
8777 if(!com.fix_omega) FOR(locus,data.ngene) data.omega[locus]=x[k++];
\r
8778 if(!com.fix_alpha) FOR(locus,data.ngene) data.alpha[locus]=x[k++];
\r
8780 /* update node ages in species tree */
\r
8783 FOR(i,tree.nnode) sptree.nodes[i].age=nodes[i].age;
\r
8785 for(locus=0,pbrate=x+com.ntime-1; locus<data.ngene; locus++) {
\r
8787 UseLocus(locus, -1, 1, 1);
\r
8788 /* copy node ages to gene tree */
\r
8789 FOR(i,tree.nnode) nodes[i].age=sptree.nodes[nodes[i].ipop].age;
\r
8790 FOR(i,tree.nnode) {
\r
8791 if(i!=tree.root) {
\r
8792 nodes[i].branch = (nodes[nodes[i].father].age-nodes[i].age)
\r
8793 * pbrate[(int)nodes[i].label];
\r
8794 if(nodes[i].branch<-1e-4)
\r
8798 lnL += lnLt = com.plfun(x, -1);
\r
8799 pbrate += data.nbrate[locus];
\r
8805 double funSS_AHRS (double x[], int np)
\r
8807 /* Function to be minimized in the ad hoc rate smoothing procedure:
\r
8809 nodes[].label has node rate.
\r
8810 lnLb is weighted sum of squares using approximate variances for branch lengths.
\r
8812 lnLr is the log of the prior of rates under the geometric Brownian motion
\r
8813 model of rate evolution. There is no need for recursion as the order at
\r
8814 which sptree.nodes are visited is unimportant. The rates are stored in
\r
8816 The root rate is fixed to be the weighted average rate of its two sons,
\r
8817 inversely weighted by the divergence times.
\r
8819 int locus, j,k, root, pa, son0, son1;
\r
8820 double lnLb, lnLr, lnLbi, lnLri; /* lnLb & lnLr are sum of squares for b and r */
\r
8821 double b,be,t, t0,t1, r,rA, w,y, small=1e-20, smallage=AgeLow[sptree.root]*small;
\r
8822 double nu = nu_AHRS, *varb=varb_AHRS;
\r
8824 /* set up node ages in species tree */
\r
8827 for(j=0; j<tree.nnode; j++)
\r
8828 sptree.nodes[j].age = nodes[j].age;
\r
8831 for(locus=0,lnLb=lnLr=0; locus<data.ngene; varb+=com.ns*2-1,locus++) {
\r
8832 UseLocus(locus, -1, 0, 0);
\r
8833 if(data.fix_nu==2) nu = x[np-1];
\r
8834 else if(data.fix_nu==3) nu = x[np-1-(data.ngene-1-locus)];
\r
8837 son0 = nodes[root].sons[0];
\r
8838 son1 = nodes[root].sons[1];
\r
8839 /* copy node ages and rates into gene tree nodes[]. */
\r
8840 for(j=0; j<tree.nnode; j++) { /* age and rates */
\r
8841 nodes[j].age=sptree.nodes[nodes[j].ipop].age;
\r
8843 nodes[j].label = x[k++];
\r
8845 t0 = nodes[root].age-nodes[son0].age;
\r
8846 t1 = nodes[root].age-nodes[son1].age;
\r
8848 error2("small root branch. Think about what to do.");
\r
8849 nodes[root].label = (nodes[son0].label*t1+nodes[son1].label*t0)/(t0+t1);
\r
8851 for(j=0,lnLbi=0; j<tree.nnode; j++) {
\r
8852 if(j==son0 || j==son1) continue;
\r
8853 pa = nodes[j].father;
\r
8855 b = nodes[son0].branch+nodes[son1].branch;
\r
8856 be = (nodes[j].age-nodes[son0].age) * (nodes[root].label+nodes[son0].label)/2
\r
8857 + (nodes[j].age-nodes[son1].age) * (nodes[root].label+nodes[son1].label)/2;
\r
8860 b = nodes[j].branch;
\r
8861 be = (nodes[pa].age-nodes[j].age) * (nodes[pa].label+nodes[j].label)/2;
\r
8865 puts("small variance");
\r
8866 lnLbi -= square(be-b)/(2*w);
\r
8869 for(j=0,lnLri=0; j<tree.nnode; j++) {
\r
8870 if(j==root) continue;
\r
8871 pa = nodes[j].father;
\r
8872 t = nodes[pa].age - nodes[j].age;
\r
8873 t = max2(t,smallage);
\r
8874 r = nodes[j].label;
\r
8875 rA= nodes[pa].label;
\r
8877 if(rA<small || t<small || r<small) puts("small r, rA, or t");
\r
8878 y = log(r/rA)+t*nu/2;
\r
8879 lnLri -= y*y/(2*t*nu) - log(r) - log(2*Pi*t*nu)/2;
\r
8882 if(data.fix_nu>1) lnLri += -nu/nu_AHRS-log(nu); /* exponential prior */
\r
8886 return (lnLb + lnLr);
\r
8890 void SetBranchRates(int inode)
\r
8892 /* this uses node rates to set branch rates, and is used only after the ad hoc
\r
8893 rate smoothing iteration is finished.
\r
8897 nodes[inode].label = (nodes[inode].label + nodes[nodes[inode].father].label)/2;
\r
8899 for(i=0; i<nodes[inode].nson; i++)
\r
8900 SetBranchRates(nodes[inode].sons[i]);
\r
8904 int GetInitialsClock6Step1 (double x[], double xb[][2])
\r
8906 /* This is for clock 6 step 1.
\r
8909 double tb[]={.0001, 999};
\r
8911 com.ntime=k=tree.nbranch;
\r
8912 GetInitialsTimes (x);
\r
8914 com.plfun = (com.alpha==0 ? lfun : lfundG);
\r
8915 com.conPSiteClass = (com.method && com.plfun==lfundG);
\r
8917 /* InitializeNodeScale(); */
\r
8919 if(com.seqtype==0) com.nrate = !com.fix_kappa;
\r
8921 com.np=com.ntime+!com.fix_kappa+!com.fix_alpha;
\r
8922 if(com.seqtype==1 && !com.fix_omega) com.np++;
\r
8924 if(!com.fix_kappa) x[k++]=com.kappa;
\r
8925 if(!com.fix_omega) x[k++]=com.omega;
\r
8926 if(!com.fix_alpha) x[k++]=com.alpha;
\r
8929 for(i=0; i<com.ntime; i++)
\r
8930 { xb[i][0]=tb[0]; xb[i][1]=tb[1]; }
\r
8931 for( ; i<com.np; i++)
\r
8932 { xb[i][0]=.001; xb[i][1]=999; }
\r
8934 if(noisy>3 && com.np<200) {
\r
8935 printf("\nInitials (np=%d)\n", com.np);
\r
8936 for(i=0; i<com.np; i++) printf(" %10.5f", x[i]); FPN(F0);
\r
8937 for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][0]); FPN(F0);
\r
8938 for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][1]); FPN(F0);
\r
8945 int GetInitialsClock56Step3 (double x[])
\r
8947 /* This is for clock 5 or clock 6 step 3
\r
8949 int i, j,k=0, naa=20;
\r
8952 GetInitialsTimes (x);
\r
8954 com.plfun = (com.alpha==0 ? lfun : lfundG);
\r
8955 com.conPSiteClass = (com.method && com.plfun==lfundG);
\r
8957 /* InitializeNodeScale(); */
\r
8959 com.np = com.ntime-1 + (1+!com.fix_kappa+!com.fix_omega+!com.fix_alpha)*data.ngene;
\r
8961 for(i=com.ntime-1;i<com.np;i++) x[i]=.2+rndu();
\r
8962 else if(com.clock==6) {
\r
8963 for(j=0,k=com.ntime-1; j<data.ngene; k+=data.nbrate[j],j++)
\r
8964 com.np += data.nbrate[j]-1;
\r
8965 if(!com.fix_kappa)
\r
8966 for(j=0; j<data.ngene; j++) x[k++]=data.kappa[j];
\r
8967 if(!com.fix_omega)
\r
8968 for(j=0; j<data.ngene; j++) x[k++]=data.omega[j];
\r
8969 if(!com.fix_alpha)
\r
8970 for(j=0; j<data.ngene; j++) x[k++]=data.alpha[j];
\r
8971 for(i=k;i<com.np;i++) x[i]=(.5+rndu())/2;
\r
8977 double GetMeanRate (void)
\r
8979 /* This gets the rough average rate for the locus
\r
8981 int inode, i,j,k, ipop, nleft,nright,marks[NS], sons[2], nfossil;
\r
8985 for(inode=com.ns; inode<tree.nnode; inode++) {
\r
8986 ipop = nodes[inode].ipop;
\r
8987 if(sptree.nodes[ipop].fossil == 0) continue;
\r
8988 sons[0] = nodes[inode].sons[0];
\r
8989 sons[1] = nodes[inode].sons[1];
\r
8990 for(i=0,nleft=nright=0; i<com.ns; i++) {
\r
8991 for(j=i,marks[i]=0; j!=tree.root; j=nodes[j].father) {
\r
8992 if(j==sons[0]) { marks[i]=1; nleft++; break; }
\r
8993 else if (j==sons[1]) { marks[i]=2; nright++; break; }
\r
8996 if(nleft==0 || nright==0) {
\r
8997 puts("this calibration is not in gene tree.");
\r
9002 for(i=0,md=0; i<com.ns; i++) {
\r
9003 for(j=0; j<com.ns; j++) {
\r
9004 if(marks[i]==1 && marks[j]==2) {
\r
9005 for(k=i; k!=inode; k=nodes[k].father)
\r
9006 md+=nodes[k].branch;
\r
9007 for(k=j; k!=inode; k=nodes[k].father)
\r
9008 md+=nodes[k].branch;
\r
9012 md /= (nleft*nright);
\r
9013 mr += md/(sptree.nodes[ipop].age*2);
\r
9016 printf("node age & mr n%-4d %9.5f%9.5f ", inode, sptree.nodes[ipop].age, md);
\r
9017 if(com.ns<100) FOR(i,com.ns) printf("%d",marks[i]);
\r
9023 { printf("need fossils for this locus\n"); exit(-1); }
\r
9029 int AdHocRateSmoothing (FILE*fout, double x[NS*3], double xb[NS*3][2], double space[])
\r
9031 /* ad hoc rate smoothing for likelihood estimation of divergence times.
\r
9032 Step 1: Use JC69 to estimate branch lengths under no-clock model.
\r
9033 Step 2: ad hoc rate smoothing, estimating one set of divergence times
\r
9034 and many sets of branch rates for loci. Rate at root is set to
\r
9035 weighted average of rate at the two sons.
\r
9037 int model0=com.model, ntime0=com.ntime; /* is this useful? */
\r
9038 int fix_kappa0=com.fix_kappa, fix_omega0=com.fix_omega, fix_alpha0=com.fix_alpha;
\r
9039 int ib, son0, son1;
\r
9040 double kappa0=com.kappa, omega0=com.omega, alpha0=com.alpha, t0,t1, *varb;
\r
9041 double f, e=1e-8, pb=0.00001, rb[]={0.001,99}, lnL,lnLsum=0;
\r
9042 double mbrate[20], Rj[20], r,minr,maxr, beta, *pnu=&nu_AHRS,nu, mr[NGENE];
\r
9043 int i,j,k,k0, locus, nbrate[20],maxnbrate=20;
\r
9045 FILE *fBV = gfopen("in.BV","w");
\r
9046 FILE *fdist = gfopen("RateDist.txt","w");
\r
9047 FILE *finStep1 = fopen("in.ClockStep1","r"),
\r
9048 *finStep2 = fopen("in.ClockStep2","r");
\r
9051 for(locus=0,k=0; locus<data.ngene; locus++)
\r
9052 k += 2*data.ns[locus]-1;
\r
9053 if((varb_AHRS=(double*)malloc(k*sizeof(double)))==NULL)
\r
9054 error2("oom AHRS");
\r
9055 for(i=0; i<k;i++) varb_AHRS[i]=-1;
\r
9058 /* Step 1: Estimate branch lengths without clock. */
\r
9059 printf("\nStep 1: Estimate branch lengths under no clock.\n");
\r
9060 fprintf(fout,"\n\nStep 1: Estimate branch lengths under no clock.\n");
\r
9061 com.clock=0; com.method=1;
\r
9063 com.model=0; com.fix_kappa=1; com.kappa=1;
\r
9064 com.fix_alpha=1; com.alpha=0;
\r
9066 for(locus=0; locus<data.ngene; locus++) {
\r
9067 if(!com.fix_kappa) data.kappa[locus]=com.kappa;
\r
9068 if(!com.fix_omega) data.omega[locus]=com.omega;
\r
9069 if(!com.fix_alpha) data.alpha[locus]=com.alpha;
\r
9071 for(locus=0,varb=varb_AHRS; locus<data.ngene; varb+=com.ns*2-1,locus++) {
\r
9072 UseLocus(locus, -1, 1, 1);
\r
9074 fprintf(fout,"\nLocus %d (%d sequences)\n", locus+1, com.ns);
\r
9076 son0 = nodes[tree.root].sons[0];
\r
9077 son1 = nodes[tree.root].sons[1];
\r
9079 GetInitialsClock6Step1 (x, xb);
\r
9082 if(com.ns>30) fprintf(frub, "\n\nLocus %d\n", locus+1);
\r
9084 puts("read MLEs from step 1 from file");
\r
9085 for(i=0; i<com.np; i++)
\r
9086 fscanf(finStep1,"%lf",&x[i]);
\r
9089 j = minB((com.ns>30?frub:NULL), &lnL, x, xb, e, space);
\r
9090 for(j=0; j<com.ns*2-1; j++) {
\r
9091 ib = nodes[j].ibranch;
\r
9092 if(j!=tree.root) varb[j] = (x[ib]>1e-8 ? -1/varb_minbranches[ib] : 999);
\r
9095 matout(F0, x, 1, com.ntime);
\r
9096 matout2(F0, varb, 1, tree.nnode, 10, 7);
\r
9102 if(!com.fix_kappa) data.kappa[locus] = x[com.ntime];
\r
9103 if(!com.fix_omega) data.omega[locus] = x[com.ntime + !com.fix_kappa];
\r
9104 if(!com.fix_alpha) data.alpha[locus] = x[com.ntime + !com.fix_kappa + !com.fix_omega];
\r
9108 t0 = nodes[son0].branch;
\r
9109 t1 = nodes[son1].branch;
\r
9110 varb[tree.root] = varb[t0>t1?son0:son1];
\r
9111 nodes[son0].branch = nodes[son1].branch = (t0+t1)/2; /* arbitrary */
\r
9112 mr[locus] = GetMeanRate();
\r
9114 printf(" Locus %d: %d sequences, %d blengths, lnL = %15.6f mr=%.5f%10s\n",
\r
9115 locus+1, com.ns, com.np-1,-lnL,mr[locus], printtime(timestr));
\r
9116 fprintf(fout,"\nlnL = %.6f\n\n", -lnL);
\r
9117 OutTreeB(fout); FPN(fout);
\r
9118 for(i=0; i<com.np; i++) fprintf(fout," %8.5f",x[i]); FPN(fout);
\r
9119 for(i=0; i<tree.nbranch; i++) fprintf(fout," %8.5f", sqrt(varb[tree.branches[i][1]])); FPN(fout);
\r
9120 FPN(fout); OutTreeN(fout,1,1); FPN(fout); fflush(fout);
\r
9122 fprintf(fBV, "\n\nLocus %d: %d sequences, %d+1 branches\nlnL = %15.6f\n\n",
\r
9123 locus+1, com.ns, tree.nbranch-1, -lnL);
\r
9124 OutTreeB(fBV); FPN(fBV);
\r
9125 for(i=0; i<tree.nbranch; i++) fprintf(fBV," %12.9f",x[i]); FPN(fBV);
\r
9126 for(i=0; i<tree.nbranch; i++) fprintf(fBV," %12.9f", sqrt(varb[tree.branches[i][1]])); FPN(fBV);
\r
9127 FPN(fBV); OutTreeN(fBV,1,1); FPN(fBV); fflush(fBV);
\r
9130 if(data.ngene>1) fprintf(fout,"\nSum of lnL over loci = %15.6f\n", -lnLsum);
\r
9132 /* Step 2: ad hoc rate smoothing to estimate branch rates. */
\r
9133 printf("\nStep 2: Ad hoc rate smoothing to estimate branch rates.\n");
\r
9134 fprintf(fout, "\n\nStep 2: Ad hoc rate smoothing to estimate branch rates.\n");
\r
9135 /* s - 1 - NFossils node ages, (2*s_i - 2) rates for branches at each locus */
\r
9138 GetInitialsTimes (x);
\r
9140 for(locus=0,com.np=com.ntime-1; locus<data.ngene; locus++)
\r
9141 com.np += data.ns[locus]*2-2;
\r
9142 if(data.fix_nu==2) com.np++;
\r
9143 if(data.fix_nu==3) com.np+=data.ngene;
\r
9145 if(com.np>NS*6) error2("change NP for ad hoc rate smoothing.");
\r
9146 for(i=0; i<com.ntime-1; i++)
\r
9147 { xb[i][0]=pb; xb[i][1]=1-pb; }
\r
9148 if(!nodes[tree.root].fossil)
\r
9149 { xb[0][0]=AgeLow[tree.root]*1.0001; xb[0][1]=max2(AgeLow[tree.root]*10,50); }
\r
9150 for( ; i<com.np; i++) { /* for rates */
\r
9151 xb[i][0]=rb[0]; xb[i][1]=rb[1];
\r
9153 for(locus=0,i=com.ntime-1; locus<data.ngene; locus++)
\r
9154 for(j=0; j<data.ns[locus]*2-2; j++)
\r
9155 x[i++]=mr[locus]*(.8+.4*rndu());
\r
9156 for( ; i<com.np; i++) /* nu */
\r
9157 x[i]=0.001+0.1*rndu();
\r
9160 for(i=0; i<com.np; i++)
\r
9161 { printf(" %10.5f", x[i]); if(i==com.ntime-2) FPN(F0); } FPN(F0);
\r
9163 for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][0]); FPN(F0);
\r
9164 for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][1]); FPN(F0);
\r
9168 if(data.fix_nu>1)
\r
9169 pnu = x+com.np-(data.fix_nu==2 ? 1 : data.ngene);
\r
9170 printf(" %d times, %d rates, %d parameters, ", com.ntime-1,k,com.np);
\r
9173 f = funSS_AHRS(x, com.np);
\r
9174 if(noisy>2) printf("\nf0 = %12.6f\n",f );
\r
9177 puts("read MLEs from step 2 from file");
\r
9178 for(i=0; i<com.np; i++) fscanf(finStep2,"%lf",&x[i]);
\r
9179 matout(F0,x,1,com.np);
\r
9182 j = ming2(frub, &f, funSS_AHRS, NULL, x, xb, space, 1e-9, com.np);
\r
9184 /* generate output to in.clockStep2
\r
9185 matout(fout,x,1,com.np);
\r
9189 { puts("\nad hoc rate smoothing iteration may not have converged.\nEnter to continue; Ctrl-C to break.");
\r
9194 fputs("\nEstimated divergence times from ad hoc rate smoothing\n\n",fout);
\r
9196 FOR(i,tree.nnode) nodes[i].branch*=100;
\r
9197 for(i=com.ns; i<tree.nnode; i++)
\r
9198 fprintf(fout, "Node %2d Time %9.5f\n", i+1, nodes[i].age*100);
\r
9199 FPN(fout); OutTreeN(fout,1,1); FPN(fout);
\r
9201 fprintf(fout, "\nEstimated rates from ad hoc rate smoothing\n");
\r
9202 for(locus=0,k=k0=com.ntime-1; locus<data.ngene; k0+=data.nbrate[locus++]) {
\r
9204 UseLocus(locus, -1, 0, 1);
\r
9205 for(i=0; i<tree.nnode; i++)
\r
9206 if(i!=tree.root) nodes[i].label=x[k++];
\r
9207 son0=nodes[tree.root].sons[0]; son1=nodes[tree.root].sons[1];
\r
9208 t0=nodes[tree.root].age-nodes[son0].age;
\r
9209 t1=nodes[tree.root].age-nodes[son1].age;
\r
9210 nodes[tree.root].label = (nodes[son0].label*t1+nodes[son1].label*t0)/(t0+t1);
\r
9211 SetBranchRates(tree.root); /* node rates -> branch rates */
\r
9213 nu = (data.fix_nu==3 ? *(pnu+locus) : *pnu);
\r
9214 fprintf(fout,"\nLocus %d (%d sequences)\n\n", locus+1, com.ns);
\r
9215 fprintf(fout,"nu = %.6g\n", nu);
\r
9217 /* this block can be deleted? */
\r
9218 fprintf(fout, "\nnode \tage \tlength \trate\n");
\r
9219 for(i=0; i<tree.nnode; i++,FPN(fout)) {
\r
9220 fprintf(fout, "%02d\t%.3f", i+1,nodes[i].age);
\r
9222 fprintf(fout, "\t%.5f\t%.5f", nodes[i].branch,nodes[i].label);
\r
9225 fprintf(fout,"\nRates as labels in tree:\n");
\r
9226 OutTreeN(fout,1,PrLabel); FPN(fout); fflush(fout);
\r
9228 if(data.nbrate[locus]>maxnbrate) error2("too many rate classes? Change source.");
\r
9229 for(i=0,minr=1e6,maxr=0; i<tree.nnode; i++)
\r
9230 if(i!=tree.root) {
\r
9233 puts("node label<0?");
\r
9234 minr = min2(minr,r);
\r
9235 maxr = max2(maxr,r);
\r
9238 fprintf(fdist, "\n%6d\n", tree.nnode-1);
\r
9239 for(i=0; i<tree.nnode; i++) {
\r
9240 if(i==tree.root) continue;
\r
9241 fprintf(fdist, "R%-10.7f ", nodes[i].label);
\r
9242 for(j=0; j<i; j++)
\r
9244 fprintf(fdist, " %9.6f", fabs(nodes[i].label-nodes[j].label));
\r
9249 for(j=0; j<data.nbrate[locus]; j++)
\r
9250 Rj[j]=minr+(j+1)*(maxr-minr)/data.nbrate[locus];
\r
9252 beta = pow(1/(data.nbrate[locus]+1.), 1/(data.nbrate[locus]-1.));
\r
9253 beta = 0.25+0.25*log((double)data.nbrate[locus]);
\r
9254 if(beta>1) beta=0.99;
\r
9255 for(j=0; j<data.nbrate[locus]; j++)
\r
9256 Rj[j]=minr+(maxr-minr)*pow(beta, data.nbrate[locus]-1.-j);
\r
9258 printf("\nLocus %d: nu = %.6f, rate range (%.6f, %.6f)\n", locus+1,nu,minr,maxr);
\r
9259 printf("Cutting points:\n");
\r
9260 for(j=0; j<data.nbrate[locus]; j++)
\r
9261 printf(" < %.6f, ", Rj[j]);
\r
9262 printf("\nThe number of rate groups (0 for no change)? ");
\r
9263 /* scanf("%d", &j); */
\r
9266 data.nbrate[locus]=j;
\r
9267 printf("input %d cutting points? ", data.nbrate[locus]-1);
\r
9268 for(j=0,Rj[data.nbrate[locus]-1]=maxr; j<data.nbrate[locus]-1; j++)
\r
9269 scanf("%lf", &Rj[j]);
\r
9272 for(i=0;i<data.nbrate[locus];i++) { mbrate[i]=0; nbrate[i]=0; }
\r
9273 for(i=0; i<tree.nnode; i++) {
\r
9274 if(i==tree.root) continue;
\r
9276 for(j=0; j<data.nbrate[locus]-1; j++)
\r
9277 if(r<Rj[j]) break;
\r
9280 nodes[i].label = j;
\r
9282 nodes[tree.root].label=-1;
\r
9283 for(i=0;i<data.nbrate[locus];i++)
\r
9284 mbrate[i] = (nbrate[i]?mbrate[i]/nbrate[i]:-1);
\r
9286 fprintf(fout,"\nCollapsing rates into groups\nRate range: (%.6f, %.6f)\n", minr,maxr);
\r
9287 /* fprintf(fout,"\nCollapsing rates into groups\nbeta = %.6g Rate range: (%.6f, %.6f)\n", beta, minr,maxr);
\r
9289 for(j=0; j<data.nbrate[locus]; j++)
\r
9290 fprintf(fout,"rate group %d (%2d): <%9.6f, mean %9.6f\n",
\r
9291 j, nbrate[j], Rj[j], mbrate[j]);
\r
9293 FPN(fout); OutTreeN(fout,1,PrLabel); FPN(fout);
\r
9294 fprintf(fout, "\n\nRough rates for branch groups at locus %d\n", locus+1);
\r
9295 for(i=0; i<data.nbrate[locus]; i++)
\r
9296 x[k0+i] = mbrate[i];
\r
9299 printf("\n\n%d times, %d timerates from AHRS:\n", com.ntime-1,k0);
\r
9300 fprintf(fout,"\n\n%d times, %d timerates from AHRS\n", com.ntime-1,k0);
\r
9301 for(i=0; i<k0; i++) {
\r
9302 printf("%12.6f", x[i]);
\r
9303 if(i==com.ntime-2) FPN(F0);
\r
9304 fprintf(fout,"%12.6f", x[i]);
\r
9305 if(i==com.ntime-2) FPN(fout);
\r
9307 FPN(F0); FPN(fout);
\r
9309 for(i=0; i<k0; i++) x[i]*=0.9+0.2*rndu();
\r
9311 com.model=model0; com.clock=6;
\r
9314 com.fix_kappa=fix_kappa0; com.kappa=kappa0;
\r
9315 com.fix_omega=fix_omega0; com.omega=omega0;
\r
9316 com.fix_alpha=fix_alpha0; com.alpha=alpha0;
\r
9319 /* fix parameters: value > 0, precise value unimportant */
\r
9320 if(!fix_kappa0) { com.fix_kappa=1; com.kappa=0.1; }
\r
9321 if(!fix_omega0) { com.fix_omega=1; com.omega=0.1; }
\r
9322 if(!fix_alpha0) { com.fix_alpha=1; com.alpha=0.1; }
\r
9327 printf(" %10s\n", printtime(timestr));
\r
9329 if(finStep1) fclose(finStep1);
\r
9330 if(finStep2) fclose(finStep2);
\r
9336 void DatingHeteroData (FILE* fout)
\r
9338 /* This is for clock and local-clock dating using heterogeneous data from
\r
9339 multiple loci. Some species might be missing at some loci. Thus
\r
9340 gnodes[locus] stores the gene tree at locus. Branch lengths in the gene
\r
9341 tree are constructed using the divergence times in the master species tree,
\r
9342 and the rates for genes and branches.
\r
9344 com.clock = 5: global clock
\r
9348 int i,j,k, s, np, sconP0=0, locus;
\r
9349 double x[NS*6],xb[NS*6][2], lnL,e=1e-7, *var=NULL;
\r
9351 size_t maxnpML, maxnpADRS;
\r
9355 if(com.clock==6) {
\r
9356 printf("nu (1:fix; 2:estimate one for all genes; 3:estimate one for every gene)? ");
\r
9357 scanf("%d", &data.fix_nu);
\r
9358 if(data.fix_nu==1) scanf("%lf", &nu_AHRS);
\r
9361 ReadTreeSeqs(fout);
\r
9363 for(j=0; j<sptree.nnode; j++) {
\r
9364 sptree.nodes[j].pfossil[0] = sptree.nodes[j].pfossil[1] = -1;
\r
9366 for(j=sptree.nspecies, com.ntime=j-1, sptree.nfossil=0; j<sptree.nnode; j++) {
\r
9367 if(sptree.nodes[j].fossil) {
\r
9370 printf("node %2d age fixed at %.3f\n", j, sptree.nodes[j].age);
\r
9374 s = sptree.nspecies;
\r
9375 maxnpML = s-1 + (5+2)*data.ngene;
\r
9376 maxnpADRS = s-1 + (2*s-1)*data.ngene + 2*data.ngene;
\r
9377 com.sspace = max2(com.sspace, spaceming2(maxnpADRS));
\r
9378 com.sspace = max2(com.sspace, maxnpML*(maxnpML+1)*sizeof(double));
\r
9379 if((com.space = (double*)realloc(com.space,com.sspace))==NULL)
\r
9380 error2("oom space");
\r
9382 #if (defined CODEML)
\r
9383 GetUVRoot_codeml ();
\r
9385 if(com.clock==6) {
\r
9386 if(data.fix_nu<=1) {
\r
9387 printf("nu & nbrate? ");
\r
9388 scanf("%lf%d? ", &nu_AHRS, &nbrate);
\r
9390 for(locus=0; locus<data.ngene; locus++)
\r
9391 data.nbrate[locus] = nbrate;
\r
9392 AdHocRateSmoothing(fout, x, xb, com.space);
\r
9394 printf("\nStep 3: ML estimation of times and rates.");
\r
9395 fprintf(fout,"\n\nStep 3: ML estimation of times and rates.\n");
\r
9397 else { /* clock = 5, global clock */
\r
9398 for(locus=0; locus<data.ngene; locus++)
\r
9399 for(i=0,data.nbrate[locus]=1; i<data.ns[locus]*2-1; i++)
\r
9400 gnodes[locus][i].label=0;
\r
9406 GetInitialsClock56Step3(x);
\r
9409 SetxBound (com.np, xb);
\r
9410 lnL = lnLfunHeteroData(x,np);
\r
9413 printf("\nntime & nrate & np:%6d%6d%6d\n",com.ntime-1,com.nrate,com.np);
\r
9414 matout(F0,x,1,np);
\r
9415 printf("\nlnL0 = %12.6f\n",-lnL);
\r
9418 j = ming2(noisy>2?frub:NULL,&lnL,lnLfunHeteroData,NULL,x,xb, com.space,e,np);
\r
9420 if(noisy) printf("Out...\nlnL = %12.6f\n", -lnL);
\r
9423 for(i=0,j=!sptree.nodes[sptree.root].fossil; i<sptree.nnode; i++)
\r
9424 if(i!=sptree.root && sptree.nodes[i].nson && !sptree.nodes[i].fossil)
\r
9425 x[j++]=sptree.nodes[i].age; /* copy node ages into x[] */
\r
9428 if(np>100 || (com.seqtype && np>20)) puts("Calculating SE's");
\r
9430 Hessian (np,x,lnL,com.space,var,lnLfunHeteroData,var+np*np);
\r
9431 matinv(var,np,np,var+np*np);
\r
9435 fprintf(fout,"\n\nTree: "); OutTreeN(fout,0,0);
\r
9436 fprintf(fout,"\nlnL(ntime:%3d np:%3d):%14.6f\n", com.ntime-1,np,-lnL);
\r
9437 OutTreeB(fout); FPN (fout);
\r
9438 for(i=0;i<np;i++) fprintf(fout," %9.5f",x[i]); FPN(fout); fflush(fout);
\r
9441 fprintf(fout,"SEs for parameters:\n");
\r
9442 for(i=0;i<np;i++) fprintf(fout," %9.5f",(var[i*np+i]>0.?sqrt(var[i*np+i]):-1));
\r
9444 if (com.getSE==2) matout2(fout, var, np, np, 15, 10);
\r
9447 fprintf(fout,"\nTree with node ages for TreeView\n");
\r
9448 FOR(i,tree.nnode) nodes[i].branch*=100;
\r
9449 FPN(fout); OutTreeN(fout,1,1); FPN(fout);
\r
9450 FPN(fout); OutTreeN(fout,1,PrNodeNum); FPN(fout);
\r
9451 FPN(fout); OutTreeN(fout,1,PrLabel|PrAge); FPN(fout);
\r
9452 FPN(fout); OutTreeN(fout,1,0); FPN(fout);
\r
9453 OutputTimesRates(fout, x, var);
\r
9455 fprintf(fout,"\nSubstititon rates for genes (per time unit)\n");
\r
9456 for(j=0,k=com.ntime-1; j<data.ngene; j++,FPN(fout)) {
\r
9457 fprintf(fout," Gene %2d: ", j+1);
\r
9458 for(i=0; i<data.nbrate[j]; i++,k++) {
\r
9459 fprintf(fout,"%10.5f", x[k]);
\r
9460 if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));
\r
9462 if(com.clock==6) fprintf(fout," ");
\r
9464 if(!com.fix_kappa) {
\r
9465 fprintf(fout,"\nkappa for genes\n");
\r
9466 for(j=0; j<data.ngene; j++,k++) {
\r
9467 fprintf(fout,"%10.5f", data.kappa[j]);
\r
9468 if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));
\r
9471 if(!com.fix_omega) {
\r
9472 fprintf(fout,"\nomega for genes\n");
\r
9473 for(j=0; j<data.ngene; j++,k++) {
\r
9474 fprintf(fout,"%10.5f", data.omega[j]);
\r
9475 if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));
\r
9478 if(!com.fix_alpha) {
\r
9479 fprintf(fout,"\nalpha for genes\n");
\r
9480 for(j=0; j<data.ngene; j++,k++) {
\r
9481 fprintf(fout,"%10.5f", data.alpha[j]);
\r
9482 if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));
\r
9487 printf("\nTime used: %s\n", printtime(timestr));
\r