+/* TREESUB.c\r
+ subroutines that operates on trees, inserted into other programs \r
+ such as baseml, basemlg, codeml, and pamp.\r
+*/\r
+\r
+extern char BASEs[], *EquateBASE[], BASEs5[], *EquateBASE5[], AAs[], BINs[], CODONs[][4], nChara[], CharaMap[][64];\r
+\r
+extern int noisy;\r
+\r
+#ifdef BASEML\r
+#define REALSEQUENCE\r
+#define NODESTRUCTURE\r
+#define TREESEARCH\r
+#define LSDISTANCE\r
+#define LFUNCTIONS\r
+#define RECONSTRUCTION\r
+#define MINIMIZATION\r
+#endif\r
+\r
+#ifdef CODEML\r
+#define REALSEQUENCE\r
+#define NODESTRUCTURE\r
+#define TREESEARCH\r
+#define LSDISTANCE\r
+#define LFUNCTIONS\r
+#define RECONSTRUCTION\r
+#define MINIMIZATION\r
+#endif\r
+\r
+#ifdef BASEMLG\r
+#define REALSEQUENCE\r
+#define NODESTRUCTURE\r
+#define LSDISTANCE\r
+#endif\r
+\r
+#ifdef RECONSTRUCTION\r
+#define PARSIMONY\r
+#endif\r
+\r
+#ifdef MCMCTREE\r
+#define REALSEQUENCE\r
+#define NODESTRUCTURE\r
+#define LFUNCTIONS\r
+#endif\r
+\r
+#if(defined CODEML || defined YN00)\r
+double SS, NN, Sd, Nd; /* kostas, # of syn. sites,# of non syn. sites,# of syn. subst.,# of non syn. subst. */\r
+#endif\r
+\r
+\r
+\r
+#ifdef REALSEQUENCE\r
+\r
+int hasbase (char *str)\r
+{\r
+ char *p=str, *eqdel=".-?";\r
+ while (*p) \r
+ if (*p==eqdel[0] || *p==eqdel[1] || *p==eqdel[2] || isalpha(*p++)) \r
+ return(1);\r
+ return(0);\r
+}\r
+\r
+\r
+int GetSeqFileType(FILE *fseq, int *paupseq);\r
+int IdenticalSeqs(void);\r
+void RemoveEmptySequences(void);\r
+\r
+int GetSeqFileType(FILE *fseq, int *format)\r
+{\r
+/* paupstart="begin data" and paupend="matrix" identify nexus file format.\r
+ Modify if necessary.\r
+ format: 0: alignment; 1: fasta; 2: nexus.\r
+\r
+*/\r
+ int lline=1000, ch, aligned;\r
+ char fastastarter='>';\r
+ char line[1000], *paupstart="begin data",*paupend="matrix", *p;\r
+ char *ntax="ntax",*nchar="nchar";\r
+\r
+ while (isspace(ch=fgetc(fseq)))\r
+ ;\r
+ ungetc(ch, fseq);\r
+ if(ch == fastastarter) {\r
+ *format = 1;\r
+ ScanFastaFile(fseq, &com.ns, &com.ls, &aligned);\r
+ if(aligned)\r
+ return(0);\r
+ else \r
+ error2("The seq file appears to be in fasta format, but not aligned?");\r
+ }\r
+ if(fscanf(fseq,"%d%d", &com.ns, &com.ls)==2) {\r
+ *format = 0; return(0);\r
+ }\r
+ *format = 2;\r
+ printf("\nseq file is not paml/phylip format. Trying nexus format.");\r
+\r
+ for ( ; ; ) {\r
+ if(fgets(line,lline,fseq)==NULL) error2("seq err1: EOF");\r
+ strcase(line,0);\r
+ if(strstr(line,paupstart)) break;\r
+ }\r
+ for ( ; ; ) {\r
+ if(fgets(line,lline,fseq)==NULL) error2("seq err2: EOF");\r
+ strcase(line,0);\r
+ if((p=strstr(line,ntax))!=NULL) {\r
+ while (*p != '=') { if(*p==0) error2("seq err"); p++; }\r
+ sscanf(p+1,"%d", &com.ns);\r
+ if((p=strstr(line,nchar))==NULL) error2("expect nchar");\r
+ while (*p != '=') { if(*p==0) error2("expect ="); p++; }\r
+ sscanf(p+1,"%d", &com.ls);\r
+ break;\r
+ } \r
+ }\r
+ /* printf("\nns: %d\tls: %d\n", com.ns, com.ls); */\r
+ for ( ; ; ) {\r
+ if(fgets(line,lline,fseq)==NULL) error2("seq err1: EOF");\r
+ strcase(line,0);\r
+ if (strstr(line,paupend)) break;\r
+ }\r
+ return(0);\r
+}\r
+\r
+int PopupComment(FILE *fseq)\r
+{\r
+ int ch, comment1=']';\r
+ for( ; ; ) {\r
+ ch=fgetc(fseq);\r
+ if(ch==EOF) error2("expecting ]");\r
+ if(ch==comment1) break;\r
+ if(noisy) putchar(ch);\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+#if(MCMCTREE)\r
+\r
+int ReadMorphology (FILE *fout, FILE *fin)\r
+{\r
+ int i,j, locus=data.nmorphloci;\r
+ char line[1024], str[64];\r
+\r
+ if((data.zmorph[locus][0] = (double*)malloc((com.ns*2-1)*com.ls*sizeof(double))) == NULL)\r
+ error2("oom zmorph");\r
+ if((data.Rmorph[locus] = (double*)malloc(com.ls*com.ls*sizeof(double))) == NULL)\r
+ error2("oom Rmorph");\r
+\r
+ if((data.nmorphloci = locus+1) > NMORPHLOCI) error2("raise NMORPHLOCI and recompile.");\r
+ for(i=1; i<com.ns*2-1; i++) {\r
+ data.zmorph[locus][i] = data.zmorph[locus][0] + i*com.ls;\r
+ }\r
+ for(i=0; i<com.ns; i++) {\r
+ fscanf(fin, "%s", com.spname[i]);\r
+ printf ("Reading data for species #%2d: %s \r", i+1, com.spname[i]);\r
+ for(j=0; j<com.ls; j++) \r
+ fscanf(fin, "%lf", &data.zmorph[locus][i][j]);\r
+ }\r
+\r
+ for(i=0; i<com.ns; i++) {\r
+ fprintf(fout, "%-10s ", com.spname[i]);\r
+ for(j=0; j<com.ls; j++)\r
+ fprintf(fout, " %8.5f", data.zmorph[locus][i][j]);\r
+ FPN(fout);\r
+ }\r
+\r
+#if(0)\r
+ fscanf(fin, "%s", str);\r
+ fgets(line, 1024, fin);\r
+ i = j = -1;\r
+ if(strstr("Correlation", str)) {\r
+ for(i=0; i<com.ls; i++) {\r
+ for(j=0; j<com.ls; j++) \r
+ if(fscanf(fin, "%lf", &data.Rmorph[locus][i*com.ls+j]) != 1) break;\r
+ if(j<com.ls) break;\r
+ }\r
+ }\r
+ if(i!=com.ls || j!=com.ls) {\r
+ printf("\ndid not find a good R matrix. Setting it to identity matrix I.\n");\r
+ for(i=0; i<com.ls; i++) \r
+ for(j=0; j<com.ls; j++) \r
+ data.Rmorph[locus][i*com.ls+j] = (i==j);\r
+ }\r
+#endif\r
+ return(0);\r
+}\r
+\r
+#endif\r
+\r
+int ReadSeq (FILE *fout, FILE *fseq, int cleandata, int locus)\r
+{\r
+/* read in sequence, translate into protein (CODON2AAseq), and \r
+ This counts ngene but does not initialize lgene[].\r
+ It also codes (transforms) the sequences.\r
+ com.seqtype: 0=nucleotides; 1=codons; 2:AAs; 3:CODON2AAs; 4:BINs\r
+ com.pose[] is used to store gene or site-partition labels.\r
+ ls/3 gene marks for codon sequences.\r
+ char opt_c[]="GIPM";\r
+ G:many genes; I:interlaved format; P:patterns; M:morphological characters\r
+\r
+ Use cleandata=1 to clean up ambiguities. In return, com.cleandata=1 if the \r
+ data are clean or are cleaned, and com.cleandata=0 is the data are unclean. \r
+*/\r
+ char *p,*p1, eq='.', comment0='[', *line;\r
+ int format=0; /* 0: paml/phylip, 1: fasta; 2: paup/nexus */\r
+ int i,j,k, ch, noptline=0, lspname=LSPNAME, miss=0, nb;\r
+ int lline=10000,lt[NS], igroup, Sequential=1, basecoding=0;\r
+ int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
+ int gap=(n31==3?3:10), nchar=(com.seqtype==AAseq?20:4);\r
+ int h,b[3]={0};\r
+ char *pch=((com.seqtype<=1||com.seqtype==CODON2AAseq) ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5 ? BASEs5 : BINs)));\r
+ char str[4]=" ";\r
+ double lst;\r
+#if(MCMCTREE)\r
+ data.datatype[locus] = com.seqtype;\r
+#endif\r
+ str[0]=0; h=-1; b[0]=-1; /* avoid warning */\r
+ com.readpattern = 0;\r
+ if (com.seqtype==4) error2("seqtype==BINs, check with author");\r
+ if (noisy>=9 && (com.seqtype<=CODONseq||com.seqtype==CODON2AAseq)) {\r
+ puts("\n\nAmbiguity character definition table:\n");\r
+ for(i=0; i<(int)strlen(BASEs); i++) {\r
+ nb = strlen(EquateBASE[i]);\r
+ printf("%c (%d): ", BASEs[i], nb);\r
+ for(j=0; j<nb; j++) printf("%c ", EquateBASE[i][j]);\r
+ FPN(F0);\r
+ }\r
+ }\r
+ GetSeqFileType(fseq, &format);\r
+\r
+ if (com.ns>NS) error2("too many sequences.. raise NS?");\r
+ if (com.ls%n31!=0) {\r
+ printf ("\n%d nucleotides, not a multiple of 3!", com.ls); exit(-1);\r
+ }\r
+ if (noisy) printf ("\nns = %d \tls = %d\n", com.ns, com.ls);\r
+\r
+ for(j=0; j<com.ns; j++) {\r
+ if(com.spname[j]) free(com.spname[j]);\r
+ com.spname[j] = (char*)malloc((lspname+1)*sizeof(char));\r
+ for(i=0; i<lspname+1; i++) com.spname[j][i]=0;\r
+ if((com.z[j] = (unsigned char*)realloc(com.z[j],com.ls*sizeof(unsigned char))) == NULL)\r
+ error2("oom z");\r
+ }\r
+ com.rgene[0] = 1; com.ngene = 1; \r
+ lline = max2(lline, com.ls/n31*(n31+1)+lspname+50);\r
+ if((line=(char*)malloc(lline*sizeof(char))) == NULL) error2("oom line");\r
+\r
+ /* first line */\r
+ if (format == 0) {\r
+ if(!fgets(line,lline,fseq)) error2("ReadSeq: first line");\r
+ com.readpattern = (strchr(line, 'P') || strchr(line, 'p'));\r
+#if(MCMCTREE)\r
+ if(strchr(line, 'M') || strchr(line, 'm')) data.datatype[locus] = MORPHC;\r
+#endif\r
+ }\r
+#if(MCMCTREE)\r
+ if(data.datatype[locus] == MORPHC) { /* morhpological data */\r
+ ReadMorphology(fout, fseq);\r
+ return(0);\r
+ }\r
+ else\r
+#endif\r
+ if(!com.readpattern) {\r
+ if((com.pose=(int*)realloc(com.pose, com.ls/n31*sizeof(int)))==NULL)\r
+ error2("oom pose");\r
+ for(j=0; j<com.ls/n31; j++) com.pose[j]=0; /* gene #1, default */\r
+ }\r
+ else {\r
+ if(com.pose) free(com.pose); \r
+ com.pose = NULL;\r
+ }\r
+ if(format) goto readseq;\r
+\r
+ for (j=0; j<lline && line[j] && line[j]!='\n'; j++) {\r
+ if (!isalnum(line[j])) continue;\r
+ line[j]=(char)toupper(line[j]);\r
+ switch (line[j]) {\r
+ case 'G': noptline++; break;\r
+ case 'C': basecoding=1; break;\r
+ case 'S': Sequential=1; break;\r
+ case 'I': Sequential=0; break;\r
+ case 'P': break; /* already dealt with. */\r
+ default : \r
+ printf ("\nBad option '%c' in first line of seqfile\n", line[j]);\r
+ exit (-1);\r
+ }\r
+ }\r
+ if (strchr(line,'C')) { /* protein-coding DNA sequences */\r
+ if(com.seqtype==2) error2("option C?");\r
+ if(com.seqtype==0) {\r
+ if (com.ls%3!=0 || noptline<1) error2("option C?");\r
+ com.ngene=3; \r
+ for(i=0;i<3;i++) com.lgene[i]=com.ls/3;\r
+#if(defined(BASEML) || defined(BASEMLG))\r
+ com.coding=1;\r
+ if(com.readpattern) \r
+ error2("partterns for coding sequences (G C P) not implemented.");\r
+ else \r
+ for (i=0;i<com.ls;i++) com.pose[i]=(char)(i%3);\r
+ \r
+#endif\r
+ }\r
+ noptline--;\r
+ }\r
+\r
+ /* option lines */\r
+ for(j=0; j<noptline; j++) {\r
+ for(ch=0; ; ) {\r
+ ch = (char)fgetc(fseq);\r
+ if(ch == comment0) \r
+ PopupComment(fseq);\r
+ if(isalnum(ch)) break;\r
+ }\r
+\r
+ ch = (char)toupper(ch);\r
+ switch (ch) {\r
+ case ('G') :\r
+ if(basecoding) error2("Error in sequence data file: incorrect option format, use GC?\n");\r
+ if (fscanf(fseq,"%d",&com.ngene)!=1) error2("expecting #gene here..");\r
+ if (com.ngene>NGENE) error2("raise NGENE?");\r
+\r
+ fgets(line,lline,fseq);\r
+ if (!blankline(line)) { /* #sites in genes on the 2nd line */\r
+ for (i=0,p=line; i<com.ngene; i++) {\r
+ while (*p && !isalnum(*p)) p++;\r
+ if (sscanf(p,"%d",&com.lgene[i])!=1) break;\r
+ while (*p && isalnum(*p)) p++;\r
+ }\r
+ /* if ngene is large and some lgene is on the next line */\r
+ for (; i<com.ngene; i++)\r
+ if (fscanf(fseq,"%d", &com.lgene[i])!=1) error2("EOF at lgene");\r
+\r
+ for(i=0,k=0; i<com.ngene; i++) \r
+ k += com.lgene[i];\r
+ if(k!=com.ls/n31) {\r
+ matIout(F0, com.lgene, 1, com.ngene);\r
+ printf("\n%6d != %d", com.ls/n31, k);\r
+ puts("\nOption G: total length over genes is not correct");\r
+ if(com.seqtype==1) {\r
+ puts("Note: gene length is in number of codons.");\r
+ }\r
+ puts("Sequence length in number of nucleotides.");\r
+ exit(-1);\r
+ }\r
+ if(!com.readpattern)\r
+ for(i=0,k=0; i<com.ngene; k+=com.lgene[i],i++)\r
+ for(j=0; j<com.lgene[i]; j++)\r
+ com.pose[k+j] = i;\r
+\r
+ }\r
+ else { /* site marks on later line(s) */\r
+ if(com.readpattern) \r
+ error2("option PG: use number of patterns in each gene and not site marks");\r
+ for(k=0; k<com.ls/n31; ) {\r
+ if (com.ngene>9) fscanf(fseq,"%d", &ch);\r
+ else {\r
+ do ch=fgetc(fseq); while (!isdigit(ch));\r
+ ch=ch-(int)'1'+1; /* assumes 1,2,...,9 are consecutive */\r
+ }\r
+ if (ch<1 || ch>com.ngene)\r
+ { printf("\ngene mark %d at %d?\n", ch, k+1); exit (-1); }\r
+ com.pose[k++]=ch-1;\r
+ }\r
+ if(!fgets(line,lline,fseq)) error2("sequence file, gene marks");\r
+ }\r
+ break;\r
+ default :\r
+ printf ("Bad option '%c' in option lines in seqfile\n", line[0]);\r
+ exit (-1);\r
+ }\r
+ }\r
+\r
+ readseq:\r
+ /* read sequence */\r
+ if (Sequential) { /* sequential */\r
+ if (noisy) printf ("Reading sequences, sequential format..\n");\r
+ for (j=0; j<com.ns; j++) {\r
+ lspname = LSPNAME;\r
+ for (i=0; i<2*lspname; i++) line[i]='\0';\r
+ if (!fgets (line, lline, fseq)) error2("EOF?");\r
+ if (blankline(line)) {\r
+ if (PopEmptyLines (fseq, lline, line))\r
+ { printf("error in sequence data file: empty line (seq %d)\n",j+1); exit(-1); }\r
+ }\r
+ p = line+(line[0]=='=' || line[0]=='>') ;\r
+ while(isspace(*p)) p++;\r
+ if ((ch=strstr(p," ")-p)<lspname && ch>0) lspname=ch;\r
+ strncpy (com.spname[j], p, lspname);\r
+ k = strlen(com.spname[j]);\r
+ p += (k<lspname?k:lspname);\r
+\r
+ for (; k>0; k--) /* trim spaces */\r
+ if (!isgraph(com.spname[j][k])) com.spname[j][k]=0;\r
+ else break;\r
+\r
+ if (noisy>=2) printf ("Reading seq #%2d: %s \r", j+1, com.spname[j]);\r
+ for (k=0; k<com.ls; p++) {\r
+ while (*p=='\n' || *p=='\0') {\r
+ p=fgets(line, lline, fseq);\r
+ if(p==NULL)\r
+ { printf("\nEOF at site %d, seq %d\n", k+1,j+1); exit(-1); }\r
+ }\r
+ *p = (char)toupper(*p);\r
+ if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U') \r
+ *p = 'T';\r
+ p1 = strchr(pch, *p);\r
+ if (p1 && p1-pch>=nchar) \r
+ miss = 1;\r
+ if (*p==eq) {\r
+ if (j==0) error2("Error in sequence data file: . in 1st seq.?");\r
+ com.z[j][k] = com.z[0][k]; k++;\r
+ }\r
+ else if (p1) \r
+ com.z[j][k++] = *p;\r
+ else if (isalpha(*p)) {\r
+ printf("\nError in sequence data file: %c at %d seq %d.\n",*p,k+1,j+1); \r
+ puts("Make sure to separate the sequence from its name by 2 or more spaces.");\r
+ exit(0); \r
+ }\r
+ else if (*p == (char)EOF) error2("EOF?");\r
+ } /* for(k) */\r
+ if(strchr(p,'\n')==NULL) /* pop up line return */\r
+ while((ch=fgetc(fseq))!='\n' && ch!=EOF) ;\r
+ } /* for (j,com.ns) */\r
+ }\r
+ else { /* interlaved */\r
+ if (noisy) printf ("Reading sequences, interlaved format..\n");\r
+ FOR (j, com.ns) lt[j]=0; /* temporary seq length */\r
+ for (igroup=0; ; igroup++) {\r
+ /*\r
+ printf ("\nreading block %d ", igroup+1); matIout(F0,lt,1,com.ns);*/\r
+\r
+ FOR (j, com.ns) if (lt[j]<com.ls) break;\r
+ if (j==com.ns) break;\r
+ FOR (j,com.ns) {\r
+ if (!fgets(line,lline,fseq)) {\r
+ printf("\nerr reading site %d, seq %d group %d\nsites read in each seq:",\r
+ lt[j]+1,j+1,igroup+1);\r
+ error2("EOF?");\r
+ }\r
+ if (!hasbase(line)) {\r
+ if (j) {\r
+ printf ("\n%d, seq %d group %d", lt[j]+1, j+1, igroup+1);\r
+ error2("empty line.");\r
+ }\r
+ else \r
+ if (PopEmptyLines(fseq,lline,line)==-1) {\r
+ printf ("\n%d, seq %d group %d", lt[j]+1, j+1, igroup+1);\r
+ error2("EOF?");\r
+ }\r
+ }\r
+ p=line;\r
+ if (igroup==0) {\r
+ lspname = LSPNAME;\r
+ while(isspace(*p)) p++;\r
+ if ((ch=strstr(p," ")-p)<lspname && ch>0)\r
+ lspname = ch;\r
+ strncpy (com.spname[j], p, lspname);\r
+ k = strlen(com.spname[j]);\r
+ p += (k<lspname?k:lspname);\r
+\r
+ for (; k>0; k--) /* trim spaces */\r
+ if (!isgraph(com.spname[j][k]))\r
+ com.spname[j][k]=0;\r
+ else\r
+ break;\r
+ if(noisy>=2) printf("Reading seq #%2d: %s \r",j+1,com.spname[j]);\r
+ }\r
+ for (; *p && *p!='\n'; p++) {\r
+ if (lt[j]==com.ls) break;\r
+ *p = (char)toupper(*p);\r
+ if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U') \r
+ *p = 'T';\r
+ p1 = strchr(pch, *p);\r
+ if (p1 && p1-pch>=nchar) \r
+ miss = 1;\r
+ if (*p == eq) {\r
+ if (j == 0) {\r
+ printf("err: . in 1st seq, group %d.\n",igroup);\r
+ exit (-1);\r
+ }\r
+ com.z[j][lt[j]] = com.z[0][lt[j]];\r
+ lt[j]++;\r
+ }\r
+ else if (p1)\r
+ com.z[j][lt[j]++]=*p;\r
+ else if (isalpha(*p)) {\r
+ printf("\nerr: unrecognised character %c at %d seq %d block %d.",\r
+ *p,lt[j]+1,j+1,igroup+1);\r
+ exit(-1);\r
+ }\r
+ else if (*p==(char)EOF) error2("EOF");\r
+ } /* for (*p) */\r
+ } /* for (j,com.ns) */\r
+\r
+ if(noisy>2) {\r
+ printf("\nblock %3d:", igroup+1);\r
+ for(j=0;j<com.ns;j++) printf(" %6d",lt[j]);\r
+ }\r
+\r
+ } /* for (igroup) */\r
+ }\r
+ free(line);\r
+\r
+#ifdef CODEML\r
+ /* mask stop codons as ???. */\r
+ if(com.seqtype==1 && MarkStopCodons())\r
+ miss=1;\r
+#endif\r
+\r
+ if(!miss)\r
+ com.cleandata = 1;\r
+ else if (cleandata) { /* forced removal of ambiguity characters */\r
+ if(noisy>2) puts("\nSites with gaps or missing data are removed.");\r
+ if(fout) {\r
+ fprintf(fout,"\nBefore deleting alignment gaps\n");\r
+ fprintf(fout, " %6d %6d\n", com.ns, com.ls);\r
+ printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,gap,com.seqtype,0,0,NULL);\r
+ }\r
+ RemoveIndel ();\r
+ if(fout) fprintf(fout,"\nAfter deleting gaps. %d sites\n",com.ls);\r
+ }\r
+\r
+ if(fout && !com.readpattern) {/* verbose=1, listing sequences again */\r
+ fprintf(fout, " %6d %6d\n", com.ns, com.ls);\r
+ printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,gap,com.seqtype,0,0,NULL);\r
+ }\r
+\r
+ if(n31==3) com.ls/=n31;\r
+\r
+ /* IdenticalSeqs(); */\r
+\r
+#ifdef CODEML\r
+ if(com.seqtype==1 && com.verbose) Get4foldSites();\r
+\r
+ if(com.seqtype==CODON2AAseq) {\r
+ if (noisy>2) puts("\nTranslating into AA sequences\n");\r
+ for(j=0; j<com.ns; j++) {\r
+ if (noisy>2) printf("Translating sequence %d\n",j+1);\r
+ DNA2protein(com.z[j], com.z[j], com.ls,com.icode);\r
+ }\r
+ com.seqtype=AAseq;\r
+\r
+ if(fout) {\r
+ fputs("\nTranslated AA Sequences\n",fout);\r
+ fprintf(fout,"%4d %6d",com.ns,com.ls);\r
+ printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,10,com.seqtype,0,0,NULL);\r
+ }\r
+ }\r
+#endif\r
+\r
+#if (defined CODEML || defined BASEML)\r
+ if(com.ngene==1 && com.Mgene==1) com.Mgene=0;\r
+ if(com.ngene>1 && com.Mgene==1 && com.verbose) printSeqsMgenes ();\r
+\r
+ if(com.bootstrap) { BootstrapSeq("boot.txt"); exit(0); }\r
+#endif\r
+\r
+\r
+#if (defined CODEML)\r
+ /* list sites with 2 types of serine codons: TC? and TCY. 19 March 2014, Ziheng. */\r
+ {\r
+ char codon[4]="";\r
+ int nbox0, nbox1;\r
+ for(h=0; h<com.ls; h++) {\r
+ for(i=0,nbox0=nbox1=0; i<com.ns; i++) {\r
+ codon[0]=com.z[i][h*3+0]; codon[1]=com.z[i][h*3+1]; codon[2]=com.z[i][h*3+2];\r
+ if(codon[0]=='T' && codon[1]=='C') nbox0++;\r
+ else if(codon[0]=='A' && codon[1]=='G' && (codon[2]=='T' || codon[2]=='C')) nbox1++;\r
+ }\r
+ if(nbox0 && nbox1 && nbox0+nbox1==com.ns) {\r
+ printf("\ncodon %7d: ", h+1);\r
+ for(i=0; i<com.ns; i++)\r
+ printf("%c%c%c ", com.z[i][h*3+0], com.z[i][h*3+1], com.z[i][h*3+2]);\r
+ }\r
+ }\r
+ }\r
+#endif\r
+\r
+\r
+\r
+ if(noisy>=2) printf ("\nSequences read..\n");\r
+ if(com.ls==0) {\r
+ puts("no sites. Got nothing to do");\r
+ return(1);\r
+ }\r
+\r
+#if (defined MCMCTREE)\r
+ /* Check and remove empty sequences. */\r
+\r
+ if(com.cleandata==0)\r
+ RemoveEmptySequences();\r
+\r
+#endif\r
+\r
+ if(!com.readpattern) \r
+ PatternWeight();\r
+ else { /* read pattern counts */\r
+ com.npatt = com.ls;\r
+ if((com.fpatt=(double*)realloc(com.fpatt, com.npatt*sizeof(double))) == NULL)\r
+ error2("oom fpatt");\r
+ for(h=0,lst=0; h<com.npatt; h++) {\r
+ fscanf(fseq, "%lf", &com.fpatt[h]);\r
+ lst += com.fpatt[h];\r
+ if(com.fpatt[h]<0 || com.fpatt[h]>1e6)\r
+ printf("fpatth[%d] = %.6g\n", h+1, com.fpatt[h]);\r
+ }\r
+ if(lst>1.00001) { \r
+ com.ls = (int)lst;\r
+ if(noisy) printf("\n%d site patterns read, %d sites\n", com.npatt, com.ls);\r
+ }\r
+ if(com.ngene==1) { \r
+ com.lgene[0] = com.ls; \r
+ com.posG[0] = 0; \r
+ com.posG[1] = com.npatt; \r
+ }\r
+ else {\r
+ for(j=0,com.posG[0]=0; j<com.ngene; j++)\r
+ com.posG[j+1] = com.posG[j] + com.lgene[j];\r
+\r
+ for(j=0; j<com.ngene; j++) {\r
+ com.lgene[j] = (j==0 ? 0 : com.lgene[j-1]);\r
+ for(h=com.posG[j]; h<com.posG[j+1]; h++)\r
+ com.lgene[j] += (int)com.fpatt[h];\r
+ }\r
+ }\r
+ }\r
+\r
+ EncodeSeqs();\r
+\r
+ if(fout) {\r
+ fprintf(fout,"\nPrinting out site pattern counts\n\n");\r
+ printPatterns(fout);\r
+ }\r
+\r
+ return (0);\r
+}\r
+\r
+\r
+#if(defined CODEML)\r
+\r
+int MarkStopCodons(void)\r
+{\r
+/* this converts the whole column into ??? if there is a stop codon in one sequence.\r
+ Data in com.z[] are just read in and not encoded yet.\r
+*/\r
+ int i,j,h,k, NColumnEdited=0;\r
+ char codon[4]="", stops[6][4]={"","",""}, nstops=0;\r
+\r
+ if(com.seqtype!=1) error2("should not be here");\r
+\r
+ for(i=0; i<64; i++) \r
+ if(GeneticCode[com.icode][i]==-1) \r
+ getcodon(stops[nstops++], i);\r
+\r
+ for(h=0; h<com.ls/3; h++) {\r
+ for(i=0; i<com.ns; i++) {\r
+ codon[0] = com.z[i][h*3+0];\r
+ codon[1] = com.z[i][h*3+1];\r
+ codon[2] = com.z[i][h*3+2];\r
+ for(j=0; j<nstops; j++) \r
+ if(strcmp(codon, stops[j])==0) {\r
+ printf("stop codon %s in seq. # %3d (%s)\r", codon, i+1, com.spname[i]);\r
+ break;\r
+ }\r
+ if(j<nstops) break;\r
+ }\r
+ if(i<com.ns) {\r
+ for(i=0; i<com.ns; i++) \r
+ com.z[i][h*3+0] = com.z[i][h*3+1] = com.z[i][h*3+2] = '?';\r
+ NColumnEdited++;\r
+ }\r
+ }\r
+ if(NColumnEdited) {\r
+ printf("\n%2d columns are converted into ??? because of stop codons\nPress Enter to continue", NColumnEdited);\r
+ getchar();\r
+ }\r
+ return(NColumnEdited);\r
+}\r
+\r
+#endif\r
+\r
+\r
+void RemoveEmptySequences(void)\r
+{\r
+/* this removes empty sequences (? or - only) and adjust com.ns\r
+*/\r
+ int j,h, nsnew;\r
+ char emptyseq[NS];\r
+\r
+ for(j=0; j<com.ns; j++) {\r
+ emptyseq[j] = 1;\r
+ for(h=0; h<com.ls*(com.seqtype==1?3:1); h++)\r
+ if(com.z[j][h] != '?' && com.z[j][h] != '-') {\r
+ emptyseq[j] = 0;\r
+ break;\r
+ }\r
+ }\r
+ for(j=0,nsnew=0; j<com.ns; j++) {\r
+ if(emptyseq[j]) {\r
+ printf("seq #%3d: %-30s is removed\n", j+1, com.spname[j]);\r
+ free(com.z[j]);\r
+ free(com.spname[j]);\r
+ continue;\r
+ }\r
+ com.z[nsnew] = com.z[j];\r
+ com.spname[nsnew] = com.spname[j];\r
+ nsnew ++;\r
+ }\r
+ for(j=nsnew; j<com.ns; j++) {\r
+ com.z[j] = NULL; \r
+ com.spname[j] = NULL;\r
+ }\r
+ com.ns = nsnew;\r
+}\r
+\r
+\r
+int printPatterns(FILE *fout)\r
+{\r
+ int j,h, n31 = (com.seqtype==CODONseq||com.seqtype==CODON2AAseq ? 3 : 1);\r
+ int gap=(n31==3?3:10), n=(com.seqtype==AAseq?20:4);\r
+\r
+ fprintf(fout,"\n%10d %10d P", com.ns, com.npatt*n31);\r
+ if(com.ngene>1) {\r
+ fprintf (fout," G\nG %d ", com.ngene);\r
+ for(j=0; j<com.ngene; j++)\r
+ fprintf(fout,"%7d", com.posG[j+1]-com.posG[j]);\r
+ }\r
+ FPN(fout);\r
+\r
+ if(com.seqtype==1 && com.cleandata) {\r
+ ; /* nothing is printed out for yn00, as the coding is different. */\r
+#if(defined CODEML || defined YN00)\r
+ printsmaCodon (fout, com.z, com.ns, com.npatt, com.npatt, 1);\r
+#endif\r
+ }\r
+ else\r
+ printsma(fout,com.spname,com.z,com.ns, com.npatt,com.npatt, gap, com.seqtype, 1, 0, NULL);\r
+ if(com.ls>1.0001) {\r
+ fprintf(fout, "\n");\r
+ for(h=0; h<com.npatt; h++) {\r
+ fprintf(fout," %4.0f", com.fpatt[h]);\r
+ if((h+1)%15 == 0) FPN(fout);\r
+ }\r
+ fprintf(fout, "\n\n");\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+\r
+void EncodeSeqs (void)\r
+{\r
+/* This encodes sequences and set up com.TipMap[][], called after sites are collapsed \r
+ into patterns.\r
+*/\r
+ int n=com.ncode, nA, is,h, i, j, k,ic, indel=0, ch, b[3];\r
+ char *pch = ((com.seqtype==0||com.seqtype==1) ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));\r
+ unsigned char c[4]="", str[4]=" ";\r
+\r
+ if(com.seqtype != 1) {\r
+ for(is=0; is<com.ns; is++) {\r
+ for (h=0; h<com.npatt; h++) {\r
+ ch = com.z[is][h];\r
+ com.z[is][h] = (char)(k = strchr(pch, ch) - pch);\r
+ if(k<0) {\r
+ printf("strange character %c in seq %d site %d\n", ch, is+1, h+1);\r
+ exit(-1);\r
+ }\r
+ }\r
+ }\r
+ }\r
+#if (defined CODEML || defined YN00)\r
+ else if(com.seqtype==1) {\r
+ /* collect all observed codons into CODONs, with a maximum of 256 distinct codons. */\r
+ memset(&CODONs[0][0], 0, 256*4*sizeof(char));\r
+ for(nA=0; nA<n; nA++) {\r
+ ic=FROM61[nA]; b[0]=ic/16; b[1]=(ic/4)%4; b[2]=ic%4;\r
+ for(i=0; i<3; i++) CODONs[nA][i] = BASEs[b[i]];\r
+ }\r
+ for(j=0,nA=n; j<com.ns; j++) {\r
+ for(h=0; h<com.npatt; h++) {\r
+ for(k=0; k<3; k++) {\r
+ c[k] = com.z[j][h*3+k]; \r
+ b[k] = strchr(BASEs,c[k]) - BASEs;\r
+ if(b[k]<0) printf("strange nucleotide %c in seq %d\n", c[k], j+1);\r
+ }\r
+ if(b[0]<4 && b[1]<4 && b[2]<4) {\r
+ k = FROM64[b[0]*16 + b[1]*4 + b[2]];\r
+ if(k<0) {\r
+ printf("\nstop codon %s in seq #%2d: %s\n", c, j+1, com.spname[j]);\r
+ printf("\ncodons in other sequences are\n");\r
+ for(i=0; i<com.ns; i++) {\r
+ for(k=0; k<3; k++) c[k] = com.z[i][h*3+k]; \r
+ printf("seq #%2d %-30s %s\n", i+1, com.spname[i], c);\r
+ }\r
+ exit(-1);\r
+ }\r
+ }\r
+ else { /* an ambiguous codon */\r
+ for(k=n; k<nA; k++) \r
+ if(strcmp(CODONs[k], c) == 0) break;\r
+ }\r
+ if(k==nA) {\r
+ if(++nA>256) \r
+ error2("too many ambiguity codons in the data. Contact author");\r
+ strcpy(CODONs[nA-1], c);\r
+ }\r
+ com.z[j][h] = (unsigned char)k;\r
+ }\r
+ com.z[j] = (unsigned char*)realloc(com.z[j], com.npatt);\r
+ }\r
+ if(nA>n) {\r
+ printf("%d ambiguous codons are seen in the data:\n", nA - n);\r
+ for(k=n; k<nA; k++) printf("%4s", CODONs[k]);\r
+ printf("\n");\r
+ }\r
+ }\r
+#endif\r
+}\r
+\r
+\r
+void SetMapAmbiguity (void)\r
+{\r
+/* This sets up CharaMap, the map from the ambiguity characters to resolved characters.\r
+*/\r
+ int n=com.ncode, i,j, i0,i1,i2, nb[3], ib[3][4], ic;\r
+ char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));\r
+ char *pbases = (com.seqtype==0 ? BASEs : (com.seqtype==5 ? BASEs5: NULL));\r
+ char **pEquateBASE = (com.seqtype==0 ? EquateBASE : (com.seqtype==5 ? EquateBASE5 : NULL));\r
+ char debug=0;\r
+\r
+ for(j=0; j<n; j++) { /* basic characters, coded according to the definition in pch. */\r
+ nChara[j] = (char)1;\r
+ CharaMap[j][0] = (char)j;\r
+ }\r
+\r
+ if(com.seqtype != 1) {\r
+ for(j=n,pch+=n; *pch; j++,pch++) {\r
+ if(com.seqtype==0 || com.seqtype==5) { /* ambiguities are allowed for those 2 types */\r
+ nChara[j] = (char)strlen(pEquateBASE[j]);\r
+ for(i=0; i<nChara[j]; i++)\r
+ CharaMap[j][i] = (char)(strchr(pbases, pEquateBASE[j][i]) - pbases);\r
+ }\r
+ else { /* for non-nucleotide characters, ambiguity characters must be ? or -. */\r
+ nChara[j] = (char)n;\r
+ for(i=0; i<n; i++)\r
+ CharaMap[j][i] = (char)i;\r
+ }\r
+ if(debug) {\r
+ printf("character %c (%d): ", pbases[j], nChara[j]);\r
+ for(i=0; i<nChara[j]; i++)\r
+ printf("%c", pbases[CharaMap[j][i]]);\r
+ printf("\n");\r
+ }\r
+ }\r
+ }\r
+#ifdef CODEML\r
+ else {\r
+ for(j=n; j<256 && CODONs[j][0]; j++) {\r
+ nChara[j] = (char)0;\r
+ for(i=0; i<3; i++)\r
+ NucListall(CODONs[j][i], &nb[i], ib[i]);\r
+ for(i0=0; i0<nb[0]; i0++) {\r
+ for(i1=0; i1<nb[1]; i1++) \r
+ for(i2=0; i2<nb[2]; i2++) {\r
+ ic = ib[0][i0]*16+ib[1][i1]*4+ib[2][i2];\r
+ if(GeneticCode[com.icode][ic] != -1) \r
+ CharaMap[j][nChara[j]++] = FROM64[ic];\r
+ }\r
+ }\r
+ if(nChara[j]==0) {\r
+ printf("\ncodon %s is stop codon", CODONs[j]);\r
+ exit(-1);\r
+ }\r
+ }\r
+ }\r
+#endif\r
+}\r
+\r
+\r
+int IdenticalSeqs(void)\r
+{\r
+/* This checks for identical sequences and create a data set of unique \r
+ sequences. The file name is <SeqDataFile.unique. This is casually \r
+ written and need more testing.\r
+ The routine is called right after the sequence data are read.\r
+ For codon sequences, com.ls has the number of codons, which are NOT\r
+ coded.\r
+*/\r
+ char tmpf[96], keep[NS];\r
+ FILE *ftmp;\r
+ int is,js,h, same,nkept=com.ns;\r
+ int ls1=com.ls*(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
+\r
+ puts("\nIdenticalSeqs: not tested\a");\r
+ for(is=0; is<com.ns; is++) \r
+ keep[is] = 1;\r
+ for(is=0; is<com.ns; is++) { \r
+ if(!keep[is]) continue;\r
+ for(js=0; js<is; js++) {\r
+ for(h=0,same=1; h<ls1; h++)\r
+ if(com.z[is][h] != com.z[js][h]) break;\r
+ if(h == ls1) {\r
+ printf("Seqs. %3d & %3d (%s & %s) are identical!\n",\r
+ js+1,is+1,com.spname[js],com.spname[is]);\r
+ keep[is] = 0;\r
+ }\r
+ }\r
+ }\r
+ for(is=0; is<com.ns; is++) \r
+ if(!keep[is]) nkept--;\r
+ if(nkept<com.ns) {\r
+ strcpy(tmpf, com.seqf);\r
+ strcat(tmpf, ".unique");\r
+ if((ftmp=fopen(tmpf,"w"))==NULL) error2("IdenticalSeqs: file error");\r
+ printSeqs(ftmp, NULL, keep, 1);\r
+ fclose(ftmp);\r
+ printf("\nUnique sequences collected in %s.\n", tmpf);\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+void AllPatterns (FILE* fout)\r
+{\r
+/* This prints out an alignment containting all possible site patterns, and then exits.\r
+ This alignment may be useful to generate a dataset of infinitely long sequences, \r
+ summarized in the site pattern probabilities.\r
+ Because the PatternWeight() function changes the order of patterns, this routine \r
+ prints out the alignment as one of patterns, with lots of 1's below it, to avoid \r
+ baseml or codeml calling that routine to collaps sites. \r
+ You then replace those 1'with the calculated pattern probabilities for further \r
+ analysis.\r
+*/\r
+ int j, h, it, ic;\r
+ char codon[4]=" ";\r
+ int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
+ int gap=(n31==3?3:10);\r
+\r
+ com.ns = 3;\r
+ for(j=0,com.npatt=1; j<com.ns; j++) com.npatt*=com.ncode;\r
+ printf ("%3d species, %d site patterns\n", com.ns, com.npatt);\r
+ com.cleandata=1;\r
+ for(j=0; j<com.ns; j++) {\r
+ com.spname[j] = (char*)realloc(com.spname[j], 11*sizeof(char));\r
+ sprintf(com.spname[j], "%c ", 'a'+j);\r
+ }\r
+ for(j=0; j<com.ns; j++) \r
+ if((com.z[j]=(unsigned char*) malloc(com.npatt*sizeof(char))) == NULL)\r
+ error2("oom in AllPatterns");\r
+ for (h=0; h<com.npatt; h++) {\r
+ for (j=0,it=h; j<com.ns; j++) {\r
+ ic = it%com.ncode;\r
+ it /= com.ncode;\r
+ com.z[com.ns-1-j][h] = (char)ic;\r
+ }\r
+ }\r
+ com.ls = com.npatt;\r
+\r
+ fprintf(fout, " %6d %6d P\n", com.ns, com.ls*n31);\r
+ if(com.seqtype==1) {\r
+#if(defined CODEML || defined YN00)\r
+ printsmaCodon (fout, com.z, com.ns, com.ls, com.ls, 0);\r
+#endif\r
+ }\r
+ else\r
+ printsma(fout,com.spname,com.z,com.ns, com.ls, com.ls, gap, com.seqtype, 1, 0, NULL);\r
+\r
+ for(h=0; h<com.npatt; h++) {\r
+ fprintf(fout, " 1");\r
+ if((h+1)%40==0) FPN(fout);\r
+ }\r
+ FPN(fout);\r
+ exit(0);\r
+}\r
+\r
+\r
+int PatternWeight (void)\r
+{\r
+/* This collaps sites into patterns, for nucleotide, amino acid, or codon sequences.\r
+ This relies on \0 being the end of the string so that sequences should not be \r
+ encoded before this routine is called.\r
+ com.pose[i] has labels for genes as input and maps sites to patterns in return.\r
+ com.fpatt, a vector of doubles, wastes space as site pattern counts are integers.\r
+ Sequences z[ns*ls] are copied into patterns zt[ls*lpatt], and bsearch is used \r
+ twice to avoid excessive copying, to count npatt first & to generate fpatt etc.\r
+*/\r
+ int maxnpatt=com.ls, h, ip,l,u, j, k, same, ig, *poset;\r
+ int gap = (com.seqtype==CODONseq ? 3 : 10);\r
+ int n31 = (com.seqtype==CODONseq ? 3 : 1);\r
+ int lpatt=com.ns*n31+1; /* extra 0 used for easy debugging, can be voided */\r
+ int *p2s; /* point patterns to sites in zt */\r
+ char *zt, *p, timestr[36];\r
+ double nc = (com.seqtype == 1 ? 64 : com.ncode) + !com.cleandata+1;\r
+ int debug=0;\r
+ char DS[]="DS";\r
+\r
+ /* (A) \r
+ Collect and sort patterns. Get com.npatt, com.lgene, com.posG.\r
+ Move sequences com.z[ns][ls] into sites zt[ls*lpatt]. \r
+ Use p2s to map patterns to sites in zt to avoid copying.\r
+ */\r
+ if(noisy) printf("Counting site patterns.. %s\n", printtime(timestr));\r
+\r
+ if((com.seqtype==1 && com.ns<5) || (com.seqtype!=1 && com.ns<7))\r
+ maxnpatt = (int)(pow(nc, (double)com.ns) + 0.5) * com.ngene;\r
+ if(maxnpatt>com.ls) maxnpatt = com.ls;\r
+ p2s = (int*)malloc(maxnpatt*sizeof(int));\r
+ zt = (char*)malloc(com.ls*lpatt*sizeof(char));\r
+ if(p2s==NULL || zt==NULL) error2("oom p2s or zt");\r
+ memset(zt, 0, com.ls*lpatt*sizeof(char));\r
+ for(j=0; j<com.ns; j++) \r
+ for(h=0; h<com.ls; h++) \r
+ for(k=0; k<n31; k++)\r
+ zt[h*lpatt+j*n31+k] = com.z[j][h*n31+k];\r
+\r
+ for(j=0; j<com.ns; j++) free(com.z[j]); \r
+\r
+ for(ig=0; ig<com.ngene; ig++) com.lgene[ig] = 0;\r
+ for(ig=0,com.npatt=0; ig<com.ngene; ig++) {\r
+ com.posG[ig] = l = u = ip = com.npatt; \r
+ for(h=0; h<com.ls; h++) {\r
+ if(com.pose[h] != ig) continue;\r
+ if(debug) printf("\nh %3d %s", h, zt+h*lpatt);\r
+\r
+ /* bsearch in existing patterns. Knuth 1998 Vol3 Ed2 p.410 \r
+ ip is the loc for match or insertion. [l,u] is the search interval.\r
+ */\r
+ same = 0;\r
+ if(com.lgene[ig]++ != 0) { /* not 1st pattern? */\r
+ for(l=com.posG[ig], u=com.npatt-1; ; ) {\r
+ if(u<l) break;\r
+ ip = (l+u)/2;\r
+ k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);\r
+ if(k<0) u = ip - 1;\r
+ else if(k>0) l = ip + 1;\r
+ else { same = 1; break; }\r
+ }\r
+ }\r
+ if(!same) {\r
+ if(com.npatt>maxnpatt) \r
+ error2("npatt > maxnpatt");\r
+ if(l > ip) ip++; /* last comparison in bsearch had k > 0. */\r
+ /* Insert new pattern at ip. This is the expensive step. */\r
+\r
+ if(ip<com.npatt)\r
+ memmove(p2s+ip+1, p2s+ip, (com.npatt-ip)*sizeof(int));\r
+\r
+ /*\r
+ for(j=com.npatt; j>ip; j--) \r
+ p2s[j] = p2s[j-1];\r
+ */\r
+ p2s[ip] = h;\r
+ com.npatt ++;\r
+ }\r
+\r
+ if(debug) {\r
+ printf(": %3d (%c ilu %3d%3d%3d) ", com.npatt, DS[same], ip, l, u);\r
+ for(j=0; j<com.npatt; j++)\r
+ printf(" %s", zt+p2s[j]*lpatt);\r
+ }\r
+ if(noisy && ((h+1)%10000==0 || h+1==com.ls))\r
+ printf("\r%12d patterns at %8d / %8d sites (%.1f%%), %s", \r
+ com.npatt, h+1, com.ls, (h+1.)*100/com.ls, printtime(timestr));\r
+\r
+ } /* for (h) */\r
+ } /* for (ig) */\r
+ if(noisy) FPN(F0);\r
+\r
+ /* (B) count pattern frequencies and collect pose[] */\r
+ com.posG[com.ngene] = com.npatt;\r
+ for(j=0; j<com.ngene; j++) \r
+ if(com.lgene[j]==0) \r
+ error2("some gene labels are missing");\r
+ for(j=1; j<com.ngene; j++) \r
+ com.lgene[j] += com.lgene[j-1];\r
+\r
+ com.fpatt = (double*)realloc(com.fpatt, com.npatt*sizeof(double));\r
+ poset = (int*)malloc(com.ls*sizeof(int));\r
+ if(com.fpatt==NULL || poset==NULL) error2("oom poset");\r
+ for(ip=0; ip<com.npatt; ip++) com.fpatt[ip] = 0;\r
+\r
+ for(ig=0; ig<com.ngene; ig++) {\r
+ for(h=0; h<com.ls; h++) {\r
+ if(com.pose[h] != ig) continue;\r
+ for(same=0, l=com.posG[ig], u=com.posG[ig+1]-1; ; ) {\r
+ if(u<l) break;\r
+ ip = (l+u)/2;\r
+ k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);\r
+ if(k<0) u = ip - 1;\r
+ else if(k>0) l = ip + 1;\r
+ else { same = 1; break; }\r
+ }\r
+ if(!same)\r
+ error2("ghost pattern?");\r
+ com.fpatt[ip]++;\r
+ poset[h] = ip;\r
+ } /* for (h) */\r
+ } /* for (ig) */\r
+\r
+ if(com.seqtype==CODONseq && com.ngene==3 &&com.lgene[0]==com.ls/3) {\r
+ puts("\nCheck option G in data file? (Enter)\n");\r
+ }\r
+\r
+ for(j=0; j<com.ns; j++) {\r
+ com.z[j] = (unsigned char*)malloc(com.npatt*n31*sizeof(char));\r
+ for(ip=0,p=com.z[j]; ip<com.npatt; ip++) \r
+ for(k=0; k<n31; k++)\r
+ *p++ = zt[p2s[ip]*lpatt + j*n31 + k];\r
+ }\r
+ memcpy(com.pose, poset, com.ls*sizeof(int));\r
+ free(poset); free(p2s); free(zt);\r
+\r
+ return (0);\r
+}\r
+\r
+\r
+void AddFreqSeqGene(int js,int ig,double pi0[],double pi[]);\r
+\r
+\r
+void Chi2FreqHomo(double f[], int ns, int nc, double X2G[2])\r
+{\r
+/* This calculates a chi-square like statistic for testing that the base \r
+ or amino acid frequencies are identical among sequences.\r
+ f[ns*nc] where ns is #sequences (rows) and nc is #states (columns).\r
+*/\r
+ int i, j;\r
+ double mf[64]={0}, small=1e-50;\r
+\r
+ X2G[0]=X2G[1]=0;\r
+ for(i=0; i<ns; i++) \r
+ for(j=0; j<nc; j++) \r
+ mf[j]+=f[i*nc+j]/ns;\r
+\r
+ for(i=0; i<ns; i++) {\r
+ for(j=0; j<nc; j++) {\r
+ if(mf[j]>small) {\r
+ X2G[0] += square(f[i*nc+j]-mf[j])/mf[j];\r
+ if(f[i*nc+j])\r
+ X2G[1] += 2*f[i*nc+j]*log(f[i*nc+j]/mf[j]);\r
+ }\r
+ }\r
+ }\r
+}\r
+\r
+int InitializeBaseAA (FILE *fout)\r
+{\r
+/* Count site patterns (com.fpatt) and calculate base or amino acid frequencies\r
+ in genes and species. This works on raw (uncoded) data. \r
+ Ambiguity characters in sequences are resolved by iteration. \r
+ For frequencies in each species, they are resolved within that sequence.\r
+ For average base frequencies among species, they are resolved over all \r
+ species.\r
+\r
+ This routine is called by baseml and aaml. codonml uses another\r
+ routine InitializeCodon()\r
+*/\r
+ char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));\r
+ char indel[]="-?";\r
+ int wname=30, h,js,k, ig, nconstp, n=com.ncode;\r
+ int irf, nrf=20;\r
+ double pi0[20], t,lmax=0, X2G[2], *pisg; /* freq for species & gene, for X2 & G */\r
+\r
+ if(noisy) printf("Counting frequencies..");\r
+ if(fout) fprintf(fout,"\nFrequencies..");\r
+ if((pisg=(double*)malloc(com.ns*n*sizeof(double))) == NULL)\r
+ error2("oom pisg");\r
+ for(h=0,nconstp=0; h<com.npatt; h++) {\r
+ for (js=1; js<com.ns; js++)\r
+ if(com.z[js][h] != com.z[0][h]) break;\r
+ if (js==com.ns && com.z[0][h]!=indel[0] && com.z[0][h]!=indel[1])\r
+ nconstp += (int)com.fpatt[h];\r
+ }\r
+ for (ig=0,zero(com.pi,n); ig<com.ngene; ig++) {\r
+ if (com.ngene>1)\r
+ fprintf (fout,"\n\nGene %2d (len %4d)", ig+1, com.lgene[ig]-(ig==0?0:com.lgene[ig-1]));\r
+ fprintf(fout,"\n%*s", wname, "");\r
+ for(k=0; k<n; k++) fprintf(fout,"%7c", pch[k]);\r
+\r
+ /* The following block calculates freqs in each species for each gene. \r
+ Ambiguities are resolved in each species. com.pi and com.piG are \r
+ used for output only, and are not be used later with missing data.\r
+ */\r
+ zero(com.piG[ig], n);\r
+ zero(pisg, com.ns*n);\r
+ for(js=0; js<com.ns; js++) {\r
+ fillxc(pi0, 1.0/n, n);\r
+ for(irf=0; irf<nrf; irf++) {\r
+ zero(com.pi, n);\r
+ AddFreqSeqGene(js, ig, pi0, com.pi);\r
+ t = sum(com.pi, n);\r
+ if(t<1e-10) {\r
+ printf("Some sequences are empty.\n");\r
+ fillxc(com.pi, 1.0/n, n);\r
+ }\r
+ else \r
+ abyx(1/t, com.pi, n);\r
+ if(com.cleandata || com.cleandata || (t=distance(com.pi,pi0,n))<1e-8)\r
+ break;\r
+ xtoy(com.pi, pi0, n);\r
+ } /* for(irf) */\r
+ fprintf(fout,"\n%-*s", wname, com.spname[js]);\r
+ for(k=0; k<n; k++) fprintf(fout, "%8.5f", com.pi[k]);\r
+ if(com.ncode==4 && com.ngene==1) fprintf(fout, " GC = %5.3f", com.pi[1]+com.pi[3]);\r
+ for(k=0; k<n; k++) com.piG[ig][k] += com.pi[k]/com.ns;\r
+ xtoy(com.pi, pisg+js*n, n);\r
+ } /* for(js,ns) */\r
+ if(com.ngene>1) {\r
+ fprintf(fout,"\n\n%-*s", wname, "Mean");\r
+ for(k=0; k<n; k++) fprintf(fout, "%7.4f", com.piG[ig][k]);\r
+ }\r
+\r
+ Chi2FreqHomo(pisg, com.ns, n, X2G);\r
+\r
+ fprintf(fout,"\n\nHomogeneity statistic: X2 = %.5f G = %.5f ",X2G[0], X2G[1]);\r
+\r
+ /* fprintf(frst1,"\t%.5f", X2G[1]); */\r
+\r
+ } /* for(ig) */\r
+ if(noisy) printf("\n");\r
+\r
+ /* If there are missing data, the following block calculates freqs \r
+ in each gene (com.piG[]), as well as com.pi[] for the entire sequence. \r
+ Ambiguities are resolved over entire data sets across species (within \r
+ each gene for com.piG[]). These are used in ML calculation later.\r
+ */\r
+ if(com.cleandata) {\r
+ for (ig=0,zero(com.pi,n); ig<com.ngene; ig++) {\r
+ t = (ig==0 ? com.lgene[0] : com.lgene[ig]-com.lgene[ig-1])/(double)com.ls;\r
+ for(k=0; k<n; k++) com.pi[k] += com.piG[ig][k]*t;\r
+ }\r
+ }\r
+ else {\r
+ for (ig=0; ig<com.ngene; ig++) { \r
+ xtoy(com.piG[ig], pi0, n);\r
+ for(irf=0; irf<nrf; irf++) { /* com.piG[] */\r
+ zero(com.piG[ig], n);\r
+ for(js=0; js<com.ns; js++)\r
+ AddFreqSeqGene(js, ig, pi0, com.piG[ig]);\r
+ t = sum(com.piG[ig], n);\r
+ if(t<1e-10) \r
+ puts("empty sequences?");\r
+ abyx(1/t, com.piG[ig], n);\r
+ if(distance(com.piG[ig], pi0, n)<1e-8) break;\r
+ xtoy(com.piG[ig], pi0, n);\r
+ } /* for(irf) */\r
+ } /* for(ig) */\r
+ zero(pi0, n);\r
+ for(k=0; k<n; k++) for(ig=0; ig<com.ngene; ig++) \r
+ pi0[k] += com.piG[ig][k]/com.ngene;\r
+ for(irf=0; irf<nrf; irf++) { /* com.pi[] */\r
+ zero(com.pi,n);\r
+ for(ig=0; ig<com.ngene; ig++) for(js=0; js<com.ns; js++)\r
+ AddFreqSeqGene(js, ig, pi0, com.pi);\r
+ abyx(1/sum(com.pi,n), com.pi, n);\r
+ if(distance(com.pi, pi0, n)<1e-8) break;\r
+ xtoy(com.pi, pi0, n);\r
+ } /* for(ig) */\r
+ }\r
+ fprintf (fout, "\n\n%-*s", wname, "Average");\r
+ for(k=0; k<n; k++) fprintf(fout,"%8.5f", com.pi[k]);\r
+ if(!com.cleandata) fputs("\n(Ambiguity characters are used to calculate freqs.)\n",fout);\r
+\r
+ fprintf (fout,"\n\n# constant sites: %6d (%.2f%%)",\r
+ nconstp, (double)nconstp*100./com.ls);\r
+\r
+ if (com.model==0 || (com.seqtype==BASEseq && com.model==1)) {\r
+ fillxc(com.pi, 1./n, n);\r
+ for(ig=0; ig<com.ngene; ig++)\r
+ xtoy(com.pi, com.piG[ig], n);\r
+ }\r
+ if (com.seqtype==BASEseq && com.model==5) { /* T92 model */\r
+ com.pi[0] = com.pi[2] = (com.pi[0] + com.pi[2])/2;\r
+ com.pi[1] = com.pi[3] = (com.pi[1] + com.pi[3])/2;\r
+ for(ig=0; ig<com.ngene; ig++) {\r
+ com.piG[ig][0] = com.piG[ig][2] = (com.piG[ig][0] + com.piG[ig][2])/2;\r
+ com.piG[ig][1] = com.piG[ig][3] = (com.piG[ig][1] + com.piG[ig][3])/2;\r
+ }\r
+ }\r
+\r
+ /* this is used only for REV & REVu in baseml and model==3 in aaml */\r
+ if(com.seqtype==AAseq) {\r
+ for (k=0,t=0; k<n; k++) t += (com.pi[k]>0);\r
+ if (t<=4)\r
+ puts("\n\a\t\tAre these a.a. sequences?");\r
+ }\r
+ if(com.cleandata && com.ngene==1) {\r
+ for(h=0,lmax=-(double)com.ls*log((double)com.ls); h<com.npatt; h++)\r
+ if(com.fpatt[h]>1) lmax += com.fpatt[h]*log((double)com.fpatt[h]);\r
+ }\r
+ if(fout) {\r
+ if(lmax) fprintf(fout, "\nln Lmax (unconstrained) = %.6f\n", lmax);\r
+ fflush(fout);\r
+ }\r
+\r
+ free(pisg);\r
+ return(0);\r
+}\r
+\r
+\r
+void AddFreqSeqGene(int js, int ig, double pi0[], double pi[])\r
+{\r
+/* This adds the character counts in sequence js in gene ig to pi, \r
+ using pi0, by resolving ambiguities. The data are coded. com.cleandata==1 or 0.\r
+ This is for nucleotide and amino acid sequences only.\r
+*/\r
+ char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));\r
+ int k, h, b, n=com.ncode;\r
+ double t;\r
+\r
+ if(com.cleandata) {\r
+ for(h=com.posG[ig]; h<com.posG[ig+1]; h++) \r
+ pi[com.z[js][h]] += com.fpatt[h];\r
+ }\r
+ else {\r
+ for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ b = com.z[js][h];\r
+ if(b<n)\r
+ pi[b] += com.fpatt[h];\r
+ else {\r
+ /*\r
+ if(com.seqtype==BASEseq) {\r
+ NucListall(BASEs[b], &nb, ib);\r
+ for(k=0,t=0; k<nb; k++) t += pi0[ib[k]];\r
+ for(k=0; k<nb; k++) \r
+ pi[ib[k]] += pi0[ib[k]]/t * com.fpatt[h];\r
+ }\r
+ */\r
+ if(com.seqtype==BASEseq) {\r
+ for(k=0,t=0; k<nChara[b]; k++) \r
+ t += pi0[CharaMap[b][k]];\r
+ for(k=0; k<nChara[b]; k++) \r
+ pi[CharaMap[b][k]] += pi0[CharaMap[b][k]]/t * com.fpatt[h];\r
+ }\r
+ else if(com.seqtype==AAseq) /* unrecognized AAs are treated as "?". */\r
+ for(k=0; k<n; k++) pi[k] += pi0[k]*com.fpatt[h];\r
+ }\r
+ }\r
+ }\r
+}\r
+\r
+\r
+int RemoveIndel(void)\r
+{\r
+/* Remove ambiguity characters and indels in the untranformed sequences, \r
+ Changing com.ls and com.pose[] (site marks for multiple genes).\r
+ For codonml, com.ls is still 3*#codons\r
+ Called at the end of ReadSeq, when com.pose[] are still site marks.\r
+ All characters in com.z[][] not found in the character string pch are\r
+ considered ambiguity characters and are removed.\r
+*/\r
+ int n=com.ncode, h,k, j,js,lnew,nindel, n31=1;\r
+ char b, *miss; /* miss[h]=1 if site (codon) h is missing, 0 otherwise */\r
+ char *pch=((com.seqtype<=1||com.seqtype==CODON2AAseq)?BASEs:(com.seqtype==2?AAs: (com.seqtype==5?BASEs5:BINs)));\r
+\r
+ if(com.seqtype==CODONseq || com.seqtype==CODON2AAseq) {\r
+ n31=3; n=4;\r
+ }\r
+\r
+ if (com.ls%n31) error2("ls in RemoveIndel.");\r
+ if((miss=(char*)malloc(com.ls/n31 *sizeof(char)))==NULL)\r
+ error2("oom miss");\r
+ for(h=0; h<com.ls/n31; h++)\r
+ miss[h] = 0;\r
+ for (js=0; js<com.ns; js++) {\r
+ for (h=0,nindel=0; h<com.ls/n31; h++) {\r
+ for (k=0; k<n31; k++) {\r
+ b = (char)toupper(com.z[js][h*n31+k]);\r
+ for(j=0; j<n; j++) \r
+ if(b==pch[j]) break;\r
+ if(j==n) {\r
+ miss[h]=1; nindel++; \r
+ }\r
+ }\r
+ }\r
+ if (noisy>2 && nindel) \r
+ printf("\n%6d ambiguity characters in seq. %d", nindel,js+1);\r
+ }\r
+ if(noisy>2) {\r
+ for(h=0,k=0; h<com.ls/n31; h++) if(miss[h]) k++;\r
+ printf("\n%d sites are removed. ", k);\r
+ if(k<1000)\r
+ for(h=0; h<com.ls/n31; h++) if(miss[h]) printf(" %2d", h+1);\r
+ }\r
+\r
+ for (h=0,lnew=0; h<com.ls/n31; h++) {\r
+ if(miss[h]) continue;\r
+ for (js=0; js<com.ns; js++) {\r
+ for (k=0; k<n31; k++)\r
+ com.z[js][lnew*n31+k]=com.z[js][h*n31+k];\r
+ }\r
+ com.pose[lnew]=com.pose[h];\r
+ lnew++;\r
+ }\r
+ com.ls=lnew*n31;\r
+ free(miss);\r
+ return (0);\r
+}\r
+\r
+\r
+\r
+int MPInformSites (void)\r
+{\r
+/* Outputs parsimony informative and noninformative sites into \r
+ two files named MPinf.seq and MPninf.seq\r
+ Uses transformed sequences. \r
+ Not used for a long time. Does not work if com.pose is NULL. \r
+*/\r
+ char *imark;\r
+ char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
+ int h, i, markb[NS], inf, lsinf;\r
+ FILE *finf, *fninf;\r
+\r
+puts("\nMPInformSites: missing data not dealt with yet?\n");\r
+\r
+ finf=fopen("MPinf.seq","w");\r
+ fninf=fopen("MPninf.seq","w");\r
+ if (finf==NULL || fninf==NULL) error2("MPInformSites: file creation error");\r
+\r
+ puts ("\nSorting parsimony-informative sites: MPinf.seq & MPninf.seq");\r
+ if ((imark=(char*)malloc(com.ls*sizeof(char)))==NULL) error2("oom imark");\r
+ for (h=0,lsinf=0; h<com.ls; h++) {\r
+ for (i=0; i<com.ns; i++) markb[i]=0;\r
+ for (i=0; i<com.ns; i++) markb[(int)com.z[i][com.pose[h]]]++;\r
+\r
+ for (i=0,inf=0; i<com.ncode; i++) if (markb[i]>=2) inf++;\r
+ if (inf>=2) { imark[h]=1; lsinf++; }\r
+ else imark[h]=0;\r
+ }\r
+ fprintf (finf, "%6d%6d\n", com.ns, lsinf);\r
+ fprintf (fninf, "%6d%6d\n", com.ns, com.ls-lsinf);\r
+ for (i=0; i<com.ns; i++) {\r
+ fprintf (finf, "\n%s\n", com.spname[i]);\r
+ fprintf (fninf, "\n%s\n", com.spname[i]);\r
+ for (h=0; h<com.ls; h++)\r
+ fprintf ((imark[h]?finf:fninf), "%c", pch[(int)com.z[i][com.pose[h]]]);\r
+ FPN (finf); FPN(fninf);\r
+ }\r
+ free (imark);\r
+ fclose(finf); fclose(fninf);\r
+ return (0);\r
+}\r
+\r
+\r
+int PatternWeightJC69like (FILE *fout)\r
+{\r
+/* This collaps site patterns further for JC69-like models, called after\r
+ PatternWeight(). This is used for JC and poisson amino acid models. \r
+ The routine could be merged into PatternWeight(), which should lead to \r
+ faster computation, but this is not done because right now \r
+ InitializeBaseAA() prints out base or amino acid frequencies after \r
+ PatternWeight() and before this routine. \r
+ \r
+ If the data have no ambiguities (com.cleanddata=1), the routine recodes \r
+ the data, for example, changing data at a site 1120 (CCAT) into 0012 \r
+ (TTCA) before checking against old patterns already found. If the data \r
+ contain ambiguities, they are not encoded. In that case, for every \r
+ site, the routine changes ? or N into - first. It then checks whether there \r
+ are any other ambibiguities and will recode if and only if there are not \r
+ any other ambiguities. For example, a site with data CC?T will be \r
+ changed into CC-T first and then recoded into TT-C and checked against \r
+ old patterns found. A site with data CCRT will not be recoded. In theory \r
+ such sites may be packed as well, but perhaps the effort is not worthwhile. \r
+ The routine checks data like CCRT against old patterns already found, \r
+\r
+ If com.pose is not NULL, the routine also updates com.pose. This allows \r
+ the program to work if com.readpattern==1.\r
+*/\r
+ char zh[NS], b, gap;\r
+ char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
+ int npatt0=com.npatt, h, ht, j,k, same=0, ig, recode;\r
+\r
+ if(com.seqtype==1) \r
+ error2("PatternWeightJC69like does not work for codon seqs");\r
+ if(noisy) printf("Counting site patterns again, for JC69.\n");\r
+ gap = (char) (strchr(pch, (int)'-') - pch);\r
+ for (h=0,com.npatt=0,ig=-1; h<npatt0; h++) {\r
+ if (ig<com.ngene-1 && h==com.posG[ig+1])\r
+ com.posG[++ig] = com.npatt; \r
+\r
+ if(com.cleandata) { /* clean data, always recode */\r
+ zh[0] = b = 0; \r
+ b++;\r
+ for (j=1; j<com.ns; j++) {\r
+ for(k=0; k<j; k++) \r
+ if (com.z[j][h]==com.z[k][h]) break;\r
+ zh[j] = (k<j ? zh[k] : b++);\r
+ }\r
+ }\r
+ else { /* recode only if there are no non-gap ambiguity characters */\r
+ for(j=0; j<com.ns; j++)\r
+ zh[j] = com.z[j][h];\r
+\r
+ /* After this loop, recode = 0 or 1 decides whether to recode. */\r
+ for (j=0,recode=1; j<com.ns; j++) {\r
+ if (zh[j] < com.ncode) \r
+ continue;\r
+ if (nChara[zh[j]] == com.ncode) {\r
+ zh[j] = gap;\r
+ continue;\r
+ }\r
+ recode = 0; \r
+ break;\r
+ }\r
+ if(recode) {\r
+ b = 0;\r
+ if(zh[0] != gap) \r
+ zh[0] = b++;\r
+ for (j=1; j<com.ns; j++) {\r
+ if(zh[j] != gap) {\r
+ for(k=0; k<j; k++)\r
+ if (zh[j] == com.z[k][h]) break;\r
+ if(k<j) zh[j] = zh[k];\r
+ else zh[j] = b++;\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ for (ht=com.posG[ig],same=0; ht<com.npatt; ht++) {\r
+ for (j=0,same=1; j<com.ns; j++)\r
+ if (zh[j]!=com.z[j][ht]) {\r
+ same = 0; break; \r
+ }\r
+ if (same) break; \r
+ }\r
+ if (same)\r
+ com.fpatt[ht] += com.fpatt[h];\r
+ else {\r
+ for(j=0; j<com.ns; j++) com.z[j][com.npatt] = zh[j];\r
+ com.fpatt[com.npatt++] = com.fpatt[h];\r
+ }\r
+ if(com.pose) \r
+ for(k=0; k<com.ls; k++) \r
+ if(com.pose[k]==h) com.pose[k] = ht;\r
+ } /* for (h) */\r
+ com.posG[com.ngene] = com.npatt;\r
+ if (noisy) printf ("new no. site patterns:%7d\n", com.npatt);\r
+\r
+ if(fout) {\r
+ fprintf(fout, "\n\nPrinting out site pattern counts\n");\r
+ printPatterns(fout);\r
+ }\r
+ return (0);\r
+}\r
+\r
+int Site2Pattern (FILE *fout)\r
+{\r
+ int h;\r
+ fprintf(fout,"\n\nMapping site to pattern (i.e. site %d has pattern %d):\n",\r
+ com.ls-1, com.pose[com.ls-2]+1);\r
+ FOR (h, com.ls) {\r
+ fprintf (fout, "%6d", com.pose[h]+1);\r
+ if ((h+1)%10==0) FPN (fout);\r
+ }\r
+ FPN (fout);\r
+ return (0);\r
+}\r
+\r
+\r
+#endif\r
+\r
+\r
+\r
+int print1seq (FILE*fout, char *z, int ls, int pose[])\r
+{\r
+/* This prints out one sequence, and the sequences are encoded. \r
+ z[] contains patterns if (pose!=NULL)\r
+ This uses com.seqtype.\r
+*/\r
+ int h, hp, gap=10;\r
+ char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
+ char str[4]="";\r
+ int nb = (com.seqtype==CODONseq?3:1);\r
+\r
+ for(h=0; h<ls; h++) {\r
+ hp = (pose ? pose[h] : h);\r
+ if(com.seqtype != CODONseq) {\r
+ fprintf(fout, "%c", pch[(int)z[hp]]);\r
+ if((h+1)%gap==0) fputc(' ', fout);\r
+ }\r
+ else\r
+ fprintf(fout, "%s ", CODONs[z[hp]]);\r
+ }\r
+ return(0);\r
+}\r
+\r
+void printSeqs (FILE *fout, int *pose, char keep[], int format)\r
+{\r
+/* Print sequences into fout, using paml (format=0 or 1) or paup (format=2) \r
+ formats.\r
+ Use pose=NULL if called before site patterns are collapsed. \r
+ keep[] marks the sequences to be printed. Use NULL for keep if all sequences \r
+ are to be printed.\r
+ Sequences may (com.cleandata==1) and may not (com.cleandata=0) be coded.\r
+ com.z[] has site patterns if pose!=NULL.\r
+ This uses com.seqtype, and com.ls is the number of codons for codon seqs.\r
+ See notes in print1seq()\r
+\r
+ format = 0,1: PAML sites or patterns\r
+ 2: PAUP Nexus format.\r
+\r
+ This is used by evolver. Check and merge with printsma().\r
+\r
+*/\r
+ int h, j, ls1, n31=(com.seqtype==1?3:1), nskept=com.ns, wname=30;\r
+ char *dt=(com.seqtype==AAseq?"protein":"dna");\r
+\r
+ ls1 = (format==1 ? com.npatt : com.ls);\r
+ if(keep) \r
+ for(j=0; j<com.ns; j++) nskept -= !keep[j];\r
+ if(format==0 || format==1)\r
+ fprintf(fout, "\n\n%6d %7d %s\n\n", nskept, ls1*n31, (format==1?" P":""));\r
+ else if(format==2) { /* NEXUS format */\r
+ fprintf(fout,"\nbegin data;\n");\r
+ fprintf(fout," dimensions ntax=%d nchar=%d;\n", nskept, ls1*n31);\r
+ fprintf(fout," format datatype=%s missing=? gap=-;\n matrix\n",dt);\r
+ }\r
+\r
+ for(j=0; j<com.ns; j++,FPN(fout)) {\r
+ if(keep && !keep[j]) continue;\r
+ fprintf(fout,"%s%-*s ", (format==2?" ":""), wname, com.spname[j]);\r
+ print1seq(fout, com.z[j], (format==1?com.npatt:com.ls), pose);\r
+ }\r
+ if(format==2) fprintf(fout, " ;\nend;");\r
+ else if (format==1) {\r
+ for(h=0,FPN(fout); h<com.npatt; h++) {\r
+ /* fprintf(fout," %12.8f", com.fpatt[h]/(double)com.ls); */\r
+ fprintf(fout," %4.0f", com.fpatt[h]);\r
+ if((h+1)%15==0) FPN(fout);\r
+ }\r
+ }\r
+\r
+ fprintf(fout,"\n\n");\r
+ fflush(fout);\r
+}\r
+\r
+#define gammap(x,alpha) (alpha*(1-pow(x,-1.0/alpha)))\r
+/* DistanceREV () used to be here, moved to pamp. \r
+*/\r
+\r
+#if (defined BASEML || defined BASEMLG || defined MCMCTREE || defined PROBTREE || defined YULETREE) \r
+\r
+double SeqDivergence (double x[], int model, double alpha, double *kappa)\r
+{\r
+/* alpha=0 if no gamma \r
+ return -1 if in error.\r
+ Check DistanceF84() if variances are wanted.\r
+*/\r
+ int i,j;\r
+ double p[4], Y,R, a1,a2,b, P1,P2,Q,fd,tc,ag, GC;\r
+ double small=1e-10/com.ls,largek=999, larged=9;\r
+\r
+ if (testXMat(x)) {\r
+ matout(F0, x, 4, 4);\r
+ printf("\nfrequency matrix error, setting distance to large d");\r
+ return(larged);\r
+ }\r
+ for (i=0,fd=1,zero(p,4); i<4; i++) {\r
+ fd -= x[i*4+i];\r
+ FOR (j,4) { p[i]+=x[i*4+j]/2; p[j]+=x[i*4+j]/2; }\r
+ }\r
+ P1 = x[0*4+1]+x[1*4+0];\r
+ P2 = x[2*4+3]+x[3*4+2];\r
+ Q = x[0*4+2]+x[0*4+3]+x[1*4+2]+x[1*4+3]+ x[2*4+0]+x[2*4+1]+x[3*4+0]+x[3*4+1];\r
+ if(fd<small) \r
+ return(0);\r
+ if(P1<small) P1=0; \r
+ if(P2<small) P2=0; \r
+ if(Q<small) Q=0;\r
+ Y=p[0]+p[1]; R=p[2]+p[3]; tc=p[0]*p[1]; ag=p[2]*p[3];\r
+\r
+ switch (model) {\r
+ case (JC69):\r
+ FOR (i,4) p[i]=.25;\r
+ case (F81):\r
+ for (i=0,b=0; i<4; i++) b += p[i]*(1-p[i]);\r
+ if (1-fd/b<=0) return (larged);\r
+\r
+ if (alpha<=0) return (-b*log (1-fd/b));\r
+ else return (-b*gammap(1-fd/b,alpha));\r
+ case (K80) :\r
+/*\r
+ printf("\nP Q = %.6f %.6f\n", P1+P2,Q);\r
+ printf("\nP1 P2 Q = %.6f %.6f %.6f\n", P1,P2,Q);\r
+*/\r
+ a1=1-2*(P1+P2)-Q; b=1-2*Q;\r
+/* if (a1<=0 || b<=0) return (-1); */\r
+ if (a1<=0 || b<=0) return (larged);\r
+ if (alpha<=0) { a1=-log(a1); b=-log(b); }\r
+ else { a1=-gammap(a1,alpha); b=-gammap(b,alpha); }\r
+ a1=.5*a1-.25*b; b=.25*b;\r
+ if(b>small) *kappa = a1/b; else *kappa=largek;\r
+ return (a1+2*b);\r
+ case (F84):\r
+ if(Y<small || R<small)\r
+ error2("Y or R = 0.");\r
+\r
+ a1=(2*(tc+ag)+2*(tc*R/Y+ag*Y/R)*(1-Q/(2*Y*R)) -P1-P2) / (2*tc/Y+2*ag/R);\r
+ b = 1 - Q/(2*Y*R);\r
+/* if (a1<=0 || b<=0) return (-1); */\r
+ if (a1<=0 || b<=0) return (larged);\r
+ if (alpha<=0) { a1=-log(a1); b=-log(b); }\r
+ else { a1=-gammap(a1,alpha); b=-gammap(b,alpha); }\r
+ a1=.5*a1; b=.5*b;\r
+ *kappa = a1/b-1;\r
+ *kappa = max2(*kappa, -.5);\r
+ return 4*b*(tc*(1+ *kappa/Y)+ag*(1+ *kappa/R)+Y*R);\r
+ case (HKY85): /* HKY85, from Rzhetsky & Nei (1995 MBE 12, 131-51) */\r
+ if(Y<small || R<small)\r
+ error2("Y or R = 0.");\r
+\r
+ *kappa = largek;\r
+ a1=1-Y*P1/(2*tc)-Q/(2*Y);\r
+ a2=1-R*P2/(2*ag)-Q/(2*R);\r
+ b=1-Q/(2*Y*R);\r
+ if (a1<=0 || a2<=0 || b<=0) return (larged);\r
+ if (alpha<=0) { a1=-log(a1); a2=-log(a2); b=-log(b); }\r
+ else { a1=-gammap(a1,alpha); a2=-gammap(a2,alpha); b=-gammap(b,alpha);}\r
+ a1 = -R/Y*b + a1/Y;\r
+ a2 = -Y/R*b + a2/R;\r
+ if (b>0) *kappa = min2((a1+a2)/(2*b), largek);\r
+ return 2*(p[0]*p[1] + p[2]*p[3])*(a1+a2)/2 + 2*Y*R*b;\r
+ case (T92):\r
+ *kappa = largek;\r
+ GC=p[1]+p[3];\r
+ a1 = 1 - Q - (P1+P2)/(2*GC*(1-GC)); b=1-2*Q;\r
+ if (a1<=0 || b<=0) return (larged);\r
+ if (alpha<=0) { a1=-log(a1); b=-log(b); }\r
+ else { a1=-gammap(a1,alpha); b=-gammap(b,alpha);}\r
+ if(Q>0) *kappa = 2*a1/b-1;\r
+ return 2*GC*(1-GC)*a1 + (1-2*GC*(1-GC))/2*b;\r
+ case (TN93): /* TN93 */\r
+ if(Y<small || R<small)\r
+ error2("Y or R = 0.");\r
+ a1=1-Y*P1/(2*tc)-Q/(2*Y); \r
+ a2=1-R*P2/(2*ag)-Q/(2*R);\r
+ b=1-Q/(2*Y*R);\r
+/* if (a1<=0 || a2<=0 || b<=0) return (-1); */\r
+ if (a1<=0 || a2<=0 || b<=0) return (larged);\r
+ if (alpha<=0) { a1=-log(a1); a2=-log(a2); b=-log(b); }\r
+ else { a1=-gammap(a1,alpha); a2=-gammap(a2,alpha); b=-gammap(b,alpha);}\r
+ a1=.5/Y*(a1-R*b); a2=.5/R*(a2-Y*b); b=.5*b;\r
+ *kappa = largek;\r
+/*\r
+ printf("\nk1&k2 = %.6f %.6f\n", a1/b,a2/b);\r
+*/\r
+ if (b>0) *kappa = min2((a1+a2)/(2*b), largek);\r
+ return 4*p[0]*p[1]*a1 + 4*p[2]*p[3]*a2 + 4*Y*R*b;\r
+ }\r
+ return (-1);\r
+}\r
+\r
+\r
+double DistanceIJ (int is, int js, int model, double alpha, double *kappa)\r
+{\r
+/* Distance between sequences is and js.\r
+ See DistanceMatNuc() for more details.\r
+*/\r
+ char b0,b1;\r
+ int h, n=4, missing=0;\r
+ double x[16], sumx, larged=9;\r
+\r
+ zero(x, 16);\r
+ if(com.cleandata && com.seqtype==0) {\r
+ for (h=0; h<com.npatt; h++)\r
+ x[com.z[is][h]*n+com.z[js][h]] += com.fpatt[h];\r
+ }\r
+ else {\r
+ for (h=0; h<com.npatt; h++) {\r
+ b0 = com.z[is][h];\r
+ b1 = com.z[js][h];\r
+ if(b0<n && b1<n)\r
+ x[b0*n+b1] += com.fpatt[h];\r
+ else\r
+ missing=1;\r
+ }\r
+ }\r
+ sumx = sum(x,16);\r
+\r
+ if(sumx<=0) return(larged); /* questionable??? */\r
+ abyx(1./sum(x,16),x,16);\r
+ return SeqDivergence(x, model, alpha, kappa);\r
+}\r
+\r
+\r
+#if (defined LSDISTANCE && defined REALSEQUENCE)\r
+\r
+extern double *SeqDistance;\r
+\r
+int DistanceMatNuc (FILE *fout, FILE*f2base, int model, double alpha)\r
+{\r
+/* This calculates pairwise distances. The data may be clean and coded \r
+ (com.cleandata==1) or not. In the latter case, ambiguity sites are not \r
+ used (pairwise deletion). Site patterns are used.\r
+*/\r
+ int is,js, status=0;\r
+ double kappat=0, t,bigD=9;\r
+ \r
+ if(f2base) fprintf(f2base,"%6d\n", com.ns);\r
+ if(model>=REV) model=TN93; /* TN93 here */\r
+ if(fout) {\r
+ fprintf(fout,"\nDistances:%5s", models[model]);\r
+ if (model!=JC69 && model!=F81) fprintf (fout, " (kappa) ");\r
+ fprintf(fout," (alpha set at %.2f)\n", alpha);\r
+ fprintf(fout,"This matrix is not used in later m.l. analysis.\n");\r
+ if(!com.cleandata) fprintf(fout, "\n(Pairwise deletion.)");\r
+ }\r
+ for(is=0; is<com.ns; is++) {\r
+ if(fout) fprintf(fout,"\n%-15s ", com.spname[is]);\r
+ if(f2base) fprintf(f2base,"%-15s ", com.spname[is]);\r
+ for(js=0; js<is; js++) {\r
+ t = DistanceIJ(is, js, model, alpha, &kappat);\r
+ if(t<0) { t=bigD; status=-1; }\r
+ SeqDistance[is*(is-1)/2+js] = t;\r
+ if(f2base) fprintf(f2base," %7.4f", t);\r
+ if(fout) fprintf(fout,"%8.4f", t);\r
+ if(fout && (model==K80 || model>=F84))\r
+ fprintf(fout,"(%7.4f)", kappat);\r
+ }\r
+ if(f2base) FPN(f2base);\r
+ }\r
+ if(fout) FPN(fout);\r
+ if(status) puts("\ndistance formula sometimes inapplicable..");\r
+ return(status);\r
+}\r
+\r
+\r
+\r
+#endif\r
+\r
+\r
+#ifdef BASEMLG\r
+extern int CijkIs0[];\r
+#endif\r
+\r
+extern int nR;\r
+extern double Cijk[], Root[];\r
+\r
+int QTN93 (int model, double Q[], double kappa1, double kappa2, double pi[])\r
+{\r
+ int i,j;\r
+ double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G, scalefactor;\r
+\r
+ if (model==JC69 || model==F81) kappa1=kappa2=com.kappa=1; \r
+ else if (com.model<TN93) kappa2=kappa1;\r
+ if(model==F84) { kappa2=1+kappa1/R; kappa1=1+kappa1/Y; }\r
+ scalefactor = 1/(2*T*C*kappa1+2*A*G*kappa2 + 2*Y*R);\r
+\r
+ for(i=0; i<4; i++) for(j=0; j<4; j++) Q[i*4+j] = (i==j ? 0 : 1);\r
+ Q[0*4+1] = Q[1*4+0] = kappa1;\r
+ Q[2*4+3] = Q[3*4+2] = kappa2;\r
+ for(i=0; i<4; i++) for(j=0; j<4; j++) Q[i*4+j] *= pi[j]*scalefactor;\r
+ for(i=0; i<4; i++) { Q[i*4+i] = 0; Q[i*4+i] = -sum(Q+i*4, 4); }\r
+\r
+ return (0);\r
+}\r
+\r
+int RootTN93 (int model, double kappa1, double kappa2, double pi[], \r
+ double *scalefactor, double Root[])\r
+{\r
+ double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G;\r
+\r
+ if (model==JC69 || model==F81) kappa1=kappa2=com.kappa=1; \r
+ else if (com.model<TN93) kappa2=kappa1;\r
+ if(model==F84) { kappa2=1+kappa1/R; kappa1=1+kappa1/Y; }\r
+\r
+ *scalefactor = 1/(2*T*C*kappa1+2*A*G*kappa2 + 2*Y*R);\r
+\r
+ Root[0] = 0;\r
+ Root[1] = - (*scalefactor);\r
+ Root[2] = -(Y+R*kappa2) * (*scalefactor);\r
+ Root[3] = -(Y*kappa1+R) * (*scalefactor);\r
+ return (0);\r
+}\r
+\r
+\r
+int eigenTN93 (int model, double kappa1, double kappa2, double pi[],\r
+ int *nR, double Root[], double Cijk[])\r
+{\r
+/* initialize Cijk[] & Root[], which are the only part to be changed\r
+ for a new substitution model\r
+ for JC69, K80, F81, F84, HKY85, TN93\r
+ Root: real Root divided by v, the number of nucleotide substitutions.\r
+*/\r
+ int i,j,k, nr;\r
+ double scalefactor, U[16],V[16], t;\r
+ double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G;\r
+\r
+ if (model==JC69 || model==F81) kappa1=kappa2=com.kappa=1; \r
+ else if (com.model<TN93) kappa2=kappa1;\r
+ RootTN93(model, kappa1, kappa2, pi, &scalefactor, Root);\r
+\r
+ *nR = nr = 2 + (model==K80||model>=F84) + (model>=HKY85);\r
+ U[0*4+0]=U[1*4+0]=U[2*4+0]=U[3*4+0]=1;\r
+ U[0*4+1]=U[1*4+1]=1/Y; U[2*4+1]=U[3*4+1]=-1/R;\r
+ U[0*4+2]=U[1*4+2]=0; U[2*4+2]=G/R; U[3*4+2]=-A/R;\r
+ U[2*4+3]=U[3*4+3]=0; U[0*4+3]=C/Y; U[1*4+3]=-T/Y;\r
+\r
+ xtoy (pi, V, 4);\r
+ V[1*4+0]=R*T; V[1*4+1]=R*C;\r
+ V[1*4+2]=-Y*A; V[1*4+3]=-Y*G;\r
+ V[2*4+0]=V[2*4+1]=0; V[2*4+2]=1; V[2*4+3]=-1;\r
+ V[3*4+0]=1; V[3*4+1]=-1; V[3*4+2]=V[3*4+3]=0;\r
+\r
+ for(i=0; i<4; i++) for(j=0; j<4; j++) {\r
+ Cijk[i*4*nr+j*nr+0]=U[i*4+0]*V[0*4+j];\r
+ switch (model) {\r
+ case JC69:\r
+ case F81:\r
+ for (k=1,t=0; k<4; k++) t += U[i*4+k]*V[k*4+j];\r
+ Cijk[i*4*nr+j*nr+1] = t;\r
+ break;\r
+ case K80:\r
+ case F84:\r
+ Cijk[i*4*nr+j*nr+1]=U[i*4+1]*V[1*4+j];\r
+ for (k=2,t=0; k<4; k++) t += U[i*4+k]*V[k*4+j];\r
+ Cijk[i*4*nr+j*nr+2]=t;\r
+ break;\r
+ case HKY85: case T92: case TN93:\r
+ for (k=1; k<4; k++) Cijk[i*4*nr+j*nr+k] = U[i*4+k]*V[k*4+j];\r
+ break;\r
+ default:\r
+ error2("model in eigenTN93");\r
+ }\r
+ }\r
+#ifdef BASEMLG\r
+ FOR (i,64) CijkIs0[i] = (Cijk[i]==0);\r
+#endif\r
+ return(0);\r
+}\r
+\r
+\r
+#endif\r
+\r
+\r
+\r
+#if (defined(CODEML) || defined(YN00))\r
+\r
+int printfcode (FILE *fout, double fb61[], double space[])\r
+{\r
+/* space[64*2]\r
+*/\r
+ int i, n=Nsensecodon;\r
+\r
+ fprintf (fout, "\nCodon freq., x 10000\n");\r
+ zero (space, 64);\r
+ for(i=0; i<n; i++) space[FROM61[i]] = fb61[i]*10000;\r
+ printcu(fout, space, com.icode);\r
+ return(0);\r
+}\r
+\r
+\r
+int printsmaCodon (FILE *fout, unsigned char * z[],int ns,int ls,int lline,int simple)\r
+{\r
+/* print, in blocks, multiple aligned and transformed codon sequences.\r
+ indels removed.\r
+ This is needed as codons are coded 0,1, 2, ..., 60, and \r
+ printsma won't work.\r
+*/\r
+ int ig, ngroup, lt, il,is, i,b, lspname=30;\r
+ char equal='.',*pz, c0[4],c[4];\r
+\r
+ if(ls==0) return(1);\r
+ ngroup = (ls-1)/lline + 1;\r
+ for (ig=0,FPN(fout); ig<ngroup; ig++) {\r
+ /* fprintf (fout,"%-8d\n", ig*lline+1); */\r
+ for (is=0; is<ns; is++) {\r
+ fprintf(fout,"%-*s ", lspname, com.spname[is]);\r
+ lt=0; \r
+ for(il=ig*lline,pz=z[is]+il; lt<lline && il<ls; il++,lt++,pz++) {\r
+ b = *pz; \r
+ b = FROM61[b]; \r
+ c[0] = (char)(b/16); \r
+ c[1] = (char)((b%16)/4);\r
+ c[2] = (char)(b%4);\r
+ c[3] = 0;\r
+ for(i=0; i<3; i++)\r
+ c[i] = BASEs[(int)c[i]];\r
+ if (is && simple) {\r
+ b = z[0][il];\r
+ b = FROM61[b];\r
+ c0[0]=(char)(b/16); c0[1]=(char)((b%16)/4); c0[2]=(char)(b%4);\r
+ for(i=0; i<3; i++)\r
+ if (c[i]==BASEs[(int)c0[i]]) c[i]=equal;\r
+ }\r
+ fprintf(fout,"%3s ", c);\r
+ }\r
+ FPN (fout);\r
+ }\r
+ }\r
+ return (0);\r
+}\r
+\r
+\r
+int setmark_61_64 (void)\r
+{\r
+/* This sets two matrices FROM61[], and FROM64[], which translate between two \r
+ codings of codons. In one coding, codons go from 0, 1, ..., 63 while in \r
+ the other codons range from 0, 1, ..., 61 with the three stop codons removed.\r
+ FROM61[] translates from the 61-state coding to the 64-state coding, while \r
+ FROM64[] translates from the 64-state coding to the 61-state coding.\r
+\r
+ This routine also sets up FourFold[4][4], which defines the 4-fold codon\r
+ boxes.\r
+*/\r
+ int i,j,k, *code=GeneticCode[com.icode];\r
+ int c[3],aa0,aa1, by[3]={16,4,1};\r
+ double nSilent, nStop, nRepl;\r
+\r
+ Nsensecodon=0;\r
+ for (i=0; i<64; i++) {\r
+ if (code[i]==-1) FROM64[i]=-1; \r
+ else { FROM61[Nsensecodon]=i; FROM64[i]=Nsensecodon++; }\r
+ }\r
+ com.ncode=Nsensecodon;\r
+\r
+ for(i=0; i<4; i++) for(j=0; j<4; j++) {\r
+ k=i*16+j*4;\r
+ FourFold[i][j] = (code[k]==code[k+1] && code[k]==code[k+2] && code[k]==code[k+3]);\r
+ }\r
+\r
+ for (i=0,nSilent=nStop=nRepl=0; i<64; i++) {\r
+ c[0]=i/16; c[1]=(i/4)%4; c[2]=i%4;\r
+ if((aa0=code[i])==-1) continue;\r
+ for(j=0; j<3; j++) for(k=0; k<3; k++) {\r
+ aa1 = code[i + ((c[j]+k+1)%4 - c[j])*by[j]];\r
+ if(aa1==-1) nStop++;\r
+ else if(aa0==aa1) nSilent++;\r
+ else nRepl++;\r
+ }\r
+ }\r
+/*\r
+ printf("\ncode Stop Silent Replace\n");\r
+ printf("%3d (%d) %6.0f%6.0f%6.0f %12.6f%12.6f\n", \r
+ com.icode, 64-com.ncode, nStop,nSilent,nRepl,nStop*3/(com.ncode*9),nSilent*3/(com.ncode*9));\r
+*/\r
+ return (0);\r
+}\r
+\r
+int DistanceMatNG86 (FILE *fout, FILE*fds, FILE*fdn, FILE*ft, double alpha)\r
+{\r
+/* Estimation of dS and dN by the method of Nei & Gojobori (1986)\r
+ This works with both coded (com.cleandata==1) and uncoded data.\r
+ In the latter case (com.cleandata==0), the method does pairwise delection.\r
+\r
+ alpha for gamma rates is used for dN only.\r
+*/\r
+ char *codon[2];\r
+ int is,js, i,k,h, wname=20, status=0, ndiff,nsd[4];\r
+ int nb[3],ib[3][4], missing;\r
+ double ns,na, nst,nat, S,N, St,Nt, dS,dN,dN_dS,y, bigD=3, lst;\r
+ double SEds, SEdn, p;\r
+\r
+ if(fout) { \r
+ fputs("\n\n\nNei & Gojobori 1986. dN/dS (dN, dS)",fout);\r
+ if(com.cleandata==0) fputs("\n(Pairwise deletion)",fout);\r
+ fputs("\n(Note: This matrix is not used in later ML. analysis.\n",fout);\r
+ fputs("Use runmode = -2 for ML pairwise comparison.)\n",fout);\r
+ }\r
+\r
+ if(fds) {\r
+ fprintf(fds,"%6d\n",com.ns);\r
+ fprintf(fdn,"%6d\n",com.ns); \r
+ fprintf(ft,"%6d\n",com.ns);\r
+ }\r
+ if(noisy>1 && com.ns>10) puts("NG distances for seqs.:");\r
+ for(is=0; is<com.ns; is++) {\r
+ if(fout) \r
+ fprintf(fout,"\n%-*s", wname,com.spname[is]);\r
+ if(fds) {\r
+ fprintf(fds, "%-*s ",wname,com.spname[is]);\r
+ fprintf(fdn, "%-*s ",wname,com.spname[is]);\r
+ fprintf(ft, "%-*s ",wname,com.spname[is]);\r
+ }\r
+ for(js=0; js<is; js++) {\r
+ for(k=0; k<4; k++) nsd[k] = 0;\r
+ for (h=0,lst=0,nst=nat=S=N=0; h<com.npatt; h++) {\r
+ if(com.z[is][h]>=com.ncode || com.z[js][h]>=com.ncode) \r
+ continue;\r
+ codon[0] = CODONs[com.z[is][h]];\r
+ codon[1] = CODONs[com.z[js][h]];\r
+ lst += com.fpatt[h];\r
+ ndiff = difcodonNG(codon[0], codon[1], &St, &Nt, &ns, &na, 0, com.icode);\r
+ nsd[ndiff] += (int)com.fpatt[h];\r
+ S += St*com.fpatt[h];\r
+ N += Nt*com.fpatt[h];\r
+ nst += ns*com.fpatt[h];\r
+ nat += na*com.fpatt[h];\r
+ } /* for(h) */\r
+ if(S<=0 || N<=0)\r
+ y=0;\r
+ else { /* rescale for stop codons */\r
+ y = lst*3./(S+N);\r
+ S *= y;\r
+ N *= y;\r
+ }\r
+ if(noisy>=9)\r
+ printf("\n%3d %3d:Sites %7.1f +%7.1f =%7.1f\tDiffs %7.1f +%7.1f =%7.1f",\r
+ is+1,js+1,S,N,S+N,nst,nat, nst+nat);\r
+\r
+ dS = (S<=0 ? 0 : 1-4./3*nst/S);\r
+ dN = (N<=0 ? 0 : 1-4./3*nat/N);\r
+ if(noisy>=9 && (dS<=0 || dN<=0))\r
+ { puts("\nNG86 unusable."); status=-1;}\r
+ if(dS==1) dS = 0;\r
+ else dS = (dS<=0 ? -1 : 3./4*(-log(dS)));\r
+ if(dN==1) dN = 0;\r
+ else dN = (dN<=0 ? -1 : 3./4*(alpha==0?-log(dN):alpha*(pow(dN,-1/alpha)-1)));\r
+\r
+ dN_dS = (dS>0 && dN>0 ? dN/dS : -1);\r
+ if(fout) fprintf(fout,"%7.4f (%5.4f %5.4f)", dN_dS, dN, dS);\r
+\r
+ if(N>0 && dN<0) dN = bigD; \r
+ if(S>0&&dS<0) dS = bigD;\r
+\r
+#ifdef CODEML\r
+ SeqDistance[is*(is-1)/2+js] = (S<=0||N<=0 ? 0 : (S*dS+N*dN)*3/(S+N));\r
+#endif\r
+\r
+ if(fds) {\r
+ fprintf(fds," %7.4f", dS);\r
+ fprintf(fdn," %7.4f", dN);\r
+ fprintf(ft," %7.4f", (S*dS+N*dN)*3/(S+N));\r
+ }\r
+ if(alpha==0 && dS<bigD) { p=nst/S; SEds=sqrt(9*p*(1-p)/(square(3-4*p)*S)); }\r
+ if(alpha==0 && dN<bigD) { p=nat/N; SEdn=sqrt(9*p*(1-p)/(square(3-4*p)*N)); }\r
+ } /* for(js) */\r
+ if(fds) {\r
+ FPN(fds); FPN(fdn); FPN(ft);\r
+ }\r
+ if(noisy>1 && com.ns>10) printf(" %3d", is+1);\r
+ } /* for(is) */\r
+ FPN(F0); \r
+ if(fout) FPN(fout);\r
+ if(status) fprintf (fout, "NOTE: -1 means that NG86 is inapplicable.\n");\r
+\r
+ SS=S, NN=N, Sd=nst, Nd=nat; /* kostas */\r
+\r
+ return (0);\r
+}\r
+\r
+\r
+#endif\r
+\r
+\r
+\r
+#ifdef BASEML\r
+\r
+int eigenQREVbase (FILE* fout, double Q[NCODE*NCODE], double kappa[], double pi[], int *nR, double Root[], double Cijk[])\r
+{\r
+/* pi[] is constant.\r
+ This returns the Q matrix in Q.\r
+*/\r
+ int n=com.ncode, i,j,k;\r
+ int nr = (com.ngene>1 && com.Mgene>=3 ? com.nrate/com.ngene : com.nrate);\r
+ double Q0[NCODE*NCODE], U[NCODE*NCODE], V[NCODE*NCODE], mr, space_pisqrt[NCODE*NCODE];\r
+\r
+ NPMatUVRoot=0;\r
+ *nR=n;\r
+ zero(Q, n*n);\r
+ if(com.model==REV) {\r
+ if(n!=4) error2("ncode != 4 for REV");\r
+ Q[3*n+2] = Q[2*n+3] = 1; /* r_AG = r_GA = 1. */\r
+ for(i=0,k=0; i<n-1; i++) for (j=i+1; j<n; j++)\r
+ if(i*n+j != 2*n+3)\r
+ Q[i*n+j] = Q[j*n+i] = kappa[k++];\r
+ }\r
+ else /* (model==REVu) */\r
+ for(i=0; i<n-1; i++) for(j=i+1; j<n; j++)\r
+ Q[i*n+j]=Q[j*n+i] = (StepMatrix[i*n+j] ? kappa[StepMatrix[i*n+j]-1] : 1);\r
+\r
+ for(i=0; i<n; i++) for(j=0; j<n; j++)\r
+ Q[i*n+j] *= pi[j];\r
+\r
+ for (i=0,mr=0; i<n; i++) {\r
+ Q[i*n+i] = 0; \r
+ Q[i*n+i] = -sum(Q+i*n, n);\r
+ mr -= pi[i]*Q[i*n+i]; \r
+ }\r
+ abyx(1/mr, Q, n*n);\r
+\r
+ if (fout) {\r
+ mr = 2*pi[0]*Q[0*n+1] + 2*pi[2]*Q[2*n+3];\r
+ if(com.nhomo==0) {\r
+ fprintf(fout, "\nRate parameters: ");\r
+ for(j=0; j<nr; j++) \r
+ fprintf(fout, " %8.5f", kappa[j]);\r
+ fprintf(fout, "\nBase frequencies: ");\r
+ for(j=0; j<n; j++) \r
+ fprintf(fout," %8.5f", pi[j]);\r
+ }\r
+ fprintf (fout, "\nRate matrix Q, Average Ts/Tv =%9.4f", mr/(1-mr));\r
+ matout (fout, Q, n, n);\r
+ }\r
+ else {\r
+ xtoy (Q, Q0, n*n);\r
+ eigenQREV(Q0, pi, n, Root, U, V, space_pisqrt);\r
+ for(i=0; i<n; i++) for(j=0; j<n; j++) for(k=0; k<n; k++) \r
+ Cijk[i*n*n+j*n+k] = U[i*n+k]*V[k*n+j];\r
+ }\r
+ return (0);\r
+}\r
+\r
+\r
+int QUNREST (FILE *fout, double Q[], double rate[], double pi[])\r
+{\r
+/* This constructs the rate matrix Q for the unrestricted model.\r
+ pi[] is changed in the routine.\r
+*/\r
+ int n=com.ncode, i,j,k;\r
+ double mr, ts, space[20];\r
+\r
+ if(com.model==UNREST) {\r
+ if(n!=4) error2("ncode != 4 for UNREST");\r
+ for (i=0,k=0,Q[14]=1; i<n; i++) for(j=0; j<n; j++) \r
+ if (i!=j && i*n+j != 14) Q[i*n+j] = rate[k++];\r
+ }\r
+ else /* (model==UNRESTu) */\r
+ for(i=0; i<n; i++) for(j=0; j<n; j++) \r
+ if(i != j) \r
+ Q[i*n+j] = (StepMatrix[i*n+j] ? rate[StepMatrix[i*n+j]-1] : 1);\r
+\r
+ for(i=0; i<n; i++) {\r
+ Q[i*n+i] = 0; \r
+ Q[i*n+i] = -sum(Q+i*n, n); \r
+ }\r
+\r
+ /* get pi */\r
+ QtoPi(Q, com.pi, n, space);\r
+\r
+ for (i=0,mr=0; i<n; i++) mr -= pi[i]*Q[i*n+i];\r
+ for (i=0; i<n*n; i++) Q[i] /= mr;\r
+\r
+ if (fout) {\r
+ ts = pi[0]*Q[0*n+1] + pi[1]*Q[1*n+0] + pi[2]*Q[2*n+3] + pi[3]*Q[3*n+2];\r
+\r
+ fprintf(fout, "Rate parameters: ");\r
+ for(j=0; j<com.nrate; j++) fprintf(fout, " %8.5f", rate[j]);\r
+ fprintf(fout, "\nBase frequencies: ");\r
+ for(j=0; j<n; j++) fprintf(fout," %8.5f", pi[j]);\r
+ if(n==4)\r
+ fprintf (fout,"\nrate matrix Q, Average Ts/Tv (similar to kappa/2) =%9.4f", ts/(1-ts));\r
+ else \r
+ fprintf (fout,"\nrate matrix Q");\r
+ matout (fout, Q, n, n);\r
+ }\r
+ return (0);\r
+}\r
+\r
+#endif\r
+\r
+\r
+#ifdef LSDISTANCE\r
+\r
+double *SeqDistance=NULL; \r
+int *ancestor=NULL;\r
+\r
+int SetAncestor()\r
+{\r
+/* This finds the most recent common ancestor of species is and js.\r
+*/\r
+ int is, js, it, a1, a2;\r
+\r
+ for(is=0; is<com.ns; is++) for(js=0; js<is; js++) {\r
+ it = is*(is-1)/2+js;\r
+ ancestor[it] = -1;\r
+ for (a1=is; a1!=-1; a1=nodes[a1].father) {\r
+ for (a2=js; a2!=-1; a2=nodes[a2].father)\r
+ if (a1==a2) { ancestor[it] = a1; break; }\r
+ if (ancestor[it] != -1) break;\r
+ }\r
+ if (ancestor[it] == -1) error2("no ancestor");\r
+ }\r
+ return(0);\r
+}\r
+\r
+int fun_LS (double x[], double diff[], int np, int npair);\r
+\r
+int fun_LS (double x[], double diff[], int np, int npair)\r
+{\r
+ int i,j, aa, it=-np;\r
+ double dexp;\r
+\r
+ if (SetBranch(x) && noisy>2) puts ("branch len.");\r
+ if (npair != com.ns*(com.ns-1)/2) error2("# seq pairs err.");\r
+ for(i=0; i<com.ns; i++) for(j=0; j<i; j++) {\r
+ it = i*(i-1)/2+j;\r
+ for (aa=i,dexp=0; aa!=ancestor[it]; aa=nodes[aa].father)\r
+ dexp += nodes[aa].branch;\r
+ for (aa=j; aa!=ancestor[it]; aa=nodes[aa].father)\r
+ dexp += nodes[aa].branch;\r
+ diff[it] = SeqDistance[it] - dexp;\r
+\r
+ if(fabs(diff[it])>1000) {\r
+ printf("\ndistances very different: diff = %12.6f ", diff[it]);\r
+ }\r
+\r
+ }\r
+ return(0);\r
+}\r
+\r
+int LSDistance (double *ss,double x[],int (*testx)(double x[],int np))\r
+{\r
+/* get Least Squares estimates of branch lengths for a given tree topology\r
+ This uses nls2, a general least squares algorithm for nonlinear programming \r
+ to estimate branch lengths, and it thus inefficient.\r
+*/\r
+ int i;\r
+\r
+ if ((*testx)(x, com.ntime)) {\r
+ matout (F0, x, 1, com.ntime);\r
+ puts ("initial err in LSDistance()");\r
+ }\r
+ SetAncestor();\r
+ i = nls2((com.ntime>20&&noisy>=3?F0:NULL),\r
+ ss,x,com.ntime,fun_LS,NULL,testx,com.ns*(com.ns-1)/2,1e-6);\r
+\r
+ return (i);\r
+}\r
+\r
+double PairDistanceML(int is, int js)\r
+{\r
+/* This calculates the ML distance between is and js, the sum of ML branch \r
+ lengths along the path between is and js.\r
+ LSdistance() has to be called once to set ancestor before calling this \r
+ routine.\r
+*/\r
+ int it, a;\r
+ double dij=0;\r
+\r
+ if(is==js) return(0);\r
+ if(is<js) { it=is; is=js; js=it; }\r
+\r
+ it = is*(is-1)/2 + js;\r
+ for (a=is; a!=ancestor[it]; a=nodes[a].father)\r
+ dij += nodes[a].branch;\r
+ for (a=js; a!=ancestor[it]; a=nodes[a].father)\r
+ dij += nodes[a].branch;\r
+ return(dij);\r
+}\r
+\r
+\r
+int GroupDistances()\r
+{\r
+/* This calculates average group distances by summing over the ML \r
+ branch lengths */\r
+ int newancestor=0, i,j, ig,jg;\r
+/* int ngroup=2, Ningroup[10], group[200]={1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
+1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r
+2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; */ /* dloop for HC200.paup */\r
+ int ngroup=10, Ningroup[10], group[115]={\r
+ 10, 9, 9, 9, 9, 9, 9, 9, 9, 10, \r
+ 9, 9, 3, 3, 1, 1, 1, 1, 1, 1, \r
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \r
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \r
+ 1, 2, 2, 2, 2, 2, 2, 4, 4, 4, \r
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, \r
+ 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, \r
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, \r
+ 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, \r
+ 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, \r
+ 8, 8, 8, 8, 8}; /* dloop data for Anne Yoder, ns=115 */\r
+ double dgroup, npairused;\r
+\r
+/* ngroup=2; for(j=0;j<com.ns; j++) group[j]=1+(group[j]>2); */\r
+\r
+ for(j=0;j<ngroup;j++) Ningroup[j]=0;\r
+ for(j=0;j<com.ns; j++) Ningroup[group[j]-1]++;\r
+ printf("\n# sequences in group:");\r
+ matIout(F0,Ningroup,1,ngroup);\r
+ if(ancestor==NULL) {\r
+ newancestor=1;\r
+ ancestor=(int*)realloc(ancestor, com.ns*(com.ns-1)/2*sizeof(int));\r
+ if(ancestor==NULL) error2("oom ancestor");\r
+ }\r
+ SetAncestor();\r
+\r
+ for(ig=0; ig<ngroup; ig++) {\r
+ printf("\ngroup %2d",ig+1);\r
+ for(jg=0; jg<ig+1; jg++) {\r
+ dgroup=0; npairused=0;\r
+ for(i=0;i<com.ns;i++) for(j=0;j<com.ns;j++) {\r
+ if(i!=j && group[i]==ig+1 && group[j]==jg+1) {\r
+ dgroup += PairDistanceML(i, j);\r
+ npairused++;\r
+ }\r
+ }\r
+ dgroup/=npairused;\r
+ printf("%9.4f", dgroup);\r
+\r
+ /* printf("%6.1f", dgroup/0.2604*5); */ /* 0.2604, 0.5611 */\r
+ }\r
+ }\r
+ if(newancestor==1) free(ancestor);\r
+ return(0);\r
+}\r
+\r
+#endif \r
+\r
+#ifdef NODESTRUCTURE\r
+\r
+void BranchToNode (void)\r
+{\r
+/* tree.root need to be specified before calling this\r
+*/\r
+ int i, from, to;\r
+ \r
+ tree.nnode=tree.nbranch+1;\r
+ for(i=0; i<tree.nnode; i++)\r
+ { nodes[i].father=nodes[i].ibranch=-1; nodes[i].nson=0; }\r
+ for (i=0; i<tree.nbranch; i++) {\r
+ from=tree.branches[i][0];\r
+ to =tree.branches[i][1];\r
+ nodes[from].sons[nodes[from].nson++]=to;\r
+ nodes[to].father=from;\r
+ nodes[to].ibranch=i;\r
+ }\r
+ /* nodes[tree.root].branch=0; this breaks method=1 */\r
+}\r
+\r
+void NodeToBranchSub (int inode);\r
+\r
+void NodeToBranchSub (int inode)\r
+{\r
+ int i, ison;\r
+\r
+ for(i=0; i<nodes[inode].nson; i++) {\r
+ tree.branches[tree.nbranch][0] = inode;\r
+ tree.branches[tree.nbranch][1] = ison = nodes[inode].sons[i];\r
+ nodes[ison].ibranch = tree.nbranch++;\r
+ if(nodes[ison].nson>0) NodeToBranchSub(ison);\r
+ }\r
+}\r
+\r
+void NodeToBranch (void)\r
+{\r
+ tree.nbranch=0;\r
+ NodeToBranchSub (tree.root);\r
+ if(tree.nnode != tree.nbranch+1)\r
+ error2("nnode != nbranch + 1?");\r
+}\r
+\r
+\r
+void ClearNode (int inode)\r
+{\r
+/* a source of confusion. Try not to use this routine.\r
+*/\r
+ nodes[inode].father = nodes[inode].ibranch = -1;\r
+ nodes[inode].nson = 0;\r
+ nodes[inode].branch = nodes[inode].age = 0;\r
+ /* nodes[inode].label = -1; */\r
+ /* nodes[inode].branch = 0; clear node structure only, not branch lengths */\r
+ /* for(i=0; i<com.ns; i++) nodes[inode].sons[i]=-1; */\r
+}\r
+\r
+int ReadTreeB (FILE *ftree, int popline)\r
+{\r
+ char line[255];\r
+ int nodemark[NS*2-1]={0}; /* 0: absent; 1: father only (root); 2: son */\r
+ int i,j, state=0, YoungAncestor=0;\r
+\r
+ if(com.clock) {\r
+ puts("\nbranch representation of tree might not work with clock model.");\r
+ getchar();\r
+ }\r
+\r
+ fscanf (ftree, "%d", &tree.nbranch);\r
+ for(j=0; j<tree.nbranch; j++) {\r
+ for(i=0; i<2; i++) {\r
+ if (fscanf (ftree, "%d", & tree.branches[j][i]) != 1) state=-1;\r
+ tree.branches[j][i]--;\r
+ if(tree.branches[j][i]<0 || tree.branches[j][i]>com.ns*2-1) \r
+ error2("ReadTreeB: node numbers out of range");\r
+ }\r
+ nodemark[tree.branches[j][1]]=2;\r
+ if(nodemark[tree.branches[j][0]]!=2) nodemark[tree.branches[j][0]]=1;\r
+ if (tree.branches[j][0]<com.ns) YoungAncestor=1;\r
+\r
+ printf ("\nBranch #%3d: %3d -> %3d",j+1,tree.branches[j][0]+1,tree.branches[j][1]+1);\r
+\r
+ }\r
+ if(popline) fgets(line, 254, ftree);\r
+ for(i=0,tree.root=-1; i<tree.nbranch; i++) \r
+ if(nodemark[tree.branches[i][0]]!=2) tree.root=tree.branches[i][0];\r
+ if(tree.root==-1) error2("root err");\r
+ for(i=0; i<com.ns; i++)\r
+ if(nodemark[i]==0) {\r
+ matIout(F0,nodemark,1,com.ns);\r
+ error2("branch specification of tree");\r
+ }\r
+\r
+ if(YoungAncestor) {\r
+ puts("\nAncestors in the data? Take care.");\r
+ if(!com.cleandata) {\r
+ puts("This kind of tree does not work with unclean data.");\r
+ getchar();\r
+ }\r
+ }\r
+\r
+/*\r
+ com.ntime = com.clock ? (tree.nbranch+1)-com.ns+(tree.root<com.ns)\r
+ : tree.nbranch;\r
+*/\r
+\r
+ BranchToNode ();\r
+ return (state);\r
+}\r
+\r
+\r
+int OutTreeB (FILE *fout)\r
+{\r
+ int j;\r
+ char *fmt[]={" %3d..%-3d", " %2d..%-2d"};\r
+ FOR (j, tree.nbranch)\r
+ fprintf(fout, fmt[0], tree.branches[j][0]+1,tree.branches[j][1]+1);\r
+ return (0);\r
+}\r
+\r
+int GetTreeFileType(FILE *ftree, int *ntree, int *pauptree, int shortform);\r
+\r
+int GetTreeFileType(FILE *ftree, int *ntree, int *pauptree, int shortform)\r
+{\r
+/* paupstart="begin trees" and paupend="translate" identify paup tree files.\r
+ paupch=";" will be the last character before the list of trees.\r
+ Modify if necessary.\r
+*/\r
+ int i,k, lline=32000, ch=0, paupch=';';\r
+ char line[32000];\r
+ char *paupstart="begin tree", *paupend="translate";\r
+\r
+ *pauptree=0;\r
+ k=fscanf(ftree,"%d%d",&i,ntree);\r
+ if(k==2) {\r
+ if(i==com.ns) return(0); /* old paml style */\r
+ else error2("Number of sequences different in tree and seq files.");\r
+ }\r
+ else if(k==1) { *ntree=i; return(0); } /* phylip & molphy style */\r
+ while(ch!='(' && !isalnum(ch) && ch!=EOF) ch=fgetc(ftree); /* treeview style */\r
+ if(ch=='(') { *ntree=-1; ungetc(ch,ftree); return(0); }\r
+\r
+ puts("\n# seqs in tree file does not match. Read as the nexus format.");\r
+ for ( ; ; ) {\r
+ if(fgets(line,lline,ftree)==NULL) error2("tree err1: EOF");\r
+ strcase(line,0);\r
+ if (strstr(line,paupstart)) { *pauptree=1; *ntree=-1; break; }\r
+ }\r
+ if(shortform) return(0);\r
+ for ( ; ; ) {\r
+ if(fgets(line,lline,ftree)==NULL) error2("tree err2: EOF");\r
+ strcase(line,0);\r
+ if (strstr(line,paupend)) break;\r
+ }\r
+ for ( ; ; ) {\r
+ if((ch=fgetc(ftree))==EOF) error2("tree err3: EOF");\r
+ if (ch==paupch) break;\r
+ }\r
+ if(fgets(line,lline,ftree)==NULL) error2("tree err4: EOF");\r
+\r
+ return(0);\r
+}\r
+\r
+int PopPaupTreeRubbish(FILE *ftree);\r
+int PopPaupTreeRubbish(FILE *ftree)\r
+{\r
+/* This reads out the string in front of the tree in the nexus format, \r
+ typically "tree PAUP_1 = [&U]" with "[&U]" optional\r
+*/\r
+ int ch;\r
+\r
+ for (; ;) {\r
+ ch=fgetc(ftree);\r
+ if(ch=='(')\r
+ { ungetc(ch,ftree); return(0); }\r
+ else if(ch==EOF || ch=='/') \r
+ return(-1);\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+static int *CladeLabel = NULL;\r
+\r
+void DownTreeCladeLabel (int inode, int cLabel)\r
+{\r
+/* This goes down the tree to change $ labels (stored in CladeLabel[]) into\r
+ # labels (stored in nodes[].label). To deal with nested clade labels,\r
+ branches within a clade are labeled by negative numbers initially, and \r
+ converted to positive labels at the end of the algorithm.\r
+\r
+ nodes[].label and CladeLabel[] are initialized to -1 before this routine \r
+ is called.\r
+*/\r
+ int i, label;\r
+\r
+ label = cLabel;\r
+ if(CladeLabel[inode] != -1) \r
+ label = CladeLabel[inode];\r
+ if(inode != tree.root && nodes[inode].label == -1) \r
+ nodes[inode].label = label;\r
+ for(i=0; i<nodes[inode].nson; i++)\r
+ DownTreeCladeLabel(nodes[inode].sons[i], label);\r
+}\r
+\r
+int IsNameNumber(char line[])\r
+{\r
+/* returns 0 if line has species number; 1 if it has name.\r
+ It uses com.ns.\r
+*/\r
+ int isname=1, alldigits=1, n;\r
+ char *p=line;\r
+\r
+ while(*p)\r
+ if(!isdigit(*p++)) { alldigits=0; break; }\r
+ if(alldigits) {\r
+ n = atoi(line);\r
+ if(n>=1 && n<=com.ns) isname = 0;\r
+ }\r
+ return(isname);\r
+}\r
+\r
+\r
+\r
+int ReadTreeN (FILE *ftree, int *haslength, int *haslabel, int copyname, int popline)\r
+{\r
+/* Read a tree from ftree, using the parenthesis node representation of trees.\r
+ Branch lengths are read in nodes[].branch, and branch (node) labels \r
+ (integers) are preceeded by # and read in nodes[].label. If the clade label\r
+ $ is used, the label is read into CladeLabel[] first and then moved into\r
+ nodes[].label in the routine DownTreeCladeLabel().\r
+\r
+ This assumes that com.ns is known.\r
+ Species names are considered case-sensitive, with trailing spaces ignored.\r
+\r
+ copyname = 0: species numbers and names are both accepted, but names have \r
+ to match the names in com.spname[], which are from the \r
+ sequence data file. Used by baseml and codeml, for example.\r
+ 1: species names are copied into com.spname[], but species \r
+ numbers are accepted. Used by evolver for simulation, \r
+ in which case no species names were read before.\r
+ 2: the tree must have species names, which are copied into com.spname[].\r
+ Note that com.ns is assumed known. To remove this restrition, \r
+ one has to consider the space for nodes[], CladeLabel, starting \r
+ node number etc.\r
+\r
+ isname = 0: species number; 1: species name;\r
+\r
+ Ziheng note (18/12/2011): I have changed the code so that sequence number is not used \r
+ anymore. isname = 1 always.\r
+*/\r
+ int cnode, cfather=-1; /* current node and father */\r
+ int inodeb=0; /* node number that will have the next branch length */\r
+ int cladeLabels=0, i,j,k, level=0, isname, ch=' ', icurspecies=0;\r
+ char check[NS], delimiters[]="(),:#$=@><;", quote[]="\"\'";\r
+ int lline=32000;\r
+ char line[32000], *pch;\r
+\r
+ if(com.ns<=0) error2("you should specify # seqs in the tree file.");\r
+\r
+ if((CladeLabel=(int*)malloc((com.ns*2-1)*sizeof(int)))==NULL) \r
+ error2("oom trying to get space for cladelabel");\r
+ for(i=0; i<2*com.ns-1; i++) \r
+ CladeLabel[i] = -1;\r
+\r
+ /* initialization */\r
+ for(i=0; i<com.ns; i++) check[i]=0;\r
+ *haslength = 0; *haslabel = 0;\r
+ tree.nnode = com.ns; tree.nbranch = 0;\r
+ for(i=0; i<2*com.ns-1; i++) {\r
+ nodes[i].father = nodes[i].ibranch = -1;\r
+ nodes[i].nson = 0; nodes[i].label = -1; nodes[i].branch = 0;\r
+ nodes[i].age = 0; /* TipDate models set this for each tree later. */\r
+#if (defined(BASEML) || defined(CODEML))\r
+ nodes[i].fossil = 0;\r
+#endif\r
+ }\r
+ while(isspace(ch))\r
+ ch=fgetc(ftree); /* skip spaces */\r
+ ungetc(ch,ftree);\r
+ if (isdigit(ch))\r
+ { ReadTreeB(ftree,popline); return(0); }\r
+\r
+ if(PopPaupTreeRubbish(ftree) == -1) return(-1);\r
+\r
+ for ( ; ; ) {\r
+ ch = fgetc (ftree);\r
+ if (ch==EOF) return(-1);\r
+ else if (ch == ';') {\r
+ if(level!=0) error2("; in treefile");\r
+ else break;\r
+ }\r
+ else if (ch==',') ;\r
+ else if (!isgraph(ch))\r
+ continue;\r
+ else if (ch == '(') { /* left ( */\r
+ level++;\r
+ cnode=tree.nnode++;\r
+ if(tree.nnode>2*com.ns-1)\r
+ error2("check #seqs and tree: perhaps too many '('?");\r
+ if (cfather >= 0) {\r
+ if(nodes[cfather].nson >= MAXNSONS) {\r
+ printf("there are at least %d daughter nodes, raise MAXNSONS?", nodes[cfather].nson);\r
+ exit(-1);\r
+ }\r
+ nodes[cfather].sons[nodes[cfather].nson++] = cnode;\r
+ nodes[cnode].father = cfather;\r
+ tree.branches[tree.nbranch][0] = cfather;\r
+ tree.branches[tree.nbranch][1] = cnode;\r
+ nodes[cnode].ibranch = tree.nbranch++;\r
+ }\r
+ else\r
+ tree.root = cnode;\r
+ cfather = cnode;\r
+ }\r
+ /* treating : and > in the same way is risky. */\r
+ else if (ch==')') {\r
+ level--; inodeb=cfather; cfather=nodes[cfather].father; \r
+ }\r
+ else if (ch==':'||ch=='>') { \r
+ if(ch==':') *haslength=1;\r
+ fscanf(ftree, "%lf", &nodes[inodeb].branch); \r
+ }\r
+ else if (ch==quote[0] || ch==quote[1]) {\r
+ for (k=0; ; k++) { /* read notes into line[] */\r
+ line[k] = (char)fgetc(ftree);\r
+ if((int)line[k] == EOF)\r
+ error2("EOF when reading node label");\r
+ if(line[k] == quote[0] || line[k] == quote[1])\r
+ break;\r
+ }\r
+ line[k++] = '\0';\r
+ nodes[inodeb].nodeStr = (char*)malloc(k*sizeof(char));\r
+ if (nodes[inodeb].nodeStr == NULL) error2("oom nodeStr");\r
+ strcpy(nodes[inodeb].nodeStr, line);\r
+ if((pch = strchr(line,'#')) || (pch = strchr(line,'<'))) {\r
+ *haslabel=1; sscanf(pch+1, "%lf", &nodes[inodeb].label); \r
+ }\r
+ if((pch = strchr(line,'>'))) {\r
+ sscanf(pch+1, "%lf", &nodes[inodeb].branch); \r
+ }\r
+ if((pch = strchr(line,'$'))) {\r
+ *haslabel=1; sscanf(pch+1, "%d", &CladeLabel[inodeb]);\r
+ }\r
+ if((pch = strchr(line,'=')) || (pch = strchr(line,'@'))) {\r
+ sscanf(pch+1, "%lf", &nodes[inodeb].age);\r
+#if (defined(BASEML) || defined(CODEML))\r
+ if(com.clock) nodes[inodeb].fossil = 1;\r
+#endif\r
+#if (defined(CODEML))\r
+ nodes[inodeb].omega = 0;\r
+#endif\r
+ }\r
+ }\r
+ else if (ch=='#' || ch=='<') { *haslabel=1; fscanf(ftree, "%lf", &nodes[inodeb].label); }\r
+ else if (ch=='$') { *haslabel=1; fscanf(ftree, "%d", &CladeLabel[inodeb]); }\r
+ else if (ch=='@' || ch=='=') { \r
+ fscanf(ftree,"%lf", &nodes[inodeb].age);\r
+#if (defined(BASEML) || defined(CODEML))\r
+ if(com.clock) nodes[inodeb].fossil = 1;\r
+#endif\r
+#if (defined(CODEML))\r
+ nodes[inodeb].omega = 0;\r
+#endif\r
+ }\r
+ else { /* read species name or number */\r
+ if(level<=0) \r
+ error2("expecting ; in the tree file");\r
+ line[0]=(char)ch; line[1]=(char)fgetc(ftree);\r
+/* if(line[1]==(char)EOF) error2("eof in tree file"); */\r
+\r
+ for (i=1; i<lline; ) { /* read species name into line[] until delimiter */\r
+ if ((strchr(delimiters,line[i]) && line[i]!='@') \r
+ || line[i]==(char)EOF || line[i]=='\n')\r
+ { ungetc(line[i],ftree); line[i]=0; break; }\r
+ line[++i]=(char)fgetc(ftree);\r
+ }\r
+ for(j=i-1;j>0;j--) /* trim spaces*/\r
+ if(isgraph(line[j])) break; else line[j]=0;\r
+\r
+ if(FullSeqNames)\r
+ isname = 1; /* numbers are part of names. */\r
+ else\r
+ isname = IsNameNumber(line);\r
+\r
+ if (isname==0) { /* number */\r
+ if(copyname==2) error2("Use names in tree.");\r
+ sscanf(line, "%d", &cnode);\r
+ cnode--;\r
+ }\r
+ else { /* name */\r
+ if(!copyname) {\r
+ for(i=0; i<com.ns; i++) if (!strcmp(line,com.spname[i])) break;\r
+ if((cnode=i)==com.ns) \r
+ { printf("\nSpecies %s?\n", line); exit(-1); }\r
+ }\r
+ else {\r
+ if(icurspecies>com.ns-1) {\r
+ error2("error in tree: too many species in tree");\r
+ }\r
+ strcpy(com.spname[cnode=icurspecies++], line);\r
+ }\r
+ }\r
+ nodes[cnode].father=cfather;\r
+ if(nodes[cfather].nson>=MAXNSONS)\r
+ error2("too many daughter nodes, raise MAXNSONS");\r
+\r
+ nodes[cfather].sons[nodes[cfather].nson++] = cnode;\r
+ tree.branches[tree.nbranch][0] = cfather;\r
+ tree.branches[tree.nbranch][1] = cnode;\r
+ nodes[cnode].ibranch = tree.nbranch++;\r
+ inodeb = cnode;\r
+ check[cnode]++;\r
+ }\r
+ }\r
+\r
+ if (popline) \r
+ fgets(line, lline, ftree);\r
+ for(i=0; i<com.ns; i++) {\r
+ if(check[i]>1) {\r
+ printf("\nSeq #%d occurs more than once in the tree\n",i+1); exit(-1); \r
+ }\r
+ else if(check[i]<1) {\r
+ printf("\nSeq #%d (%s) is missing in the tree\n", i+1, com.spname[i]);\r
+ exit(-1); \r
+ }\r
+ }\r
+ if(tree.nbranch>2*com.ns-2) { \r
+ printf("nbranch %d", tree.nbranch); puts("too many branches in tree?");\r
+ }\r
+ if (tree.nnode != tree.nbranch+1) {\r
+ printf ("\nnnode%6d != nbranch%6d + 1\n", tree.nnode, tree.nbranch);\r
+ exit(-1);\r
+ }\r
+\r
+/* check that it is o.k. to comment out this line\r
+ com.ntime = com.clock ? (tree.nbranch+1)-com.ns+(tree.root<com.ns)\r
+ : tree.nbranch;\r
+*/\r
+\r
+\r
+ /* check and convert clade labels $ */\r
+#if(defined(BASEML) || defined(CODEML))\r
+#if(defined(BASEML))\r
+ if(com.seqtype==0 && com.nhomo==5) cladeLabels = 1;\r
+#endif\r
+ if(com.clock>1 || (com.seqtype==1 && com.model>=2)) cladeLabels = 1;\r
+ if(cladeLabels) {\r
+ for(i=0,j=0; i<tree.nnode; i++) {\r
+ if(CladeLabel[i] != -1) j++;\r
+ }\r
+ if(j) { /* j is number of clade labels */\r
+ DownTreeCladeLabel(tree.root, 0);\r
+ }\r
+\r
+ /*** Somehow some labels are still -1 after this, so I changed this. Needs checking. ***/\r
+ for(i=0; i<tree.nnode; i++) \r
+ if(i!=tree.root && nodes[i].label==-1) nodes[i].label = 0;\r
+\r
+ /* OutTreeN(F0,1,PrBranch|PrNodeNum); FPN(F0); */\r
+ /* FPN(F0); OutTreeN(F0,1,PrLabel); FPN(F0); */\r
+\r
+ for(i=0,com.nbtype=0; i<tree.nnode; i++) { \r
+ if(i == tree.root) continue;\r
+ j = (int)nodes[i].label;\r
+ if(j+1 > com.nbtype) com.nbtype = j+1;\r
+ if(j<0 || j>tree.nbranch-1) \r
+ error2("branch label in the tree (note labels start from 0 and are consecutive)");\r
+ }\r
+ if (com.nbtype<=1)\r
+ error2("need branch labels in the tree for the model.");\r
+ else {\r
+ printf("\n%d branch types are in tree. Stop if wrong.", com.nbtype);\r
+ }\r
+\r
+#if(defined(CODEML))\r
+ if(com.seqtype==1 && com.NSsites==2 && com.model==3 && com.nbtype>NBTYPE) \r
+ error2("nbtype too large. Raise NBTYPE");\r
+ else if(com.seqtype==1 && com.NSsites && com.model==2 && com.nbtype!=2)\r
+ error2("only two branch types are allowed for branch models.");\r
+#endif\r
+\r
+ }\r
+#endif\r
+\r
+ free(CladeLabel);\r
+ return (0);\r
+}\r
+\r
+\r
+\r
+int OutSubTreeN (FILE *fout, int inode, int spnames, int printopt, char *labelfmt);\r
+\r
+int OutSubTreeN (FILE *fout, int inode, int spnames, int printopt, char *labelfmt)\r
+{\r
+ int i, dad = nodes[inode].father, nsib = (inode==tree.root ? 0 : nodes[dad].nson);\r
+\r
+ if(inode != tree.root && inode == nodes[dad].sons[0])\r
+ fputc ('(', fout);\r
+\r
+ for(i=0; i<nodes[inode].nson; i++)\r
+ OutSubTreeN(fout, nodes[inode].sons[i], spnames, printopt, labelfmt);\r
+\r
+ if(nodes[inode].nson==0) { /* inode is tip */\r
+ if(spnames) {\r
+ if(printopt & PrNodeNum) fprintf(fout, "%d_", inode+1);\r
+ fprintf(fout, "%s", com.spname[inode]);\r
+ }\r
+ else \r
+ fprintf(fout, "%d", inode+1);\r
+ }\r
+ if((printopt & PrNodeNum) && nodes[inode].nson) \r
+ fprintf(fout," %d ", inode+1);\r
+ if((printopt & PrLabel) && nodes[inode].label>0)\r
+ fprintf(fout, labelfmt, nodes[inode].label);\r
+ if((printopt & PrAge) && nodes[inode].age) \r
+ fprintf(fout, " @%.6f", nodes[inode].age);\r
+\r
+/* Add branch labels to be read by Rod Page's TreeView. */\r
+#if (defined CODEML)\r
+ if((printopt & PrOmega) && inode != tree.root)\r
+ fprintf(fout, " #%.4f ", nodes[inode].omega);\r
+#elif (defined (EVOLVER) || defined (MCMCTREE))\r
+ if((printopt & PrLabel) && nodes[inode].nodeStr && nodes[inode].nodeStr[0])\r
+ fprintf(fout, " %s", nodes[inode].nodeStr);\r
+#endif\r
+ \r
+ if((printopt & PrBranch) && (inode!=tree.root || nodes[inode].branch>0))\r
+ fprintf(fout, ": %.6f", nodes[inode].branch);\r
+ /*\r
+ if((printopt & PrBranch) && nodes[inode].age>0) // print node ages instead of branch lengths \r
+ fprintf(fout, ": %.6f", nodes[inode].age);\r
+ */\r
+\r
+ if(nsib == 0) /* root */\r
+ fputc(';', fout);\r
+ else if (inode == nodes[dad].sons[nsib-1]) /* last sib */\r
+ fputc(')', fout);\r
+ else /* not last sib */\r
+ fprintf(fout, ", ");\r
+\r
+ return (0);\r
+}\r
+\r
+\r
+int OutTreeN (FILE *fout, int spnames, int printopt)\r
+{\r
+/* print the current tree.\r
+ Can the block of print statements be moved inside the recursive function?\r
+*/\r
+ int i, intlabel=1;\r
+ char* labelfmt[2]={"#%.6f", "#%.0f"};\r
+\r
+ if(printopt & PrLabel) {\r
+ for(i=0; i<tree.nnode; i++) \r
+ if(nodes[i].label-(int)nodes[i].label != 0) intlabel=0;\r
+ }\r
+\r
+ OutSubTreeN(fout, tree.root, spnames, printopt, labelfmt[intlabel]);\r
+\r
+ return(0);\r
+}\r
+\r
+\r
+int DeRoot (void)\r
+{\r
+/* This cnages the bifurcation at the root into a trifurcation, but setting one of \r
+ the sons to be the new root. The new root is the first son that is not a tip. \r
+ tree.nnode is updated, but the routine does not re-number the nodes, so the new\r
+ node labels do not go from ns, ns + 1, ..., as they normally should.\r
+*/\r
+ int i, ison, sib, root = tree.root;\r
+\r
+ if(nodes[root].nson!=2) error2("in DeRoot?");\r
+\r
+ ison = nodes[root].sons[i = 0];\r
+ if(nodes[ison].nson==0)\r
+ ison = nodes[root].sons[i = 1];\r
+ sib = nodes[root].sons[1 - i];\r
+ nodes[sib].branch += nodes[ison].branch;\r
+ nodes[sib].father = tree.root = ison;\r
+ nodes[tree.root].father = -1;\r
+ nodes[tree.root].sons[nodes[tree.root].nson++] = sib; /* sib added as the last child of the new root */\r
+ nodes[tree.root].branch = 0;\r
+ tree.nnode --; /* added 2007/4/9 */\r
+ return(0);\r
+}\r
+\r
+int PruneSubTreeN (int inode, int keep[])\r
+{\r
+/* This prunes tips from the tree, using keep[com.ns]. Removed nodes in the \r
+ big tree has nodes[].father=-1 and nodes[].nson=0.\r
+ Do not change nodes[inode].nson and nodes[inode].sons[] until after the \r
+ node's descendent nodes are all processed. So when a son is deleted, \r
+ only the father node's nson is changed, but not \r
+*/\r
+ int i,j, ison, father=nodes[inode].father, nson0=nodes[inode].nson;\r
+\r
+ nodes[inode].label = 0;\r
+ for(i=0; i<nson0; i++)\r
+ PruneSubTreeN(nodes[inode].sons[i], keep);\r
+\r
+ /* remove inode because of no descendents. \r
+ Note this does not touch the father node */\r
+ if(inode<com.ns && keep[inode]==0)\r
+ nodes[inode].father = -1;\r
+ else if(inode>=com.ns) {\r
+ for(i=0,nodes[inode].nson=0; i<nson0; i++) {\r
+ ison = nodes[inode].sons[i];\r
+ if(nodes[ison].father!=-1) \r
+ nodes[inode].sons[ nodes[inode].nson++ ] = nodes[inode].sons[i];\r
+ }\r
+ if(nodes[inode].nson == 0)\r
+ nodes[inode].father = -1;\r
+ }\r
+\r
+ /* remove inode if it has a single descendent ison */\r
+ if(inode>=com.ns && nodes[inode].nson==1 && inode!=tree.root) {\r
+ ison = nodes[inode].sons[0];\r
+ nodes[ison].father = father;\r
+ nodes[ison].branch += nodes[inode].branch;\r
+ nodes[ison].label ++; /* records # deleted nodes for branch ison */\r
+ for(j=0; j<nodes[father].nson; j++) {\r
+ if(nodes[father].sons[j]==inode)\r
+ { nodes[father].sons[j] = ison; break; }\r
+ }\r
+ nodes[inode].nson = 0;\r
+ nodes[inode].father = -1;\r
+ }\r
+ else if(nodes[inode].nson==1 && inode==tree.root) { /* move down root if root has 1 descendent */\r
+ nodes[inode].father = -1;\r
+ nodes[inode].nson = 0;\r
+ ison = nodes[tree.root].sons[0];\r
+ tree.root = ison;\r
+ nodes[tree.root].father = -1;\r
+ nodes[tree.root].branch = 0;\r
+ }\r
+\r
+ /*\r
+ printf("\nVisiting inode %d\n", inode);\r
+ FOR(i, tree.nnode) printf(" %2d", i); FPN(F0);\r
+ FOR(i, tree.nnode) printf(" %2.0f", nodes[i].label); FPN(F0);\r
+ */\r
+ return(0);\r
+}\r
+\r
+\r
+int GetSubTreeN (int keep[], int space[])\r
+{\r
+/* This removes some tips to generate the subtree. Branch lengths are \r
+ preserved by summing them up when some nodes are removed. \r
+ The algorithm use post-order tree traversal to remove tips and nodes. It \r
+ then switches to the branch representation to renumber nodes.\r
+ space[] can be NULL. If not, it returns newnodeNO[], which holds the \r
+ new node numbers; for exmaple, newnodeNO[12]=5 means that old node 12 now \r
+ becomes node 5.\r
+\r
+ The routine does not change com.ns or com.spname[], which have to be updated \r
+ outside.\r
+\r
+ CHANGE OF ROOT happens if the root in the old tree had >=3 sons, but has 2 \r
+ sons in the new tree and if (!com.clock). In that case, the tree is derooted.\r
+\r
+ This routine does not work if a current seq is ancestral to some others \r
+ and if that sequence is removed. (***check this comment ***)\r
+ \r
+ Different formats for keep[] are used. Suppose the current tree is for \r
+ nine species: a b c d e f g h i.\r
+ \r
+ (A) keep[]={1,0,1,1,1,0,0,1,0} means that a c d e h are kept in the tree. \r
+ The old tip numbers are not changed, so that OutTreeN(?,1,?) gives the \r
+ correct species names or OutTreeN(?,0,?) gives the old species numbers.\r
+\r
+ (B) keep[]={1,0,2,3,4,0,0,5,0} means that a c d e h are kept in the tree, and \r
+ they are renumbered 0 1 2 3 4 and all the internal nodes are renumbered \r
+ as well to be consecutive. Note that the positive numbers have to be \r
+ consecutive natural numbers.\r
+\r
+ keep[]={5,0,2,1,4,0,0,3,0} means that a c d e h are kept in the tree. \r
+ However, the order of the sequences are changed to d c h e a, so that the \r
+ numbers are now 0 1 2 3 4 for d c h e a. This is useful when the subtree \r
+ is extracted from a big tree for a subset of the sequence data, while the \r
+ species are odered d c h e a in the sequence data file.\r
+ This option can be used to renumber the tips in the complete tree.\r
+*/\r
+ int nsnew, i,j,k, nnode0=tree.nnode, sumnumber=0, newnodeNO[2*NS-1], ison, sib;\r
+ int unrooted = (nodes[tree.root].nson>=3); /* com.clock is not checked here! */\r
+ double *branch0;\r
+ int debug=0;\r
+\r
+ if(debug) { FOR(i,com.ns) printf("%-30s %2d\n", com.spname[i], keep[i]); }\r
+ for(i=0,nsnew=0; i<com.ns; i++)\r
+ if(keep[i]) { nsnew++; sumnumber+=keep[i]; }\r
+ if(nsnew<2) return(-1);\r
+\r
+ /* mark removed nodes in the big tree by father=-1 && nson=0.\r
+ nodes[].label records the number of nodes collapsed.\r
+ */\r
+ PruneSubTreeN(tree.root, keep);\r
+ /* If unrooted tree has a bifurcation at the new root, collapse root. */\r
+ if (unrooted && nodes[tree.root].nson==2) {\r
+ ison = nodes[tree.root].sons[i = 0];\r
+ if(nodes[ison].nson==0)\r
+ ison = nodes[tree.root].sons[i = 1];\r
+ sib = nodes[tree.root].sons[1 - i];\r
+\r
+ nodes[sib].branch += nodes[ison].branch;\r
+ nodes[sib].label += nodes[ison].label + 2;\r
+ nodes[sib].father = tree.root = ison;\r
+ nodes[tree.root].father = -1;\r
+ nodes[tree.root].sons[nodes[tree.root].nson++] = sib; /* sib added as the last child of the new root */\r
+ nodes[tree.root].branch = 0;\r
+ }\r
+ if(debug) printtree(1);\r
+\r
+ for(i=0,k=1; i<tree.nnode; i++) if(nodes[i].father!=-1) k++;\r
+ tree.nnode = k;\r
+ NodeToBranch();\r
+\r
+ /* to renumber the nodes */\r
+ if(sumnumber>nsnew) {\r
+ if(sumnumber != nsnew*(nsnew+1)/2)\r
+ error2("keep[] not right in GetSubTreeN");\r
+\r
+ if((branch0=(double*)malloc(nnode0*sizeof(double)))==NULL) error2("oom#");\r
+ FOR(i,nnode0) branch0[i] = nodes[i].branch;\r
+ FOR(i,nnode0) newnodeNO[i] = -1;\r
+ FOR(i,com.ns) if(keep[i]) newnodeNO[i] = keep[i]-1;\r
+\r
+ newnodeNO[tree.root] = k = nsnew;\r
+ tree.root = k++;\r
+ for( ; i<nnode0; i++) {\r
+ if(nodes[i].father==-1) continue;\r
+ for(j=0; j<tree.nbranch; j++)\r
+ if(i==tree.branches[j][1]) break;\r
+ if(j==tree.nbranch)\r
+ error2("strange here");\r
+ newnodeNO[i] = k++;\r
+ }\r
+ for(j=0; j<tree.nbranch; j++) FOR(i,2)\r
+ tree.branches[j][i] = newnodeNO[tree.branches[j][i]];\r
+ BranchToNode();\r
+ for(i=0; i<nnode0; i++) {\r
+ if(newnodeNO[i]>-1)\r
+ nodes[newnodeNO[i]].branch = branch0[i];\r
+ }\r
+ free(branch0);\r
+ }\r
+\r
+ if(space) memmove(space, newnodeNO, (com.ns*2-1)*sizeof(int));\r
+ return (0);\r
+}\r
+\r
+\r
+void printtree (int timebranches)\r
+{\r
+ int i,j;\r
+\r
+ printf("\nns = %d nnode = %d", com.ns, tree.nnode);\r
+ printf("\n%7s%7s", "father","node");\r
+ if(timebranches) printf("%10s%10s%10s", "age", "branch", "label");\r
+ printf(" %7s%7s", "nson:","sons");\r
+ FOR (i, tree.nnode) {\r
+ printf ("\n%7d%7d", nodes[i].father, i);\r
+ if(timebranches)\r
+ printf(" %9.6f %9.6f %9.0f", nodes[i].age, nodes[i].branch,nodes[i].label);\r
+\r
+ printf ("%7d: ", nodes[i].nson);\r
+ FOR(j,nodes[i].nson) printf(" %2d", nodes[i].sons[j]);\r
+ }\r
+ FPN(F0); \r
+ OutTreeN(F0,0,0); FPN(F0); \r
+ OutTreeN(F0,1,0); FPN(F0); \r
+ OutTreeN(F0,1,1); FPN(F0); \r
+}\r
+\r
+\r
+void PointconPnodes (void)\r
+{\r
+/* This points the nodes[com.ns+inode].conP to the right space in com.conP.\r
+ The space is different depending on com.cleandata (0 or 1)\r
+ This routine updates internal nodes com.conP only. \r
+ End nodes (com.conP0) are updated in InitConditionalPNode().\r
+*/\r
+ int nintern=0, i;\r
+\r
+ for(i=0; i<tree.nbranch+1; i++)\r
+ if(nodes[i].nson>0) /* more thinking */\r
+ nodes[i].conP = com.conP + com.ncode*com.npatt*nintern ++;\r
+}\r
+\r
+\r
+int SetxInitials (int np, double x[], double xb[][2])\r
+{\r
+/* This forces initial values into the boundary of the space\r
+*/\r
+ int i;\r
+\r
+ for (i=com.ntime; i<np; i++) {\r
+ if (x[i]<xb[i][0]*1.005) x[i]=xb[i][0]*1.05;\r
+ if (x[i]>xb[i][1]/1.005) x[i]=xb[i][1]/1.05;\r
+ }\r
+ for (i=0; i<com.np; i++) {\r
+ if (x[i]<xb[i][0]) x[i]=xb[i][0]*1.2;\r
+ if (x[i]>xb[i][1]) x[i]=xb[i][1]*.8;\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+#if(defined(BASEML) || defined(CODEML) || defined(MCMCTREE))\r
+\r
+int GetTipDate (double *TipDate, double *TipDate_TimeUnit)\r
+{\r
+/* This scans sequence names to collect the sampling dates. The last field of \r
+ the sequence name is assumed to contain the date.\r
+ Divergence times are rescaled by using TipDate_TimeUnit.\r
+*/\r
+ int i, j, indate, ndates=0;\r
+ double young=-1, old=-1;\r
+ char *p;\r
+\r
+ *TipDate = 0;\r
+ for(i=0,ndates=0; i<com.ns; i++) {\r
+ nodes[i].age = 0;\r
+ j = strlen(com.spname[i]);\r
+ for(indate=0,p=com.spname[i]+j-1; j>=0; j--,p--) {\r
+ if(isdigit(*p) || *p=='.') indate=1;\r
+ else if(indate) \r
+ break;\r
+ }\r
+ sscanf(p+1, "%lf", &nodes[i].age);\r
+ if(nodes[i].age<=0)\r
+ error2("Tip date <= 0");\r
+ else \r
+ ndates++;\r
+\r
+ if(i==0)\r
+ young = old = nodes[i].age;\r
+ else {\r
+ old = min2(old, nodes[i].age);\r
+ young = max2(young, nodes[i].age);\r
+ }\r
+ }\r
+ if(ndates==0) {\r
+ if(*TipDate_TimeUnit == -1) *TipDate_TimeUnit = 1;\r
+ return(0);\r
+ }\r
+ else if (ndates!=com.ns) {\r
+ printf("TipDate model requires date for each sequence.");\r
+ }\r
+\r
+ /* TipDate models */\r
+ if(ndates != com.ns) \r
+ error2("TipDate model: each sequence must have a date");\r
+ *TipDate = young;\r
+ if(*TipDate_TimeUnit <= 0) \r
+ *TipDate_TimeUnit = (young - old)*2.5;\r
+ if(young - old < 1e-30)\r
+ error2("TipDate: all sequences are of the same age?");\r
+ for(i=0; i<tree.nnode; i++) {\r
+ if(i<com.ns || nodes[i].fossil) {\r
+ nodes[i].age = (young - nodes[i].age) / *TipDate_TimeUnit;\r
+ if(nodes[i].age<1e-20) nodes[i].age = 0;\r
+ }\r
+ }\r
+\r
+ if(noisy) printf("\nTipDate model\nDate range: (%.2f, %.2f) => (0, %.2f). TimeUnit = %.2f.\n",\r
+ young, old, (young-old)/ *TipDate_TimeUnit, *TipDate_TimeUnit);\r
+\r
+ return(0);\r
+}\r
+\r
+#endif\r
+\r
+\r
+#if(defined(BASEML) || defined(CODEML))\r
+\r
+double *AgeLow=NULL;\r
+int NFossils=0, AbsoluteRate=0;\r
+/* TipDate models: \r
+ MutationRate = mut/TipDate_TimeUnit; \r
+ age = age*TipDate_TimeUnit \r
+*/\r
+\r
+void SetAge(int inode, double x[]);\r
+void GetAgeLow (int inode);\r
+/* number of internal node times, usd to deal with known ancestors. Broken? */\r
+static int innode_time=0; \r
+\r
+/* Ziheng Yang, 25 January 2003\r
+ The following routines deal with clock and local clock models, including \r
+ Andrew Rambaut's TipDate models (Rambaut 2000 Bioinformatics 16:395-399;\r
+ Yoder & Yang 2000 Mol Biol Evol 17:1081-1090; Yang & Yoder 2003 Syst Biol).\r
+ The tree is rooted. The routine SetAge assumes that ancestral nodes are\r
+ arranged in the increasing order and so works only if the input tree uses \r
+ the parenthesis notation and not the branch notation. The option of known \r
+ ancestors is probably broken.\r
+\r
+ The flag AbsoluteRate=1 if(TipDate || NFossils). This could be removed\r
+ as the flags TipDate and NFossils are sufficient.\r
+ clock = 1: global clock, deals with TipDate with no or many fossils, \r
+ ignores branch rates (#) in tree if any.\r
+ = 2: local clock models, as above, but requires branch rates # \r
+ in tree.\r
+ = 3: as 2, but requires Mgene and option G in sequence file.\r
+\r
+ Order of variables in x[]: divergence times, rates for branches, rgene, ...\r
+ In the following ngene=4, com.nbtype=3, with r_ij to be the rate \r
+ of gene i and branch class j.\r
+\r
+ clock=1 or 2:\r
+ [times, r00(if absolute) r01 r02 rgene1 rgene2 rgene3]\r
+ NOTE: rgene[] has relative rates\r
+ clock=3:\r
+ [times, r00(if absolute) r01 r02 r11 r12 r21 r22 r31 r32 rgene1 rgene2 rgene3]\r
+ NOTE: rgene1=r10, rgene2=r20, rgene3=r30\r
+\r
+ If(nodes[tree.root].fossil==0) x[0] has absolute time for the root. \r
+ Otherwise x[0] has proportional ages.\r
+*/\r
+\r
+\r
+double GetBranchRate(int igene, int ibrate, double x[], int *ix)\r
+{\r
+/* This finds the right branch rate in x[]. The rate is absolute if AbsoluteRate.\r
+ ibrate=0,1,..., indicates the branch rate class.\r
+ This routine is used in the likeihood calculation and in formatting output.\r
+ ix (k) has the position in x[] for the branch rate if the rate is a parameter.\r
+ and is -1 if the rate is not a parameter in the ML iteration. This is \r
+ for printing SEs.\r
+*/\r
+ int nage=tree.nnode-com.ns-NFossils, k=nage+AbsoluteRate;\r
+ double rate00=(AbsoluteRate?x[nage]:1), brate=rate00;\r
+\r
+ if(igene==0 && ibrate==0)\r
+ k = (AbsoluteRate?nage:-1);\r
+ else if(com.clock==GlobalClock) {\r
+ brate = x[k=com.ntime+igene-1]; /* igene>0, rgene[] has absolute rates */\r
+ }\r
+ else if(com.clock==LocalClock) { /* rgene[] has relative rates */\r
+ if(igene==0 && ibrate) { brate = x[k+=ibrate-1]; }\r
+ else if(igene && ibrate==0){ brate = rate00*x[com.ntime+igene-1]; k=-1; }\r
+ else if(igene && ibrate) { brate = x[k+ibrate-1]*x[com.ntime+igene-1]; k=-1; }\r
+ }\r
+ else if(com.clock==ClockCombined) {\r
+ if(ibrate==0 && igene) brate = x[k=com.ntime+igene-1];\r
+ else brate = x[k+=ibrate-1+igene*(com.nbtype-1)]; /* ibrate>0 */\r
+ }\r
+\r
+ if(ix) *ix=k;\r
+ return(brate);\r
+}\r
+\r
+\r
+void SetAge (int inode, double x[])\r
+{\r
+/* This is called from SetBranch(), to set up age for nodes under clock \r
+ models (clock=1,2,3).\r
+ if(TipDate||NFossil), that is, if(AbsoluteRate), this routine sets up \r
+ times (nodes[].age) and then SetBranch() sets up branch lengths by\r
+ multiplying times with rate:\r
+ [].age[i] = AgeLow[i] + ([father].age - AgeLow[i])*x[i]\r
+ \r
+ The routine assumes that times are arranged in the order of node numbers, \r
+ and should work if parenthesis notation of tree is used in the tree file, \r
+ but not if the branch notation is used.\r
+*/\r
+ int i,ison;\r
+\r
+ FOR (i,nodes[inode].nson) {\r
+ ison=nodes[inode].sons[i];\r
+ if(nodes[ison].nson) {\r
+ if(AbsoluteRate) {\r
+ if(!nodes[ison].fossil)\r
+ nodes[ison].age = AgeLow[ison]\r
+ +(nodes[inode].age - AgeLow[ison])*x[innode_time++];\r
+ }\r
+ else \r
+ nodes[ison].age = nodes[inode].age*x[innode_time++];\r
+ SetAge(ison, x);\r
+ }\r
+ }\r
+}\r
+\r
+void GetAgeLow (int inode)\r
+{\r
+/* This sets AgeLow[], the minimum age of each node. It moves down the tree to \r
+ scan [].age, which has tip dates and fossil dates. It is needed if(AbsoluteRate)\r
+ and is called by GetInitialsTimes().\r
+*/\r
+ int i,ison;\r
+ double tlow=0;\r
+\r
+ for(i=0; i<nodes[inode].nson; i++) {\r
+ ison = nodes[inode].sons[i];\r
+ if(nodes[ison].nson)\r
+ GetAgeLow(ison);\r
+ tlow = max2(tlow, nodes[ison].age);\r
+ }\r
+ if(nodes[inode].fossil) {\r
+ if(nodes[inode].age < tlow) \r
+ error2("age in tree is in conflict.");\r
+ AgeLow[inode] = nodes[inode].age;\r
+ }\r
+ else\r
+ AgeLow[inode] = nodes[inode].age = tlow;\r
+}\r
+\r
+\r
+\r
+int SetBranch (double x[])\r
+{\r
+/* if(AbsoluteRate), mutation rate is not multiplied here, but during the \r
+ likelihood calculation. It is copied into com.rgene[0].\r
+*/\r
+ int i, status=0;\r
+ double small=-1e-5;\r
+\r
+ if(com.clock==0) {\r
+ for(i=0; i<tree.nnode; i++) {\r
+ if(i!=tree.root) \r
+ if((nodes[i].branch=x[nodes[i].ibranch])<small) status = -1;\r
+ }\r
+ return(status);\r
+ }\r
+ innode_time = 0;\r
+ if(!LASTROUND) { /* transformed variables (proportions) are used */\r
+ if(!nodes[tree.root].fossil) /* note order of times in x[] */\r
+ nodes[tree.root].age = x[innode_time++];\r
+ SetAge(tree.root, x);\r
+ }\r
+ else { /* times are used */\r
+ for(i=com.ns; i<tree.nnode; i++) \r
+ if(!nodes[i].fossil) nodes[i].age = x[innode_time++];\r
+ }\r
+\r
+ for(i=0; i<tree.nnode; i++) { /* [].age to [].branch */\r
+ if(i==tree.root) continue;\r
+ nodes[i].branch = nodes[nodes[i].father].age-nodes[i].age;\r
+ if(nodes[i].branch<small)\r
+ status = -1;\r
+ }\r
+ return(status);\r
+}\r
+\r
+\r
+int GetInitialsTimes (double x[])\r
+{\r
+/* this counts com.ntime and initializes x[] under clock and local clock models,\r
+ including TipDate and ClockCombined models. See above for notes.\r
+ Under local clock models, com.ntime includes both times and rates for \r
+ lineages.\r
+ A recursive algorithm is used to specify initials if(TipDate||NFossil).\r
+*/\r
+ int i,j,k;\r
+ double maxage, t;\r
+\r
+ /* no clock */\r
+ if(com.fix_blength==2)\r
+ { com.ntime=0; com.method=0; return(0); }\r
+ else if(com.clock==0) {\r
+ com.ntime = tree.nbranch;\r
+ if(com.fix_blength==1) return(0);\r
+ for(i=0; i<com.ntime; i++) \r
+ x[i] = rndu()*0.1+0.01;\r
+\r
+ if(com.fix_blength==0 && com.clock<5 && ancestor && com.ntime<100)\r
+ LSDistance (&t, x, testx);\r
+\r
+ return(0);\r
+ }\r
+ \r
+ /* clock models: check branch rate labels and fossil dates first */\r
+ if(com.clock<5) {\r
+ com.nbtype=1;\r
+ if(com.clock==1) \r
+ for(i=0; i<tree.nnode; i++) nodes[i].label=0;\r
+ else {\r
+ for(i=0; i<tree.nnode; i++) {\r
+ if(i!=tree.root && (j=(int)nodes[i].label+1)>com.nbtype) {\r
+ com.nbtype = j;\r
+ if(j<0 || j>tree.nbranch-1) error2("branch label in the tree.");\r
+ }\r
+ }\r
+ for(j=0; j<com.nbtype; j++) {\r
+ for(i=0; i<tree.nnode; i++) \r
+ if(i!=tree.root && j==(int)nodes[i].label) break;\r
+ if(i==tree.nnode)\r
+ printf("\nNot all branch labels (0, ..., %d) are found on tree?", com.nbtype-1);\r
+ }\r
+ if(noisy) printf("\nfound %d branch rates in tree.\n", com.nbtype);\r
+ if(com.nbtype<=1) error2("use clock = 1 or add branch rate labels in tree");\r
+\r
+ for(i=0; i<tree.nbranch; i++) \r
+ printf("%3.0f",nodes[tree.branches[i][1]].label); FPN(F0);\r
+ }\r
+ }\r
+ for(i=0,NFossils=0,maxage=0; i<tree.nnode; i++) {\r
+ if(nodes[i].nson && nodes[i].fossil) {\r
+ NFossils ++;\r
+ maxage = max2(maxage,nodes[i].age);\r
+ }\r
+ }\r
+ if(NFossils && maxage>10) \r
+ error2("Change time unit so that fossil dates fall in (0.00001, 10).");\r
+\r
+ if(com.TipDate)\r
+ GetTipDate(&com.TipDate, &com.TipDate_TimeUnit);\r
+\r
+ AbsoluteRate = (com.TipDate || NFossils);\r
+ if(com.clock>=5 && AbsoluteRate==0) \r
+ error2("needs fossil calibrations");\r
+\r
+ com.ntime = AbsoluteRate + (tree.nnode-com.ns-NFossils) + (com.nbtype-1);\r
+ if(com.clock == ClockCombined)\r
+ com.ntime += (com.ngene-1)*(com.nbtype-1);\r
+ com.ntime += (tree.root<com.ns); /* root is a known sequence. Broken? */\r
+\r
+ /* DANGER! AgeLow is not freed in the program. Fix this? */\r
+ k=0;\r
+ if(AbsoluteRate) {\r
+ AgeLow = (double*)realloc(AgeLow, tree.nnode*sizeof(double));\r
+ GetAgeLow(tree.root);\r
+ }\r
+ if(!nodes[tree.root].fossil)\r
+ x[k++] = (AbsoluteRate?nodes[tree.root].age*(1.2+rndu()) : rndu()*.5+.1); /* root age */\r
+ for(; k<tree.nnode-com.ns-NFossils; k++) /* relative times */\r
+ x[k] = 0.4+.5*rndu();\r
+ if(com.clock!=6) /* branch rates */\r
+ for( ; k<com.ntime; k++)\r
+ x[k] = 0.1*(.5+rndu());\r
+ else\r
+ for(j=0,k=com.ntime-1; j<data.ngene; j++,k++) \r
+ x[k] = 0.1*(.5+rndu());\r
+ return(0);\r
+}\r
+\r
+int OutputTimesRates (FILE *fout, double x[], double var[])\r
+{\r
+/* SetBranch() has been called before calling this, so that [].age is up \r
+ to date.\r
+*/\r
+ int i,j,k=AbsoluteRate+tree.nnode-com.ns-NFossils, jeffnode;\r
+ double scale=(com.TipDate ? com.TipDate_TimeUnit : 1);\r
+\r
+ /* rates */\r
+ if(AbsoluteRate && com.clock<5) {\r
+ fputs("\nSubstitution rate is per time unit\n", fout);\r
+ if(com.nbtype>1) fprintf(fout,"Rates for branch groups\n");\r
+ for(i=0; i<com.ngene; i++,FPN(fout)) {\r
+ if(com.ngene>1) fprintf(fout,"Gene %2d: ", i+1);\r
+ for(j=0; j<com.nbtype; j++) {\r
+ fprintf(fout,"%12.6f", GetBranchRate(i,j,x,&k));\r
+ if(i==0 && j==0 && !AbsoluteRate) continue;\r
+ if((com.clock!=LocalClock||com.ngene==1) && com.getSE) {\r
+ if(k==-1) error2("we are in trouble. k should not be -1 here.");\r
+ fprintf(fout," +- %8.6f", sqrt(var[k*com.np+k]));\r
+ }\r
+ }\r
+ }\r
+ }\r
+ else\r
+ if(com.clock==2) {\r
+ fprintf (fout,"rates for branches: 1");\r
+ for(k=tree.nnode-com.ns; k<com.ntime; k++) fprintf(fout," %8.5f",x[k]);\r
+ }\r
+\r
+\r
+ /* times */\r
+ if(AbsoluteRate) {\r
+ fputs("\nNodes and Times\n",fout);\r
+ fputs("(JeffNode is for Thorne's multidivtime. ML analysis uses ingroup data only.)\n\n",fout);\r
+ }\r
+ if(com.TipDate) { /* DANGER! SE not printed if(TipDate && NFossil). */\r
+ for(i=0,k=0; i<tree.nnode; i++,FPN(fout)) {\r
+ jeffnode=(i<com.ns?i:tree.nnode-1+com.ns-i);\r
+ fprintf(fout,"Node %3d (Jeffnode %3d) Time %7.2f ",i+1, jeffnode, \r
+ com.TipDate - nodes[i].age*scale);\r
+ if(com.getSE && i>=com.ns && !nodes[i].fossil) {\r
+ fprintf(fout," +- %6.2f", sqrt(var[k*com.np+k])*scale);\r
+ k++;\r
+ }\r
+ }\r
+ }\r
+ else if(AbsoluteRate) {\r
+ for(i=com.ns,k=0; i<tree.nnode; i++,FPN(fout)) {\r
+ jeffnode=tree.nnode-1+com.ns-i;\r
+ fprintf(fout,"Node %3d (Jeffnode %3d) Time %9.5f", i+1, tree.nnode-1+com.ns-i, \r
+ nodes[i].age);\r
+ if(com.getSE && i>=com.ns && !nodes[i].fossil) {\r
+ fprintf(fout," +- %7.5f", sqrt(var[k*com.np+k]));\r
+ if(fabs(nodes[i].age-x[k])>1e-5) error2("node order wrong.");\r
+ k++;\r
+ }\r
+ }\r
+ }\r
+\r
+ return(0);\r
+}\r
+\r
+int SetxBoundTimes (double xb[][2])\r
+{\r
+/* This sets bounds for times (or branch lengths) and branch rates\r
+*/ \r
+ int i=-1,j,k;\r
+ double tb[]={4e-6,50}, rateb[]={1e-4,99}, pb[]={.000001,.999999};\r
+\r
+ if(com.clock==0) {\r
+ for(i=0;i<com.ntime;i++) {\r
+ xb[i][0] = tb[0];\r
+ xb[i][1] = tb[1];\r
+ }\r
+ }\r
+ else {\r
+ k=0; xb[0][0]=tb[0]; xb[0][1]=tb[1];\r
+ if(!nodes[tree.root].fossil) {\r
+ if(AbsoluteRate) xb[0][0]=AgeLow[tree.root];\r
+ k=1;\r
+ }\r
+ for( ; k<tree.nnode-com.ns-NFossils; k++) /* proportional ages */\r
+ { xb[k][0]=pb[0]; xb[k][1]=pb[1]; }\r
+ for(; k<com.ntime; k++) /* rate and branch rates */\r
+ FOR(j,2) xb[k][j]=rateb[j];\r
+ }\r
+ return(0);\r
+}\r
+\r
+#endif\r
+\r
+\r
+#if(defined(BASEML) || defined(BASEMLG) || defined(CODEML))\r
+\r
+\r
+int readx(double x[], int *fromfile)\r
+{\r
+/* this reads parameters from file, used as initial values\r
+ if(runmode>0), this reads common substitution parameters only into x[], which \r
+ should be copied into another place before heuristic tree search. This is broken\r
+ right now. Ziheng, 9 July 2003.\r
+ fromfile=0: if nothing read from file, 1: read from file, -1:fix parameters\r
+*/\r
+ static int times=0;\r
+ int i, npin;\r
+ double *xin;\r
+\r
+ times++; *fromfile=0;\r
+ if(finitials==NULL || (com.runmode>0 && times>1)) return(0);\r
+ if(com.runmode<=0) { npin=com.np; xin=x; }\r
+ else { npin=com.np-com.ntime; xin=x+com.ntime; }\r
+\r
+ if(npin<=0) return(0);\r
+ if(com.runmode>0&&com.seqtype==1&&com.model) error2("option or in.codeml");\r
+ printf("\nReading initials/paras from file (np=%d). Stop if wrong.\n",npin);\r
+ fscanf(finitials,"%lf",&xin[i=0]);\r
+ *fromfile=1;\r
+ if(xin[0]==-1) { *fromfile=-1; LASTROUND=1; }\r
+ else i++;\r
+ for( ; i<npin; i++) \r
+ if(fscanf(finitials, "%lf", &xin[i])!=1) break;\r
+ if(i<npin)\r
+ { printf("err at #%d. Edit or remove it.\n",i+1); exit(-1); }\r
+ if(com.runmode>0) {\r
+ matout(F0,xin,1,npin);\r
+ puts("Those are fixed for tree search. Stop if wrong.");\r
+ }\r
+ return(0);\r
+}\r
+\r
+#endif\r
+\r
+#if(defined(BASEML) || defined(CODEML))\r
+\r
+int CollapsNode (int inode, double x[]) \r
+{\r
+/* Merge inode to its father. Update the first com.ntime elments of\r
+ x[] only if (x!=NULL), by using either x[] if clock=1 or\r
+ nodes[].branch if clock=0. So when clock=0, the routine works\r
+ properly only if SetBranch() is called before this routine, which\r
+ is true if m.l. or l.s. has been used to estimate branch lengths.\r
+*/\r
+ int i,j, ifather, ibranch, ison;\r
+\r
+ if (inode==tree.root || inode<com.ns) error2("err CollapsNode");\r
+ ibranch=nodes[inode].ibranch; ifather=nodes[inode].father; \r
+ for (i=0; i<nodes[inode].nson; i++) {\r
+ ison=nodes[inode].sons[i];\r
+ tree.branches[nodes[ison].ibranch][0]=ifather;\r
+ }\r
+ for (i=ibranch+1; i<tree.nbranch; i++) \r
+ for (j=0; j<2; j++) tree.branches[i-1][j]=tree.branches[i][j];\r
+ tree.nbranch--; com.ntime--;\r
+ for (i=0; i<tree.nbranch; i++) for (j=0; j<2; j++) \r
+ if (tree.branches[i][j]>inode) tree.branches[i][j]--;\r
+ BranchToNode();\r
+\r
+ if (x) {\r
+ if (com.clock) \r
+ for (i=inode+1; i<tree.nnode+1; i++) x[i-1-com.ns]=x[i-com.ns];\r
+ else {\r
+ for (i=ibranch+1; i<tree.nbranch+1; i++) x[i-1]=x[i];\r
+ SetBranch (x);\r
+ }\r
+ }\r
+ return (0);\r
+}\r
+\r
+#endif\r
+\r
+\r
+#if(defined(BPP) || defined(EVOLVER))\r
+\r
+void Tree2PartitionDescentTree (int inode, char split[])\r
+{\r
+ int i, ison;\r
+\r
+ for(i=0; i<nodes[inode].nson; i++) {\r
+ ison = nodes[inode].sons[i];\r
+ if(ison<com.ns)\r
+ split[ison] = '1';\r
+ else \r
+ Tree2PartitionDescentTree(ison, split);\r
+ }\r
+}\r
+\r
+void Tree2Partition (char splits[])\r
+{\r
+/* This generates branch partitions in splits[nib*(com.ns+1)]. \r
+ splits[0,...,ns-1] is the first split, splits[ns,...,2*ns-1] the second, and so on.\r
+ For large trees, the algorithm is inefficient.\r
+ The root node has 2 sons if the tree is rooted and >=3 sons if the tree \r
+ is unrooted. For unrooted tree, the mark for the first species is set to 0.\r
+ For rooted trees, the mark for the first species can be either 0 or 1.\r
+*/\r
+ int unrooted = (nodes[tree.root].nson>=3);\r
+ int s=com.ns, lsplit=s+1, nsplit=tree.nnode-s-1, i, j, k;\r
+ char *split;\r
+\r
+ if(s<=2) return ;\r
+ memset(splits, 0, nsplit*lsplit*sizeof(char));\r
+ for(i=com.ns,k=0; i<tree.nnode; i++) {\r
+ if(i==tree.root) continue;\r
+ split = splits+k*lsplit;\r
+ for(j=0; j<s; j++) split[j] = '0';\r
+ Tree2PartitionDescentTree(i, split);\r
+ /* If unrooted tree, set first species to 0 if tree is unrooted */\r
+ if(unrooted && split[0]=='1')\r
+ for(j=0; j<s; j++) split[j] = '0' + '1' - split[j];\r
+ k++;\r
+ }\r
+}\r
+\r
+int Partition2Tree (char splits[], int lsplit, int ns, int nsplit, double label[])\r
+{\r
+/* This generates the tree in nodes[] using splits or branch partitions.\r
+ It constructs pptable[(2*s-1)*(2*s-1)] to generate the tree. \r
+ Split i corresponds to node ns+i, while the root is ns + nsplit.\r
+ label[] has labels for splits and become labels for nodes on the tree.\r
+ This works even if ns=1 and ns=2.\r
+*/\r
+ int i,j,k, s21=ns*2-1, a, minndesc, ndesc[NS]={0}; /* clade size */\r
+ char debug=0, *p, *pptable;\r
+\r
+ if(nsplit>ns-2) error2("too many splits for ns");\r
+ if(nsplit<0) nsplit=0;\r
+ if((pptable=(char*)malloc((s21*s21+1)*sizeof(char))) == NULL)\r
+ error2("oom in Partition2Tree");\r
+ memset(pptable, 0, s21*s21*sizeof(char));\r
+\r
+ /* initialize tree */\r
+ tree.nnode = ns+nsplit+1;\r
+ tree.root = ns+nsplit;\r
+ tree.nbranch = 0;\r
+ for(i=0; i<tree.nnode; i++) {\r
+ nodes[i].father = nodes[i].ibranch = -1;\r
+ nodes[i].nson = 0; nodes[i].label = -1; nodes[i].branch = nodes[i].age = 0;\r
+ }\r
+\r
+ /* set up pptable */\r
+ for(i=0,p=splits,ndesc[tree.root-ns]=ns; i<nsplit; i++, p+=lsplit) {\r
+ for(j=0; j<ns; j++) {\r
+ if(p[j] == '1') { /* clade (split) i includes tip j */\r
+ pptable[j*s21 + ns+i] = 1;\r
+ ndesc[i]++;\r
+ }\r
+ }\r
+ }\r
+ for(i=0; i<tree.nnode-1; i++) pptable[i*s21+tree.root] = 1;\r
+ for(i=0; i<nsplit; i++) {\r
+ for(j=0; j<i; j++) {\r
+ if(pptable[(ns+i)*s21+ns+j] || pptable[(ns+j)*s21+ns+i] || ndesc[i] == ndesc[j])\r
+ continue;\r
+ for(k=0; k<ns; k++)\r
+ if(pptable[k*s21+ns+i]==1 && pptable[k*s21+ns+j]==1) break;\r
+ if(k<ns) { /* i and j are ancestral to k, and are ancestral to each other. */\r
+ if(ndesc[i] < ndesc[j]) pptable[(ns+i)*s21+ns+j] = 1;\r
+ else pptable[(ns+j)*s21+ns+i] = 1;\r
+ }\r
+ }\r
+ }\r
+ if(debug) {\r
+ printf("\npptable\n");\r
+ for(i=0; i<s21; i++,FPN(F0))\r
+ for(j=0; j<s21; j++)\r
+ printf(" %1d", (int)pptable[i*s21+j]);\r
+ printf("ndesc: ");\r
+ for(i=0; i<nsplit; i++) printf(" %2d", ndesc[i]); FPN(F0);\r
+ }\r
+\r
+ /* generate tree nodes and labels. For each nonroot node, youngest ancestor is dad. */\r
+ for(i=0; i<tree.nnode-1; i++) {\r
+ minndesc=ns+1; a=-1;\r
+ for(j=ns; j<tree.nnode; j++) {\r
+ if(pptable[i*s21+j]==1 && minndesc>ndesc[j-ns]) \r
+ { minndesc = ndesc[j-ns]; a=j; }\r
+ }\r
+ if(a<0)\r
+ error2("jgl");\r
+ nodes[i].father = a;\r
+ nodes[a].sons[nodes[a].nson++] = i;\r
+ if(a!=tree.root && label) nodes[a].label = label[a-ns];\r
+ }\r
+ if(debug) {\r
+ printtree(1);\r
+ OutTreeN(F0,1,0); FPN(F0);\r
+ }\r
+ free(pptable);\r
+ return(0);\r
+}\r
+\r
+\r
+int GetNSfromTreeFile(FILE *ftree, int *ns, int *ntree)\r
+{\r
+/* This gets the sequence names from the tree file.\r
+*/\r
+ char separators[]="(,):#";\r
+ int inname=0, k, c;\r
+ double y;\r
+\r
+ *ns = *ntree = -1;\r
+ k = fscanf(ftree, "%d%d", ns, ntree);\r
+ if(k==1) { *ntree = *ns; *ns = -1; }\r
+ else if(k==0) {\r
+ *ns = 0;\r
+ while((c = fgetc(ftree)) != ';') {\r
+ if(c==EOF) return(-1);\r
+ if(strchr(separators, c)) {\r
+ if (c == ':') fscanf(ftree, "%lf", &y);\r
+ else if(c == '#') fscanf(ftree, "%lf", &y);\r
+ if(inname) { inname=0; (*ns)++; }\r
+ }\r
+ else if(isgraph(c))\r
+ inname = 1;\r
+ }\r
+ rewind(ftree);\r
+ }\r
+ return(0);\r
+}\r
+\r
+void CladeSupport (FILE *fout, char treef[], int getSnames, char mastertreef[], int pick1tree)\r
+{\r
+/* This reads all bootstrap or Bayesian trees from treef to identify the best trees \r
+ (trees with the highest posterior probabilities), and to construct the majority-rule \r
+ consensus tree. The set of the best trees constitute the 95% or 99% credibility set \r
+ of trees.\r
+ A tree (ptree) is represented by its splits, ordered lexicographically, and concatenated.\r
+ It can also read a master tree file and goes through master trees and attach support \r
+ values on splits on each master tree.\r
+ split1 if for one tree, and split50 is for the majority-rule consensus tree.\r
+*/\r
+ int i,j,k, i1, ntreeM,ntree, itreeM, sizetree, found, lline=1024;\r
+ int *index, *indexspace, maxnsplits, nsplits=0, s, nsplit1, nsplit50, lsplit, same;\r
+ int maxnptree, nptree=0, sizeptree;\r
+ char *split1, *splits=NULL, *splitM=NULL, *split50=NULL, *pM, *ptree, *ptrees=NULL, *pc;\r
+ double *countsplits=NULL, *countptree=NULL, *Psplit50, *Psame, y, cdf;\r
+ char pick1treef[32]="pick1tree.tre", line[1024];\r
+ struct TREEN *nodes_t;\r
+ FILE *ft, *fM=NULL, *f1tree=NULL;\r
+ int debug=0;\r
+\r
+ /* Count trees and splits */\r
+ printf("\nRead tree sample, count trees & splits \n");\r
+ ft = gfopen(treef, "r");\r
+\r
+ if(getSnames) /* species ordered as in first tree in file */\r
+ GetNSfromTreeFile(ft, &com.ns, &k);\r
+ if(com.ns<3) error2("need >=3 species to justify this much effort.");\r
+ s=com.ns; lsplit=s+1; maxnsplits=maxnptree=s; sizeptree=(s-2)*lsplit;\r
+ if((split1=(char*)malloc(3*(s-2) * lsplit * sizeof(char))) == NULL)\r
+ error2("oom splits");\r
+ ptree = split1 + (s-2)*lsplit;\r
+ split50 = ptree + (s-2)*lsplit;\r
+ memset(split1, 0, 3*(s-2) * lsplit * sizeof(char));\r
+ if((Psplit50=(double*)malloc(s*sizeof(double))) == NULL)\r
+ error2("oom Psplit50");\r
+\r
+ sizetree=(s*2-1)*sizeof(struct TREEN);\r
+ if((nodes=(struct TREEN*)malloc(sizetree*2))==NULL) error2("oom");\r
+ for(i=0; i<s*2-1; i++) nodes[i].nodeStr=NULL;\r
+ nodes_t = nodes + s*2-1;\r
+\r
+ for(ntree=0; ; ntree++) {\r
+ if(nptree+s >= maxnptree) {\r
+ maxnptree = (int)(maxnptree*(ntree<1000 ? 2 : 1.2));\r
+ ptrees = (char *)realloc(ptrees, maxnptree*sizeptree);\r
+ countptree = (double*)realloc(countptree, maxnptree*sizeof(double));\r
+ if(ptrees==NULL || countptree==NULL) error2("oom ptrees || countptree");\r
+ memset(ptrees+nptree*sizeptree, 0, (maxnptree-nptree)*sizeptree);\r
+ memset(countptree+nptree, 0, (maxnptree-nptree)*sizeof(double));\r
+ }\r
+ if(nsplits+s >= maxnsplits) {\r
+ maxnsplits *= 2;\r
+ splits = (char*)realloc(splits, maxnsplits * lsplit * sizeof(char));\r
+ countsplits = (double*)realloc(countsplits, maxnsplits*sizeof(double));\r
+ if(splits==NULL || countsplits==NULL) error2("oom splits realloc");\r
+ }\r
+\r
+ /* if (getSnames), copy species/sequence names from first tree in file. */\r
+ if(ReadTreeN(ft, &i, &j, (getSnames && ntree==0), 1)) break;\r
+ if(debug || (ntree+1)%5000==0) {\r
+ printf("\rtree %5d ", ntree+1);\r
+ if(s<15) OutTreeN(F0, 1, 0);\r
+ }\r
+ Tree2Partition(split1);\r
+ nsplit1 = tree.nnode - s - 1;\r
+\r
+ /* Process the tree */ \r
+ qsort(split1, nsplit1, lsplit, (int(*)(const void *, const void *))strcmp);\r
+ if(debug)\r
+ { for(i=0; i<nsplit1; i++) printf(" %s", split1+i*lsplit); printf("\n"); }\r
+ for(i=0,pc=ptree; i<nsplit1; i++) \r
+ for(j=0; j<s; j++) *pc++ = split1[i*lsplit+j];\r
+ j = binarysearch(ptree, ptrees, nptree, sizeptree, (int(*)(const void *, const void *))strcmp, &found);\r
+ if(found)\r
+ countptree[j]++;\r
+ else {\r
+ if(j<nptree) {\r
+ memmove(ptrees+(j+1)*sizeptree, ptrees+j*sizeptree, (nptree-j)*sizeptree);\r
+ memmove(countptree+j+1, countptree+j, (nptree-j)*sizeof(double));\r
+ }\r
+ memmove(ptrees+j*sizeptree, ptree, sizeptree);\r
+ nptree++;\r
+ countptree[j]=1;\r
+ }\r
+\r
+ /* Process the splits in the tree */\r
+ for(i=0; i<nsplit1; i++) { /* going through splits in current tree */\r
+ j = binarysearch(split1+i*lsplit, splits, nsplits, lsplit, (int(*)(const void *, const void *))strcmp, &found);\r
+ if(found) /* found */\r
+ countsplits[j]++;\r
+ else {\r
+ if(j<nsplits) { /* check the size of the moved block here */\r
+ memmove(splits+(j+1)*lsplit, splits+j*lsplit, (nsplits-j)*lsplit*sizeof(char));\r
+ memmove(countsplits+(j+1), countsplits+j, (nsplits-j)*sizeof(double));\r
+ }\r
+ memcpy(splits+j*lsplit, split1+i*lsplit, lsplit*sizeof(char));\r
+ nsplits++;\r
+ countsplits[j]=1;\r
+ }\r
+ }\r
+ if(debug) {\r
+ printf("%4d splits: ", nsplits);\r
+ for(k=0; k<nsplits; k++) printf(" %s (%.0f)", splits+k*lsplit, countsplits[k]);\r
+ FPN(F0);\r
+ }\r
+ }\r
+ printf("\n%6d trees read, %d distinct trees.\n", ntree, nptree);\r
+\r
+ k = max2(nsplits, nptree);\r
+ if((index=(int*)malloc(k*2*sizeof(int)))==NULL) error2("oom index");\r
+ indexspace = index+k;\r
+\r
+ printf("\nSpecies in order:\n");\r
+ for(i=0; i<s; i++) printf("%2d. %s\n", i+1, com.spname[i]);\r
+ printf("\n(A) Best trees in the sample (%d distinct trees in all)\n", nptree);\r
+ fprintf(fout, "\n\nSpecies in order:\n");\r
+ for(i=0; i<s; i++) fprintf(fout, "%2d. %s\n", i+1, com.spname[i]);\r
+ fprintf(fout, "\n(A) Best trees in the sample (%d distinct trees in all)\n", nptree);\r
+\r
+ indexing(countptree, nptree, index, 1, indexspace);\r
+\r
+ for(k=0,cdf=0; k<nptree; k++) {\r
+ j = index[k]; y=countptree[j];\r
+ for(i=0,pc=split1; i<nsplit1; i++,*pc++='\0') for(i1=0; i1<s; i1++)\r
+ *pc++ = ptrees[j*sizeptree + i*s + i1];\r
+ Partition2Tree(split1, lsplit, s, nsplit1, NULL);\r
+ printf(" %6.0f %8.5f %8.5f ", y, y/ntree, (cdf+=y/ntree));\r
+ OutTreeN(F0, 1, 0); \r
+ /* for(i=0; i<nsplit1; i++) printf(" %s", split1+i*lsplit); */\r
+ printf("\n");\r
+\r
+ fprintf(fout, " %6.0f %8.5f %8.5f ", y, y/ntree, cdf);\r
+ OutTreeN(fout, 1, 0); \r
+ /* for(i=0; i<nsplit1; i++) fprintf(fout, " %s", split1+i*lsplit); */ \r
+ fprintf(fout, "\n");\r
+\r
+ if(cdf > 0.999) break;\r
+ }\r
+\r
+ printf("\n(B) Best splits in the sample of trees (%d splits in all)\n", nsplits);\r
+ indexing(countsplits, nsplits, index, 1, indexspace);\r
+ for(k=0; k<nsplits; k++) {\r
+ j = index[k]; y=countsplits[j];\r
+ printf(" %6.0f %9.5f %s\n", y, y/ntree, splits+j*lsplit);\r
+ if(y/ntree < 0.001) break;\r
+ }\r
+ fprintf(fout, "\n(B) Best splits in the sample of trees (%d splits in all)\n", nsplits);\r
+ for(k=0; k<nsplits; k++) {\r
+ j = index[k]; y=countsplits[j];\r
+ fprintf(fout, " %6.0f %9.5f %s\n", y, y/ntree, splits+j*lsplit);\r
+ if(y/ntree < 0.001) break;\r
+ }\r
+\r
+ /* Majority-rule consensus tree */\r
+ for(k=0,nsplit50=0; k<nsplits; k++)\r
+ if(countsplits[k]/ntree >= 0.5) nsplit50++;\r
+ for(k=0,nsplit50=0; k<nsplits; k++) {\r
+ if(countsplits[k]/ntree > 0.5) {\r
+ memmove(split50+nsplit50*lsplit, splits+k*lsplit, lsplit);\r
+ Psplit50[nsplit50 ++] = countsplits[k]/ntree;\r
+ }\r
+ }\r
+ Partition2Tree(split50, lsplit, s, nsplit50, Psplit50);\r
+ printf("\n(C) Majority-rule consensus tree\n");\r
+ OutTreeN(F0, 1, PrLabel); FPN(F0);\r
+ fprintf(fout, "\n(C) Majority-rule consensus tree\n");\r
+ OutTreeN(fout, 1, PrLabel); FPN(fout);\r
+\r
+ if(mastertreef) fM = fopen(mastertreef, "r");\r
+ if(fM==NULL) goto CleanUp;\r
+\r
+ fscanf(fM, "%d%d", &i, &ntreeM);\r
+ if(i!=s || ntreeM<1) error2("<ns> <ntree> on the first line in master tree.");\r
+\r
+ /* Probabilities of trees in the master tree file */\r
+ splitM = (char*)malloc(ntreeM * (s-2)*lsplit * sizeof(char));\r
+ Psame = (double*)malloc(ntreeM * sizeof(double));\r
+ if(splitM==NULL || Psame==NULL) error2("oom splitM");\r
+ zero(Psame, ntreeM);\r
+ if(pick1tree>=1 && pick1tree<=ntreeM && (f1tree=(FILE*)fopen(pick1treef,"w"))==NULL)\r
+ error2("oom");\r
+ for(itreeM=0,pM=splitM; itreeM<ntreeM; itreeM++,pM+=(s-2)*lsplit) {\r
+ if(ReadTreeN(fM, &i, &j, 0, 1)) break;\r
+ if(tree.nnode<s*2-2 || tree.nnode>s*2-1) error2("Master trees have to be binary");\r
+ Tree2Partition(pM);\r
+ qsort(pM, tree.nnode-s-1, lsplit, (int(*)(const void *, const void *))strcmp);\r
+ if(debug) {\r
+ printf("\nMaster tree %2d: ", itreeM+1);\r
+ OutTreeN(F0, 1, 0);\r
+ for(i=0; i<tree.nnode-s-1; i++) printf(" %s", pM+i*lsplit);\r
+ }\r
+ }\r
+ /* read the tree sample again */\r
+ rewind(ft);\r
+ for(ntree=0; ; ntree++) {\r
+ if(ReadTreeN(ft, &i, &j, 0, 0)) break;\r
+ fgets(line, lline, ft);\r
+ Tree2Partition(split1);\r
+ for(itreeM=0,pM=splitM; itreeM<ntreeM; itreeM++,pM+=(s-2)*lsplit) {\r
+ for(i=0,same=1; i<tree.nnode-s-1; i++) {\r
+ if(bsearch(split1+i*lsplit, pM, tree.nnode-s-1, lsplit, (int(*)(const void *, const void *))strcmp) == NULL) \r
+ { same=0; break; }\r
+ }\r
+ if(same) {\r
+ Psame[itreeM] ++;\r
+ if(pick1tree-1==itreeM) {\r
+ OutTreeN(f1tree, 1, 1); fprintf(f1tree, "%s", line);\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ printf("\n(D) Probabilities of trees in the master tree file\n");\r
+ fprintf(fout, "\n(D) Probabilities of trees in the master tree file\n");\r
+ /* read the master trees another round just for printing. */\r
+ rewind(fM);\r
+ fscanf(fM, "%d%d", &s, &ntreeM);\r
+ for(itreeM=0; itreeM<ntreeM; itreeM++) {\r
+ if(ReadTreeN(fM, &i, &j, 0, 1)) break;\r
+ Tree2Partition(split1);\r
+ for(i=s,k=0; i<tree.nnode; i++) {\r
+ if(i==tree.root) continue;\r
+ j = binarysearch(split1+k*lsplit, splits, nsplits, lsplit, (int(*)(const void *, const void *))strcmp, &found);\r
+ if(found) nodes[i].label = countsplits[j]/ntree;\r
+ k++;\r
+ }\r
+ printf(" P = %6.4f ", Psame[itreeM]/ntree);\r
+ OutTreeN(F0, 1, PrLabel); FPN(F0);\r
+ \r
+ fprintf(fout, " P = %6.4f ", Psame[itreeM]/ntree);\r
+ OutTreeN(fout, 1, PrLabel); FPN(fout);\r
+\r
+ }\r
+ if(pick1tree) printf("\ntree #%d collected into %s\n", pick1tree, pick1treef);\r
+ \r
+CleanUp:\r
+ if(fM) { \r
+ free(splitM); free(Psame); \r
+ fclose(fM); if(f1tree) fclose(f1tree);\r
+ }\r
+ free(split1); free(splits); free(countsplits); free(Psplit50); \r
+ free(ptrees); free(countptree); free(index);\r
+ free(nodes);\r
+ fclose(ft);\r
+ exit(0);\r
+}\r
+\r
+#endif\r
+\r
+\r
+int NSameBranch (char partition1[],char partition2[], int nib1,int nib2, int IBsame[])\r
+{\r
+/* counts the number of correct (identical) bipartitions.\r
+ nib1 and nib2 are the numbers of interior branches in the two trees\r
+ correctIB[0,...,(correctbranch-1)] lists the correct interior branches, \r
+ that is, interior branches in tree 1 that is also in tree 2.\r
+ IBsame[i]=1 if interior branch i is correct.\r
+*/\r
+ int i,j,k=0, nsamebranch;\r
+\r
+#if(1)\r
+ for (i=0,nsamebranch=0; i<nib1; i++)\r
+ for(j=0,IBsame[i]=0; j<nib2; j++) {\r
+ if(strcmp(partition1+i*(com.ns+1), partition2+j*(com.ns+1)) == 0) {\r
+ nsamebranch++; IBsame[i]=1; break; \r
+ }\r
+ }\r
+#else\r
+ for (i=0,nsamebranch=0; i<nib1; i++)\r
+ for(j=0,IBsame[i]=0; j<nib2; j++) {\r
+ for (k=0;k<com.ns;k++)\r
+ if(partition1[i*(com.ns+1)+k] != partition2[j*(com.ns+1)+k]) break;\r
+ if (k==com.ns) {\r
+ nsamebranch++; IBsame[i]=1; break; \r
+ }\r
+ }\r
+#endif\r
+ return (nsamebranch);\r
+}\r
+\r
+\r
+int AddSpecies (int is, int ib)\r
+{\r
+/* Add species (is) to tree at branch ib. The tree currently has \r
+ is+1-1 species. Interior node numbers are increased by 2 to make \r
+ room for the new nodes.\r
+ if(com.clock && ib==tree.nbranch), the new species is added as an\r
+ outgroup to the rooted tree.\r
+*/\r
+ int i,j, it;\r
+\r
+ if(ib>tree.nbranch+1 || (ib==tree.nbranch && !com.clock)) return(-1);\r
+\r
+ if(ib==tree.nbranch && com.clock) { \r
+ FOR(i,tree.nbranch) FOR(j,2)\r
+ if (tree.branches[i][j]>=is) tree.branches[i][j]+=2;\r
+ it=tree.root; if(tree.root>=is) it+=2;\r
+ FOR(i,2) tree.branches[tree.nbranch+i][0]=tree.root=is+1;\r
+ tree.branches[tree.nbranch++][1]=it;\r
+ tree.branches[tree.nbranch++][1]=is;\r
+ }\r
+ else {\r
+ FOR(i,tree.nbranch) FOR(j,2)\r
+ if (tree.branches[i][j]>=is) tree.branches[i][j]+=2;\r
+ it=tree.branches[ib][1];\r
+ tree.branches[ib][1]=is+1;\r
+ tree.branches[tree.nbranch][0]=is+1;\r
+ tree.branches[tree.nbranch++][1]=it;\r
+ tree.branches[tree.nbranch][0]=is+1;\r
+ tree.branches[tree.nbranch++][1]=is;\r
+ if (tree.root>=is) tree.root+=2;\r
+ }\r
+ BranchToNode ();\r
+ return (0);\r
+}\r
+\r
+\r
+#ifdef TREESEARCH\r
+\r
+static struct TREE\r
+ {struct TREEB tree; struct TREEN nodes[2*NS-1]; double x[NP]; } \r
+ treebest, treestar;\r
+/*\r
+static struct TREE \r
+ {struct TREEB tree; struct TREEN nodes[2*NS-1];} treestar;\r
+*/\r
+\r
+int Perturbation(FILE* fout, int initialMP, double space[]);\r
+\r
+int Perturbation(FILE* fout, int initialMP, double space[])\r
+{\r
+/* heuristic tree search by the NNI tree perturbation algorithm. \r
+ Some trees are evaluated multiple times as no trees are kept.\r
+ This needs more work.\r
+*/\r
+ int step=0, ntree=0, nmove=0, improve=0, ineighb, i,j;\r
+ int sizetree=(2*com.ns-1)*sizeof(struct TREEN);\r
+ double *x=treestar.x;\r
+ FILE *ftree;\r
+\r
+ if(com.clock) error2("\n\aerr: pertubation does not work with a clock yet.\n");\r
+ if(initialMP&&!com.cleandata)\r
+ error2("\ncannot get initial parsimony tree for gapped data yet.");\r
+\r
+ fprintf(fout, "\n\nHeuristic tree search by NNI perturbation\n");\r
+ if (initialMP) {\r
+ if (noisy) printf("\nInitial tree from stepwise addition with MP:\n");\r
+ fprintf(fout, "\nInitial tree from stepwise addition with MP:\n");\r
+ StepwiseAdditionMP (space);\r
+ }\r
+ else {\r
+ if (noisy) printf ("\nInitial tree read from file %s:\n", com.treef);\r
+ fprintf(fout, "\nInitial tree read from file.\n");\r
+ if ((ftree=fopen (com.treef,"r"))==NULL) error2("treefile not exist?");\r
+ fscanf (ftree, "%d%d", &i, &ntree);\r
+ if (i!=com.ns) error2("ns in the tree file");\r
+ if(ReadTreeN(ftree, &i, &j, 0, 1)) error2("err tree..");\r
+ fclose(ftree);\r
+ }\r
+ if (noisy) { FPN (F0); OutTreeN(F0,0,0); FPN(F0); }\r
+ tree.lnL=TreeScore(x, space);\r
+ if (noisy) { OutTreeN(F0,0,1); printf("\n lnL = %.4f\n",-tree.lnL); }\r
+ OutTreeN(fout,1,1); fprintf(fout, "\n lnL = %.4f\n",-tree.lnL);\r
+ if (com.np>com.ntime) {\r
+ fprintf(fout, "\tparameters:"); \r
+ for(i=com.ntime; i<com.np; i++) fprintf(fout, "%9.5f", x[i]);\r
+ FPN(fout);\r
+ }\r
+ fflush(fout);\r
+ treebest.tree=tree; memcpy(treebest.nodes, nodes, sizetree);\r
+\r
+ for (step=0; ; step++) {\r
+ for (ineighb=0,improve=0; ineighb<(tree.nbranch-com.ns)*2; ineighb++) {\r
+ tree=treebest.tree; memcpy (nodes, treebest.nodes, sizetree);\r
+ NeighborNNI (ineighb);\r
+ if(noisy) {\r
+ printf("\nTrying tree # %d (%d move[s]) \n", ++ntree,nmove);\r
+ OutTreeN(F0,0,0); FPN(F0);\r
+ }\r
+ tree.lnL=TreeScore(x, space);\r
+ if (noisy) { OutTreeN(F0,1,1); printf("\n lnL = %.4f\n",-tree.lnL);}\r
+ if (noisy && com.np>com.ntime) {\r
+ printf("\tparameters:"); \r
+ for(i=com.ntime; i<com.np; i++) printf("%9.5f", x[i]);\r
+ FPN(F0);\r
+ }\r
+ if (tree.lnL<=treebest.tree.lnL) {\r
+ treebest.tree=tree; memcpy (treebest.nodes, nodes, sizetree);\r
+ improve=1; nmove++;\r
+ if (noisy) printf(" moving to this tree\n");\r
+ if (fout) {\r
+ fprintf(fout, "\nA better tree:\n");\r
+ OutTreeN(fout,0,0); FPN(fout); OutTreeN(fout,1,1); FPN(fout); \r
+ fprintf(fout, "\nlnL = %.4f\n", tree.lnL);\r
+ if (com.np>com.ntime) {\r
+ fprintf(fout,"\tparameters:"); \r
+ for(i=com.ntime; i<com.np; i++) fprintf(fout,"%9.5f", x[i]);\r
+ FPN(fout);\r
+ }\r
+ fflush(fout);\r
+ }\r
+ }\r
+ }\r
+ if (!improve) break;\r
+ }\r
+ tree=treebest.tree; memcpy (nodes, treebest.nodes, sizetree);\r
+ if (noisy) {\r
+ printf("\n\nBest tree found:\n");\r
+ OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,1); FPN(F0); \r
+ printf("\nlnL = %.4f\n", tree.lnL);\r
+ }\r
+ if (fout) {\r
+ fprintf(fout, "\n\nBest tree found:\n");\r
+ OutTreeN(fout,0,0); FPN(fout); OutTreeN(fout,1,1); FPN(fout); \r
+ fprintf(fout, "\nlnL = %.4f\n", tree.lnL);\r
+ }\r
+ return (0);\r
+}\r
+\r
+\r
+static int *_U0, *_step0, _mnnode;\r
+/* up pass characters and changes for the star tree: each of size npatt*nnode*/\r
+\r
+int StepwiseAdditionMP (double space[])\r
+{\r
+/* tree search by species addition.\r
+*/\r
+ char *z0[NS];\r
+ int ns0=com.ns, is, i,j,h, tiestep=0,tie,bestbranch=0;\r
+ int sizetree=(2*com.ns-1)*sizeof(struct TREEN);\r
+ double bestscore=0,score;\r
+\r
+ _mnnode=com.ns*2-1;\r
+ _U0=(int*)malloc(com.npatt*_mnnode*sizeof(int));\r
+ _step0=(int*)malloc(com.npatt*_mnnode*sizeof(int));\r
+ if (noisy>2) \r
+ printf("\n%9ld bytes for MP (U0 & N0)\n", 2*com.npatt*_mnnode*sizeof(int));\r
+ if (_U0==NULL || _step0==NULL) error2("oom U0&step0");\r
+\r
+ FOR (i,ns0) z0[i]=com.z[i];\r
+ tree.nbranch=tree.root=com.ns=3;\r
+ FOR (i, tree.nbranch) { tree.branches[i][0]=com.ns; tree.branches[i][1]=i; }\r
+ BranchToNode ();\r
+ FOR (h, com.npatt)\r
+ FOR (i,com.ns)\r
+ { _U0[h*_mnnode+i]=1<<(com.z[i][h]-1); _step0[h*_mnnode+i]=0; }\r
+ for (is=com.ns,tie=0; is<ns0; is++) {\r
+ treestar.tree=tree; memcpy (treestar.nodes, nodes, sizetree);\r
+\r
+ for (j=0; j<treestar.tree.nbranch; j++,com.ns--) {\r
+ tree=treestar.tree; memcpy (nodes, treestar.nodes, sizetree);\r
+ com.ns++;\r
+ AddSpecies (is, j);\r
+ score=MPScoreStepwiseAddition(is, space, 0);\r
+/*\r
+OutTreeN(F0, 0, 0); \r
+printf(" Add sp %d (ns=%d) at branch %d, score %.0f\n", is+1,com.ns,j+1,score);\r
+*/\r
+ if (j && score==bestscore) tiestep=1;\r
+ if (j==0 || score<bestscore || (score==bestscore&&rndu()<.1)) {\r
+ tiestep=0;\r
+ bestscore=score; bestbranch=j;\r
+ }\r
+ }\r
+ tie+=tiestep;\r
+ tree=treestar.tree; memcpy (nodes, treestar.nodes, sizetree);\r
+ com.ns=is+1;\r
+ AddSpecies (is, bestbranch);\r
+ score=MPScoreStepwiseAddition(is, space, 1);\r
+\r
+ if (noisy)\r
+ { printf("\r Added %d [%5.0f steps]",is+1,-bestscore); fflush(F0);}\r
+ }\r
+ if (noisy>2) printf(" %d stages with ties, ", tie);\r
+ tree.lnL=bestscore;\r
+ free(_U0); free(_step0);\r
+ return (0);\r
+}\r
+\r
+double MPScoreStepwiseAddition (int is, double space[], int save)\r
+{\r
+/* this changes only the part of the tree affected by the newly added \r
+ species is.\r
+ save=1 for the best tree, so that _U0 & _step0 are updated\r
+*/\r
+ int *U,*N,U3[3], h,ist, i,father,son2,*pU0=_U0,*pN0=_step0;\r
+ double score;\r
+\r
+ U=(int*)space; N=U+_mnnode;\r
+ for (h=0,score=0; h<com.npatt; h++,pU0+=_mnnode,pN0+=_mnnode) {\r
+ FOR (i, tree.nnode) { U[i]=pU0[i-2*(i>=is)]; N[i]=pN0[i-2*(i>=is)]; }\r
+ U[is]=1<<(com.z[is][h]-1); N[is]=0;\r
+ for (ist=is; (father=nodes[ist].father)!=tree.root; ist=father) {\r
+ if ((son2=nodes[father].sons[0])==ist) son2=nodes[father].sons[1];\r
+ N[father]=N[ist]+N[son2];\r
+ if ((U[father]=U[ist]&U[son2])==0)\r
+ { U[father]=U[ist]|U[son2]; N[father]++; }\r
+ }\r
+ FOR (i,3) U3[i]=U[nodes[tree.root].sons[i]];\r
+ N[tree.root]=2;\r
+ if (U3[0]&U3[1]&U3[2]) N[tree.root]=0;\r
+ else if (U3[0]&U3[1] || U3[1]&U3[2] || U3[0]&U3[2]) N[tree.root]=1;\r
+ FOR(i,3) N[tree.root]+=N[nodes[tree.root].sons[i]];\r
+\r
+ if (save) {\r
+ memcpy (pU0, U, tree.nnode*sizeof(int));\r
+ memcpy (pN0, N, tree.nnode*sizeof(int));\r
+ }\r
+ score+=N[tree.root]*com.fpatt[h];\r
+ }\r
+ return (score);\r
+}\r
+\r
+\r
+double TreeScore(double x[], double space[])\r
+{\r
+ static int fromfile=0;\r
+ int i;\r
+ double xb[NP][2], e=1e-9, lnL=0;\r
+\r
+ if(com.clock==2) error2("local clock in TreeScore");\r
+ com.ntime = com.clock ? tree.nnode-com.ns : tree.nbranch;\r
+\r
+ GetInitials(x, &i); /* this shoulbe be improved??? */\r
+ if(i) fromfile=1;\r
+ PointconPnodes();\r
+\r
+ if(com.method==0 || !fromfile) SetxBound(com.np, xb);\r
+\r
+ if(fromfile) {\r
+ lnL = com.plfun(x,com.np);\r
+ com.np = com.ntime;\r
+ }\r
+ NFunCall=0;\r
+ if(com.method==0 || com.ntime==0)\r
+ ming2(NULL,&lnL,com.plfun,NULL,x,xb, space,e,com.np);\r
+ else\r
+ minB(NULL, &lnL, x, xb, e, space);\r
+\r
+ return(lnL);\r
+}\r
+\r
+\r
+int StepwiseAddition (FILE* fout, double space[])\r
+{\r
+/* heuristic tree search by species addition. Species are added in the order \r
+ of occurrence in the data.\r
+ Try to get good initial values.\r
+*/\r
+ char *z0[NS], *spname0[NS];\r
+ int ns0=com.ns, is, i,j, bestbranch=0, randadd=0, order[NS];\r
+ int sizetree=(2*com.ns-1)*sizeof(struct TREEN);\r
+ double bestscore=0,score, *x=treestar.x;\r
+\r
+ if(com.ns>50) printf("if this crashes, increase com.sspace?");\r
+\r
+ if(com.ns<3) error2("2 sequences, no need for tree search");\r
+ if (noisy) printf("\n\nHeuristic tree search by stepwise addition\n");\r
+ if (fout) fprintf(fout, "\n\nHeuristic tree search by stepwise addition\n");\r
+ FOR (i,ns0) { z0[i]=com.z[i]; spname0[i]=com.spname[i]; }\r
+ tree.nbranch=tree.root=com.ns=(com.clock?2:3); \r
+\r
+ FOR(i,ns0) order[i]=i;\r
+ if(randadd) {\r
+ FOR(i,ns0)\r
+ { j=(int)(ns0*rndu()); is=order[i]; order[i]=order[j]; order[j]=is; }\r
+ if(noisy) FOR(i,ns0) printf(" %d", order[i]+1);\r
+ if(fout) { \r
+ fputs("\nOrder of species addition:\n",fout); \r
+ FOR(i,ns0)fprintf(fout,"%3d %-s\n", order[i]+1,com.spname[order[i]]);\r
+ }\r
+ for(i=0; i<ns0; i++) { \r
+ com.z[i]=z0[order[i]]; \r
+ com.spname[i]=spname0[order[i]]; \r
+ }\r
+ }\r
+\r
+ for(i=0; i<tree.nbranch; i++) {\r
+ tree.branches[i][0]=com.ns; tree.branches[i][1]=i; \r
+ }\r
+ BranchToNode ();\r
+ for (is=com.ns; is<ns0; is++) { /* add the is_th species */\r
+ treestar.tree=tree; memcpy (treestar.nodes, nodes, sizetree);\r
+\r
+ for (j=0; j<treestar.tree.nbranch+(com.clock>0); j++,com.ns--) { \r
+ tree=treestar.tree; memcpy(nodes, treestar.nodes, sizetree);\r
+ com.ns++;\r
+ AddSpecies(is,j);\r
+ score=TreeScore(x, space);\r
+ if (noisy>1)\r
+ { printf("\n "); OutTreeN(F0, 0, 0); printf("%12.3f",-score); }\r
+\r
+ if (j==0 || score<bestscore || (score==bestscore&&rndu()<.2)) {\r
+ treebest.tree=tree; memcpy(treebest.nodes, nodes, sizetree);\r
+ xtoy (x, treebest.x, com.np);\r
+ bestscore=score; bestbranch=j;\r
+ }\r
+ }\r
+ tree=treebest.tree; memcpy(nodes,treebest.nodes, sizetree);\r
+ xtoy (treebest.x, x, com.np);\r
+ com.ns=is+1;\r
+\r
+ if (noisy) {\r
+ printf("\n\nAdded sp. %d, %s [%.3f]\n",is+1,com.spname[is],-bestscore);\r
+ OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0); FPN(F0);\r
+ if (com.np>com.ntime) {\r
+ printf("\tparameters:"); \r
+ for(i=com.ntime; i<com.np; i++) printf("%9.5f", x[i]);\r
+ FPN(F0);\r
+ }\r
+ }\r
+ if (fout) {\r
+ fprintf(fout,"\n\nAdded sp. %d, %s [%.3f]\n",\r
+ is+1, com.spname[is], -bestscore);\r
+ OutTreeN(fout,0,0); FPN(fout);\r
+ OutTreeN(fout,1,1); FPN(fout);\r
+ if (com.np>com.ntime) {\r
+ fprintf(fout, "\tparameters:"); \r
+ for(i=com.ntime; i<com.np; i++) fprintf(fout, "%9.5f", x[i]);\r
+ FPN(fout);\r
+ }\r
+ fflush(fout);\r
+ }\r
+ }\r
+ tree.lnL=bestscore;\r
+\r
+ return (0);\r
+}\r
+\r
+\r
+int DecompTree (int inode, int ison1, int ison2);\r
+#define hdID(i,j) (max2(i,j)*(max2(i,j)-1)/2+min2(i,j))\r
+\r
+int StarDecomposition (FILE *fout, double space[])\r
+{\r
+/* automatic tree search by star decomposition, nhomo<=1\r
+ returns (0,1,2,3) for the 4s problem.\r
+*/\r
+ int status=0,stage=0, i,j, itree,ntree=0,ntreet,best=0,improve=1,collaps=0;\r
+ int inode, nson=0, ison1,ison2, son1, son2;\r
+ int sizetree=(2*com.ns-1)*sizeof(struct TREEN);\r
+ double x[NP];\r
+ FILE *ftree, *fsum=frst;\r
+\r
+ if (com.runmode==1) { /* read the star-like tree from tree file */\r
+ if ((ftree=fopen (com.treef,"r"))==NULL)\r
+ error2("no treefile");\r
+ fscanf (ftree, "%d%d", &i, &ntree);\r
+ if (ReadTreeN(ftree, &i, &j, 0, 1)) error2("err tree file");\r
+ fclose (ftree);\r
+ }\r
+ else { /* construct the star tree of ns species */\r
+ tree.nnode = (tree.nbranch=tree.root=com.ns)+1;\r
+ for (i=0; i<tree.nbranch; i++)\r
+ { tree.branches[i][0]=com.ns; tree.branches[i][1]=i; }\r
+ com.ntime = com.clock?1:tree.nbranch;\r
+ BranchToNode ();\r
+ }\r
+ if (noisy) { printf("\n\nstage 0: "); OutTreeN(F0,0,0); }\r
+ if (fsum) { fprintf(fsum,"\n\nstage 0: "); OutTreeN(fsum,0,0); }\r
+ if (fout) { fprintf(fout,"\n\nstage 0: "); OutTreeN(fout,0,0); }\r
+\r
+ tree.lnL=TreeScore(x,space);\r
+\r
+ if (noisy) printf("\nlnL:%14.6f%6d", -tree.lnL, NFunCall);\r
+ if (fsum) fprintf(fsum,"\nlnL:%14.6f%6d", -tree.lnL, NFunCall);\r
+ if (fout) {\r
+ fprintf(fout,"\nlnL(ntime:%3d np:%3d):%14.6f\n",\r
+ com.ntime, com.np, -tree.lnL);\r
+ OutTreeB (fout); FPN(fout);\r
+ FOR (i, com.np) fprintf (fout,"%9.5f", x[i]); FPN (fout);\r
+ }\r
+ treebest.tree=tree; memcpy(treebest.nodes,nodes,sizetree);\r
+ FOR (i,com.np) treebest.x[i]=x[i];\r
+ for (ntree=0,stage=1; ; stage++) {\r
+ for (inode=treebest.tree.nnode-1; inode>=0; inode--) {\r
+ nson=treebest.nodes[inode].nson;\r
+ if (nson>3) break;\r
+ if (com.clock) { if (nson>2) break; }\r
+ else if (nson>2+(inode==treebest.tree.root)) break;\r
+ }\r
+ if (inode==-1 || /*stage>com.ns-3+com.clock ||*/ !improve) { /* end */\r
+ tree=treebest.tree; memcpy (nodes, treebest.nodes, sizetree);\r
+\r
+ if (noisy) {\r
+ printf("\n\nbest tree: "); OutTreeN(F0,0,0);\r
+ printf(" lnL:%14.6f\n", -tree.lnL);\r
+ }\r
+ if (fsum) {\r
+ fprintf(fsum, "\n\nbest tree: "); OutTreeN(fsum,0,0);\r
+ fprintf(fsum, " lnL:%14.6f\n", -tree.lnL);\r
+ }\r
+ if (fout) {\r
+ fprintf(fout, "\n\nbest tree: "); OutTreeN(fout,0,0);\r
+ fprintf(fout, " lnL:%14.6f\n", -tree.lnL);\r
+ OutTreeN(fout,1,1); FPN(fout);\r
+ }\r
+ break;\r
+ }\r
+ treestar=treebest; memcpy(nodes,treestar.nodes,sizetree);\r
+\r
+ if (collaps && stage) { \r
+ printf ("\ncollapsing nodes\n");\r
+ OutTreeN(F0, 1, 1); FPN(F0);\r
+\r
+ tree=treestar.tree; memcpy(nodes, treestar.nodes, sizetree);\r
+ for (i=com.ns,j=0; i<tree.nnode; i++)\r
+ if (i!=tree.root && nodes[i].branch<1e-7) \r
+ { CollapsNode (i, treestar.x); j++; }\r
+ treestar.tree=tree; memcpy(treestar.nodes, nodes, sizetree);\r
+\r
+ if (j) { \r
+ fprintf (fout, "\n%d node(s) collapsed\n", j);\r
+ OutTreeN(fout, 1, 1); FPN(fout);\r
+ }\r
+ if (noisy) {\r
+ printf ("\n%d node(s) collapsed\n", j);\r
+ OutTreeN(F0, 1, 1); FPN(F0);\r
+/* if (j) getchar (); */\r
+ }\r
+ }\r
+\r
+ ntreet = nson*(nson-1)/2;\r
+ if (!com.clock && inode==treestar.tree.root && nson==4) ntreet=3;\r
+ com.ntime++; com.np++;\r
+\r
+ if (noisy) {\r
+ printf ("\n\nstage %d:%6d trees, ntime:%3d np:%3d\nstar tree: ",\r
+ stage, ntreet, com.ntime, com.np);\r
+ OutTreeN(F0, 0, 0);\r
+ printf (" lnL:%10.3f\n", -treestar.tree.lnL);\r
+ }\r
+ if (fsum) {\r
+ fprintf (fsum, "\n\nstage %d:%6d trees, ntime:%3d np:%3d\nstar tree: ",\r
+ stage, ntreet, com.ntime, com.np);\r
+ OutTreeN(fsum, 0, 0);\r
+ fprintf (fsum, " lnL:%10.6f\n", -treestar.tree.lnL);\r
+ }\r
+ if (fout) {\r
+ fprintf (fout,"\n\nstage %d:%6d trees\nstar tree: ", stage, ntreet);\r
+ OutTreeN(fout, 0, 0);\r
+ fprintf (fout, " lnL:%14.6f\n", -treestar.tree.lnL);\r
+ OutTreeN(fout, 1, 1); FPN (fout);\r
+ }\r
+\r
+ for (ison1=0,itree=improve=0; ison1<nson; ison1++)\r
+ for (ison2=ison1+1; ison2<nson&&itree<ntreet; ison2++,itree++,ntree++) {\r
+ DecompTree (inode, ison1, ison2);\r
+ son1=nodes[tree.nnode-1].sons[0];\r
+ son2=nodes[tree.nnode-1].sons[1];\r
+\r
+ for(i=com.np-1; i>0; i--) x[i]=treestar.x[i-1];\r
+ if (!com.clock)\r
+ for (i=0; i<tree.nbranch; i++)\r
+ x[i]=max2(nodes[tree.branches[i][1]].branch*0.99, 0.0001);\r
+ else\r
+ for (i=1,x[0]=max2(x[0],.01); i<com.ntime; i++) x[i]=.5;\r
+\r
+ if (noisy) {\r
+ printf("\nS=%d:%3d/%d T=%4d ", stage,itree+1,ntreet,ntree+1);\r
+ OutTreeN(F0, 0, 0);\r
+ }\r
+ if (fsum) {\r
+ fprintf(fsum, "\nS=%d:%3d/%d T=%4d ", stage,itree+1,ntreet,ntree+1);\r
+ OutTreeN(fsum, 0, 0);\r
+ }\r
+ if (fout) {\r
+ fprintf(fout,"\nS=%d:%4d/%4d T=%4d ",stage,itree+1,ntreet,ntree+1);\r
+ OutTreeN(fout, 0, 0);\r
+ }\r
+ tree.lnL=TreeScore(x, space);\r
+\r
+ if (tree.lnL<treebest.tree.lnL) {\r
+ treebest.tree=tree; memcpy (treebest.nodes, nodes, sizetree);\r
+ FOR(i,com.np) treebest.x[i]=x[i];\r
+ best=itree+1; improve=1;\r
+ }\r
+ if (noisy) \r
+ printf("%6d%2c %+8.6f", NFunCall,(status?'?':'X'),treestar.tree.lnL-tree.lnL);\r
+ if (fsum) {\r
+ fprintf(fsum, "%6d%2c", NFunCall, (status?'?':'X'));\r
+ for (i=com.ntime; i<com.np; i++) fprintf(fsum, "%7.3f", x[i]);\r
+ fprintf(fsum, " %+8.6f", treestar.tree.lnL-tree.lnL);\r
+ fflush(fsum);\r
+ }\r
+ if (fout) {\r
+ fprintf(fout,"\nlnL(ntime:%3d np:%3d):%14.6f\n",\r
+ com.ntime, com.np, -tree.lnL);\r
+ OutTreeB (fout); FPN(fout);\r
+ FOR (i,com.np) fprintf(fout,"%9.5f", x[i]); \r
+ FPN(fout); fflush(fout);\r
+ }\r
+ } /* for (itree) */\r
+ son1=treebest.nodes[tree.nnode-1].sons[0];\r
+ son2=treebest.nodes[tree.nnode-1].sons[1];\r
+ } /* for (stage) */\r
+\r
+ if (com.ns<=4 && !improve && best) error2("strange");\r
+\r
+ if (com.ns<=4) return (best);\r
+ else return (0);\r
+}\r
+\r
+int DecompTree (int inode, int ison1, int ison2)\r
+{\r
+/* decompose treestar at NODE inode into tree and nodes[]\r
+*/\r
+ int i, son1, son2;\r
+ int sizetree=(2*com.ns-1)*sizeof(struct TREEN);\r
+ double bt, fmid=0.001, fclock=0.0001;\r
+\r
+ tree=treestar.tree; memcpy (nodes, treestar.nodes, sizetree);\r
+ for (i=0,bt=0; i<tree.nnode; i++)\r
+ if (i!=tree.root) bt+=nodes[i].branch/tree.nbranch;\r
+\r
+ nodes[tree.nnode].nson=2;\r
+ nodes[tree.nnode].sons[0]=son1=nodes[inode].sons[ison1];\r
+ nodes[tree.nnode].sons[1]=son2=nodes[inode].sons[ison2];\r
+ nodes[tree.nnode].father=inode;\r
+ nodes[son1].father=nodes[son2].father=tree.nnode;\r
+\r
+ nodes[inode].sons[ison1]=tree.nnode;\r
+ for (i=ison2; i<nodes[inode].nson; i++)\r
+ nodes[inode].sons[i]=nodes[inode].sons[i+1];\r
+ nodes[inode].nson--;\r
+\r
+ tree.nnode++;\r
+ NodeToBranch();\r
+ if (!com.clock)\r
+ nodes[tree.nnode-1].branch=bt*fmid;\r
+ else\r
+ nodes[tree.nnode-1].age=nodes[inode].age*(1-fclock);\r
+\r
+ return(0);\r
+}\r
+\r
+\r
+#ifdef REALSEQUENCE\r
+\r
+\r
+int MultipleGenes (FILE* fout, FILE*fpair[], double space[])\r
+{\r
+/* This does the separate analysis of multiple-gene data.\r
+ Note that com.pose[] is not correct and so RateAncestor = 0 should be set\r
+ in baseml and codeml.\r
+*/\r
+ int ig=0, j, ngene0, npatt0, lgene0[NGENE], posG0[NGENE+1];\r
+ int nb = ((com.seqtype==1 && !com.cleandata) ? 3 : 1);\r
+ \r
+ if(com.ndata>1) error2("multiple data sets & multiple genes?");\r
+\r
+ ngene0=com.ngene; npatt0=com.npatt;\r
+ for(ig=0; ig<ngene0; ig++) lgene0[ig]=com.lgene[ig];\r
+ for(ig=0; ig<ngene0+1; ig++) posG0[ig]=com.posG[ig];\r
+\r
+ ig=0;\r
+/*\r
+ printf("\nStart from gene (1-%d)? ", com.ngene);\r
+ scanf("%d", &ig); \r
+ ig--;\r
+*/\r
+\r
+ for ( ; ig<ngene0; ig++) {\r
+\r
+ com.ngene=1;\r
+ com.ls=com.lgene[0]= ig==0?lgene0[0]:lgene0[ig]-lgene0[ig-1];\r
+ com.npatt = ig==ngene0-1 ? npatt0-posG0[ig] : posG0[ig+1]-posG0[ig];\r
+ com.posG[0]=0; com.posG[1]=com.npatt;\r
+ FOR (j,com.ns) com.z[j]+=posG0[ig]*nb; com.fpatt+=posG0[ig];\r
+ xtoy (com.piG[ig], com.pi, com.ncode);\r
+\r
+ printf ("\n\nGene %2d ls:%4d npatt:%4d\n",ig+1,com.ls,com.npatt);\r
+ fprintf(fout,"\nGene %2d ls:%4d npatt:%4d\n",ig+1,com.ls,com.npatt);\r
+ fprintf(frst,"\nGene %2d ls:%4d npatt:%4d\n",ig+1,com.ls,com.npatt);\r
+ fprintf(frst1,"%d\t%d\t%d",ig+1,com.ls,com.npatt);\r
+\r
+#ifdef CODEML\r
+ if(com.seqtype==CODONseq) {\r
+ DistanceMatNG86(fout,fpair[0],fpair[1],fpair[2],0);\r
+ if(com.codonf>=F1x4MG) com.pf3x4 = com.f3x4[ig];\r
+ }\r
+#else\r
+ if(com.fix_alpha)\r
+ DistanceMatNuc(fout,fpair[0],com.model,com.alpha);\r
+#endif\r
+\r
+ if (com.runmode==0) Forestry(fout);\r
+#ifdef CODEML\r
+ else if (com.runmode==-2) {\r
+ if(com.seqtype==CODONseq) PairwiseCodon(fout,fpair[3],fpair[4],fpair[5],space);\r
+ else PairwiseAA(fout,fpair[0]);\r
+ }\r
+#endif\r
+ else StepwiseAddition(fout, space);\r
+\r
+ for(j=0; j<com.ns; j++) com.z[j] -= posG0[ig]*nb;\r
+ com.fpatt -= posG0[ig];\r
+ FPN(frst1);\r
+ }\r
+ com.ngene = ngene0;\r
+ com.npatt = npatt0;\r
+ com.ls = lgene0[ngene0-1];\r
+ for(ig=0; ig<ngene0; ig++)\r
+ com.lgene[ig] = lgene0[ig];\r
+ for(ig=0; ig<ngene0+1; ig++)\r
+ com.posG[ig] = posG0[ig];\r
+ return (0);\r
+}\r
+\r
+void printSeqsMgenes (void)\r
+{\r
+/* separate sites from different partitions (genes) into different files.\r
+ called before sequences are coded.\r
+ Note that this is called before PatternWeight and so posec or posei is used\r
+ and com.pose is not yet allocated.\r
+ In case of codons, com.ls is the number of codons.\r
+*/\r
+ FILE *fseq;\r
+ char seqf[20];\r
+ int ig, lg, i,j,h;\r
+ int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
+\r
+ puts("Separating sites in genes into different files.\n");\r
+ for (ig=0, FPN(F0); ig<com.ngene; ig++) {\r
+ for (h=0,lg=0; h<com.ls; h++)\r
+ if(com.pose[h]==ig)\r
+ lg++;\r
+ sprintf(seqf, "Gene%d.seq", ig+1);\r
+ if((fseq=fopen(seqf,"w"))==NULL) error2("file creation err.");\r
+ printf("%d sites in gene %d go to file %s\n", lg, ig+1,seqf);\r
+\r
+ fprintf (fseq, "%8d%8d\n", com.ns, lg*n31);\r
+ for (j=0; j<com.ns; j++) {\r
+\r
+ /* fprintf(fseq,"*\n>\n%s\n", com.spname[j]); */\r
+ fprintf(fseq,"%-20s ", com.spname[j]);\r
+ if (n31==1) { /* nucleotide or aa sequences */\r
+ FOR (h,com.ls)\r
+ if(com.pose[h]==ig)\r
+ fprintf(fseq, "%c", com.z[j][h]);\r
+ }\r
+ else { /* codon sequences */\r
+ FOR (h,com.ls)\r
+ if(com.pose[h]==ig) {\r
+ FOR (i,3) fprintf(fseq,"%c", com.z[j][h*3+i]);\r
+ fputc(' ',fseq);\r
+ }\r
+ }\r
+ FPN(fseq);\r
+ }\r
+ fclose (fseq);\r
+ }\r
+ return ;\r
+}\r
+\r
+void printSeqsMgenes2 (void)\r
+{\r
+/* This print sites from certain genes into one file.\r
+ called before sequences are coded.\r
+ In case of codons, com.ls is the number of codons.\r
+*/\r
+ FILE *fseq;\r
+ char seqf[20]="newseqs";\r
+ int ig, lg, i,j,h;\r
+ int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
+ \r
+ int ngenekept=0;\r
+ char *genenames[44]={"atpa", "atpb", "atpe", "atpf", "atph", "petb", "petg", "psaa",\r
+"psab", "psac", "psaj", "psba", "psbb", "psbc", "psbd", "psbe",\r
+"psbf", "psbh", "psbi", "psbj", "psbk", "psbl", "psbn", "psbt",\r
+"rl14", "rl16", "rl2", "rl20", "rl36", "rpob", "rpoc", "rpod", "rs11",\r
+"rs12", "rs14", "rs18", "rs19", "rs2", "rs3", "rs4", "rs7", "rs8",\r
+"ycf4", "ycf9"};\r
+ int wantgene[44]={0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
+ 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \r
+ 0, 0, 0, 0};\r
+/*\r
+for(ig=0,lg=0; ig<com.ngene; ig++) wantgene[ig]=!wantgene[ig];\r
+*/\r
+\r
+ if(com.ngene!=44) error2("ngene!=44");\r
+ FOR(h,com.ls) { \r
+ printf("%3d",com.pose[h]); \r
+ if((h+1)%20==0) FPN(F0); if((h+1)%500==0) getchar();\r
+ }\r
+ matIout(F0,com.lgene,1,com.ngene);\r
+ matIout(F0,wantgene,1,com.ngene);\r
+\r
+ for(ig=0,lg=0; ig<com.ngene; ig++) \r
+ if(wantgene[ig]) { ngenekept++; lg+=com.lgene[ig]; }\r
+\r
+ if((fseq=fopen(seqf,"w"))==NULL) error2("file creation err.");\r
+ fprintf(fseq,"%4d %4d G\nG %d ", com.ns, lg*n31, ngenekept);\r
+ FOR(ig,com.ngene) if(wantgene[ig]) fprintf(fseq," %3d", com.lgene[ig]);\r
+ FPN(fseq);\r
+\r
+ for (j=0; j<com.ns; FPN(fseq),j++) {\r
+ fprintf(fseq,"%-20s ", com.spname[j]);\r
+ if (n31==1) { /* nucleotide or aa sequences */\r
+ FOR (h,com.ls) \r
+ if(wantgene[ig=com.pose[h]]) fprintf(fseq,"%c",com.z[j][h]);\r
+ }\r
+ else { /* codon sequences */\r
+ FOR (h,com.ls)\r
+ if (wantgene[ig=com.pose[h]]) {\r
+ FOR (i,3) fprintf(fseq,"%c", com.z[j][h*3+i]);\r
+ fputc(' ', fseq);\r
+ }\r
+ }\r
+ }\r
+ FPN(fseq); \r
+ FOR(ig,com.ngene) if(wantgene[ig]) fprintf(fseq," %s", genenames[ig]);\r
+ FPN(fseq);\r
+ fclose (fseq);\r
+\r
+ exit(0);\r
+}\r
+\r
+#endif /* ifdef REALSEQUENCE */\r
+#endif /* ifdef TREESEARCH */\r
+#endif /* ifdef NODESTRUCTURE */\r
+\r
+\r
+\r
+#ifdef PARSIMONY\r
+\r
+void UpPassScoreOnly (int inode);\r
+void UpPassScoreOnlyB (int inode);\r
+\r
+static int *Nsteps, *chUB; /* MM */\r
+static char *Kspace, *chU, *NchU; \r
+/* Elements of chU are character states (there are NchU of them). This \r
+ representation is used to speed up calculation for large trees.\r
+ Bit operations on chUB are performed for binary trees\r
+*/\r
+\r
+void UpPassScoreOnly (int inode)\r
+{\r
+/* => VU, VL, & MM, theorem 2 */\r
+ int ison, i, j;\r
+ char *K=Kspace, maxK; /* chMark (VV) not used in up pass */\r
+\r
+ FOR (i,nodes[inode].nson)\r
+ if (nodes[nodes[inode].sons[i]].nson>0)\r
+ UpPassScoreOnly (nodes[inode].sons[i]);\r
+\r
+ FOR (i,com.ncode) K[i]=0;\r
+ FOR (i,nodes[inode].nson) \r
+ for (j=0,ison=nodes[inode].sons[i]; j<NchU[ison]; j++)\r
+ K[(int)chU[ison*com.ncode+j]]++;\r
+ for (i=0,maxK=0; i<com.ncode; i++) if (K[i]>maxK) maxK=K[i];\r
+ for (i=0,NchU[inode]=0; i<com.ncode; i++)\r
+ if (K[i]==maxK) chU[inode*com.ncode+NchU[inode]++]=(char)i;\r
+ Nsteps[inode]=nodes[inode].nson-maxK;\r
+ FOR (i, nodes[inode].nson) Nsteps[inode]+=Nsteps[nodes[inode].sons[i]];\r
+}\r
+\r
+void UpPassScoreOnlyB (int inode)\r
+{\r
+/* uses bit operation, for binary trees only \r
+*/\r
+ int ison1,ison2, i, change=0;\r
+\r
+ FOR (i,nodes[inode].nson)\r
+ if (nodes[nodes[inode].sons[i]].nson>0)\r
+ UpPassScoreOnlyB (nodes[inode].sons[i]);\r
+\r
+ ison1=nodes[inode].sons[0]; ison2=nodes[inode].sons[1];\r
+ if ((chUB[inode]=(chUB[ison1] & chUB[ison2]))==0)\r
+ { chUB[inode]=(chUB[ison1] | chUB[ison2]); change=1; }\r
+ Nsteps[inode]=change+Nsteps[ison1]+Nsteps[ison2];\r
+}\r
+\r
+\r
+double MPScore (double space[])\r
+{\r
+/* calculates MP score for a given tree using Hartigan's (1973) algorithm.\r
+ sizeof(space) = nnode*sizeof(int)+(nnode+2)*ncode*sizeof(char).\r
+ Uses Nsteps[nnode], chU[nnode*ncode], NchU[nnode].\r
+ if(BitOperation), bit operations are used on binary trees.\r
+*/\r
+ int h,i, BitOperation,U[3],change;\r
+ double score;\r
+\r
+ Nsteps=(int*)space;\r
+ BitOperation=(tree.nnode==2*com.ns-1 - (nodes[tree.root].nson==3));\r
+ BitOperation=(BitOperation&&com.ncode<32);\r
+ if (BitOperation) chUB=Nsteps+tree.nnode;\r
+ else {\r
+ chU=(char*)(Nsteps+tree.nnode);\r
+ NchU=chU+tree.nnode*com.ncode; Kspace=NchU+tree.nnode;\r
+ }\r
+ for (h=0,score=0; h<com.npatt; h++) {\r
+ FOR (i,tree.nnode) Nsteps[i]=0;\r
+ if (BitOperation) { \r
+ FOR (i,com.ns) chUB[i]=1<<(com.z[i][h]);\r
+ UpPassScoreOnlyB (tree.root);\r
+ if (nodes[tree.root].nson>2) {\r
+ FOR (i,3) U[i]=chUB[nodes[tree.root].sons[i]];\r
+ change=2;\r
+ if (U[0]&U[1]&U[2]) change=0;\r
+ else if (U[0]&U[1] || U[1]&U[2] || U[0]&U[2]) change=1;\r
+ for (i=0,Nsteps[tree.root]=change; i<3; i++) \r
+ Nsteps[tree.root]+=Nsteps[nodes[tree.root].sons[i]];\r
+ }\r
+ }\r
+ else { /* polytomies, use characters */\r
+ FOR(i,com.ns)\r
+ {chU[i*com.ncode]=(char)(com.z[i][h]); NchU[i]=(char)1; }\r
+ for (i=com.ns; i<tree.nnode; i++) NchU[i]=0;\r
+ UpPassScoreOnly (tree.root);\r
+ }\r
+ score+=Nsteps[tree.root]*com.fpatt[h];\r
+/*\r
+printf("\nh %3d: ", h+1);\r
+FOR(i,com.ns) printf("%2d ", com.z[i][h]);\r
+printf(" %6d ", Nsteps[tree.root]);\r
+if((h+1)%10==0) exit(1);\r
+*/\r
+ }\r
+\r
+ return (score);\r
+}\r
+\r
+double RemoveMPNinfSites (double *nsiteNinf)\r
+{\r
+/* Removes parsimony-noninformative sites and return the number of changes \r
+ at those sites.\r
+ Changes .z[], .fpatt[], .npatt, etc.\r
+*/\r
+ int h,j, it, npatt0=com.npatt, markb[NCODE], gt2;\r
+ double MPScoreNinf;\r
+\r
+ for (h=0,com.npatt=0,MPScoreNinf=0,*nsiteNinf=0; h<npatt0; h++) {\r
+ FOR (j, com.ncode) markb[j]=0;\r
+ FOR (j, com.ns) markb[(int)com.z[j][h]]++;\r
+ for (j=0,it=gt2=0; j<com.ncode; j++)\r
+ if (markb[j]>=2) { it++; gt2=1; }\r
+ if (it<2) { /* non-informative */\r
+ *nsiteNinf+=com.fpatt[h];\r
+ FOR (j,com.ncode) if(markb[j]==1) MPScoreNinf+=com.fpatt[h];\r
+ if (!gt2) MPScoreNinf-=com.fpatt[h];\r
+ }\r
+ else {\r
+ FOR (j, com.ns) com.z[j][com.npatt]=com.z[j][h];\r
+ com.fpatt[com.npatt++]=com.fpatt[h];\r
+ }\r
+ }\r
+ return (MPScoreNinf);\r
+}\r
+\r
+#endif\r
+\r
+\r
+#ifdef RECONSTRUCTION\r
+\r
+static char *chMark, *chMarkU, *chMarkL; /* VV, VU, VL */\r
+/* chMark, chMarkU, chMarkL (VV, VU, VL) have elements 0 or 1, marking\r
+ whether the character state is present in the set */\r
+static char *PATHWay, *NCharaCur, *ICharaCur, *CharaCur;\r
+/* PATHWay, NCharaCur, ICharaCur, CharaCur are for the current \r
+ reconstruction. \r
+*/\r
+\r
+int UpPass (int inode);\r
+int DownPass (int inode);\r
+\r
+int UpPass (int inode)\r
+{\r
+/* => VU, VL, & MM, theorem 2 */\r
+ int n=com.ncode, i, j;\r
+ char *K=chMark, maxK; /* chMark (VV) not used in up pass */\r
+\r
+ FOR (i,nodes[inode].nson)\r
+ if (nodes[nodes[inode].sons[i]].nson>0) UpPass (nodes[inode].sons[i]);\r
+\r
+ FOR (i, n) K[i]=0;\r
+ FOR (i,nodes[inode].nson) \r
+ FOR (j, n) if(chMarkU[nodes[inode].sons[i]*n+j]) K[j]++;\r
+ for (i=0,maxK=0; i<n; i++) if (K[i]>maxK) maxK=K[i];\r
+ for (i=0; i<n; i++) {\r
+ if (K[i]==maxK) chMarkU[inode*n+i]=1; \r
+ else if (K[i]==maxK-1) chMarkL[inode*n+i]=1;\r
+ }\r
+ Nsteps[inode]=nodes[inode].nson-maxK;\r
+ FOR (i, nodes[inode].nson) Nsteps[inode]+=Nsteps[nodes[inode].sons[i]];\r
+ return (0);\r
+}\r
+\r
+int DownPass (int inode)\r
+{\r
+/* VU, VL => VV, theorem 3 */\r
+ int n=com.ncode, i, j, ison;\r
+\r
+ FOR (i,nodes[inode].nson) {\r
+ ison=nodes[inode].sons[i];\r
+ FOR (j,n) if (chMark[inode*n+j]>chMarkU[ison*n+j]) break;\r
+ if (j==n) \r
+ FOR (j,n) chMark[ison*n+j]=chMark[inode*n+j];\r
+ else \r
+ FOR (j,n)\r
+ chMark[ison*n+j] = \r
+ (char)(chMarkU[ison*n+j]||(chMark[inode*n+j]&&chMarkL[ison*n+j]));\r
+ }\r
+ FOR (i,nodes[inode].nson)\r
+ if (nodes[nodes[inode].sons[i]].nson>0) DownPass (nodes[inode].sons[i]);\r
+ return (0);\r
+}\r
+\r
+\r
+int DownStates (int inode)\r
+{\r
+/* VU, VL => NCharaCur, CharaCur, theorem 4 */\r
+ int i;\r
+\r
+ FOR (i,nodes[inode].nson) \r
+ if (nodes[inode].sons[i]>=com.ns) \r
+ DownStatesOneNode (nodes[inode].sons[i], inode);\r
+ return (0);\r
+}\r
+\r
+int DownStatesOneNode (int ison, int father)\r
+{\r
+/* States down inode, given father */\r
+ char chi=PATHWay[father-com.ns];\r
+ int n=com.ncode, j, in;\r
+\r
+ if((in=ison-com.ns)<0) return (0);\r
+ if (chMarkU[ison*n+chi]) {\r
+ NCharaCur[in]=1; CharaCur[in*n+0]=chi;\r
+ }\r
+ else if (chMarkL[ison*n+chi]) {\r
+ for (j=0,NCharaCur[in]=0; j<n; j++) \r
+ if (chMarkU[ison*n+j] || j==chi) CharaCur[in*n+NCharaCur[in]++]=(char)j;\r
+ }\r
+ else {\r
+ for (j=0,NCharaCur[in]=0; j<n; j++) \r
+ if (chMarkU[ison*n+j]) CharaCur[in*n+NCharaCur[in]++]=(char)j;\r
+ }\r
+ PATHWay[in]=CharaCur[in*n+(ICharaCur[in]=0)];\r
+ FOR (j, nodes[ison].nson) if (nodes[ison].sons[j]>=com.ns) break;\r
+ if (j<nodes[ison].nson) DownStates (ison);\r
+\r
+ return (0);\r
+}\r
+\r
+int InteriorStatesMP (int job, int h, int *nchange, char NChara[NS-1], \r
+ char Chara[(NS-1)*NCODE], double space[]);\r
+\r
+int InteriorStatesMP (int job, int h, int *nchange, char NChara[NS-1], \r
+ char Chara[(NS-1)*NCODE], double space[])\r
+{\r
+/* sizeof(space) = nnode*sizeof(int)+3*nnode*ncode*sizeof(char)\r
+ job: 0=# of changes; 1:equivocal states\r
+*/\r
+ int n=com.ncode, i,j;\r
+\r
+ Nsteps=(int*)space; chMark=(char*)(Nsteps+tree.nnode);\r
+ chMarkU=chMark+tree.nnode*n; chMarkL=chMarkU+tree.nnode*n;\r
+ FOR (i,tree.nnode) Nsteps[i]=0;\r
+ FOR (i,3*n*tree.nnode) chMark[i]=0;\r
+ FOR (i,com.ns) chMark[i*n+com.z[i][h]]=chMarkU[i*n+com.z[i][h]]=1;\r
+ UpPass (tree.root);\r
+ *nchange=Nsteps[tree.root];\r
+ if (job==0) return (0);\r
+ FOR (i,n) chMark[tree.root*n+i]=chMarkU[tree.root*n+i];\r
+ DownPass (tree.root);\r
+ FOR (i,tree.nnode-com.ns) \r
+ for (j=0,NChara[i]=0; j<n; j++) \r
+ if (chMark[(i+com.ns)*n+j]) Chara[i*n+NChara[i]++]=(char)j;\r
+ return (0);\r
+}\r
+\r
+\r
+int PathwayMP (FILE *fout, double space[])\r
+{\r
+/* Hartigan, JA. 1973. Minimum mutation fits to a given tree. \r
+ Biometrics, 29:53-65.\r
+*/\r
+ char *pch=(com.seqtype==0?BASEs:AAs), visit[NS-1];\r
+ int n=com.ncode, nid=tree.nbranch-com.ns+1, it, i,j,k, h, npath;\r
+ int nchange, nchange0;\r
+ char nodeb[NNODE], Equivoc[NS-1];\r
+\r
+ PATHWay=(char*)malloc(nid*(n+3)*sizeof(char));\r
+ NCharaCur=PATHWay+nid; ICharaCur=NCharaCur+nid; CharaCur=ICharaCur+nid;\r
+\r
+ for (j=0,visit[i=0]=(char)(tree.root-com.ns); j<tree.nbranch; j++) \r
+ if (tree.branches[j][1]>=com.ns) \r
+ visit[++i]=(char)(tree.branches[j][1]-com.ns);\r
+/*\r
+ printf ("\nOrder in nodes: ");\r
+ FOR (j, nid) printf ("%4d", visit[j]+1+com.ns); FPN(F0);\r
+*/\r
+ for (h=0; h<com.npatt; h++) {\r
+ fprintf (fout, "\n%4d%6.0f ", h+1, com.fpatt[h]);\r
+ FOR (j, com.ns) fprintf (fout, "%c", pch[(int)com.z[j][h]]);\r
+ fprintf (fout, ": ");\r
+\r
+ FOR (j,com.ns) nodeb[j]=(char)(com.z[j][h]);\r
+\r
+ InteriorStatesMP (1, h, &nchange, NCharaCur, CharaCur, space); \r
+ ICharaCur[j=tree.root-com.ns]=0; PATHWay[j]=CharaCur[j*n+0];\r
+ FOR (j,nid) Equivoc[j]=(char)(NCharaCur[j]>1);\r
+ DownStates (tree.root);\r
+\r
+ for (npath=0; ;) {\r
+ for (j=0,k=visit[nid-1]; j<NCharaCur[k]; j++) {\r
+ PATHWay[k]=CharaCur[k*n+j]; npath++; \r
+ FOR (i, nid) fprintf (fout, "%c", pch[(int)PATHWay[i]]);\r
+ fprintf (fout, " ");\r
+\r
+ FOR (i,nid) nodeb[i+com.ns]=PATHWay[i];\r
+ for (i=0,nchange0=0; i<tree.nbranch; i++) \r
+ nchange0+=(nodeb[tree.branches[i][0]]!=nodeb[tree.branches[i][1]]);\r
+ if (nchange0!=nchange) \r
+ { puts("\a\nerr:PathwayMP"); fprintf(fout,".%d. ", nchange0);}\r
+\r
+ }\r
+ for (j=nid-2; j>=0; j--) {\r
+ if(Equivoc[k=visit[j]] == 0) continue;\r
+ if (ICharaCur[k]+1<NCharaCur[k]) {\r
+ PATHWay[k] = CharaCur[k*n + (++ICharaCur[k])];\r
+ DownStates (k+com.ns);\r
+ break;\r
+ }\r
+ else { /* if (next equivocal node is not ancestor) update node k */\r
+ for (i=j-1; i>=0; i--) if (Equivoc[(int)visit[i]]) break;\r
+ if (i>=0) { \r
+ for (it=k+com.ns,i=visit[i]+com.ns; ; it=nodes[it].father)\r
+ if (it==tree.root || nodes[it].father==i) break;\r
+ if (it==tree.root)\r
+ DownStatesOneNode(k+com.ns, nodes[k+com.ns].father);\r
+ }\r
+ }\r
+ }\r
+ if (j<0) break;\r
+ }\r
+ fprintf (fout, " |%4d (%d)", npath, nchange);\r
+ } /* for (h) */\r
+ free (PATHWay);\r
+ return (0);\r
+}\r
+\r
+#endif\r
+\r
+\r
+\r
+#if(BASEML || CODEML)\r
+\r
+\r
+int BootstrapSeq (char* seqf)\r
+{\r
+/* This is called from within ReadSeq(), right after the sequences are read \r
+ and before the data are coded.\r
+ jackknife if(lsb<com.ls && com.ngene==1).\r
+ gmark[start+19] marks the position of the 19th site in that gene.\r
+*/\r
+ int iboot,nboot=com.bootstrap, h, is, ig, lg[NGENE]={0}, j, start;\r
+ int lsb=com.ls, n31=1,gap=10, gpos[NGENE];\r
+ int *sites=(int*)malloc(com.ls*sizeof(int)), *gmark=NULL;\r
+ FILE *fseq=(FILE*)gfopen(seqf, "w");\r
+ enum {PAML=0, PAUP};\r
+ char *datatype = (com.seqtype==AAseq?"protein":"dna");\r
+ char *paupstart="paupstart", *paupblock="paupblock", *paupend="paupend";\r
+ int format=0; /* 0: paml-phylip; 1:paup-nexus */\r
+\r
+ if(com.readpattern) error2("work on bootstrapping pattern data.");\r
+\r
+ printf("\nGenerating bootstrap samples in file %s\n", seqf);\r
+ if(format==PAUP) {\r
+ printf("%s, %s, & %s will be appended if existent.\n",\r
+ paupstart,paupblock,paupend);\r
+ appendfile(fseq, paupstart);\r
+ }\r
+\r
+ if(com.seqtype==CODONseq||com.seqtype==CODON2AAseq) { n31=3; gap=1; }\r
+ if(sites==NULL) error2("oom in BootstrapSeq");\r
+ if(com.ngene>1) {\r
+ if(lsb<com.ls) error2("jackknife when #gene>1");\r
+ if((gmark=(int*)malloc(com.ls*sizeof(int)))==NULL) \r
+ error2("oom in BootstrapSeq");\r
+\r
+ for(ig=0; ig<com.ngene; ig++) com.lgene[ig] = gpos[ig] = 0;\r
+ for(h=0; h<com.ls; h++) com.lgene[com.pose[h]]++;\r
+ for(j=0; j<com.ngene; j++) lg[j] = com.lgene[j];\r
+ for(j=1; j<com.ngene; j++) com.lgene[j] += com.lgene[j-1];\r
+\r
+ if(noisy && com.ngene>1) {\r
+ printf("Bootstrap uses stratefied sampling for %d partitions.", com.ngene);\r
+ printf("\nnumber of sites in each partition: ");\r
+ for(ig=0; ig<com.ngene; ig++) printf(" %4d", lg[ig]);\r
+ FPN(F0);\r
+ }\r
+\r
+ for(h=0; h<com.ls; h++) { /* create gmark[] */\r
+ ig = com.pose[h];\r
+ start = (ig==0 ? 0 : com.lgene[ig-1]);\r
+ gmark[start + gpos[ig]++] = h;\r
+ }\r
+ }\r
+\r
+ for (iboot=0; iboot<nboot; iboot++,FPN(fseq)) {\r
+ if(com.ngene<=1)\r
+ for(h=0; h<lsb; h++) sites[h] = (int)(rndu()*com.ls);\r
+ else {\r
+ for(ig=0; ig<com.ngene; ig++) {\r
+ start = (ig==0 ? 0 : com.lgene[ig-1]);\r
+ for(h=0; h<lg[ig]; h++)\r
+ sites[start+h] = gmark[start+(int)(rndu()*lg[ig])];\r
+ }\r
+ }\r
+\r
+ /* print out the bootstrap sample */\r
+ if(format==PAUP) {\r
+ fprintf(fseq,"\n\n[Replicate # %d]\n", iboot+1);\r
+ fprintf(fseq,"\nbegin data;\n");\r
+ fprintf(fseq," dimensions ntax=%d nchar=%d;\n", com.ns, lsb*n31);\r
+ fprintf(fseq," format datatype=%s missing=? gap=-;\n matrix\n", datatype);\r
+\r
+ for(is=0;is<com.ns;is++,FPN(fseq)) {\r
+ fprintf(fseq,"%-20s ", com.spname[is]);\r
+ for(h=0; h<lsb; h++) {\r
+ for(j=0; j<n31; j++) fprintf(fseq,"%c", com.z[is][sites[h]*n31+j]);\r
+ if((h+1)%gap==0) fprintf(fseq," ");\r
+ }\r
+ }\r
+\r
+ fprintf(fseq, " ;\nend;");\r
+ /* site partitions */\r
+ if(com.ngene>1) {\r
+ fprintf(fseq, "\n\nbegin paup;\n");\r
+ for(ig=0; ig<com.ngene; ig++)\r
+ fprintf(fseq, " charset partition%-2d = %-4d - %-4d;\n", \r
+ ig+1, (ig==0 ? 1 : com.lgene[ig-1]+1), com.lgene[ig]);\r
+ fprintf(fseq, "end;\n");\r
+ }\r
+ appendfile(fseq, paupblock);\r
+ }\r
+ else {\r
+ if(com.ngene==1) \r
+ fprintf(fseq,"%6d %6d\n", com.ns, lsb*n31);\r
+ else {\r
+ fprintf(fseq,"%6d %6d G\nG %d ", com.ns, lsb*n31, com.ngene);\r
+ for(ig=0; ig<com.ngene; ig++)\r
+ fprintf(fseq," %4d", lg[ig]);\r
+ fprintf(fseq,"\n\n");\r
+ }\r
+ for(is=0; is<com.ns; is++,FPN(fseq)) {\r
+ fprintf(fseq,"%-20s ", com.spname[is]);\r
+ for(h=0; h<lsb; h++) {\r
+ for(j=0; j<n31; j++)\r
+ fprintf(fseq,"%c", com.z[is][sites[h]*n31+j]);\r
+ if((h+1)%gap==0) fprintf(fseq," ");\r
+ }\r
+ }\r
+ }\r
+\r
+ if(noisy && (iboot+1)%10==0) printf("\rdid sample #%d", iboot+1);\r
+ } /* for(iboot) */\r
+ free(sites); if(com.ngene>1) free(gmark);\r
+ fclose(fseq);\r
+ return(0);\r
+}\r
+\r
+\r
+\r
+int rell (FILE*flnf, FILE*fout, int ntree)\r
+{\r
+/* This implements three methods for tree topology comparison. The first \r
+ tests the log likelihood difference using a normal approximation \r
+ (Kishino and Hasegawa 1989). The second does approximate bootstrap sampling\r
+ (the RELL method, Kishino and Hasegawa 1989, 1993). The third is a \r
+ modification of the K-H test with a correction for multiple comparison \r
+ (Shimodaira and Hasegawa 1999) .\r
+ The routine reads input from the file lnf.\r
+\r
+ fpattB[npatt] stores the counts of site patterns in the bootstrap sample, \r
+ with sitelist[ls] listing sites by gene, for stratefied sampling. \r
+ \r
+ com.space[ntree*(npatt+nr+5)]: \r
+ lnf[ntree*npatt] lnL0[ntree] lnL[ntree*nr] pRELL[ntree] pSH[ntree] vdl[ntree]\r
+ btrees[ntree]\r
+*/\r
+ char *line, timestr[64];\r
+ int nr=(com.ls<100000?10000:(com.ls<10000?5000:500));\r
+ int lline=16000, ntree0,ns0=com.ns, ls0,npatt0;\r
+ int itree, h,ir,j,k, ig, mltree, nbtree, *btrees, status=0;\r
+ int *sitelist, *fpattB, *lgeneB, *psitelist;\r
+ double *lnf, *lnL0, *lnL, *pRELL, *lnLmSH, *pSH, *vdl, y, mdl, small=1e-5;\r
+ size_t s;\r
+\r
+ fflush(fout);\r
+ puts( "\nTree comparisons (Kishino & Hasegawa 1989; Shimodaira & Hasegawa 1999)");\r
+ fputs("\nTree comparisons (Kishino & Hasegawa 1989; Shimodaira & Hasegawa 1999)\n",fout);\r
+ fprintf(fout,"Number of replicates: %d\n", nr);\r
+\r
+ fscanf(flnf,"%d%d%d", &ntree0, &ls0, & npatt0);\r
+ if(ntree0!=-1 && ntree0!=ntree) error2("rell: input data file strange. Check.");\r
+ if (ls0!=com.ls || npatt0!=com.npatt)\r
+ error2("rell: input data file incorrect.");\r
+ s = ntree*(com.npatt+nr+5)*sizeof(double);\r
+ if(com.sspace < s) {\r
+ if(noisy) printf("resetting space to %lu bytes in rell.\n",s);\r
+ com.sspace = s;\r
+ if((com.space=(double*)realloc(com.space,com.sspace))==NULL)\r
+ error2("oom space");\r
+ }\r
+ lnf=com.space; lnL0=lnf+ntree*com.npatt; lnL=lnL0+ntree; pRELL=lnL+ntree*nr;\r
+ pSH=pRELL+ntree; vdl=pSH+ntree; btrees=(int*)(vdl+ntree);\r
+ fpattB=(int*)malloc((com.npatt+com.ls+com.ngene)*sizeof(int));\r
+ if(fpattB==NULL) error2("oom fpattB in rell.");\r
+ sitelist=fpattB+com.npatt; lgeneB=sitelist+com.ls;\r
+\r
+ lline = (com.seqtype==1 ? ns0*8 : ns0) + 100;\r
+ lline = max2(16000, lline);\r
+ if((line=(char*)malloc((lline+1)*sizeof(char)))==NULL) error2("oom rell");\r
+\r
+ /* read lnf from file flnf, calculates lnL0[] & find ML tree */\r
+ for(itree=0,mltree=0; itree<ntree; itree++) {\r
+ printf("\r\tReading lnf for tree # %d", itree+1);\r
+ fscanf(flnf, "%d", &j);\r
+ if(j != itree+1) \r
+ { printf("\nerr: lnf, reading tree %d.",itree+1); return(-1); }\r
+ for(h=0,lnL0[itree]=0; h<com.npatt; h++) {\r
+ fscanf (flnf, "%d%d%lf", &j, &k, &y);\r
+ if(j!=h+1)\r
+ { printf("\nlnf, patt %d.",h+1); return(-1); }\r
+ fgets(line,lline,flnf);\r
+ lnL0[itree]+=com.fpatt[h]*(lnf[itree*com.npatt+h]=y);\r
+ }\r
+ if(itree && lnL0[itree]>lnL0[mltree]) mltree=itree;\r
+ }\r
+ printf(", done.\n");\r
+ free(line);\r
+\r
+ /* calculates SEs (vdl) by sitewise comparison */\r
+\r
+ printtime(timestr);\r
+ printf("\r\tCalculating SEs by sitewise comparison");\r
+ FOR(itree,ntree) {\r
+ if(itree==mltree) { vdl[itree]=0; continue; }\r
+ mdl=(lnL0[itree]-lnL0[mltree])/com.ls;\r
+ for(h=0,vdl[itree]=0; h<com.npatt; h++) {\r
+ y=lnf[itree*com.npatt+h]-lnf[mltree*com.npatt+h];\r
+ vdl[itree]+=com.fpatt[h]*(y-mdl)*(y-mdl);\r
+ }\r
+ vdl[itree]=sqrt(vdl[itree]);\r
+ }\r
+ printf(", %s\n", printtime(timestr));\r
+\r
+ /* bootstrap resampling */\r
+ for(ig=0; ig<com.ngene; ig++)\r
+ lgeneB[ig]=(ig?com.lgene[ig]-com.lgene[ig-1]:com.lgene[ig]);\r
+ for(h=0,k=0;h<com.npatt;h++) \r
+ FOR(j,(int)com.fpatt[h]) sitelist[k++]=h;\r
+\r
+ zero(pRELL,ntree); zero(pSH,ntree); zero(lnL,ntree*nr);\r
+ for(ir=0; ir<nr; ir++) {\r
+ for(h=0; h<com.npatt; h++) fpattB[h]=0;\r
+ for(ig=0,psitelist=sitelist; ig<com.ngene; psitelist+=lgeneB[ig++]) {\r
+ for(k=0; k<lgeneB[ig]; k++) {\r
+ j=(int)(lgeneB[ig]*rndu());\r
+ h=psitelist[j];\r
+ fpattB[h]++;\r
+ }\r
+ }\r
+ for(h=0; h<com.npatt; h++) {\r
+ if(fpattB[h])\r
+ for(itree=0; itree<ntree; itree++) \r
+ lnL[itree*nr+ir] += fpattB[h]*lnf[itree*com.npatt+h];\r
+ }\r
+ \r
+ /* y is the lnL for the best tree from replicate ir. */\r
+ for(j=1,nbtree=1,btrees[0]=0,y=lnL[ir]; j<ntree; j++) {\r
+ if(fabs(lnL[j*nr+ir]-y)<small) \r
+ btrees[nbtree++]=j;\r
+ else if (lnL[j*nr+ir]>y)\r
+ { nbtree=1; btrees[0]=j; y=lnL[j*nr+ir]; }\r
+ }\r
+\r
+ for(j=0; j<nbtree; j++) \r
+ pRELL[btrees[j]]+=1./(nr*nbtree);\r
+ if(nr>100 && (ir+1)%(nr/100)==0) \r
+ printf("\r\tRELL Bootstrapping.. replicate: %6d / %d %s",ir+1,nr, printtime(timestr));\r
+\r
+ }\r
+ free(fpattB);\r
+\r
+ if(fabs(1-sum(pRELL,ntree))>1e-6) error2("sum pRELL != 1.");\r
+\r
+ /* Shimodaira & Hasegawa correction (1999), working on lnL[ntree*nr] */\r
+ printf("\nnow doing S-H test");\r
+ if((lnLmSH=(double*)malloc(nr*sizeof(double))) == NULL) error2("oom in rell");\r
+ for(j=0; j<ntree; j++) /* step 3: centering */\r
+ for(ir=0,y=sum(lnL+j*nr,nr)/nr; ir<nr; ir++) lnL[j*nr+ir] -= y;\r
+ for(ir=0; ir<nr; ir++) {\r
+ for(j=1,lnLmSH[ir]=lnL[ir]; j<ntree; j++) \r
+ if(lnL[j*nr+ir]>lnLmSH[ir]) lnLmSH[ir] = lnL[j*nr+ir];\r
+ }\r
+ for(itree=0; itree<ntree; itree++) { /* steps 4 & 5 */\r
+ for(ir=0; ir<nr; ir++)\r
+ if(lnLmSH[ir]-lnL[itree*nr+ir] > lnL0[mltree]-lnL0[itree]) \r
+ pSH[itree] += 1./nr;\r
+ }\r
+\r
+ fprintf(fout,"\n%6s %12s %9s %9s%8s%10s%9s\n\n",\r
+ "tree","li","Dli"," +- SE","pKH","pSH","pRELL");\r
+ FOR(j,ntree) {\r
+ mdl=lnL0[j]-lnL0[mltree]; \r
+ if(j==mltree || fabs(vdl[j])<1e-6) { y=-1; pSH[j]=-1; status=-1; }\r
+ else y=1-CDFNormal(-mdl/vdl[j]);\r
+ fprintf(fout,"%6d%c%12.3f %9.3f %9.3f%8.3f%10.3f%9.3f\n",\r
+ j+1,(j==mltree?'*':' '),lnL0[j],mdl,vdl[j],y,pSH[j],pRELL[j]);\r
+ }\r
+\r
+fprintf(frst1,"%3d %12.6f",mltree+1, lnL0[mltree]);\r
+for(j=0;j<ntree;j++) fprintf(frst1," %5.3f",pRELL[j]);\r
+/*\r
+for(j=0;j<ntree;j++) if(j!=mltree) fprintf(frst1,"%9.6f",pSH[j]);\r
+*/\r
+\r
+ fputs("\npKH: P value for KH normal test (Kishino & Hasegawa 1989)\n",fout);\r
+ fputs("pRELL: RELL bootstrap proportions (Kishino & Hasegawa 1989)\n",fout);\r
+ fputs("pSH: P value with multiple-comparison correction (MC in table 1 of Shimodaira & Hasegawa 1999)\n",fout);\r
+ if(status) fputs("(-1 for P values means N/A)\n",fout);\r
+\r
+ FPN(F0);\r
+ free(lnLmSH);\r
+ return(0);\r
+}\r
+\r
+#endif\r
+\r
+\r
+\r
+\r
+#ifdef LFUNCTIONS\r
+#ifdef RECONSTRUCTION\r
+\r
+\r
+void ListAncestSeq(FILE *fout, char *zanc);\r
+\r
+void ListAncestSeq(FILE *fout, char *zanc)\r
+{\r
+/* zanc[nintern*com.npatt] holds ancestral sequences.\r
+ Extant sequences are coded if cleandata.\r
+*/\r
+ int wname=15, j,h, n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
+ int lst=(com.readpattern?com.npatt:com.ls);\r
+\r
+ fputs("\n\n\nList of extant and reconstructed sequences\n\n",fout);\r
+ if(!com.readpattern) fprintf(fout, "%6d %6d\n\n", tree.nnode, lst*n31);\r
+ else fprintf(fout, "%6d %6d P\n\n", tree.nnode, lst*n31);\r
+ for(j=0;j<com.ns;j++,FPN(fout)) {\r
+ fprintf(fout,"%-*s ", wname,com.spname[j]);\r
+ print1seq(fout, com.z[j], lst, com.pose);\r
+ }\r
+ for(j=0;j<tree.nnode-com.ns;j++,FPN(fout)) {\r
+ fprintf(fout,"node #%-*d ", wname-5,com.ns+j+1);\r
+ print1seq(fout, zanc+j*com.npatt, lst, com.pose);\r
+ }\r
+ if(com.readpattern) {\r
+ for(h=0,FPN(fout); h<com.npatt; h++) {\r
+ fprintf(fout," %4.0f", com.fpatt[h]);\r
+ if((h+1)%15==0) FPN(fout);\r
+ }\r
+ fprintf(fout,"\n\n");\r
+ }\r
+}\r
+\r
+int ProbSitePattern(double x[], double *lnL, double fhsiteAnc[], double ScaleC[]);\r
+int AncestralMarginal(FILE *fout, double x[], double fhsiteAnc[], double Sir[]);\r
+int AncestralJointPPSG2000(FILE *fout, double x[]);\r
+\r
+\r
+int ProbSitePattern (double x[], double *lnL, double fhsiteAnc[], double ScaleC[])\r
+{\r
+/* This calculates probabilities for observing site patterns fhsite[]. \r
+ The following notes are for ncatG>1 and method = 0. \r
+ The routine calculates the scale factor common to all site classes (ir), \r
+ that is, the greatest of the scale factors among the ir classes. \r
+ The common scale factors will be used in scaling nodes[].conP for all site \r
+ classes for all nodes in PostProbNode(). Small conP for some site classes \r
+ will be essentially set to 0, which is fine.\r
+\r
+ fhsite[npatt]\r
+ ScaleSite[npatt]\r
+\r
+ Ziheng Yang, 7 Sept, 2001\r
+*/\r
+ int ig, i,k,h, ir;\r
+ double fh, S, y=1;\r
+\r
+ if(com.ncatG>1 && com.method==1) error2("don't need this?");\r
+ if (SetParameters(x)) puts ("par err.");\r
+ for(h=0; h<com.npatt; h++)\r
+ fhsiteAnc[h] = 0;\r
+ if (com.ncatG<=1) {\r
+ for (ig=0,*lnL=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene>1) SetPGene(ig, 1, 1, 0, x);\r
+ ConditionalPNode (tree.root, ig, x);\r
+ for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ for (i=0; i<com.ncode; i++) \r
+ fhsiteAnc[h] += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];\r
+ *lnL -= log(fhsiteAnc[h])*com.fpatt[h];\r
+ if(com.NnodeScale) \r
+ for(k=0; k<com.NnodeScale; k++) \r
+ *lnL -= com.nodeScaleF[k*com.npatt+h]*com.fpatt[h];\r
+ }\r
+ }\r
+ }\r
+ else {\r
+ for (ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene>1 || com.nalpha>1)\r
+ SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);\r
+ for (ir=0; ir<com.ncatG; ir++) {\r
+#ifdef CODEML\r
+ if(com.seqtype==1 && com.NSsites /* && com.model */) IClass=ir;\r
+#endif\r
+ SetPSiteClass(ir, x);\r
+ ConditionalPNode (tree.root, ig, x);\r
+\r
+ for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ for (i=0,fh=0; i<com.ncode; i++)\r
+ fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];\r
+ \r
+ if(com.NnodeScale) {\r
+ for(k=0,S=0; k<com.NnodeScale; k++) S += com.nodeScaleF[k*com.npatt+h];\r
+ y=1;\r
+ if(ir==0) ScaleC[h]=S;\r
+ else if(S<=ScaleC[h]) y=exp(S-ScaleC[h]);\r
+ else /* change of scale factor */\r
+ { fhsiteAnc[h] *= exp(ScaleC[h]-S); ScaleC[h]=S; }\r
+ }\r
+ fhsiteAnc[h] += com.freqK[ir]*fh*y;\r
+ }\r
+ }\r
+ }\r
+ for(h=0, *lnL=0; h<com.npatt; h++)\r
+ *lnL -= log(fhsiteAnc[h])*com.fpatt[h];\r
+ if(com.NnodeScale) \r
+ for(h=0; h<com.npatt; h++)\r
+ *lnL -= ScaleC[h]*com.fpatt[h];\r
+ }\r
+ if(noisy) printf("\nlnL = %12.6f from ProbSitePattern.\n", - *lnL);\r
+\r
+ return (0);\r
+}\r
+\r
+\r
+int updateconP(double x[], int inode);\r
+\r
+int PostProbNode (int inode, double x[], double fhsiteAnc[], double ScaleC[],\r
+ double *lnL, double pChar1node[], char za[], double pnode[])\r
+{\r
+/* This calculates the full posterior distribution for node inode at each site.\r
+ Below are special comments on gamma models and method = 0.\r
+\r
+ Marginal reconstruction under gamma models, with complications arising from \r
+ scaling on large trees (com.NnodeScale) and the use of two iteration algorithms \r
+ (method).\r
+ Z. Yang Sept 2001\r
+ \r
+ The algorithm is different depending on method, which makes the code clumsy.\r
+\r
+ gamma method=0 or 2 (simultaneous updating):\r
+ nodes[].conP overlap and get destroyed for different site classes (ir)\r
+ The same for scale factors com.nodeScaleF. \r
+ fhsite[npatt] and common scale factors ScaleC[npatt] are calculated for all \r
+ nodes before this routine is called. The common scale factors are then \r
+ used to adjust nodes[].conP before they are summed across ir classes.\r
+\r
+ gamma method=1 (one branch at a time):\r
+ nodes[].conP (and com.nodeScaleF if node scaling is on) are separately \r
+ allocated for different site classes (ir), so that all info needed is\r
+ available. Use of updateconP() saves computation on large trees.\r
+ Scale factor Sir[] is of size ncatG and reused for each h.\r
+*/\r
+ int n=com.ncode, i,k,h, ir,it=-1,best, ig;\r
+ double fh, y,pbest, *Sir=ScaleC, S;\r
+\r
+ *lnL=0;\r
+ zero(pChar1node,com.npatt*n);\r
+\r
+ /* nodes[].conP are reused for different ir, with or without node scaling */\r
+ if (com.ncatG>1 && com.method!=1) {\r
+ ReRootTree(inode);\r
+ for (ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene>1 || com.nalpha>1)\r
+ SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,x);\r
+ for (ir=0; ir<com.ncatG; ir++) {\r
+#ifdef CODEML\r
+ if(com.seqtype==1 && com.NSsites) IClass=ir;\r
+#endif\r
+ SetPSiteClass(ir, x);\r
+ ConditionalPNode (tree.root, ig, x);\r
+\r
+ for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ if(!com.NnodeScale) S=1;\r
+ else {\r
+ for(k=0,S=0; k<com.NnodeScale; k++) \r
+ S += com.nodeScaleF[k*com.npatt+h];\r
+ S=exp(S-ScaleC[h]);\r
+ }\r
+ for (i=0,fh=0; i<n; i++) {\r
+ y = com.freqK[ir]*com.pi[i]*nodes[tree.root].conP[h*n+i] * S;\r
+ fh += y;\r
+ pChar1node[h*n+i] += y ;\r
+ }\r
+ }\r
+ }\r
+ }\r
+ for (h=0; h<com.npatt; h++) {\r
+ for(i=0,y=0;i<n;i++) y += (pChar1node[h*n+i]/=fhsiteAnc[h]);\r
+ if (fabs(1-y)>1e-5) \r
+ error2("PostProbNode: sum!=1");\r
+ for (i=0,best=-1,pbest=-1; i<n; i++)\r
+ if (pChar1node[h*n+i]>pbest) {\r
+ best=i;\r
+ pbest=pChar1node[h*n+i]; \r
+ }\r
+ za[(inode-com.ns)*com.npatt+h] = (char)best;\r
+ pnode[(inode-com.ns)*com.npatt+h] = pbest;\r
+ *lnL -= log(fhsiteAnc[h])*com.fpatt[h];\r
+ if(com.NnodeScale) *lnL -= ScaleC[h]*com.fpatt[h];\r
+ }\r
+ }\r
+ else { /* all other cases: (alpha==0 || method==1) */\r
+ for(i=0; i<tree.nnode; i++) com.oldconP[i] = 1;\r
+ ReRootTree(inode);\r
+ updateconP(x,inode);\r
+ if (com.alpha==0 && com.ncatG<=1) { /* (alpha==0) (ngene>1 OK) */\r
+ for (ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene==2 || com.Mgene==4)\r
+ xtoy(com.piG[ig], com.pi, n);\r
+ for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ for (i=0,fh=0,pbest=0,best=-1; i<n; i++) {\r
+ y = com.pi[i]*nodes[tree.root].conP[h*n+i];\r
+ fh += y;\r
+ if (y>pbest)\r
+ { pbest=y; best=i; }\r
+ pChar1node[h*n+i] = y;\r
+ }\r
+ za[(inode-com.ns)*com.npatt+h] = (char)best;\r
+ pnode[(inode-com.ns)*com.npatt+h] = (pbest/=fh);\r
+ for (i=0; i<n; i++)\r
+ pChar1node[h*n+i] /= fh;\r
+ *lnL -= log(fh)*(double)com.fpatt[h];\r
+ for(i=0; i<com.NnodeScale; i++)\r
+ *lnL -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];\r
+ }\r
+ }\r
+ }\r
+ else { /* (ncatG>1 && method = 1) This should work for NSsites? */\r
+ for (ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene==2 || com.Mgene==4)\r
+ xtoy(com.piG[ig], com.pi, n);\r
+ for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ if(com.NnodeScale)\r
+ for(ir=0,it=0; ir<com.ncatG; ir++) { /* Sir[it] is the biggest */\r
+ for(k=0,Sir[ir]=0; k<com.NnodeScale; k++)\r
+ Sir[ir] += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];\r
+ if(Sir[ir]>Sir[it]) it = ir;\r
+ }\r
+ for (i=0,fh=0; i<n; i++) {\r
+ for(ir=0; ir<com.ncatG; ir++) {\r
+ if(com.method==1)\r
+ y = nodes[tree.root].conP[ir*(tree.nnode-com.ns)*com.npatt*n+h*n+i];\r
+ else\r
+ y = nodes[tree.root].conP[h*n+i]; /* wrong right now */\r
+ y *= com.pi[i]*com.freqK[ir];\r
+ if(com.NnodeScale) y *= exp(Sir[ir]-Sir[it]);\r
+ \r
+ pChar1node[h*n+i] += y;\r
+ fh += y;\r
+ }\r
+ }\r
+ for (i=0,best=0; i<n; i++) {\r
+ pChar1node[h*n+i] /= fh;\r
+ if(i && pChar1node[h*n+best]<pChar1node[h*n+i])\r
+ best = i;\r
+ }\r
+ za[(inode-com.ns)*com.npatt+h] = (char)best;\r
+ pnode[(inode-com.ns)*com.npatt+h] = pChar1node[h*n+best];\r
+ *lnL -= log(fh)*com.fpatt[h];\r
+ if(com.NnodeScale) *lnL -= Sir[it]*com.fpatt[h];\r
+ }\r
+ }\r
+ }\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+void getCodonNode1Site(char codon[], char zanc[], int inode, int site);\r
+\r
+int AncestralMarginal (FILE *fout, double x[], double fhsiteAnc[], double Sir[])\r
+{\r
+/* Ancestral reconstruction for each interior node. This works under both \r
+ the one rate and gamma rates models.\r
+ pnode[npatt*nid] stores the prob for the best chara at a node and site.\r
+ The best character is kept in za[], coded as 0,...,n-1.\r
+ The data may be coded (com.cleandata==1) or not (com.cleandata==0).\r
+ Call ProbSitePatt() before running this routine.\r
+ pMAPnode[NS-1], pMAPnodeA[] stores the MAP probabilities (accuracy)\r
+ for a site and for the entire sequence, respectively.\r
+ \r
+ The routine PostProbNode calculates pChar1node[npatt*ncode], which stores \r
+ prob for each char at each pattern at each given node inode. The rest of \r
+ the routine is to output the results in different ways.\r
+\r
+ Deals with node scaling to avoid underflows. See above \r
+ (Z. Yang, 2 Sept 2001)\r
+*/\r
+ char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
+ char *zanc, str[4]="",codon[2][4]={" "," "}, aa[4]="";\r
+ char *sitepatt=(com.readpattern?"pattern":"site");\r
+ int n=com.ncode, inode, ic=0,b[3],i,j,k1=-1,k2=-1,c1,c2,k3, lsc=com.ls;\r
+ int lst=(com.readpattern?com.npatt:com.ls);\r
+ int h,hp,ig, best, oldroot=tree.root;\r
+ int nid=tree.nnode-com.ns, nchange;\r
+ double lnL=0, fh, y, pbest, *pChar1node, *pnode, p1=-1,p2=-1;\r
+ double pMAPnode[NS-1], pMAPnodeA[NS-1], smallp=0.001;\r
+\r
+ char coding=0, *bestAA=NULL;\r
+ double pAA[21], *pbestAA=NULL, ns,na, nst,nat,S,N;\r
+ /* bestAA[nid*npatt], pbestAA[nid*npatt]: \r
+ To reconstruct aa seqs using codon or nucleotide seqs, universal code */\r
+\r
+ if(noisy) puts("Marginal reconstruction.");\r
+\r
+ fprintf (fout,"\n(1) Marginal reconstruction of ancestral sequences\n");\r
+ fprintf (fout,"(eqn. 4 in Yang et al. 1995 Genetics 141:1641-1650).\n");\r
+ pChar1node = (double*)malloc(com.npatt*n*sizeof(double));\r
+ pnode = (double*)malloc((nid*com.npatt+1)*(sizeof(double)+sizeof(char)));\r
+ if (pnode==NULL||pChar1node==NULL) \r
+ error2("oom pnode");\r
+ zanc = (char*)(pnode+nid*com.npatt);\r
+\r
+#ifdef BASEML\r
+ if(com.seqtype==0 && com.ls%3==0 && com.coding) { coding=1; lsc=com.ls/3; }\r
+#endif\r
+ if(com.seqtype==1) { coding=1; lsc=com.npatt; }\r
+ if(coding==1) {\r
+ if((pbestAA=(double*)malloc(nid*lsc*2*sizeof(double)))==NULL) \r
+ error2("oom pbestAA");\r
+ bestAA = (char*)(pbestAA+nid*lsc);\r
+ }\r
+\r
+ if(SetParameters(x)) puts("par err."); \r
+\r
+ if(com.verbose>1) \r
+ fprintf(fout,"\nProb distribs at nodes, those with p < %.3f not listed\n", smallp);\r
+\r
+ /* This loop reroots the tree at inode & reconstructs sequence at inode */\r
+ for (inode=com.ns; inode<tree.nnode; inode++) {\r
+\r
+ PostProbNode (inode, x, fhsiteAnc, Sir, &lnL, pChar1node, zanc, pnode);\r
+ if(noisy) printf ("\tNode %3d: lnL = %12.6f\n", inode+1, -lnL);\r
+\r
+ /* print Prob distribution at inode if com.verbose>1 */\r
+ if (com.verbose>1) {\r
+ fprintf(fout,"\nProb distribution at node %d, by %s\n", inode+1, sitepatt);\r
+ fprintf(fout,"\n%7s Freq Data\n\n", sitepatt);\r
+ for(h=0;h<lst;h++,FPN(fout)) {\r
+ hp = (!com.readpattern ? com.pose[h] : h);\r
+ fprintf (fout,"%7d%7.0f ", h+1, com.fpatt[hp]);\r
+ print1site(fout, hp);\r
+ fputs(": ", fout);\r
+ for(j=0; j<n; j++) {\r
+ if (com.seqtype!=CODONseq) { \r
+ str[0] = pch[j];\r
+ str[1] = 0;\r
+ }\r
+ else\r
+ strcpy(str, CODONs[j]);\r
+ fprintf(fout,"%s(%5.3f) ", str, pChar1node[hp*n+j]);\r
+ }\r
+ }\r
+ } /* if (verbose) */\r
+\r
+\r
+ /* find the best amino acid for coding seqs */\r
+#ifdef CODEML\r
+ if(com.seqtype==CODONseq)\r
+ for(h=0; h<com.npatt; h++) {\r
+ for(j=0; j<20; j++) pAA[j]=0; \r
+ for(j=0; j<n; j++) {\r
+ i = GeneticCode[com.icode][FROM61[j]];\r
+ pAA[i] += pChar1node[h*n+j];\r
+ }\r
+ /* matout(F0,pAA,1,20); */\r
+ for(j=0,best=0,pbest=0; j<20; j++) \r
+ if(pAA[j]>pbest) { pbest=pAA[j]; best=j; }\r
+ bestAA[(inode-com.ns)*com.npatt+h] = (char)best;\r
+ pbestAA[(inode-com.ns)*com.npatt+h] = pbest;\r
+ }\r
+#endif\r
+ if(com.seqtype==0 && coding) { /* coding seqs analyzed by baseml */\r
+ for(h=0; h<lsc; h++) { /* h-th codon */\r
+ /* sums up probs for the 20 AAs for each node. Stop codons are \r
+ ignored, and so those probs are approxiamte. */\r
+ for(j=0,y=0; j<20; j++) pAA[j]=0;\r
+ for(k1=0; k1<4; k1++) for(k2=0; k2<4; k2++) for(k3=0; k3<4; k3++) {\r
+ ic = k1*16+k2*4+k3;\r
+ b[0] = com.pose[h*3+0]*n+k1; \r
+ b[1] = com.pose[h*3+1]*n+k2; \r
+ b[2] = com.pose[h*3+2]*n+k3;\r
+ fh = pChar1node[b[0]]*pChar1node[b[1]]*pChar1node[b[2]];\r
+ if((ic=GeneticCode[com.icode][ic])==-1) \r
+ y += fh;\r
+ else\r
+ pAA[ic] += fh;\r
+ }\r
+ if(fabs(1-y-sum(pAA,20))>1e-6) error2("AncestralMarginal strange?");\r
+\r
+ for(j=0,best=0,pbest=0; j<20; j++) \r
+ if(pAA[j]>pbest) { pbest=pAA[j]; best=j; }\r
+\r
+ bestAA[(inode-com.ns)*com.ls/3+h] = (char)best;\r
+ pbestAA[(inode-com.ns)*com.ls/3+h] = pbest/(1-y);\r
+ }\r
+ }\r
+ } /* for (inode), This closes the big loop */\r
+\r
+ for(i=0; i<tree.nnode; i++)\r
+ com.oldconP[i] = 0;\r
+ ReRootTree(oldroot);\r
+\r
+ if(com.seqtype==0 && coding && !com.readpattern) { /* coding seqs analyzed by baseml */\r
+ fputs("\nBest amino acids reconstructed from nucleotide model.\n",fout);\r
+ fputs("Prob at each node listed by amino acid (codon) site\n",fout);\r
+ fputs("(Please ignore if not relevant)\n\n",fout);\r
+ for(h=0;h<com.ls/3;h++,FPN(fout)) {\r
+ fprintf(fout,"%4d ", h+1);\r
+ for(j=0; j<com.ns; j++) {\r
+ getCodonNode1Site(codon[0], NULL, j, h);\r
+ Codon2AA(codon[0], aa, com.icode, &i);\r
+ fprintf(fout," %s(%c)",codon[0],AAs[i]);\r
+ }\r
+ fprintf(fout,": ");\r
+ for (j=0; j<tree.nnode-com.ns; j++) {\r
+ fprintf(fout," %1c (%5.3f)", AAs[bestAA[j*com.ls/3+h]], pbestAA[j*com.ls/3+h]);\r
+ }\r
+ }\r
+ }\r
+\r
+ /* calculate accuracy measures */\r
+ zero(pMAPnode,nid); fillxc(pMAPnodeA, 1., nid);\r
+ for (inode=0; inode<tree.nnode-com.ns; inode++) {\r
+ for(h=0; h<com.npatt; h++) {\r
+ pMAPnode[inode] += com.fpatt[h]*pnode[inode*com.npatt+h]/com.ls;\r
+ pMAPnodeA[inode] *= pow(pnode[inode*com.npatt+h], com.fpatt[h]);\r
+ }\r
+ }\r
+\r
+ fprintf(fout,"\nProb of best state at each node, listed by %s", sitepatt);\r
+ if (com.ngene>1) fprintf(fout,"\n\n%7s (g) Freq Data: \n", sitepatt);\r
+ else fprintf(fout,"\n\n%7s Freq Data: \n", sitepatt);\r
+\r
+ for(h=0; h<lst; h++) {\r
+ hp = (!com.readpattern ? com.pose[h] : h);\r
+ fprintf(fout,"\n%4d ",h+1);\r
+ if (com.ngene>1) { /* which gene the site is from */\r
+ for(ig=1; ig<com.ngene; ig++) \r
+ if(hp<com.posG[ig]) break;\r
+ fprintf(fout,"(%d)",ig);\r
+ }\r
+ fprintf(fout," %5.0f ", com.fpatt[hp]);\r
+ print1site(fout, hp);\r
+ fprintf(fout, ": ");\r
+\r
+ for(j=0; j<nid; j++) {\r
+ if (com.seqtype!=CODONseq)\r
+ fprintf(fout,"%c(%5.3f) ", pch[(int)zanc[j*com.npatt+hp]],pnode[j*com.npatt+hp]);\r
+#ifdef CODEML\r
+ else {\r
+ ic = zanc[j*com.npatt+hp];\r
+ Codon2AA(CODONs[ic], aa, com.icode, &i);\r
+ fprintf(fout," %s %1c %5.3f (%1c %5.3f)",\r
+ CODONs[ic], AAs[i], pnode[j*com.npatt+hp], AAs[(int)bestAA[j*com.npatt+hp]], pbestAA[j*com.npatt+hp]);\r
+ }\r
+#endif\r
+ }\r
+ if(noisy && (h+1)%100000==0) printf("\r\tprinting, %d sites done", h+1);\r
+ }\r
+ if(noisy && h>=100000) printf("\n");\r
+\r
+ /* Map changes onto branches \r
+ k1 & k2 are the two characters; p1 and p2 are the two probs. */\r
+\r
+ if(!com.readpattern) {\r
+ fputs("\n\nSummary of changes along branches.\n",fout);\r
+ fputs("Check root of tree for directions of change.\n",fout);\r
+ if(!com.cleandata && com.seqtype==1) \r
+ fputs("Counts of n & s are incorrect along tip branches with ambiguity data.\n",fout);\r
+ for(j=0; j<tree.nbranch; j++,FPN(fout)) {\r
+ inode = tree.branches[j][1]; \r
+ nchange = 0;\r
+ fprintf(fout,"\nBranch %d:%5d..%-2d",j+1,tree.branches[j][0]+1,inode+1);\r
+ if(inode<com.ns) fprintf(fout," (%s) ",com.spname[inode]);\r
+\r
+ if(coding) {\r
+ lsc = (com.seqtype==1 ? com.ls : com.ls/3);\r
+ for (h=0,nst=nat=0; h<lsc; h++) {\r
+ getCodonNode1Site(codon[0], zanc, inode, h);\r
+ getCodonNode1Site(codon[1], zanc, tree.branches[j][0], h);\r
+ difcodonNG(codon[0], codon[1], &S, &N, &ns,&na, 0, com.icode);\r
+ nst += ns;\r
+ nat += na;\r
+ }\r
+ fprintf(fout," (n=%4.1f s=%4.1f)",nat,nst);\r
+ }\r
+ fprintf(fout,"\n\n");\r
+ for(h=0; h<lst; h++) {\r
+ hp = (!com.readpattern ? com.pose[h] : h);\r
+ if (com.seqtype!=CODONseq) {\r
+ if(inode<com.ns)\r
+ k2 = pch[(int)com.z[inode][hp]];\r
+ else {\r
+ k2 = pch[(int)zanc[(inode-com.ns)*com.npatt+hp]]; \r
+ p2 = pnode[(inode-com.ns)*com.npatt+hp];\r
+ }\r
+ k1 = pch[ zanc[(tree.branches[j][0]-com.ns)*com.npatt+hp] ];\r
+ p1 = pnode[(tree.branches[j][0]-com.ns)*com.npatt+hp];\r
+ }\r
+#ifdef CODEML\r
+ else {\r
+ if(inode<com.ns) {\r
+ strcpy(codon[1], CODONs[com.z[inode][hp]]);\r
+ k2 = GetAASiteSpecies(inode, hp);\r
+ }\r
+ else {\r
+ strcpy(codon[1], CODONs[(int)zanc[(inode-com.ns)*com.npatt+hp]]);\r
+ k2 = AAs[(int)bestAA[(inode-com.ns)*com.npatt+hp]];\r
+ p2 = pbestAA[(inode-com.ns)*com.npatt+hp];\r
+ }\r
+ strcpy(codon[0], CODONs[(int)zanc[(tree.branches[j][0]-com.ns)*com.npatt+hp]]);\r
+ k1 = AAs[(int)bestAA[(tree.branches[j][0]-com.ns)*com.npatt+hp]];\r
+ p1 = pbestAA[(tree.branches[j][0]-com.ns)*com.npatt+hp];\r
+\r
+ if(strcmp(codon[0],codon[1])) {\r
+ if(inode<com.ns) \r
+ fprintf(fout,"\t%4d %s (%c) %.3f -> %s (%c)\n", h+1,codon[0],k1,p1, codon[1],k2);\r
+ else\r
+ fprintf(fout,"\t%4d %s (%c) %.3f -> %s (%c) %.3f\n",h+1,codon[0],k1,p1, codon[1],k2,p2);\r
+ }\r
+ k1 = k2 = 0;\r
+ }\r
+#endif\r
+ if(k1==k2) continue;\r
+ fprintf(fout,"\t%4d ",h+1);\r
+\r
+#ifdef SITELABELS\r
+ if(sitelabels) fprintf(fout," %5s ",sitelabels[h]);\r
+#endif\r
+ if(inode<com.ns) fprintf(fout,"%c %.3f -> %1c\n",k1,p1,k2);\r
+ else fprintf(fout,"%c %.3f -> %1c %.3f\n",k1,p1,k2,p2);\r
+ nchange++;\r
+ }\r
+ }\r
+ }\r
+\r
+ ListAncestSeq(fout, zanc);\r
+ fprintf(fout,"\n\nOverall accuracy of the %d ancestral sequences:", nid);\r
+ matout2(fout,pMAPnode, 1, nid, 9,5); fputs("for a site.\n",fout);\r
+ matout2(fout,pMAPnodeA, 1, nid, 9,5); fputs("for the sequence.\n", fout);\r
+\r
+ /* best amino acid sequences from codonml */\r
+#ifdef CODEML\r
+ if(com.seqtype==1) {\r
+ fputs("\n\nAmino acid sequences inferred by codonml.\n",fout);\r
+ if(!com.cleandata) \r
+ fputs("Results unreliable for sites with alignment gaps.\n",fout);\r
+ for(inode=0; inode<nid; inode++) {\r
+ fprintf(fout,"\nNode #%-10d ",com.ns+inode+1);\r
+ for(h=0; h<lst; h++) {\r
+ hp = (!com.readpattern ? com.pose[h] : h);\r
+ fprintf(fout, "%c", AAs[(int)bestAA[inode*com.npatt+hp]]);\r
+ if((h+1)%10==0) fputc(' ', fout);\r
+ }\r
+ }\r
+ FPN(fout);\r
+ }\r
+#endif\r
+ ChangesSites(fout, coding, zanc);\r
+\r
+ free(pnode);\r
+ free(pChar1node);\r
+ if(coding) free(pbestAA);\r
+ return (0);\r
+}\r
+\r
+\r
+void getCodonNode1Site(char codon[], char zanc[], int inode, int site)\r
+{\r
+/* this is used to retrive the codon from a codon sequence for codonml \r
+ or coding sequence in baseml, used in ancestral reconstruction\r
+ zanc has ancestral sequences\r
+ site is codon site\r
+*/\r
+ int i, hp;\r
+\r
+ for(i=0; i<3; i++) /* to force crashes */\r
+ codon[i]=-1;\r
+ if(com.seqtype==CODONseq) {\r
+ hp = (!com.readpattern ? com.pose[site] : site);\r
+#ifdef CODEML\r
+ if(inode>=com.ns)\r
+ strcpy(codon, CODONs[zanc[(inode-com.ns)*com.npatt+hp]]);\r
+ else\r
+ strcpy(codon, CODONs[com.z[inode][hp]]);\r
+#endif\r
+ }\r
+ else { /* baseml coding reconstruction */\r
+ if(inode>=com.ns)\r
+ for(i=0; i<3; i++)\r
+ codon[i] = BASEs[(int)zanc[(inode-com.ns)*com.npatt+com.pose[site*3+i]]];\r
+ else\r
+ for(i=0; i<3; i++) codon[i] = BASEs[ com.z[inode][com.pose[site*3+i]] ];\r
+ }\r
+\r
+}\r
+\r
+int ChangesSites(FILE*frst, int coding, char *zanc)\r
+{\r
+/* this lists and counts changes at sites from reconstructed ancestral sequences\r
+ com.z[] has the data, and zanc[] has the ancestors\r
+ For codon sequences (codonml or baseml with com.coding), synonymous and \r
+ nonsynonymous changes are counted separately.\r
+ Added in Nov 2000.\r
+*/\r
+ char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
+ char codon[2][4]={" "," "};\r
+ int h,hp,inode,k1,k2,d, ls1=(com.readpattern?com.npatt:com.ls);\r
+ double S,N,Sd,Nd, S1,N1,Sd1,Nd1, b,btotal=0, p,C;\r
+\r
+ if(com.seqtype==0 && coding) ls1/=3;\r
+ if(coding) {\r
+ fprintf(frst,"\n\nCounts of changes at sites, listed by %s\n\n", \r
+ (com.readpattern?"pattern":"site"));\r
+ fprintf(frst1,"\nList of sites with changes according to ancestral reconstruction\n");\r
+ fprintf(frst1,"Suzuki-Gojobori (1999) style test\n");\r
+ if(!com.cleandata)\r
+ fprintf(frst, "(Counts of n & s are incorrect at sites with ambiguity data)\n\n");\r
+\r
+ for(inode=0; inode<tree.nnode; inode++) \r
+ if(inode!=tree.root) btotal += nodes[inode].branch;\r
+ for(h=0; h<ls1; h++) {\r
+ fprintf(frst,"%4d ",h+1);\r
+ for(inode=0,S=N=Sd=Nd=0; inode<tree.nnode; inode++) {\r
+ if(inode==tree.root) continue;\r
+ b = nodes[inode].branch;\r
+ getCodonNode1Site(codon[0], zanc, nodes[inode].father, h);\r
+ getCodonNode1Site(codon[1], zanc, inode, h);\r
+\r
+ difcodonNG(codon[0], codon[1], &S1, &N1, &Sd1, &Nd1, 0, com.icode);\r
+ S += S1*b/btotal;\r
+ N += N1*b/btotal;\r
+ if(Sd1 || Nd1) {\r
+ Sd += Sd1;\r
+ Nd += Nd1;\r
+ fprintf(frst," %3s.%3s ",codon[0],codon[1]);\r
+ }\r
+ }\r
+ b = S+N; S /= b; N /= b;\r
+ fprintf(frst,"(S N: %7.3f%7.3f Sd Nd: %6.1f %5.1f)\n", S*3,N*3,Sd,Nd);\r
+ fprintf(frst1,"%4d S N: %7.3f%7.3f Sd Nd: %6.1f %5.1f ", h+1,S*3,N*3,Sd,Nd);\r
+ if(Sd+Nd) {\r
+ if(Nd/(Sd+Nd)<N) {\r
+ for(d=0,p=0,C=1; d<=Nd; d++) {\r
+ p += C*pow(N,d) * pow(1-N,Sd+Nd-d);\r
+ C *= (Sd+Nd-d)/(d+1);\r
+ }\r
+ fprintf(frst1," - p =%6.3f %s", p,(p<.01?"**":(p<.05?"*":"")));\r
+ }\r
+ else {\r
+ for(d=0,p=0,C=1; d<=Sd; d++) {\r
+ p += C*pow(S,d)*pow(1-S,Sd+Nd-d);\r
+ C *= (Sd+Nd-d)/(d+1);\r
+ }\r
+ fprintf(frst1," + p =%6.3f %s", p,(p<.01?"**":(p<.05?"*":"")));\r
+ }\r
+ }\r
+ fprintf(frst1,"\n");\r
+ }\r
+ }\r
+ else { /* noncoding nucleotide or aa sequences */\r
+ fprintf(frst,"\n\nCounts of changes at sites%s\n\n",\r
+ (com.readpattern?", listed by pattern":""));\r
+ for(h=0; h<ls1; h++) {\r
+ hp=(!com.readpattern ? com.pose[h] : h);\r
+ fprintf(frst,"%4d ",h+1);\r
+ for(inode=0,d=0;inode<tree.nnode;inode++) {\r
+ if(inode==tree.root) continue;\r
+ k1 = pch[(int) zanc[(nodes[inode].father-com.ns)*com.npatt+hp] ];\r
+ if(inode<com.ns)\r
+ k2 = pch[com.z[inode][hp]];\r
+ else \r
+ k2 = pch[(int) zanc[(inode-com.ns)*com.npatt+hp] ];\r
+ if(k1!=k2) {\r
+ d++;\r
+ fprintf(frst," %c%c", k1,k2);\r
+ }\r
+ }\r
+ fprintf(frst," (%d)\n", d);\r
+ }\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+\r
+#define NBESTANC 4 /* use 1 2 3 or 4 */\r
+int parsimony=0, *nBestScore, *icharNode[NBESTANC], *combIndex;\r
+double *fhsiteAnc, *lnPanc[NBESTANC], *PMatTips, *combScore;\r
+char *charNode[NBESTANC], *ancSeq, *ancState1site;\r
+FILE *fanc;\r
+int largeReconstruction;\r
+\r
+void DownPassPPSG2000OneSite (int h, int inode, int inodestate, int ipath);\r
+void PrintAncState1site (char ancState1site[], double prob);\r
+\r
+\r
+double P0[16]={0, 1, 1.5, 1.5, \r
+ 1, 0, 1.5, 1.5, \r
+ 1.5, 1.5, 0, 1, \r
+ 1.5, 1.5, 1, 0};\r
+\r
+double piroot[NCODE]={0};\r
+\r
+/* combIndex[] uses two bits for each son to record the path that is taken by \r
+ each reconstruction; for 32-bit integers, the maximum number of sons for \r
+ each node is 16.\r
+\r
+ lnPanc[3][(tree.nnode-com.ns)*npatt*n] uses the space of com.conP. \r
+ It holds the ln(Pr) for the best reconstructions at the subtree down inode \r
+ given the state of the father node. \r
+ charNode[0,1,2] holds the corresponding state at inode. \r
+ \r
+ int nBestScore[maxnson];\r
+ int combIndex[2*n*ncomb]; \r
+ double *combScore[n*ncomb];\r
+ char ancSeq[nintern*npatt], ancState1site[nintern]; \r
+ int icharNode[NBESTANC][nintern*npatt*n];\r
+ char charNode[NBESTANC][nintern*npatt*n];\r
+*/\r
+\r
+void UpPassPPSG2000 (int inode, int igene, double x[])\r
+{\r
+/* The algorithm of PPSG2000, modified. This routine is based on ConditionalPNode(). \r
+ lnPanc[h*n+i] is the best lnP, given that inode has state i. \r
+ charNode[] stores the characters that achieved the best lnP.\r
+*/\r
+ int debug=0;\r
+ int n=com.ncode, it,ibest,i,j,k,h, ison, nson=nodes[inode].nson, *pc;\r
+ int pos0=com.posG[igene],pos1=com.posG[igene+1], ichar,jchar;\r
+ int ncomb=1,icomb, ipath;\r
+ double t, y, psum1site=-1;\r
+\r
+ if(com.ncode!=4) debug=0; \r
+\r
+ for(i=0; i<nson; i++)\r
+ if(nodes[nodes[inode].sons[i]].nson>0)\r
+ UpPassPPSG2000(nodes[inode].sons[i], igene, x);\r
+ for(i=0,ncomb=1; i<nson; i++)\r
+ ncomb *= (nBestScore[i] = (nodes[nodes[inode].sons[i]].nson>0 ? NBESTANC : 1));\r
+ if(debug) {\r
+ printf("\n\nNode %2d has sons ", inode+1);\r
+ for(i=0; i<nson; i++) printf(" %2d", nodes[inode].sons[i]+1);\r
+ printf(" ncomb=%2d: ", ncomb);\r
+ for(i=0; i<nson; i++) printf(" %2d", nBestScore[i]); FPN(F0);\r
+ }\r
+\r
+ if(inode!=tree.root) { /* calculate log{P(t)} from father to inode */\r
+ t = nodes[inode].branch*_rateSite;\r
+ if(com.clock<5) {\r
+ if(com.clock) t *= GetBranchRate(igene,(int)nodes[inode].label,x,NULL);\r
+ else t *= com.rgene[igene];\r
+ }\r
+ GetPMatBranch(PMat, x, t, inode);\r
+ for(j=0; j<n*n; j++)\r
+ PMat[j] = (PMat[j]<1e-300 ? 300 : -log(PMat[j]));\r
+ }\r
+\r
+ for(h=pos0; h<pos1; h++) { /* loop through site patterns */\r
+ if(h) debug=0;\r
+ /* The last round for inode==tree.root, shares some code with other nodes, \r
+ and is thus embedded in the same loop. Alternatively this round can be \r
+ taken out of the loop with some code duplicated.\r
+ */\r
+ for(ichar=0; ichar<(inode!=tree.root?n:1); ichar++) { /* ichar for father */\r
+ /* given ichar for the father, what are the best reconstructions at \r
+ inode? Look at n*ncomb possibilities, given father state ichar.\r
+ */\r
+ if(debug) {\r
+ if(inode==tree.root) printf("\n\nfather is root\n");\r
+ else printf("\n\nichar = %2d %c for father\n", ichar+1,BASEs[ichar]);\r
+ }\r
+\r
+ for(icomb=0; icomb<n*ncomb; icomb++) {\r
+ jchar = icomb/ncomb; /* jchar is for inode */\r
+ if(inode==tree.root) \r
+ combScore[icomb] = -log(com.pi[jchar]+1e-300);\r
+ else\r
+ combScore[icomb] = PMat[ichar*n+jchar];\r
+\r
+ if(inode==tree.root && parsimony) combScore[icomb] = 0;\r
+\r
+ if(debug) printf("comb %2d %c", icomb+1,BASEs[jchar]);\r
+\r
+ for(i=0,it=icomb%ncomb; i<nson; i++) { /* The ibest-th state in ison. */\r
+ ison = nodes[inode].sons[i];\r
+ ibest = it%nBestScore[i];\r
+ it /= nBestScore[i];\r
+\r
+ if(nodes[ison].nson) /* internal node */\r
+ y = lnPanc[ibest][(ison-com.ns)*com.npatt*n+h*n+jchar];\r
+ else if (com.cleandata) /* tip clean: PMatTips[] has log{P(t)}. */\r
+ y = PMatTips[ ison*n*n + jchar*n + com.z[ison][h] ];\r
+ else { /* tip unclean: PMatTips[] has P(t). */\r
+ for(k=0,y=0; k<nChara[com.z[ison][h]]; k++)\r
+ y += PMatTips[ ison*n*n+jchar*n + CharaMap[com.z[ison][h]][k] ];\r
+ y = -log(y);\r
+ }\r
+\r
+ combScore[icomb] += y;\r
+ if(debug) printf("%*s son %2d #%2d %7.1f\n", (i?10:1),"", ison+1, ibest+1,y);\r
+ }\r
+ } /* for(icomb) */\r
+\r
+ if(debug) { printf("score "); for(i=0;i<n*ncomb; i++) printf(" %4.1f",combScore[i]); FPN(F0); }\r
+ indexing(combScore, n*ncomb, combIndex, 0, combIndex+n*ncomb);\r
+ if(debug) { printf("index "); for(i=0;i<n*ncomb; i++) printf(" %4d",combIndex[i]); FPN(F0); }\r
+\r
+ /* print out reconstructions at the site if inode is root. */\r
+ if(inode==tree.root) {\r
+ fprintf(fanc,"%4d ", h+1);\r
+ if(com.ngene>1) fprintf(fanc,"(%d) ", igene+1);\r
+ fprintf(fanc," %6.0f ",com.fpatt[h]);\r
+ print1site(fanc, h); \r
+ fprintf(fanc, ": ");\r
+ }\r
+ psum1site=0; /* used if inode is root */\r
+\r
+ for(j=0; j<(inode!=tree.root ? NBESTANC : n*ncomb); j++) {\r
+ jchar = (it=combIndex[j])/ncomb; it%=ncomb;\r
+ if(j<NBESTANC) {\r
+ lnPanc[j][(inode-com.ns)*com.npatt*n+h*n+ichar] = combScore[combIndex[j]];\r
+ charNode[j][(inode-com.ns)*com.npatt*n+h*n+ichar] = jchar;\r
+ }\r
+ if(debug) printf("\t#%d: %6.1f %c ", j+1, combScore[combIndex[j]], BASEs[jchar]);\r
+\r
+ for(i=0,ipath=0; i<nson; i++) {\r
+ ison=nodes[inode].sons[i]; \r
+ ibest=it%nBestScore[i];\r
+ it/=nBestScore[i];\r
+ ipath |= ibest<<(2*i);\r
+ if(debug) printf("%2d", ibest+1);\r
+ }\r
+ if(j<NBESTANC) \r
+ icharNode[j][(inode-com.ns)*com.npatt*n+h*n+ichar]=ipath;\r
+\r
+ if(debug) printf(" (%o)", ipath);\r
+ \r
+ /* print if inode is root. */\r
+ if(inode==tree.root) {\r
+ ancState1site[inode-com.ns]=jchar;\r
+ if(parsimony) y = combScore[combIndex[j]];\r
+ else psum1site += y = exp(-combScore[combIndex[j]]-fhsiteAnc[h]);\r
+\r
+ for(i=0; i<nson; i++) {\r
+ if(nodes[ison=nodes[inode].sons[i]].nson)\r
+ DownPassPPSG2000OneSite(h, tree.root, jchar, ipath);\r
+ }\r
+ PrintAncState1site(ancState1site, y);\r
+ if(j>NBESTANC && y<.001) break;\r
+ }\r
+ } /* for(j) */\r
+ } /* for(ichar) */\r
+ if(inode==tree.root) fprintf(fanc," (total %6.3f)\n", psum1site);\r
+\r
+ if(largeReconstruction && (h+1)%2000==0)\r
+ printf("\r\tUp pass for gene %d node %d sitepatt %d.", igene+1,inode+1,h+1);\r
+\r
+ } /* for(h) */\r
+ if(largeReconstruction)\r
+ printf("\r\tUp pass for gene %d node %d.", igene+1,inode+1);\r
+}\r
+\r
+void DownPassPPSG2000OneSite (int h, int inode, int inodestate, int ipath)\r
+{\r
+/* this puts the state in ancState1site[nintern], using \r
+ int icharNode[NBESTANC][nintern*npatt*n],\r
+ char charNode[NBESTANC][nintern*npatt*n].\r
+ jchar is the state at inode, and ipath is the ipath code for inode.\r
+*/\r
+ int n=com.ncode, i, ison, ibest, sonstate;\r
+\r
+ for(i=0; i<nodes[inode].nson; i++) {\r
+ ison=nodes[inode].sons[i];\r
+ if(nodes[ison].nson>1) {\r
+ ibest = (ipath & (3<<(2*i))) >> (2*i);\r
+ ancState1site[ison-com.ns] = sonstate =\r
+ charNode[ibest][(ison-com.ns)*com.npatt*n+h*n+inodestate];\r
+ DownPassPPSG2000OneSite(h, ison, sonstate, \r
+ icharNode[ibest][(ison-com.ns)*com.npatt*n+h*n+inodestate]);\r
+ }\r
+ }\r
+}\r
+\r
+\r
+void PrintAncState1site (char ancState1site[], double prob)\r
+{\r
+ int i;\r
+ char codon[4]="";\r
+ char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
+\r
+ for(i=0; i<tree.nnode-com.ns; i++) {\r
+ if(com.seqtype==1) {\r
+#ifdef CODEML\r
+ fprintf(fanc,"%s ",getcodon(codon,FROM61[(int)ancState1site[i]]));\r
+#endif \r
+ }\r
+ else\r
+ fprintf(fanc, "%c", pch[(int)ancState1site[i]]);\r
+ }\r
+ fprintf(fanc," (%5.3f) ", prob);\r
+}\r
+\r
+void DownPassPPSG2000 (int inode)\r
+{\r
+/* this reads out the best chara for inode from charNode[] into ancSeq[].\r
+*/\r
+ int i,ison, h;\r
+ char c0=0;\r
+\r
+ for(h=0; h<com.npatt; h++) {\r
+ if(inode!=tree.root) \r
+ c0=ancSeq[(nodes[inode].father-com.ns)*com.npatt+h];\r
+ ancSeq[(inode-com.ns)*com.npatt+h]\r
+ = charNode[0][(inode-com.ns)*com.npatt*com.ncode+h*com.ncode+c0];\r
+ }\r
+ for(i=0; i<nodes[inode].nson; i++)\r
+ if(nodes[ison=nodes[inode].sons[i]].nson > 1)\r
+ DownPassPPSG2000(ison);\r
+}\r
+\r
+\r
+\r
+int AncestralJointPPSG2000 (FILE *fout, double x[])\r
+{\r
+/* Ziheng Yang, 8 June 2000, rewritten on 8 June 2005.\r
+ Joint ancestral reconstruction, taking character states for all nodes at a \r
+ site as one entity, based on the algorithm of Pupko et al. (2000 \r
+ Mol. Biol. Evol. 17:890-896).\r
+\r
+ fhsiteAns[]: fh[] for each site pattern\r
+ nodes[].conP[] are destroyed and restored at the end of the routine.\r
+ ancSeq[] stores the ancestral seqs, the best reconstruction.\r
+\r
+ This outputs results by pattern. I tried to print results by sites, but gave up as \r
+ some variables use the same memory (e.g., combIndex) for different site patterns.\r
+*/\r
+ char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
+ char codon[4]="";\r
+ int n=com.ncode,nintern=tree.nnode-com.ns, i,j,k,h,hp,igene;\r
+ int maxnson=0, maxncomb, lst=(com.readpattern?com.npatt:com.ls);\r
+ char *sitepatt=(com.readpattern?"pattern":"site");\r
+ double t;\r
+ size_t sconPold = com.sconP, s;\r
+\r
+ largeReconstruction = (noisy && (com.ns>300 || com.ls>1000000));\r
+\r
+ if(noisy) puts("Joint reconstruction.");\r
+\r
+ for(i=0; i<tree.nnode; i++) maxnson=max2(maxnson,nodes[i].nson);\r
+ if(maxnson>16 || NBESTANC>4) /* for int at least 32 bits */\r
+ error2("NBESTANC too large or too many sons.");\r
+ for(i=0,maxncomb=1; i<maxnson; i++) maxncomb*=NBESTANC;\r
+ if((PMatTips=(double*)malloc(com.ns*n*n*sizeof(double)))==NULL) \r
+ error2("oom PMatTips");\r
+ s = NBESTANC*nintern*(size_t)com.npatt*n*sizeof(double);\r
+ if(s > sconPold) {\r
+ com.sconP = s;\r
+ printf("\n%9lu bytes for conP, adjusted\n", com.sconP);\r
+ if((com.conP=(double*)realloc(com.conP,com.sconP))==NULL)\r
+ error2("oom conP");\r
+ }\r
+ s = NBESTANC*nintern*com.npatt*n;\r
+ s = ((s*sizeof(int)+(s+nintern)*sizeof(char)+16)/sizeof(double))*sizeof(double);\r
+ if(s > com.sspace) {\r
+ com.sspace=s;\r
+ printf("\n%9lu bytes for space, adjusted\n",com.sspace);\r
+ if((com.space=(double*)realloc(com.space,com.sspace))==NULL) error2("oom space");\r
+ }\r
+ for(i=0; i<NBESTANC; i++) {\r
+ lnPanc[i]= com.conP+i*nintern*com.npatt*n;\r
+ icharNode[i] = (int*)com.space+i*nintern*com.npatt*n;\r
+ charNode[i] = (char*)((int*)com.space+NBESTANC*nintern*com.npatt*n)\r
+ + i*nintern*com.npatt*n;\r
+ ancState1site = charNode[0]+NBESTANC*nintern*com.npatt*n;\r
+ }\r
+ if((ancSeq=(char*)malloc(nintern*com.npatt*n*sizeof(char)))==NULL)\r
+ error2("oom charNode");\r
+\r
+ if((combScore=(double*)malloc((3*n*maxncomb+com.ns)*sizeof(double)))==NULL)\r
+ error2("oom combScore");\r
+ nBestScore = (int*)(combScore+n*maxncomb);\r
+ combIndex = nBestScore + com.ns; /* combIndex[2*n*ncomb] contains work space */\r
+\r
+ fanc = fout;\r
+ fprintf(fout, "\n\n(2) Joint reconstruction of ancestral sequences\n");\r
+ fprintf(fout, "(eqn. 2 in Yang et al. 1995 Genetics 141:1641-1650), using ");\r
+ fprintf(fout, "the algorithm of Pupko et al. (2000 Mol Biol Evol 17:890-896),\n");\r
+ fprintf(fout, "modified to generate sub-optimal reconstructions.\n");\r
+ fprintf(fout, "\nReconstruction (prob.), listed by pattern (use the observed data to find the right site).\n");\r
+ fprintf(fout, "\nPattern Freq Data:\n\n"); \r
+\r
+ for(igene=0; igene<com.ngene; igene++) {\r
+ if(com.Mgene>1) SetPGene(igene,1,1,0,x);\r
+ for(i=0; i<com.ns; i++) {\r
+ t = nodes[i].branch*_rateSite;\r
+ if(com.clock<5) {\r
+ if(com.clock) t *= GetBranchRate(igene,(int)nodes[i].label,x,NULL);\r
+ else t *= com.rgene[igene];\r
+ }\r
+ GetPMatBranch(PMatTips+i*n*n, x, t, i);\r
+ }\r
+\r
+ if(com.cleandata) {\r
+ for(i=0; i<com.ns*n*n; i++)\r
+ PMatTips[i] = (PMatTips[i]<1e-20 ? 300 : -log(PMatTips[i]));\r
+ }\r
+ if(parsimony) \r
+ for(i=0; i<com.ns; i++)\r
+ xtoy(P0, PMatTips+i*n*n, n*n);\r
+\r
+ UpPassPPSG2000(tree.root, igene, x); /* this prints into frst as well */\r
+ }\r
+\r
+ if(largeReconstruction) puts("\n\tDown pass.");\r
+ DownPassPPSG2000(tree.root);\r
+\r
+ ListAncestSeq(fout, ancSeq);\r
+\r
+ free(ancSeq);\r
+ free(PMatTips);\r
+ free(combScore);\r
+ com.sconP = sconPold;\r
+ if((com.conP=(double*)realloc(com.conP,com.sconP))==NULL)\r
+ error2("conP");\r
+ PointconPnodes();\r
+ return (0);\r
+}\r
+\r
+\r
+\r
+int AncestralSeqs (FILE *fout, double x[])\r
+{\r
+/* Ancestral sequence reconstruction using likelihood (Yang et al. 1995).\r
+ Marginal works with constant rate and variable rates among sites.\r
+ Joint works only with constant rate among sites (ncatG=1).\r
+*/\r
+ int h, k, i;\r
+ double lnL, *ScaleC=NULL; /* collected scale factors */\r
+\r
+ if(com.Mgene==1)\r
+ error2("When Mgene=1, use RateAncestor = 0.");\r
+ if (tree.nnode==com.ns) \r
+ { puts("\nNo ancestral nodes to reconstruct..\n"); return(0); }\r
+ if (noisy) printf ("\nReconstructed ancestral states go into file rst.\n");\r
+ fprintf(fout, "\nAncestral reconstruction by %sML.\n",\r
+ (com.seqtype==0?"BASE":(com.seqtype==1?"CODON":"AA")));\r
+ FPN(fout); OutTreeN(fout,1,1); FPN(fout); FPN(fout);\r
+ OutTreeN(fout,0,0); FPN(fout); FPN(fout);\r
+ OutTreeB(fout); FPN(fout);\r
+\r
+ fputs("\ntree with node labels for Rod Page's TreeView\n",fout);\r
+ OutTreeN(fout,1,PrNodeNum); FPN(fout);\r
+\r
+ fprintf (fout, "\nNodes %d to %d are ancestral\n", com.ns+1,tree.nnode);\r
+ if((fhsiteAnc=(double*)malloc(com.npatt*sizeof(double)))==NULL)\r
+ error2("oom fhsiteAnc");\r
+ if(com.NnodeScale && com.ncatG>1)\r
+ if((ScaleC=(double*)malloc(max2(com.npatt,com.ncatG) *sizeof(double)))==NULL) \r
+ error2("oom ScaleC in AncestralSeqs");\r
+\r
+ if(com.alpha)\r
+ puts("Rates are variable among sites, marginal reconstructions only.");\r
+ if(!com.cleandata) fputs("Unreliable at sites with alignment gaps\n", fout);\r
+\r
+ if(com.ncatG<=1 || com.method!=1)\r
+ ProbSitePattern (x, &lnL, fhsiteAnc, ScaleC);\r
+\r
+#ifdef BASEML\r
+ if(com.nhomo<=1)\r
+#endif\r
+ AncestralMarginal(fout, x, fhsiteAnc, ScaleC);\r
+ \r
+ fflush(fout);\r
+ /* fhsiteAnc[] is modified by both Marginal and Joint. */\r
+ if(com.ncatG<=1 && tree.nnode>com.ns+1) {\r
+ ProbSitePattern (x, &lnL, fhsiteAnc, ScaleC);\r
+ for(h=0; h<com.npatt; h++) {\r
+ fhsiteAnc[h] = log(fhsiteAnc[h]);\r
+ for(k=0; k<com.NnodeScale; k++) \r
+ fhsiteAnc[h] += com.nodeScaleF[k*com.npatt+h];\r
+ }\r
+ /* AncestralJointPPSG2000 corrupts com.conP[] and fhsiteAnc[]. \r
+ */\r
+ AncestralJointPPSG2000(fout, x);\r
+ }\r
+ FPN(fout);\r
+ free(fhsiteAnc);\r
+ if(com.NnodeScale && com.ncatG>1) free(ScaleC);\r
+\r
+ return (0);\r
+}\r
+\r
+\r
+#endif\r
+\r
+\r
+int SetNodeScale(int inode);\r
+int NodeScale(int inode, int pos0, int pos1);\r
+\r
+void InitializeNodeScale(void)\r
+{\r
+/* This allocates memory to hold scale factors for nodes and also decide on the \r
+ nodes for scaling by calling SetNodeScale(). \r
+ The scaling node is chosen before the iteration by counting the number of \r
+ nodes visited in the post-order tree travesal algorithm (see the routine \r
+ SetNodeScale).\r
+ See Yang (2000 JME 51:423-432) for details.\r
+ The memory required is com.NnodeScale*com.npatt*sizeof(double).\r
+*/\r
+ int i, nS;\r
+\r
+ if(com.clock>=5) return;\r
+\r
+ com.NnodeScale = 0;\r
+ com.nodeScale = (char*)realloc(com.nodeScale, tree.nnode*sizeof(char));\r
+ if(com.nodeScale==NULL) error2("oom");\r
+ for(i=0; i<tree.nnode; i++) com.nodeScale[i] = 0;\r
+ SetNodeScale(tree.root);\r
+ nS = com.NnodeScale*com.npatt;\r
+ if(com.conPSiteClass) nS *= com.ncatG;\r
+ if(com.NnodeScale) {\r
+ if((com.nodeScaleF=(double*)realloc(com.nodeScaleF, nS*sizeof(double)))==NULL)\r
+ error2("oom nscale");\r
+ for(i=0; i<nS; i++) com.nodeScaleF[i] = 0;\r
+\r
+ if(noisy) {\r
+ printf("\n%d node(s) used for scaling (Yang 2000 J Mol Evol 51:423-432):\n",com.NnodeScale);\r
+ for(i=0; i<tree.nnode; i++)\r
+ if(com.nodeScale[i]) printf(" %2d",i+1);\r
+ FPN(F0);\r
+ }\r
+ }\r
+}\r
+\r
+\r
+int SetNodeScale (int inode)\r
+{\r
+/* This marks nodes for applying scaling factors when calculating f[h].\r
+*/\r
+ int i,ison, d=0, every;\r
+\r
+ if(com.seqtype==0) every=100; /* baseml */\r
+ else if(com.seqtype==1) every=15; /* codonml */\r
+ else every=50; /* aaml */\r
+\r
+ for(i=0; i<nodes[inode].nson; i++) {\r
+ ison = nodes[inode].sons[i];\r
+ d += (nodes[ison].nson ? SetNodeScale(ison) : 1);\r
+ }\r
+ if(inode!=tree.root && d>every) {\r
+ com.nodeScale[inode] = 1;\r
+ d = 1;\r
+ com.NnodeScale++; \r
+ }\r
+ return(d);\r
+}\r
+\r
+\r
+int NodeScale (int inode, int pos0, int pos1)\r
+{\r
+/* scale to avoid underflow\r
+*/\r
+ int h,k,j, n=com.ncode;\r
+ double t, smallw=1e-12;\r
+\r
+ for(j=0,k=0; j<tree.nnode; j++) /* k-th node for scaling */\r
+ if(j==inode) break;\r
+ else if(com.nodeScale[j]) k++;\r
+\r
+ for(h=pos0; h<pos1; h++) {\r
+ for(j=0,t=0;j<n;j++)\r
+ if(nodes[inode].conP[h*n+j]>t)\r
+ t = nodes[inode].conP[h*n+j];\r
+\r
+ if(t<1e-300) {\r
+ for(j=0;j<n;j++)\r
+ nodes[inode].conP[h*n+j]=1; /* both 0 and 1 fine */\r
+ com.nodeScaleF[k*com.npatt+h] = -800; /* this is problematic? */\r
+ }\r
+ else { \r
+ for(j=0;j<n;j++)\r
+ nodes[inode].conP[h*n+j]/=t;\r
+ com.nodeScaleF[k*com.npatt+h] = log(t);\r
+ }\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+\r
+static double *dfsites;\r
+\r
+int fx_r(double x[], int np);\r
+\r
+\r
+#if (BASEML || CODEML)\r
+\r
+int HessianSKT2004 (double xmle[], double lnLm, double g[], double H[])\r
+{\r
+/* this calculates the hessian matrix of branch lengths using the approximation \r
+ of Seo et al. (2004), especially useful for approximate likelihood calcualtion \r
+ in divergence time estimation.\r
+ df[0][i*com.npatt+h] has d log(f_h)/d b_i.\r
+ method = 0 uses difference approximation to first derivatives.\r
+ method = 1 uses analytical calculation of first derivatives (Yang 2000). \r
+ I am under the impression that method = 1 may be useful for very large datasets \r
+ with >10M sites, but I have not implemented this method because the analytical \r
+ calculation of first derivatives is possible for branch lengths only, and not \r
+ available for other parameters. Right now with method = 0, H and the SEs are \r
+ calculated for all parameters although the H matrix in rst2 is a subset for \r
+ branch lengths only. More thought about what to do. Ziheng's note on 8 March 2010.\r
+*/\r
+ int method=0, backforth, h, i, j, lastround0=LASTROUND, nzero=0;\r
+ double *x, *lnL[2], *df[2], eh0=Small_Diff*2, eh, small;\r
+\r
+ if(com.np!=tree.nbranch && method==1)\r
+ error2("I think HessianSKT2004 works for branch lengths only");\r
+ df[0] = (double*)malloc((com.npatt*2+1)*com.np*sizeof(double));\r
+ if(df[0]==NULL) error2("oom space in HessianSKT2004");\r
+ df[1] = df[0] + com.npatt*com.np;\r
+ x = df[1] + com.npatt*com.np;\r
+ lnL[0] = (double*)malloc(com.np*2*sizeof(double));\r
+ lnL[1] = lnL[0]+com.np;\r
+\r
+ LASTROUND = 2;\r
+\r
+ for(backforth=0; backforth<2; backforth++) {\r
+ for(i=0; i<com.np; i++) {\r
+ xtoy(xmle, x, com.np);\r
+ eh = eh0*(fabs(xmle[i]) + 1);\r
+ if(backforth==0) x[i] = xmle[i] - eh;\r
+ else x[i] = xmle[i] + eh;\r
+ if(x[i] <= 4e-6)\r
+ nzero ++;\r
+ dfsites = df[backforth] + i*com.npatt;\r
+ lnL[backforth][i] = -com.plfun(x, com.np);\r
+ }\r
+ }\r
+\r
+ for(i=0; i<com.np; i++) {\r
+ eh = eh0*(fabs(xmle[i]) + 1); \r
+ g[i] = (lnL[1][i] - lnL[0][i])/(eh*2);\r
+ }\r
+ /*\r
+ printf("\nx gL g H");\r
+ matout(F0, xmle, 1, com.np);\r
+ matout(F0, g, 1, com.np);\r
+ */\r
+ zero(H, com.np*com.np);\r
+ for(i=0; i<com.np; i++) {\r
+ eh = eh0*(fabs(xmle[i]) + 1);\r
+ for(h=0; h<com.npatt; h++)\r
+ df[0][i*com.npatt+h] = (df[1][i*com.npatt+h] - df[0][i*com.npatt+h])/(eh*2);\r
+ }\r
+\r
+ for(i=0; i<com.np; i++) {\r
+ for(j=0; j<com.np; j++)\r
+ for(h=0; h<com.npatt; h++)\r
+ H[i*com.np+j] -= df[0][i*com.npatt+h] * df[0][j*com.npatt+h] * com.fpatt[h];\r
+ }\r
+\r
+ if(nzero) printf("\nWarning: Hessian matrix may be unreliable for zero branch lengths\n");\r
+ LASTROUND = lastround0;\r
+ free(df[0]);\r
+ free(lnL[0]);\r
+ return(0);\r
+}\r
+\r
+\r
+\r
+int lfunRates (FILE* fout, double x[], int np)\r
+{\r
+/* for dG, AdG or similar non-parametric models\r
+ This distroys com.fhK[], and in return,\r
+ fhK[<npatt] stores rates for conditional mean (re), and \r
+ fhK[<2*npatt] stores the most probable rate category number.\r
+ fhsite[npatt] stores fh=log(fh).\r
+*/\r
+ int ir,il,it, h,hp,j, nscale=1, direction=-1;\r
+ int lst=(com.readpattern?com.npatt:com.ls);\r
+ double lnL=0,fh,fh1, t, re,mre,vre, b1[NCATG],b2[NCATG],*fhsite;\r
+\r
+ if (noisy) printf("\nEstimated rates for sites go into file %s\n",ratef);\r
+ if (SetParameters(x)) puts ("par err. lfunRates");\r
+\r
+ fprintf(fout, "\nEstimated rates for sites from %sML.\n",\r
+ (com.seqtype==0?"BASE":(com.seqtype==1?"CODON":"AA")));\r
+ OutTreeN(fout,1,1); FPN(fout);\r
+ fprintf (fout,"\nFrequencies and rates for categories (K=%d)", com.ncatG);\r
+ fprintf(fout, "\nrate:"); FOR(j,com.ncatG) fprintf(fout," %8.5f",com.rK[j]);\r
+ fprintf(fout, "\nfreq:"); FOR(j,com.ncatG) fprintf(fout," %8.5f",com.freqK[j]);\r
+ FPN(fout);\r
+\r
+ if (com.rho) {\r
+ fprintf(fout,"\nTransition prob matrix over sites");\r
+ matout2(fout,com.MK,com.ncatG,com.ncatG,8,4);\r
+ }\r
+\r
+ if((fhsite=(double*)malloc(com.npatt*sizeof(double)))==NULL) error2("oom fhsite");\r
+ fx_r(x, np);\r
+ if(com.NnodeScale) {\r
+ FOR(h,com.npatt) {\r
+ for(ir=1,it=0; ir<com.ncatG; ir++)\r
+ if(com.fhK[ir*com.npatt+h] > com.fhK[it*com.npatt+h])\r
+ it = ir;\r
+ t = com.fhK[it*com.npatt+h];\r
+ lnL -= com.fpatt[h]*t;\r
+ for(ir=0; ir<com.ncatG; ir++)\r
+ com.fhK[ir*com.npatt+h] = exp(com.fhK[ir*com.npatt+h] - t);\r
+ }\r
+ }\r
+ for(h=0; h<com.npatt; h++) {\r
+ for(ir=0,fhsite[h]=0; ir<com.ncatG; ir++)\r
+ fhsite[h] += com.freqK[ir]*com.fhK[ir*com.npatt+h];\r
+ }\r
+\r
+ if (com.rho==0) { /* dG model */\r
+ if(com.verbose>1) {\r
+ fprintf(fout,"\nPosterior probabilities for site classes, by %s\n\n",\r
+ (com.readpattern?"pattern":"site"));\r
+ for (h=0; h<lst; h++,FPN(fout)) {\r
+ fprintf(fout, " %5d ", h+1);\r
+ hp = (!com.readpattern ? com.pose[h] : h);\r
+ for (ir=0; ir<com.ncatG; ir++)\r
+ fprintf(fout, " %9.4f", com.freqK[ir]*com.fhK[ir*com.npatt+hp]/fhsite[hp]);\r
+ }\r
+ }\r
+\r
+ fprintf(fout,"\n%7s Freq Data Rate (posterior mean & category)\n\n", \r
+ (com.readpattern?"Pattern":"Site"));\r
+ for (h=0,mre=vre=0; h<com.npatt; h++) {\r
+ for (ir=0,it=0,t=re=0; ir<com.ncatG; ir++) {\r
+ fh1 = com.freqK[ir]*com.fhK[ir*com.npatt+h];\r
+ if(fh1>t) { t=fh1; it=ir; }\r
+ re += fh1*com.rK[ir];\r
+ }\r
+ lnL -= com.fpatt[h]*log(fhsite[h]);\r
+\r
+ re /= fhsite[h];\r
+ mre += com.fpatt[h]*re/com.ls;\r
+ vre += com.fpatt[h]*re*re/com.ls;\r
+ com.fhK[h] = re;\r
+ com.fhK[com.npatt+h] = it+1.;\r
+ }\r
+ vre-=mre*mre;\r
+ for(h=0; h<lst; h++) {\r
+ hp=(!com.readpattern ? com.pose[h] : h);\r
+ fprintf(fout,"%7d %5.0f ",h+1, com.fpatt[hp]);\r
+ print1site(fout, hp);\r
+ fprintf(fout," %8.3f%6.0f\n", com.fhK[hp], com.fhK[com.npatt+hp]);\r
+ }\r
+ }\r
+ else { /* Auto-dGamma model */\r
+ fputs("\nSite Freq Data Rates\n\n",fout);\r
+ h = (direction==1?com.ls-1:0);\r
+ for (il=0,mre=vre=0; il<lst; h-=direction,il++) {\r
+ hp=(!com.readpattern ? com.pose[h] : h);\r
+ if (il==0)\r
+ FOR(ir,com.ncatG) b1[ir]=com.fhK[ir*com.npatt+hp];\r
+ else {\r
+ for (ir=0; ir<com.ncatG; ir++) {\r
+ for (j=0,fh=0; j<com.ncatG; j++)\r
+ fh+=com.MK[ir*com.ncatG+j]*b1[j];\r
+ b2[ir] = fh*com.fhK[ir*com.npatt+hp];\r
+ }\r
+ xtoy (b2, b1, com.ncatG);\r
+ }\r
+ if ((il+1)%nscale==0)\r
+ { fh=sum(b1,com.ncatG); abyx(1/fh,b1,com.ncatG); lnL-=log(fh); }\r
+\r
+ for (ir=0,it=-1,re=fh1=t=0; ir<com.ncatG; ir++) {\r
+ re+=com.freqK[ir]*b1[ir]*com.rK[ir];\r
+ fh1+=com.freqK[ir]*b1[ir];\r
+ if (b1[ir]>t) {it=ir; t=b1[ir]; }\r
+ }\r
+ re /= fh1;\r
+ mre += re/com.ls;\r
+ vre += re*re/com.ls;\r
+\r
+ fprintf(fout,"%4d %5.0f ",h+1, com.fpatt[hp]);\r
+ print1site(fout, hp);\r
+ fprintf(fout," %8.3f%6.0f\n", re, it+1.);\r
+ } /* for(il) */\r
+ vre -= mre*mre;\r
+ for (ir=0,fh=0; ir<com.ncatG; ir++) fh += com.freqK[ir]*b1[ir];\r
+ lnL -= log(fh);\r
+ }\r
+ if (noisy) printf ("lnL =%14.6f\n", -lnL);\r
+ fprintf (fout,"\nlnL =%14.6f\n", -lnL);\r
+ if(com.ngene==1) {\r
+ fprintf (fout,"\nmean(r^)=%9.4f var(r^)=%9.4f", mre, vre);\r
+ fprintf (fout,"\nAccuracy of rate prediction: corr(r^,r) =%9.4f\n", \r
+ sqrt(com.alpha*vre));\r
+ }\r
+ free(fhsite);\r
+ return (0);\r
+}\r
+\r
+\r
+double lfunAdG (double x[], int np)\r
+{\r
+/* Auto-Discrete-Gamma rates for sites\r
+ See notes in lfundG().\r
+*/\r
+ int nscale=1, h,il, ir, j, FPE=0;\r
+ int direction=-1; /* 1: n->1; -1: 1->n */\r
+ double lnL=0, b1[NCATG], b2[NCATG], fh;\r
+\r
+ NFunCall++;\r
+ fx_r(x, np);\r
+ if(com.NnodeScale)\r
+ FOR(h,com.npatt) {\r
+ fh=com.fhK[0*com.npatt+h];\r
+ lnL-=fh*com.fpatt[h];\r
+ for(ir=1,com.fhK[h]=1; ir<com.ncatG; ir++) \r
+ com.fhK[ir*com.npatt+h]=exp(com.fhK[ir*com.npatt+h]-fh);\r
+ }\r
+ h = (direction==1?com.ls-1:0);\r
+ for (il=0; il<com.ls; h-=direction,il++) {\r
+ if (il==0)\r
+ FOR(ir,com.ncatG) b1[ir]=com.fhK[ir*com.npatt+com.pose[h]];\r
+ else {\r
+ for (ir=0; ir<com.ncatG; ir++) {\r
+ for (j=0,fh=0; j<com.ncatG; j++)\r
+ fh+=com.MK[ir*com.ncatG+j]*b1[j];\r
+ b2[ir]=fh*com.fhK[ir*com.npatt+com.pose[h]];\r
+ }\r
+ xtoy(b2,b1,com.ncatG);\r
+ }\r
+ if((il+1)%nscale==0) {\r
+ fh=sum(b1,com.ncatG);\r
+ if(fh<1e-90) {\r
+ if(!FPE) {\r
+ FPE=1; printf ("h,fh%6d %12.4e\n", h+1,fh);\r
+ print1site(F0,h);\r
+ FPN(F0);\r
+ }\r
+ fh=1e-300;\r
+ }\r
+ abyx(1/fh,b1,com.ncatG); lnL-=log(fh);\r
+ }\r
+ }\r
+ for (ir=0,fh=0; ir<com.ncatG; ir++) fh+=com.freqK[ir]*b1[ir];\r
+ lnL-=log(fh);\r
+ return (lnL);\r
+}\r
+\r
+#endif\r
+\r
+\r
+\r
+\r
+#if (defined(BASEML))\r
+\r
+int GetPMatBranch (double Pt[], double x[], double t, int inode)\r
+{\r
+/* P(t) for branch leading to inode, called by routines ConditionalPNode()\r
+ and AncestralSeq() in baseml and codeml. x[] is not used by baseml.\r
+*/\r
+ int n=com.ncode, i;\r
+ double space[NCODE*NCODE*3] = {0};\r
+\r
+ if (com.model<=K80)\r
+ PMatK80(Pt, t, (com.nhomo==2 ? *nodes[inode].pkappa : com.kappa));\r
+ else {\r
+ if (com.nhomo==2)\r
+ eigenTN93(com.model, *nodes[inode].pkappa, -1, com.pi, &nR, Root, Cijk);\r
+ else if (com.nhomo>2 && com.model<=TN93)\r
+ eigenTN93(com.model, *nodes[inode].pkappa, *(nodes[inode].pkappa+1), nodes[inode].pi, &nR, Root, Cijk);\r
+ else if (com.nhomo>2 && com.model==REV)\r
+ eigenQREVbase(NULL, Pt, nodes[inode].pkappa, nodes[inode].pi, &nR, Root, Cijk);\r
+\r
+ if(com.model<=REV||com.model==REVu) \r
+ PMatCijk(Pt, t);\r
+ else {\r
+ QUNREST(NULL, Pt, x+com.ntime+com.nrgene, com.pi);\r
+ for(i=0; i<n*n; i++) Pt[i] *= t;\r
+ matexp (Pt, n, 7, 5, space);\r
+ }\r
+ }\r
+ return(0);\r
+}\r
+\r
+#elif (defined(CODEML))\r
+\r
+int GetPMatBranch (double Pt[], double x[], double t, int inode)\r
+{\r
+/* P(t) for branch leading to inode, called by routines ConditionalPNode()\r
+ and AncestralSeq() in baseml and codeml.\r
+\r
+ Qfactor in branch & site models (model = 2 or 3 and NSsites = 2 or 3):\r
+ Qfactor scaling is applied here and not inside eigenQcodon().\r
+*/\r
+ int iUVR=0, nUVR=NBTYPE+2, ib = (int)nodes[inode].label, updateUVR=0;\r
+ double *pkappa, w, mr=1, Qfactor=1;\r
+ double *pomega = com.pomega; /* x+com.ntime+com.nrgene+com.nkappa; */\r
+\r
+ pkappa = (com.hkyREV||com.codonf==FMutSel?x+com.ntime+com.nrgene:&com.kappa);\r
+\r
+ if(com.seqtype==CODONseq && com.NSsites && com.model) {\r
+ /* branch&site models (both NSsites & model):\r
+ Usual likelihood calculation, no need to re-calculate UVRoot. \r
+ Only need to point to the right place.\r
+ */\r
+ iUVR = Set_UVR_BranchSite (IClass, ib);\r
+ Qfactor = Qfactor_NS_branch[ib];\r
+ }\r
+ else if (com.seqtype==CODONseq && BayesEB==2 && com.model>1) { /* BEB for A&C */\r
+ /* branch&site models (both NSsites & model) BEB calculation:\r
+ Need to calculate UVRoot, as w is different. com.pomega points to wbranches[]\r
+ in get_grid_para_like_M2M8() or get_grid_para_like_AC().\r
+\r
+ Qfactor_NS_branch[] is fixed at the MLE: \r
+ "we fix the branch lengths at the synonymous sites (i.e., the expected \r
+ number of synonymous substitutions per codon) at their MLEs."\r
+ */\r
+ w = com.pomega[ib];\r
+ eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, w, Pt);\r
+ Qfactor = Qfactor_NS_branch[ib];\r
+ }\r
+ else if (com.seqtype==CODONseq && (com.model==1 ||com.model==2) && com.nbtype<=nUVR) { \r
+ /* branch model, also for AAClasses */\r
+ iUVR = (int)nodes[inode].label;\r
+ U=_UU[iUVR]; V=_VV[iUVR]; Root=_Root[iUVR];\r
+ }\r
+ else if (com.seqtype==CODONseq && com.model) {\r
+ mr = 0;\r
+ if(com.aaDist==AAClasses) { /* AAClass model */\r
+ com.pomega = PointOmega(x+com.ntime, -1, inode, -1);\r
+ eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, -1, Pt);\r
+ }\r
+ else if(com.nbtype>nUVR) { /* branch models, with more than 8 omega */\r
+ eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[inode].omega, Pt);\r
+ }\r
+ }\r
+\r
+ if (com.seqtype == AAseq && com.model == Poisson)\r
+ PMatJC69like(Pt, t, com.ncode);\r
+ else {\r
+ t *= Qfactor;\r
+ PMatUVRoot(Pt, t, com.ncode, U, V, Root);\r
+ }\r
+\r
+ return(0);\r
+}\r
+\r
+#endif\r
+\r
+\r
+\r
+void print_lnf_site (int h, double logfh)\r
+{\r
+#if(defined BASEML || defined CODEML)\r
+ fprintf(flnf, "\n%6d %6.0f %16.10f %16.12f %12.4f ",\r
+ h+1, com.fpatt[h], logfh, exp(logfh), com.ls*exp(logfh));\r
+ print1site(flnf, h);\r
+\r
+#endif\r
+}\r
+\r
+double lfundG (double x[], int np)\r
+{\r
+/* likelihood function for site-class models.\r
+ This deals with scaling for nodes to avoid underflow if(com.NnodeScale).\r
+ The routine calls fx_r() to calculate com.fhK[], which holds log{f(x|r)} \r
+ when scaling or f(x|r) when not. Scaling factors are set and used for each \r
+ site class (ir) to calculate log(f(x|r). When scaling is used, the routine \r
+ converts com.fhK[] into f(x|r), by collecting scaling factors into lnL. \r
+ The rest of the calculation then becomes the same and relies on f(x|r). \r
+ Check notes in fx_r.\r
+ This is also used for NSsites models in codonml. \r
+ Note that scaling is used between fx_r() and ConditionalPNode()\r
+ When this routine is used under the multiple-gene site-class model, note \r
+ that right now it assumes one set of com.freqK[] for the different genes, \r
+ which may be an issue.\r
+*/\r
+ int h,ir, it, FPE=0;\r
+ double lnL=0, fh=0,t;\r
+\r
+ NFunCall++;\r
+ fx_r(x,np);\r
+\r
+ for(h=0; h<com.npatt; h++) {\r
+ if (com.fpatt[h]<=0 && com.print>=0) continue;\r
+ if(com.NnodeScale) { /* com.fhK[] has log{f(x|r}. Note the scaling for nodes */\r
+ for(ir=1,it=0; ir<com.ncatG; ir++) /* select term for scaling */\r
+ if(com.fhK[ir*com.npatt+h] > com.fhK[it*com.npatt+h]) it = ir;\r
+ t = com.fhK[it*com.npatt+h];\r
+ for(ir=0,fh=0; ir<com.ncatG; ir++)\r
+ fh += com.freqK[ir]*exp(com.fhK[ir*com.npatt+h]-t);\r
+ fh = t + log(fh);\r
+ }\r
+ else {\r
+ for(ir=0,fh=0; ir<com.ncatG;ir++) \r
+ fh += com.freqK[ir]*com.fhK[ir*com.npatt+h];\r
+ if(fh<=0) {\r
+ if(!FPE) {\r
+ FPE=1; matout(F0,x,1,np);\r
+ printf("\nlfundG: h=%4d fhK=%9.6e\ndata: ", h+1, fh);\r
+ print1site(F0, h);\r
+ FPN(F0);\r
+ }\r
+ fh = 1e-300;\r
+ }\r
+ fh = log(fh);\r
+ }\r
+ lnL -= fh*com.fpatt[h];\r
+ if(LASTROUND==2) dfsites[h] = fh;\r
+ if (com.print<0) print_lnf_site(h, fh);\r
+ }\r
+\r
+ return(lnL);\r
+}\r
+\r
+\r
+int SetPSiteClass(int iclass, double x[])\r
+{\r
+/* This sets parameters for the iclass-th site class\r
+ This is used by ConditionalPNode() and also updateconP in both algorithms\r
+ For method=0 and 1.\r
+*/\r
+ int k = com.nrgene + !com.fix_kappa;\r
+ double *pkappa=NULL, *xcom=x+com.ntime, mr;\r
+\r
+ _rateSite = com.rK[iclass];\r
+#if CODEML\r
+ IClass = iclass;\r
+ mr = 1/Qfactor_NS;\r
+ pkappa = (com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);\r
+ if(com.seqtype == CODONseq && com.NSsites) {\r
+ _rateSite = 1;\r
+ if (com.model==0) {\r
+ if(com.aaDist) {\r
+ if(com.aaDist<10) com.pomega = xcom + k + com.ncatG - 1 + 2*iclass;\r
+ else if(com.aaDist==11) com.pomega = xcom + k + com.ncatG - 1 + 4*iclass;\r
+ else if(com.aaDist==12) com.pomega = xcom + k + com.ncatG - 1 + 5*iclass;\r
+ }\r
+ eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, com.rK[iclass], PMat);\r
+ }\r
+ }\r
+#endif\r
+ return (0);\r
+}\r
+\r
+extern int prt, Locus, Ir;\r
+\r
+\r
+int fx_r (double x[], int np)\r
+{\r
+/* This calculates f(x|r) if(com.NnodeScale==0) or log{f(x|r)} \r
+ if(com.NnodeScale>0), that is, the (log) probability of observing data x \r
+ at a site, given the rate r or dN/dS ratio for the site. This is used by \r
+ the discrete-gamma models in baseml and codeml as well as the NSsites models \r
+ in codeml. \r
+ The results are stored in com.fhK[com.ncatG*com.npatt].\r
+ This deals with underflows with large trees using global variables \r
+ com.nodeScale and com.nodeScaleF[com.NnodeScale*com.npatt].\r
+*/\r
+ int h, ir, i,k, ig, FPE=0;\r
+ double fh, smallw=1e-12; /* for testing site class with w=0 */\r
+\r
+ if(!BayesEB)\r
+ if(SetParameters(x)) puts("\npar err..");\r
+\r
+ for(ig=0; ig<com.ngene; ig++) { /* alpha may differ over ig */\r
+ if(com.Mgene>1 || com.nalpha>1)\r
+ SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);\r
+ for(ir=0; ir<com.ncatG; ir++) {\r
+ if(ir && com.conPSiteClass) { /* shift com.nodeScaleF & conP */\r
+ if(com.NnodeScale) \r
+ com.nodeScaleF += (size_t)com.npatt*com.NnodeScale;\r
+ for(i=com.ns; i<tree.nnode; i++)\r
+ nodes[i].conP += (tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;\r
+ }\r
+ SetPSiteClass(ir,x);\r
+ ConditionalPNode(tree.root,ig, x);\r
+\r
+ for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ if (com.fpatt[h]<=0 && com.print>=0) continue;\r
+ for (i=0,fh=0; i<com.ncode; i++)\r
+ fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];\r
+ if (fh<=0) {\r
+ if(fh<-1e-10 /* && !FPE */) { /* note that 0 may be o.k. here */\r
+ FPE=1; matout(F0,x,1,np);\r
+ printf("\nfx_r: h = %d r = %d fhK = %.5e ", h+1,ir+1,fh);\r
+ if(com.seqtype==0||com.seqtype==2) {\r
+ printf("Data: ");\r
+ print1site(F0, h);\r
+ FPN(F0);\r
+ }\r
+ }\r
+ fh = 1e-300;\r
+ }\r
+ if(!com.NnodeScale)\r
+ com.fhK[ir*com.npatt+h] = fh;\r
+ else\r
+ for(k=0,com.fhK[ir*com.npatt+h]=log(fh); k<com.NnodeScale; k++)\r
+ com.fhK[ir*com.npatt+h] += com.nodeScaleF[k*com.npatt+h];\r
+ } /* for (h) */\r
+ } /* for (ir) */\r
+\r
+ if(com.conPSiteClass) { /* shift pointers conP back */\r
+ if(com.NnodeScale) \r
+ com.nodeScaleF -= (com.ncatG-1)*com.NnodeScale*(size_t)com.npatt;\r
+ for(i=com.ns; i<tree.nnode; i++)\r
+ nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;\r
+ }\r
+ } /* for(ig) */\r
+ return(0);\r
+}\r
+\r
+\r
+double lfun (double x[], int np)\r
+{\r
+/* likelihood function for models of one rate for all sites including \r
+ Mgene models.\r
+*/\r
+ int h,i,k, ig, FPE=0;\r
+ double lnL=0, fh;\r
+\r
+ NFunCall++;\r
+ if(SetParameters(x)) puts ("\npar err..");\r
+ for(ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene>1) \r
+ SetPGene(ig,1,1,0,x);\r
+ ConditionalPNode (tree.root, ig, x);\r
+\r
+ for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ if (com.fpatt[h]<=0 && com.print>=0) continue;\r
+ for(i=0,fh=0; i<com.ncode; i++)\r
+ fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];\r
+ if(fh<=0) {\r
+ if(fh<-1e-5 && noisy) {\r
+ printf("\nfh = %.6f negative\n",fh);\r
+ exit(-1);\r
+ }\r
+ if(!FPE) {\r
+ FPE=1; matout(F0,x,1,np);\r
+ printf("lfun: h=%4d fh=%9.6e\nData: ", h+1,fh);\r
+ print1site(F0, h);\r
+ FPN(F0);\r
+ }\r
+ fh = 1e-80;\r
+ }\r
+ fh = log(fh);\r
+ for(k=0; k<com.NnodeScale; k++)\r
+ fh += com.nodeScaleF[k*com.npatt+h];\r
+\r
+ lnL -= fh*com.fpatt[h];\r
+ if(LASTROUND==2) dfsites[h] = fh;\r
+ if (com.print<0)\r
+ print_lnf_site(h,fh);\r
+ }\r
+ }\r
+ return (lnL);\r
+}\r
+\r
+\r
+\r
+\r
+int print1site (FILE*fout, int h)\r
+{\r
+/* This print out one site in the sequence data, com.z[]. It may be the h-th \r
+ site in the original data file or the h-th pattern. The data are coded.\r
+ naa > 1 if the codon codes for more than one amino acid.\r
+*/\r
+ char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
+ char compatibleAAs[20]="";\r
+ int n=com.ncode, i, b, aa=0;\r
+\r
+ for(i=0; i<com.ns; i++) {\r
+ b = com.z[i][h];\r
+ if(com.seqtype==0 || com.seqtype==2) \r
+ fprintf(fout,"%c", pch[b]);\r
+#if defined(CODEML)\r
+ else if(com.seqtype==1) {\r
+ aa = GetAASiteSpecies(i, h);\r
+ fprintf(fout, "%s (%c) ", CODONs[b], aa);\r
+ }\r
+#endif\r
+ }\r
+ return(0);\r
+}\r
+ \r
+\r
+#if(defined MINIMIZATION)\r
+\r
+/* November, 1999, Minimization branch by branch */\r
+int noisy_minbranches;\r
+double *space_minbranches, *g_minbranches, *varb_minbranches, e_minbranches;\r
+\r
+double minbranches(double xcom[], int np);\r
+int lfunt(double t, int a,int b,double x[],double *l, double space[]);\r
+int lfuntdd(double t, int a,int b,double x[], double *l,double*dl,double*ddl,\r
+ double space[]);\r
+int lfunt_SiteClass(double t, int a,int b,double x[],double *l,double space[]);\r
+int lfuntdd_SiteClass(double t, int a,int b,double x[],\r
+ double *l,double*dl,double*ddl,double space[]);\r
+\r
+int minB (FILE*fout, double *lnL,double x[],double xb[][2],double e0, double space[])\r
+{\r
+/* This calculates lnL for given values of common parameters by optimizing \r
+ branch lengths, cycling through them.\r
+ Z. Yang, November 1999\r
+ This calls minbranches to optimize branch lengths and ming2 to \r
+ estimate other paramters.\r
+ At the end of the routine, there is a call to lfun to restore nodes[].conP.\r
+ Returns variances of branch lengths in space[].\r
+ space[] is com.space[]. com.space may be reallocated here, which may be unsafe \r
+ as the pointers in the calling routine may not be pointing to the right places.\r
+\r
+ return value: 0 convergent; -1: not convergent.\r
+*/\r
+ int ntime0=com.ntime, fix_blength0=com.fix_blength;\r
+ int status=0, i, npcom=com.np-com.ntime;\r
+ size_t s;\r
+ double *xcom=x+com.ntime, lnL0= *lnL, dl, e=1e-5;\r
+ double (*xbcom)[2]=xb+ntime0;\r
+ int small_times=0, max_small_times=100, ir,maxr=(npcom?200:1);\r
+ double small_improvement=0.001;\r
+ char timestr[64];\r
+\r
+ if(com.conPSiteClass) {\r
+ s = (2*com.ncode*com.ncode+com.ncode*(size_t)com.npatt)*sizeof(double);\r
+ if(com.sspace < s) { /* this assumes that space is com.space */\r
+ printf("\n%lu bytes in space, %lu bytes needed\n", com.sspace, s);\r
+ printf("minB: reallocating memory for working space.\n");\r
+ com.space = (double*)realloc(com.space, s);\r
+ if(com.space==NULL) error2("oom space");\r
+ com.sspace = s;\r
+ }\r
+ }\r
+ g_minbranches = com.space;\r
+ varb_minbranches = com.space + com.np;\r
+ s = (3*com.ncode*com.ncode + (com.conPSiteClass) * 4 *(size_t)com.npatt) *sizeof(double);\r
+ if((space_minbranches=(double*)malloc(s))==NULL) \r
+ error2("oom minB");\r
+ if(com.ntime==0) error2("minB: should not come here");\r
+\r
+ if(*lnL<=0) *lnL = com.plfun(x,com.np);\r
+ e = e_minbranches = (npcom ? 5.0 : e0);\r
+ com.ntime = 0; com.fix_blength = 2;\r
+#if(CODEML)\r
+ if(com.NSsites==0) com.pomega = xcom+com.nrgene+!com.fix_kappa;\r
+#endif\r
+\r
+ for(ir=0; (npcom==0||com.method) && ir<maxr; ir++) {\r
+ if(npcom) {\r
+ if(noisy>2) printf("\n\nRound %da: Paras (%d) (e=%.6g)",ir+1,npcom,e);\r
+ ming2(NULL,lnL,com.plfun,NULL,xcom, xbcom, com.space,e,npcom);\r
+ if(noisy>2) {\r
+ FPN(F0); FOR(i,npcom) printf(" %11.6f", xcom[i]);\r
+ printf("%8s%s\n", "", printtime(timestr));\r
+ }\r
+ }\r
+ noisy_minbranches = noisy;\r
+ if(noisy>2)\r
+ printf("\nRound %db: Blengths (%d, e=%.6g)\n",ir+1,tree.nbranch,e_minbranches);\r
+\r
+ *lnL = minbranches(xcom, -1);\r
+ for(i=0; i<tree.nnode; i++) \r
+ if(i != tree.root) \r
+ x[nodes[i].ibranch] = nodes[i].branch;\r
+ if(noisy>2) printf("\n%s\n", printtime(timestr));\r
+\r
+ if((dl=fabs(*lnL-lnL0))<e0 && e<=0.02) break;\r
+ if(dl<small_improvement) small_times++;\r
+ else small_times=0;\r
+ if((small_times>max_small_times && ntime0<200) || (com.method==2&&ir==1)) {\r
+ if(noisy && com.method!=2) puts("\nToo slow, switching algorithm.");\r
+ status=2;\r
+ break;\r
+ }\r
+ if(noisy && small_times>5) \r
+ printf("\n%d rounds of small improvement.",small_times);\r
+\r
+ e/=2; if(dl<1) e/=2;\r
+ if(dl<0.5) e = min2(e,1e-3); \r
+ else if(dl>10) e = max2(e,0.1); \r
+ e_minbranches = max2(e, 1e-6);\r
+ e = max2(e,1e-6);\r
+\r
+ lnL0= *lnL;\r
+ if(fout) {\r
+ fprintf(fout,"%4d %12.5f x ", ir+1,*lnL);\r
+ for(i=0; i<com.np; i++)\r
+ fprintf(fout, " %8.5f", x[i]);\r
+ FPN(fout); fflush(fout);\r
+ }\r
+ }\r
+ if (npcom && ir==maxr) status=-1;\r
+\r
+ if(npcom && status==2) {\r
+ noisy_minbranches = 0;\r
+ com.ntime = ntime0; \r
+ com.fix_blength = fix_blength0;\r
+ ming2(NULL,lnL,com.plfun,NULL,x,xb, com.space,e0,com.np);\r
+ for(i=0; i<tree.nnode; i++) space[i] = -1;\r
+ }\r
+\r
+ for(i=0; i<tree.nnode; i++)\r
+ if(i!=tree.root) x[nodes[i].ibranch] = nodes[i].branch;\r
+\r
+ if(noisy>2) printf("\nlnL = %12.6f\n",- *lnL);\r
+\r
+ com.ntime = ntime0; \r
+ com.fix_blength = fix_blength0;\r
+ *lnL = com.plfun(x,com.np); /* restore things, for e.g. AncestralSeqs */\r
+ if(fabs(*lnL-lnL0) > 1e-5) \r
+ printf("%.6f != %.6f lnL error. Something is wrong in minB\n", *lnL, lnL0);\r
+ free(space_minbranches);\r
+\r
+ return (status==-1 ? -1 : 0);\r
+}\r
+\r
+\r
+/********************* START: Testing iteration algorithm ******************/\r
+\r
+int minB2 (FILE*fout, double *lnL,double x[],double xb[][2],double e0, double space[])\r
+{\r
+/* \r
+*/\r
+ int ntime0=com.ntime, fix_blength0=com.fix_blength;\r
+ int status=0, i, npcom=com.np-com.ntime;\r
+ size_t s;\r
+ double *xcom=x+com.ntime, lnL0= *lnL;\r
+ double (*xbcom)[2]=xb+ntime0;\r
+\r
+ s = (3*com.ncode*com.ncode + (com.conPSiteClass) * 4*(size_t)com.npatt) * sizeof(double);\r
+ if((space_minbranches=(double*)malloc(s))==NULL) error2("oom minB2");\r
+ if(com.ntime==0 || npcom==0) error2("minB2: should not come here");\r
+\r
+ noisy_minbranches=0;\r
+ /* if(*lnL<=0) *lnL=com.plfun(x,com.np); */\r
+ com.ntime=0; com.fix_blength=2;\r
+#if(CODEML)\r
+ if(com.NSsites==0) com.pomega=xcom+com.nrgene+!com.fix_kappa;\r
+#endif\r
+\r
+ ming2(NULL, lnL, minbranches, NULL, xcom, xbcom, space, e0, npcom);\r
+\r
+\r
+ com.ntime = ntime0; com.fix_blength = fix_blength0;\r
+ for(i=0; i<tree.nnode; i++) \r
+ if(i!=tree.root) x[nodes[i].ibranch] = nodes[i].branch;\r
+ *lnL = com.plfun(x,com.np); /* restore things, for e.g. AncestralSeqs */\r
+ free(space_minbranches);\r
+\r
+ return (status==-1 ? -1 : 0);\r
+}\r
+\r
+/********************* END: Testing iteration algorithm ******************/\r
+\r
+\r
+static int times=0;\r
+\r
+\r
+int updateconP (double x[], int inode)\r
+{\r
+/* update conP for inode. \r
+\r
+ Confusing decision about x[] follows. Think about redesign.\r
+\r
+ (1) Called by PostProbNode for ancestral reconstruction, with com.clock = 0, \r
+ 1, 2: x[] is passed over and com.ntime is used to get xcom in \r
+ SetPSiteClass()\r
+ (2) Called from minbranches(), with com.clock = 0. xcom[] is passed \r
+ over by minbranches and com.ntime=0 is set. So SetPSiteClass()\r
+ can still get the correct substitution parameters. \r
+ Also look at ConditionalPNode().\r
+ \r
+ Note that com.nodeScaleF and nodes[].conP are shifted if(com.conPSiteClass).\r
+*/\r
+ int ig,i,ir;\r
+\r
+ if(com.conPSiteClass==0)\r
+ for(ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene>1 || com.nalpha>1)\r
+ SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,x);\r
+ /* x[] needed by local clock models and if(com.aaDist==AAClasses).\r
+ This is called from PostProbNode\r
+ */\r
+ \r
+ ConditionalPNode(inode, ig, x);\r
+ }\r
+ else { /* site-class models */\r
+ FOR(ir,com.ncatG) {\r
+#ifdef CODEML\r
+ IClass = ir;\r
+#endif\r
+ if(ir) {\r
+ if(com.NnodeScale)\r
+ com.nodeScaleF += com.NnodeScale*(size_t)com.npatt;\r
+ for(i=com.ns; i<tree.nnode; i++)\r
+ nodes[i].conP += (tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;\r
+ }\r
+ SetPSiteClass(ir, x);\r
+ for(ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene>1 || com.nalpha>1)\r
+ SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);\r
+ if(com.nalpha>1) SetPSiteClass(ir, x);\r
+ ConditionalPNode(inode,ig, x);\r
+ }\r
+ }\r
+\r
+ /* shift positions */\r
+ com.nodeScaleF -= (com.ncatG-1)*com.NnodeScale*com.npatt;\r
+ for(i=com.ns; i<tree.nnode; i++)\r
+ nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+double minbranches (double x[], int np)\r
+{\r
+/* Ziheng, November 1999.\r
+ optimizing one branch at a time\r
+ \r
+ for each branch a..b, reroot the tree at b, and \r
+ then calculate conditional probability for node a.\r
+ For each branch, this routine determines the Newton search direction \r
+ p = -dl/dll. It then halves the steplength to make sure -lnL is decreased.\r
+ When the Newton solution is correct, this strategy will waste one \r
+ extra call to lfunt. It does not seem possible to remove calculation of \r
+ l (lnL) in lfuntddl().\r
+ lfun or lfundG and thus SetParameters are called once beforehand to set up \r
+ globals like com.pomega.\r
+ This works with NSsites and NSbranch models.\r
+ \r
+ com.oldconP[] marks nodes that need to be updated when the tree is rerooted. \r
+ The array is declared in baseml and codeml and used in the following \r
+ routines: ReRootTree, minbranches, and ConditionalPNode.\r
+\r
+ Note: At the end of the routine, nodes[].conP are not updated.\r
+*/\r
+ int ib,oldroot=tree.root, a,b;\r
+ int icycle, maxcycle=500, icycleb, ncycleb=10, i;\r
+ double lnL, lnL0=0, l0,l,dl,ddl=-1, t,t0,t00, p,step=1, small=1e-20,y;\r
+ double tb[2]={1e-8,50}, e=e_minbranches, *space=space_minbranches;\r
+ double *xcom=x+com.ntime; /* this is incorrect as com.ntime=0 */\r
+ double smallddl=0.25/com.ls*(1-0.25/com.ls)/com.ls;\r
+\r
+ if(com.ntime) error2("ntime should be 0 in minbranches");\r
+ lnL0 = l0 = l = lnL = com.plfun(xcom,-1);\r
+\r
+ if(noisy_minbranches>2) printf("\tlnL0 = %14.6f\n",-lnL0);\r
+\r
+ for(icycle=0; icycle<maxcycle; icycle++) {\r
+ for(ib=0; ib<tree.nbranch; ib++) {\r
+ t = t0 = t00 = nodes[tree.branches[ib][1]].branch; \r
+ l0 = l;\r
+ a = tree.branches[ib][0];\r
+ b = tree.branches[ib][1];\r
+ /* if a is the root, why do we want to reroot the tree at b? Just switch a with b? */\r
+\r
+ for(i=0; i<tree.nnode; i++)\r
+ com.oldconP[i]=1;\r
+ ReRootTree(b);\r
+ updateconP(x, a);\r
+\r
+ for(icycleb=0; icycleb<ncycleb; icycleb++) { /* iterating a branch */\r
+ if(!com.conPSiteClass)\r
+ lfuntdd(t, a, b, xcom, &y, &dl, &ddl, space);\r
+ else\r
+ lfuntdd_SiteClass(t, a, b, xcom, &y, &dl, &ddl, space);\r
+\r
+ p = -dl/fabs(ddl);\r
+ /* p = -dl/ddl; newton direction */\r
+ if (fabs(p)<small) step = 0;\r
+ else if(p<0) step = min2(1, (tb[0]-t0)/p);\r
+ else step = min2(1, (tb[1]-t0)/p);\r
+\r
+ if(icycle==0 && step!=1 && step!=0)\r
+ step *= 0.99; /* avoid border */\r
+ for (i=0; step>small; i++,step/=4) {\r
+ t = t0 + step*p;\r
+ if(!com.conPSiteClass) lfunt(t, a, b, xcom, &l, space);\r
+ else lfunt_SiteClass(t, a, b, xcom, &l, space);\r
+ if(l<l0) break;\r
+ }\r
+ if(step<=small) { t=t0; l=l0; break; }\r
+ if(fabs(t-t0)<e*fabs(1+t) && fabs(l-l0)<e) break;\r
+ t0=t; l0=l;\r
+ }\r
+ nodes[a].branch = t;\r
+\r
+ g_minbranches[ib] = -dl;\r
+ varb_minbranches[ib] = -ddl;\r
+ } /* for (ib) */\r
+ lnL = l;\r
+ if(noisy_minbranches>2) printf("\tCycle %2d: %14.6f\n",icycle+1, -l);\r
+ if(fabs(lnL-lnL0) < e) break;\r
+ lnL0 = lnL;\r
+ } /* for (icycle) */\r
+ ReRootTree(oldroot); /* did not update conP */\r
+ FOR(i,tree.nnode) com.oldconP[i]=0;\r
+ return(lnL);\r
+}\r
+\r
+\r
+\r
+int lfunt(double t, int a, int b, double xcom[], double *l, double space[])\r
+{\r
+/* See notes for lfunt_dd and minbranches\r
+*/\r
+ int i,j,k, h,ig, n=com.ncode, nroot=n;\r
+ int n1 = (com.cleandata&&b<com.ns ? 1 : n), xb, nUVR;\r
+ double expt,uexpt=0,multiply;\r
+ double *P=space, piqi,pqj, fh, mr=0;\r
+ double *pkappa;\r
+\r
+#if (CODEML)\r
+ nUVR = NBTYPE+2;\r
+ pkappa = (com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);\r
+ if (com.seqtype==CODONseq && com.model) {\r
+ if((com.model==NSbranchB || com.model==NSbranch2) && com.NSsites==0 && com.nbtype<=nUVR) {\r
+ U = _UU[(int)nodes[a].label]; \r
+ V = _VV[(int)nodes[a].label]; \r
+ Root = _Root[(int)nodes[a].label]; \r
+ }\r
+ else {\r
+ eigenQcodon(1, -1, NULL, NULL, NULL, Root, U, V, &mr, pkappa, nodes[a].omega, PMat);\r
+ }\r
+ }\r
+#endif\r
+\r
+#if (BASEML)\r
+ if (com.nhomo==2)\r
+ eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);\r
+ nroot = nR;\r
+#endif\r
+\r
+ *l = 0;\r
+ for (ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene>1) SetPGene(ig,1,1,0,xcom); /* com.ntime=0 */\r
+ for(i=0; i<n*n; i++) P[i] = 0;\r
+\r
+ for(k=0,expt=1; k<nroot; k++) {\r
+ multiply = com.rgene[ig]*Root[k];\r
+ if(k) expt = exp(t*multiply);\r
+\r
+#if (CODEML) /* uses U & V */\r
+ for(i=0; i<n; i++)\r
+ for(j=0,uexpt=U[i*n+k]*expt; j<n; j++)\r
+ P[i*n+j] += uexpt*V[k*n+j];\r
+#elif (BASEML) /* uses Cijk */\r
+ for(i=0; i<n; i++) for(j=0; j<n; j++)\r
+ P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;\r
+#endif\r
+ }\r
+\r
+ for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ n1 = (b<com.ns ? nChara[com.z[b][h]] : n);\r
+ for(i=0,fh=0; i<n1; i++) {\r
+ xb = i;\r
+ if(b<com.ns) piqi = com.pi[ xb = CharaMap[com.z[b][h]][i] ];\r
+ else piqi = com.pi[i] * nodes[b].conP[h*n+i];\r
+\r
+ for(j=0,pqj=0; j<n; j++)\r
+ pqj += P[xb*n+j]*nodes[a].conP[h*n+j];\r
+ fh += piqi*pqj;\r
+ }\r
+ if(noisy && fh<1e-250)\r
+ printf("a bit too small: fh[%d] = %10.6e\n",h,fh);\r
+ if(fh<0) fh = -500;\r
+ else fh = log(fh);\r
+\r
+ *l -= fh*com.fpatt[h];\r
+ for(i=0; i<com.NnodeScale; i++)\r
+ *l -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];\r
+ }\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+int lfuntdd(double t, int a, int b, double xcom[], double *l, double*dl, double*ddl, double space[])\r
+{\r
+/* Calculates lnL for branch length t for branch b->a.\r
+ See notes in minbranches().\r
+ Conditional probability updated correctly already.\r
+\r
+ i for b, j for a?\r
+*/\r
+ int i,j,k, h,ig,n=com.ncode, nroot=n;\r
+ int n1 = (com.cleandata&&b<com.ns ? 1 : n), xb, nUVR;\r
+ double expt, uexpt = 0, multiply;\r
+ double *P=space, *dP=P+n*n,*ddP=dP+n*n, piqi,pqj,dpqj,ddpqj, fh, dfh, ddfh;\r
+ double *pkappa, mr=0;\r
+\r
+#if(CODEML)\r
+ nUVR = NBTYPE+2;\r
+ pkappa=(com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);\r
+ if (com.seqtype==CODONseq && com.model) {\r
+ if((com.model==NSbranchB || com.model==NSbranch2) && com.NSsites==0 && com.nbtype<=nUVR) {\r
+ U = _UU[(int)nodes[a].label]; \r
+ V = _VV[(int)nodes[a].label]; \r
+ Root = _Root[(int)nodes[a].label]; \r
+ }\r
+ else {\r
+ eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[a].omega, PMat);\r
+ }\r
+ }\r
+#endif\r
+\r
+#if(BASEML)\r
+ if (com.nhomo==2)\r
+ eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);\r
+ nroot=nR;\r
+#endif\r
+ *l = *dl = *ddl = 0;\r
+ for(ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene>1) SetPGene(ig,1,1,0,xcom); /* com.ntime=0 */\r
+ for(i=0; i<n*n; i++) P[i] = dP[i] = ddP[i] = 0;\r
+\r
+ for(k=0,expt=1; k<nroot; k++) {\r
+ multiply = com.rgene[ig]*Root[k];\r
+ if(k) expt = exp(t*multiply);\r
+\r
+#if (CODEML) /* uses U & V */\r
+ for(i=0; i<n; i++) \r
+ for(j=0,uexpt=U[i*n+k]*expt; j<n; j++) {\r
+ P[i*n+j] += uexpt*V[k*n+j];\r
+ if(k) {\r
+ dP[i*n+j] += uexpt*V[k*n+j]*multiply;\r
+ ddP[i*n+j] += uexpt*V[k*n+j]*multiply*multiply;\r
+ }\r
+ }\r
+#elif (BASEML) /* uses Cijk */\r
+ for(i=0; i<n; i++) for(j=0; j<n; j++) {\r
+ P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;\r
+ if(k) {\r
+ dP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply;\r
+ ddP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply*multiply;\r
+ }\r
+ }\r
+#endif\r
+ }\r
+\r
+ for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ n1 = (b<com.ns ? nChara[com.z[b][h]] : n);\r
+ for(i=0,fh=dfh=ddfh=0; i<n1; i++) {\r
+ xb = i;\r
+ if(b<com.ns) piqi = com.pi[ xb = CharaMap[com.z[b][h]][i] ];\r
+ else piqi = com.pi[i] * nodes[b].conP[h*n+i];\r
+ for(j=0,pqj=dpqj=ddpqj=0; j<n; j++) {\r
+ pqj += P[xb*n+j] * nodes[a].conP[h*n+j];\r
+ dpqj += dP[xb*n+j] * nodes[a].conP[h*n+j];\r
+ ddpqj += ddP[xb*n+j] * nodes[a].conP[h*n+j];\r
+ }\r
+ fh += piqi*pqj;\r
+ dfh += piqi*dpqj;\r
+ ddfh += piqi*ddpqj;\r
+ }\r
+ if(noisy && fh<1e-250) {\r
+ printf("too small: fh[%d] = %10.6e\n",h,fh);\r
+ OutTreeN(F0,0,1);\r
+ }\r
+ *l -= log(fh)*com.fpatt[h];\r
+ for(i=0; i<com.NnodeScale; i++)\r
+ *l -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];\r
+ *dl -= dfh/fh * com.fpatt[h];\r
+ *ddl -= (fh*ddfh - dfh*dfh)/(fh*fh) * com.fpatt[h];\r
+ }\r
+ } /* for(ig) */\r
+ return(0);\r
+}\r
+\r
+\r
+int lfunt_SiteClass(double t, int a, int b, double xcom[], double *l, double space[])\r
+{\r
+/* see notes in lfuntdd_SiteClass\r
+ For branch&site models, look at the notes in GetPMatBranch()\r
+*/\r
+ int i,j,k, h,ig,ir,it, n=com.ncode, nroot=n;\r
+ int n1=(com.cleandata&&b<com.ns?1:n), xb;\r
+ double y,expt,uexpt=0,multiply, piqi,pqj;\r
+ double *P=space, *fh=P+n*n;\r
+ double *Sh=fh+com.npatt; /* scale factor for each site pattern*/\r
+ double *pK=com.fhK; /* proportion for each site class after scaling */\r
+ double smallw=1e-12; \r
+\r
+#if (BASEML)\r
+ if (com.nhomo==2)\r
+ eigenTN93(com.model, *nodes[a].pkappa,1,com.pi,&nR,Root,Cijk);\r
+ nroot=nR;\r
+#endif\r
+\r
+ if(com.NnodeScale==0) \r
+ for(ir=0; ir<com.ncatG; ir++) \r
+ for (h=0; h<com.npatt; h++) \r
+ pK[ir*com.npatt+h] = com.freqK[ir];\r
+ else {\r
+ for(h=0; h<com.npatt; h++) {\r
+ for(ir=0,it=0; ir<com.ncatG; ir++) {\r
+ for(k=0,y=0; k<com.NnodeScale; k++)\r
+ y += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];\r
+ if((pK[ir*com.npatt+h]=y) > pK[it*com.npatt+h])\r
+ it = ir;\r
+ }\r
+ Sh[h] = pK[it*com.npatt+h];\r
+ for(ir=0; ir<com.ncatG; ir++)\r
+ pK[ir*com.npatt+h] = com.freqK[ir]*exp(pK[ir*com.npatt+h]-Sh[h]);\r
+ }\r
+ }\r
+\r
+ for(h=0; h<com.npatt; h++) fh[h] = 0;\r
+ for(ir=0; ir<com.ncatG; ir++) {\r
+ SetPSiteClass(ir, xcom); /* com.ntime=0 */\r
+\r
+#if CODEML /* branch b->a */\r
+ /* branch&site models */\r
+ if(com.seqtype==CODONseq && com.NSsites && com.model)\r
+ Set_UVR_BranchSite (ir, (int)nodes[a].label);\r
+#endif\r
+\r
+ if(ir) {\r
+ for(i=com.ns;i<tree.nnode;i++)\r
+ nodes[i].conP += (tree.nnode-com.ns)*n*(size_t)com.npatt;\r
+ }\r
+ for (ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene>1 || com.nalpha>1)\r
+ SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,xcom); /* com.ntime=0 */\r
+ if(com.nalpha>1) SetPSiteClass(ir, xcom); /* com.ntime=0 */\r
+\r
+ for(i=0; i<n*n; i++) P[i] = 0;\r
+ for(k=0,expt=1; k<nroot; k++) {\r
+ multiply = com.rgene[ig]*Root[k]*_rateSite;\r
+#if (CODEML)\r
+ if(com.seqtype==1 && com.model>=2) \r
+ multiply *= Qfactor_NS_branch[(int)nodes[a].label];\r
+#endif\r
+ if(k) expt = exp(t*multiply);\r
+\r
+#if (CODEML) /* uses U & V */\r
+ for(i=0; i<n; i++) \r
+ for(j=0,uexpt=U[i*n+k]*expt; j<n; j++)\r
+ P[i*n+j] += uexpt*V[k*n+j];\r
+#elif (BASEML) /* uses Cijk */\r
+ for(i=0; i<n; i++) \r
+ for(j=0; j<n; j++) \r
+ P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;\r
+#endif\r
+ } /* for (k), look through eigenroots */\r
+ for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ n1 = (b<com.ns ? nChara[com.z[b][h]] : n);\r
+ for(i=0; i<n1; i++) {\r
+ xb = i;\r
+ if(b<com.ns) piqi = pK[ir*com.npatt+h] * com.pi[ xb = CharaMap[com.z[b][h]][i] ];\r
+ else piqi = pK[ir*com.npatt+h] * com.pi[i] * nodes[b].conP[h*n+i];\r
+\r
+ for(j=0,pqj=0; j<n; j++)\r
+ pqj += P[xb*n+j]*nodes[a].conP[h*n+j];\r
+ fh[h] += piqi*pqj;\r
+ }\r
+ } /* for (h) */\r
+ } /* for (ig) */\r
+ } /* for(ir) */\r
+\r
+ for(i=com.ns; i<tree.nnode; i++) /* shift position */\r
+ nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*n*(size_t)com.npatt;\r
+ for(h=0,*l=0; h<com.npatt; h++) {\r
+ if(fh[h]<1e-250) \r
+ printf("small (lfunt_SiteClass): fh[%d] = %10.6e\n",h,fh[h]);\r
+\r
+ *l -= log(fh[h])*com.fpatt[h];\r
+ if(com.NnodeScale) *l -= Sh[h]*com.fpatt[h];\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+int lfuntdd_SiteClass(double t, int a,int b,double xcom[],\r
+ double *l,double*dl,double*ddl,double space[])\r
+{\r
+/* dt and ddt for site-class models, modified from lfuntdd()\r
+ nodes[].conP (and com.nodeScaleF if scaling is used) is shifted for ir, \r
+ and moved back to the rootal place at the end of the routine.\r
+\r
+ At the start of this routine, nodes[].conP has the conditional probabilties \r
+ for each node, each site pattern, for each site class (ir). \r
+ Scaling: When scaling is used, scale factors \r
+ com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h] for all nodes \r
+ are collected into Sh[h], after adjusting for rate classes, since the \r
+ sum is taken over ir. Sh[h] and pK[ir*com.npatt+h] together store the \r
+ scale factors and proportions for site classes. com.freqK[ir] is not \r
+ used in this routine beyond this point.\r
+ if(com.Malpha), com.freqK[]=1/com.ncatG and does not change with ig, \r
+ and so the collection of Sh for sites at the start of the routine is o.k.\r
+\r
+ The space for com.fhK[] is used.\r
+ space[2*ncode*ncode + 4*npatt]:\r
+ dP[ncode*ncode],ddP[ncode*ncode],fh[npatt],dfh[npatt],ddfh[npatt],Sh[npatt]\r
+ pK[ncatG*npatt]=com.fhK[]\r
+*/\r
+ int i,j,k, h,ig,ir,it, n=com.ncode, nroot=n;\r
+ int n1=(com.cleandata&&b<com.ns?1:n), xb;\r
+ double y,expt,uexpt=0,multiply, piqi,pqj,dpqj,ddpqj;\r
+ double *P=PMat, *dP=space,*ddP=dP+n*n;\r
+ double *fh=ddP+n*n, *dfh=fh+com.npatt, *ddfh=dfh+com.npatt;\r
+ double *Sh=ddfh+com.npatt; /* scale factor for each site pattern */\r
+ double *pK=com.fhK; /* proportion for each site class after scaling */\r
+ double smallw=1e-12; \r
+ size_t s;\r
+\r
+#if (BASEML)\r
+ if (com.nhomo==2)\r
+ eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);\r
+ nroot=nR;\r
+#endif\r
+ if(com.NnodeScale==0)\r
+ for(ir=0; ir<com.ncatG; ir++)\r
+ for(h=0; h<com.npatt; h++) \r
+ pK[ir*com.npatt+h] = com.freqK[ir];\r
+ else {\r
+ for(h=0; h<com.npatt; h++) {\r
+ for(ir=0,it=0; ir<com.ncatG; ir++) {\r
+ for(k=0,y=0; k<com.NnodeScale; k++)\r
+ y += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];\r
+ if((pK[ir*com.npatt+h]=y) > pK[it*com.npatt+h]) \r
+ it = ir;\r
+ }\r
+ Sh[h] = pK[it*com.npatt+h];\r
+ for(ir=0; ir<com.ncatG; ir++)\r
+ pK[ir*com.npatt+h] = com.freqK[ir] * exp(pK[ir*com.npatt+h]-Sh[h]);\r
+ }\r
+ }\r
+\r
+ for(h=0; h<com.npatt; h++)\r
+ fh[h] = dfh[h] = ddfh[h] = 0;\r
+ for(ir=0; ir<com.ncatG; ir++) {\r
+ SetPSiteClass(ir, xcom); /* com.ntime=0 */\r
+\r
+#if CODEML /* branch b->a */\r
+ /* branch&site models */\r
+ if(com.seqtype==CODONseq && com.NSsites && com.model)\r
+ Set_UVR_BranchSite (ir, (int)nodes[a].label);\r
+#endif\r
+\r
+ if(ir) {\r
+ for(i=com.ns; i<tree.nnode; i++)\r
+ nodes[i].conP += (tree.nnode-com.ns)*n*(size_t)com.npatt;\r
+ }\r
+ for (ig=0; ig<com.ngene; ig++) {\r
+ if(com.Mgene>1 || com.nalpha>1)\r
+ SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,xcom); /* com.ntime=0 */\r
+ if(com.nalpha>1) SetPSiteClass(ir, xcom); /* com.ntime=0 */\r
+\r
+ for(i=0; i<n*n; i++) \r
+ P[i] = dP[i] = ddP[i]=0;\r
+ for(k=0,expt=1; k<nroot; k++) { /* k loops through eigenroots */\r
+ multiply = com.rgene[ig]*Root[k]*_rateSite;\r
+#if (CODEML)\r
+ if(com.seqtype==1 && com.model>=2) \r
+ multiply *= Qfactor_NS_branch[(int)nodes[a].label];\r
+#endif\r
+ if(k) expt = exp(t*multiply);\r
+\r
+#if (CODEML) /* uses U & V */\r
+ for(i=0; i<n; i++) \r
+ for(j=0,uexpt=U[i*n+k]*expt; j<n; j++) {\r
+ P[i*n+j] += uexpt*V[k*n+j];\r
+ if(k) {\r
+ dP[i*n+j] += uexpt*V[k*n+j]*multiply;\r
+ ddP[i*n+j] += uexpt*V[k*n+j]*multiply*multiply;\r
+ }\r
+ }\r
+#elif (BASEML) /* uses Cijk */\r
+ for(i=0; i<n; i++) for(j=0; j<n; j++) {\r
+ P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;\r
+ if(k) {\r
+ dP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply;\r
+ ddP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply*multiply;\r
+ }\r
+ }\r
+#endif\r
+ }\r
+\r
+ for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
+ n1 = (b<com.ns ? nChara[com.z[b][h]] : n);\r
+ for(i=0; i<n1; i++) {\r
+ xb = i;\r
+ if(b<com.ns)\r
+ piqi = pK[ir*com.npatt+h] * com.pi[ xb = CharaMap[com.z[b][h]][i] ];\r
+ else\r
+ piqi = pK[ir*com.npatt+h] * com.pi[i] * nodes[b].conP[h*n+i];\r
+\r
+ for(j=0,pqj=dpqj=ddpqj=0; j<n; j++) {\r
+ pqj += P[xb*n+j]*nodes[a].conP[h*n+j];\r
+ dpqj += dP[xb*n+j]*nodes[a].conP[h*n+j];\r
+ ddpqj += ddP[xb*n+j]*nodes[a].conP[h*n+j];\r
+ }\r
+ fh[h] += piqi*pqj;\r
+ dfh[h] += piqi*dpqj;\r
+ ddfh[h] += piqi*ddpqj;\r
+ }\r
+ } /* for (h) */\r
+ } /* for (ig) */\r
+ } /* for(ir) */\r
+\r
+ for(i=com.ns; i<tree.nnode; i++)\r
+ nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*n*(size_t)com.npatt;\r
+ for(h=0,*l=*dl=*ddl=0; h<com.npatt; h++) {\r
+ if(fh[h]<1e-250) \r
+ printf("small fh[%d] = %10.6e\n",h,fh[h]);\r
+\r
+ *l -= log(fh[h])*com.fpatt[h];\r
+ if(com.NnodeScale) *l -= Sh[h]*com.fpatt[h];\r
+ *dl -= dfh[h]/fh[h] * com.fpatt[h];\r
+ *ddl -= (fh[h]*ddfh[h] - dfh[h]*dfh[h])/(fh[h]*fh[h]) * com.fpatt[h];\r
+ }\r
+\r
+ return(0);\r
+}\r
+\r
+#endif\r
+\r
+\r
+#endif /* #ifdef LFUNCTIONS */\r
+\r
+#ifdef BIRTHDEATH\r
+\r
+void BranchLengthBD(int rooted, double birth, double death, double sample, \r
+ double mut)\r
+{\r
+/* Generate random branch lengths (nodes[].branch) using the birth and\r
+ death process with species sampling, or the Yule (coalescent?) process\r
+ if sample=0, when only parameter mut is used.\r
+ Note: older interior nodes have larger node numbers, so root is at\r
+ node com.ns*2-2 with time t[ns-2], while the youngest node is at \r
+ node com.ns with time t[0]. When unrooted=0, the root is removed with\r
+ branch lengths adjusted.\r
+ This works with the tree generated from RandomLHistory().\r
+*/\r
+ int i,j, it, imin,fixt0=1;\r
+ double la=birth, mu=death, rho=sample, tmin, r, t[NS-1];\r
+ double phi, eml, y;\r
+\r
+ if (sample==0) /* coalescent model. Check this!!! */\r
+ for (i=com.ns,y=0; i>1; i--) \r
+ nodes[com.ns*2-i].age=y += -log(rndu())/(i*(i-1.)/2.)*mut/2;\r
+ else { /* BD with sampling */\r
+ if (fixt0) t[com.ns-2]=1;\r
+ if (fabs(la-mu)>1e-6) {\r
+ eml = exp(mu-la); \r
+ phi = (rho*la*(eml-1)+(mu-la)*eml)/(eml-1);\r
+ for (i=0; i<com.ns-1-(fixt0); i++) {\r
+ r = rndu();\r
+ t[i] = log((phi-r*rho*la)/(phi-r*rho*la+r*(la-mu)))/(mu-la);\r
+ }\r
+ }\r
+ else \r
+ for (i=0; i<com.ns-1-(fixt0); i++) {\r
+ r = rndu();\r
+ t[i] = r/(1+la*rho*(1-r)); \r
+ }\r
+ /* bubble sort */\r
+ for (i=0; i<com.ns-1-1; i++) {\r
+ for (j=i+1,tmin=t[i],imin=i; j<com.ns-1; j++) \r
+ if (tmin>t[j]) { tmin=t[j]; imin=j; }\r
+ t[imin] = t[i];\r
+ t[i] = tmin;\r
+ }\r
+ for (i=com.ns; i>1; i--)\r
+ nodes[com.ns*2-i].age = t[com.ns-i]*mut;\r
+ }\r
+ for(i=0; i<com.ns; i++) nodes[i].age = 0;\r
+ for (i=0; i<tree.nnode; i++) \r
+ if (i != tree.root) \r
+ nodes[i].branch = nodes[nodes[i].father].age - nodes[i].age;\r
+ if (!rooted) {\r
+ it = nodes[tree.root].sons[2];\r
+ nodes[it].branch = 2*nodes[2*com.ns-2].age - nodes[tree.root].age - nodes[it].age;\r
+ }\r
+}\r
+\r
+#endif\r
+\r
+\r
+#ifdef NODESTRUCTURE\r
+#ifdef EVOLVER\r
+\r
+int RandomLHistory (int rooted, double space[])\r
+{\r
+/* random coalescence tree, with each labeled history having equal probability.\r
+ interior nodes are numbered ns, ns+1, ..., 2*ns-1-!rooted\r
+*/\r
+ int ns=com.ns, i, j, it=0, *nodea=(int*)space;\r
+ double t;\r
+\r
+ for (i=0; i<2*ns-1-!rooted; i++) ClearNode(i);\r
+\r
+ for (i=0; i<ns; i++) nodea[i]=i;\r
+ for (i=ns,t=0; i>(1+!rooted); i--) {\r
+ nodes[it=2*ns-i].nson = 2;\r
+ j = (int)(i*rndu()); \r
+ nodes[nodea[j]].father = it;\r
+ nodes[it].sons[0] = nodea[j];\r
+ nodea[j] = nodea[i-1];\r
+ j = (int)((i-1)*rndu()); \r
+ nodes[nodea[j]].father = it;\r
+ nodes[it].sons[1] = nodea[j];\r
+ nodea[j] = it;\r
+ if (!rooted && i==3) {\r
+ nodes[it].nson++; \r
+ nodes[nodea[1-j]].father = it;\r
+ nodes[it].sons[2] = nodea[1-j];\r
+ }\r
+ }\r
+ tree.root = it;\r
+ tree.nnode = ns*2-1-!rooted;\r
+ NodeToBranch();\r
+ return (0);\r
+}\r
+\r
+#endif\r
+\r
+#endif /* NODESTRUCTURE */\r
+\r
+\r
+\r
+/* routines for dating analysis of heterogeneous data */\r
+#if (defined BASEML || defined CODEML || defined MCMCTREE)\r
+\r
+\r
+#if (defined MCMCTREE)\r
+\r
+int ProcessFossilInfo()\r
+{\r
+/* This processes fossil calibration information that has been read into \r
+ nodes[].nodeStr. It uses both sptree and nodes[], before it is destroyed. \r
+ This is called before sequence alignments at loci are read.\r
+\r
+ Possible confusions: \r
+ Simple lower and upper bounds can be specified using <, >, or both < and > in \r
+ the tree either with or without quotation marks. These are read in ReadTreeN() \r
+ and processed in ReadTreeSeqs(). \r
+ Other distributions such as G, SN, ST must be specified using the format 'G(alpha, beta)',\r
+ say, and are processed here. Simple bounds can also be specified using the format \r
+ 'L(0.5)', 'U(1.0)', or 'B(0.5, 1.0)', in which case they are processed here. \r
+ I kept this complexity, (i) to keep the option of using <, >, which is intuitive, \r
+ (ii) for ReadTreeN to be able to read other node labels such as #, $, either with\r
+ or without ' '.\r
+*/\r
+ int i,j,k, nfossiltype=7;\r
+ char *pch;\r
+ double tailL=0.025, tailR=0.025, p_LOWERBOUND=0.1, c_LOWERBOUND=1.0;\r
+\r
+ for(i=sptree.nspecies; i<tree.nnode; i++) {\r
+ if(nodes[i].nodeStr == NULL) \r
+ continue;\r
+ if(sptree.nodes[i].fossil) { /* fossila specified using <, >, already processed. */\r
+ free(nodes[i].nodeStr);\r
+ continue;\r
+ }\r
+ for(j=1; j<nfossiltype+1; j++)\r
+ if((pch = strstr(nodes[i].nodeStr, fossils[j]))) break;\r
+ if(j == nfossiltype+1) \r
+ printf("\nunrecognized fossil calibration: %s\n", nodes[i].nodeStr);\r
+\r
+ sptree.nodes[i].fossil = j;\r
+ pch = strchr(nodes[i].nodeStr, '(') + 1;\r
+\r
+ switch(j) {\r
+ case (LOWER_F): \r
+ /* truncated Cauchy default prior L(tL, p, c) */\r
+ sptree.nodes[i].pfossil[1] = p_LOWERBOUND;\r
+ sptree.nodes[i].pfossil[2] = c_LOWERBOUND;\r
+ sptree.nodes[i].pfossil[3] = tailL;\r
+ sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],\r
+ &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);\r
+ break;\r
+ case (UPPER_F): \r
+ sptree.nodes[i].pfossil[2] = tailR;\r
+ sscanf(pch, "%lf,%lf", &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2]);\r
+ break;\r
+ case (BOUND_F): \r
+ sptree.nodes[i].pfossil[2] = tailL;\r
+ sptree.nodes[i].pfossil[3] = tailR;\r
+ sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],\r
+ &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);\r
+ if(sptree.nodes[i].pfossil[0] > sptree.nodes[i].pfossil[1]) { \r
+ printf("fossil bounds (%.4f, %.4f)", sptree.nodes[i].pfossil[0], sptree.nodes[i].pfossil[1]);\r
+ error2("fossil bounds in tree incorrect");\r
+ }\r
+ break;\r
+ case (GAMMA_F): \r
+ sscanf(pch, "%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1]);\r
+ break;\r
+ case (SKEWN_F):\r
+ sscanf(pch, "%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2]);\r
+ break;\r
+ case (SKEWT_F): \r
+ sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);\r
+ break;\r
+ case (S2N_F): \r
+ sscanf(pch, "%lf,%lf,%lf,%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],\r
+ &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3], &sptree.nodes[i].pfossil[4], \r
+ &sptree.nodes[i].pfossil[5], &sptree.nodes[i].pfossil[6]);\r
+ break;\r
+ }\r
+\r
+ sptree.nfossil++;\r
+ sptree.nodes[i].usefossil = 1;\r
+ nodes[i].branch = nodes[i].label = 0;\r
+ free(nodes[i].nodeStr);\r
+ }\r
+\r
+ return(0);\r
+}\r
+\r
+#endif\r
+\r
+\r
+int GenerateGtree (int locus);\r
+\r
+int ReadTreeSeqs (FILE*fout)\r
+{\r
+/* This reads the combined species tree, the fossil calibration information, \r
+ and sequence data at each locus. sptree.nodes[].pfossil[] has tL, tU for \r
+ bounds or alpha and beta for the gamma prior. \r
+\r
+ This routine also processes fossil calibration information specified using \r
+ <, >, or both. More complex specifications are stored in nodes[].nodeStr and \r
+ processed in ProcessFossilInfo(). See notes in that routine.\r
+\r
+ This also constructs the gene tree at each locus, by pruning the master \r
+ species tree..\r
+*/\r
+ FILE *fseq, *ftree;\r
+ int i,j, locus, clean0=com.cleandata;\r
+ double tailL=0.025, tailR=0.025, p_LOWERBOUND=0.1, c_LOWERBOUND=1.0;\r
+\r
+ ftree = gfopen(com.treef,"r");\r
+\r
+ /* read master species tree and process fossil calibration info */\r
+ fscanf(ftree, "%d%d", &sptree.nspecies, &i);\r
+ com.ns = sptree.nspecies;\r
+ if(com.ns>NS) error2("raise NS?");\r
+ /* to read master species names into sptree.nodes[].name */\r
+ if(noisy) puts("Reading master tree.");\r
+ for(j=0; j<sptree.nspecies; j++) \r
+ com.spname[j] = sptree.nodes[j].name;\r
+ nodes = nodes_t;\r
+\r
+ ReadTreeN(ftree, &i, &j, 1, 1);\r
+ if(i) {\r
+ for(i=j=0; i<tree.nnode; i++)\r
+ if(i!=tree.root && nodes[i].branch>0) j++;\r
+ if(j==tree.nbranch) \r
+ printf("\aTree with fossil calibrations should not have branch lengths!");\r
+ }\r
+ if(com.clock==5 || com.clock==6)\r
+ for(i=0; i<tree.nnode; i++) nodes[i].branch = nodes[i].label = 0;\r
+ for(i=0; i<tree.nnode; i++) \r
+ if(nodes[i].label<0) nodes[i].label = 0; /* change -1 into 0 */\r
+\r
+ /* OutTreeN(F0,0,0); FPN(F0); */\r
+ OutTreeN(F0,1,0); FPN(F0);\r
+ /* OutTreeN(F0,1,1); FPN(F0); */\r
+ /* copy master tree into sptree */\r
+ if(tree.nnode != 2*com.ns-1) \r
+ error2("check and think about multificating trees.");\r
+ sptree.nnode = tree.nnode; sptree.nbranch = tree.nbranch; \r
+ sptree.root = tree.root; sptree.nfossil = 0;\r
+ for(i=0; i<sptree.nspecies*2-1; i++) {\r
+ sptree.nodes[i].father = nodes[i].father;\r
+ sptree.nodes[i].nson = nodes[i].nson;\r
+ if(nodes[i].nson!=0 && nodes[i].nson!=2) \r
+ error2("master tree has to be binary.");\r
+ for(j=0; j<sptree.nodes[i].nson; j++) \r
+ sptree.nodes[i].sons[j] = nodes[i].sons[j];\r
+\r
+ sptree.nodes[i].fossil = nodes[i].fossil;\r
+ sptree.nodes[i].age = nodes[i].age;\r
+ sptree.nodes[i].pfossil[0] = nodes[i].branch; /* ">": Lower bound */\r
+ sptree.nodes[i].pfossil[1] = nodes[i].label; /* "<": Upper bound */\r
+\r
+ if(nodes[i].branch && nodes[i].label > 0) { /* joint bound: >0.8<1.2 */\r
+ if(nodes[i].age == 0) {\r
+ sptree.nodes[i].fossil = BOUND_F;\r
+ sptree.nodes[i].pfossil[2] = tailL;\r
+ sptree.nodes[i].pfossil[3] = tailR;\r
+ }\r
+ else {\r
+ error2("\nUse 'G(alpha, beta)' to specify the gamma calibration");\r
+ }\r
+ sptree.nfossil++;\r
+ }\r
+ else if(nodes[i].branch) { /* lower bound: >0.8 */\r
+ sptree.nodes[i].fossil = LOWER_F;\r
+ sptree.nfossil++; \r
+ /* truncated Cauchy default prior L(tL, p, c) */\r
+ sptree.nodes[i].pfossil[1] = p_LOWERBOUND;\r
+ sptree.nodes[i].pfossil[2] = c_LOWERBOUND;\r
+ sptree.nodes[i].pfossil[3] = tailL;\r
+ }\r
+ else if(nodes[i].label > 0) { /* upper bound: <1.2 */\r
+ sptree.nodes[i].fossil = UPPER_F; \r
+ sptree.nfossil++; \r
+ sptree.nodes[i].pfossil[2] = tailR;\r
+ }\r
+\r
+ if(sptree.nodes[i].fossil)\r
+ sptree.nodes[i].usefossil = 1;\r
+\r
+ nodes[i].branch = nodes[i].label = 0;\r
+ }\r
+\r
+#if (defined MCMCTREE)\r
+ if(!com.TipDate) ProcessFossilInfo();\r
+#endif\r
+\r
+ /* read sequences at each locus, construct gene tree by pruning sptree */\r
+ data.ngene = com.ndata;\r
+ com.ndata=1;\r
+ fseq = gfopen(com.seqf, "r");\r
+ if((gnodes=(struct TREEN**)malloc(sizeof(struct TREEN*)*data.ngene)) == NULL) \r
+ error2("oom");\r
+\r
+ printf("\nReading sequence data.. %d loci\n", data.ngene);\r
+ for(locus=0; locus<data.ngene; locus++) {\r
+ fprintf(fout, "\n\n*** Locus %d ***\n", locus+1);\r
+ printf("\n\n*** Locus %d ***\n", locus+1);\r
+\r
+ com.cleandata=(char)clean0;\r
+ for(j=0; j<sptree.nspecies; j++)\r
+ com.spname[j] = NULL; /* points to nowhere */\r
+#if (defined CODEML)\r
+ if(com.seqtype==1) {\r
+ com.icode = data.icode[locus];\r
+ setmark_61_64();\r
+ }\r
+#endif\r
+ ReadSeq(fout, fseq, clean0, locus); /* allocates com.spname[] */\r
+#if (defined CODEML)\r
+ if(com.seqtype == 1) {\r
+ if(com.sspace < max2(com.ngene+1,com.ns)*(64+12+4)*sizeof(double)) {\r
+ com.sspace = max2(com.ngene+1,com.ns)*(64+12+4)*sizeof(double);\r
+ if((com.space = (double*)realloc(com.space,com.sspace))==NULL)\r
+ error2("oom space for #c");\r
+ }\r
+ InitializeCodon(fout,com.space);\r
+ }\r
+#endif\r
+\r
+ data.ns[locus] = com.ns;\r
+ data.ls[locus] = com.ls;\r
+#if(MCMCTREE)\r
+ if(data.datatype[locus] == MORPHC) \r
+ ;\r
+ else \r
+#endif\r
+ {\r
+ if(com.seqtype==0 || com.seqtype==2)\r
+ InitializeBaseAA(fout);\r
+ fflush(fout);\r
+ if((com.seqtype==0 || com.seqtype==2) && com.model==0)\r
+ PatternWeightJC69like(fout);\r
+ xtoy(com.pi, data.pi[locus], com.ncode);\r
+ data.cleandata[locus] = (char)com.cleandata;\r
+ data.npatt[locus] = com.npatt;\r
+ data.fpatt[locus] = com.fpatt; com.fpatt=NULL;\r
+ for(i=0; i<com.ns; i++) { \r
+ data.z[locus][i] = com.z[i];\r
+ com.z[i] = NULL; \r
+ }\r
+ printf("%3d patterns, %s\n", com.npatt, (com.cleandata? "clean": "messy"));\r
+ }\r
+\r
+ GenerateGtree(locus); /* free com.spname[] */\r
+ }\r
+ for(i=0,com.cleandata=1; i<data.ngene; i++) \r
+ if(data.cleandata[i]==0) \r
+ com.cleandata = 0;\r
+\r
+ fclose(ftree); fclose(fseq);\r
+ SetMapAmbiguity();\r
+\r
+\r
+#if(defined MCMCTREE)\r
+ if(com.TipDate) {\r
+ /* com.TipDate_TimeUnit is already initialized, and it won't be changed in GetTipDate() */\r
+ GetTipDate(&com.TipDate, &com.TipDate_TimeUnit);\r
+ for(i=0; i<sptree.nspecies; i++)\r
+ sptree.nodes[i].age = nodes[i].age;\r
+ }\r
+#endif\r
+\r
+ return(0);\r
+}\r
+\r
+\r
+int GenerateGtree (int locus)\r
+{\r
+/* construct the gene tree at locus by pruning tips in the master species \r
+ tree. com.spname[] have names of species at the current locus (probably read \r
+ from the sequence alignment at the locus). They are used by the routine to compare \r
+ with sptree.nodes[].name to decide which species to keep for the locus. \r
+ See GetSubTreeN() for more details.\r
+*/\r
+ int ns=data.ns[locus], i,j, ipop[NS], keep[NS], newnodeNO[2*NS-1];\r
+\r
+ for(j=0; j<sptree.nspecies; j++) keep[j]=0;\r
+ for(i=0;i<ns;i++) {\r
+ for(j=0;j<sptree.nspecies;j++)\r
+ if(!strcmp(com.spname[i], sptree.nodes[j].name)) break;\r
+ if(j==sptree.nspecies) {\r
+ printf("species %s not found in master tree\n", com.spname[i]);\r
+ exit(-1);\r
+ }\r
+ if(keep[j]) {\r
+ printf("\nspecies %s occurs twice in locus %d", com.spname[i], locus+1);\r
+ error2("\ngiving up...");\r
+ }\r
+ keep[j] = i+1; ipop[i] = j; /* seq j in alignment is species i in master tree. */\r
+ free(com.spname[i]);\r
+ }\r
+\r
+ /* copy master species tree and then prune it. */\r
+ copySptree();\r
+ GetSubTreeN(keep, newnodeNO);\r
+ com.ns=ns;\r
+\r
+ for(i=0;i<sptree.nnode;i++) \r
+ if(newnodeNO[i]!=-1) nodes[newnodeNO[i]].ipop = i;\r
+ /* printGtree(0); */\r
+\r
+ gnodes[locus] = (struct TREEN*)malloc((ns*2-1)*sizeof(struct TREEN));\r
+ if(gnodes[locus] == NULL) error2("oom gtree");\r
+ memcpy(gnodes[locus], nodes, (ns*2-1)*sizeof(struct TREEN));\r
+ data.root[locus]=tree.root;\r
+\r
+ return(0);\r
+}\r
+\r
+\r
+int printGtree (int printBlength)\r
+{\r
+ int i,j;\r
+\r
+ for(i=0; i<com.ns; i++) \r
+ com.spname[i]=sptree.nodes[nodes[i].ipop].name;\r
+ for(i=0;i<tree.nnode;i++) \r
+ if(i!=tree.root) \r
+ nodes[i].branch=nodes[nodes[i].father].age-nodes[i].age;\r
+ printf("\nns = %d nnode = %d", com.ns, tree.nnode);\r
+ printf("\n%7s%7s %8s %7s%7s","father","node","(ipop)","nson:","sons");\r
+ for(i=0; i<tree.nnode; i++) {\r
+ printf ("\n%7d%7d (%2d) %7d ",\r
+ nodes[i].father+1, i+1, nodes[i].ipop+1, nodes[i].nson);\r
+ for(j=0; j<nodes[i].nson; j++) printf (" %2d", nodes[i].sons[j]+1);\r
+ }\r
+ FPN(F0); OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0); FPN(F0); \r
+ if(printBlength) { OutTreeN(F0,1,1); FPN(F0); }\r
+ return(0);\r
+}\r
+\r
+\r
+void copySptree (void)\r
+{\r
+/* This copies sptree into nodes = nodes_t, for printing or editing\r
+*/\r
+ int i,j;\r
+\r
+ nodes = nodes_t;\r
+ com.ns = sptree.nspecies; tree.root = sptree.root;\r
+ tree.nnode = sptree.nnode; tree.nbranch = sptree.nbranch; \r
+ for(i=0; i<sptree.nnode; i++) {\r
+ /* this is used by mcmctree */\r
+ if(i<com.ns) com.spname[i] = sptree.nodes[i].name;\r
+ \r
+ /* The following may be needed by bpp. Check carefully. */\r
+ /*\r
+ if(i<com.ns) strcpy(com.spname[i], sptree.nodes[i].name);\r
+ */\r
+ nodes[i].father =sptree.nodes[i].father;\r
+ nodes[i].nson = sptree.nodes[i].nson;\r
+ for(j=0;j<nodes[i].nson;j++) \r
+ nodes[i].sons[j] = sptree.nodes[i].sons[j];\r
+ nodes[i].fossil = sptree.nodes[i].fossil;\r
+ nodes[i].age = sptree.nodes[i].age;\r
+ if(i != tree.root) \r
+ nodes[i].branch = sptree.nodes[nodes[i].father].age - sptree.nodes[i].age;\r
+ }\r
+}\r
+\r
+void printSptree (void)\r
+{\r
+ int i, j, k;\r
+\r
+ printf("\n************\nSpecies tree\nns = %d nnode = %d", sptree.nspecies, sptree.nnode);\r
+ printf("\n%7s%7s %-8s %12s %12s%16s\n","father","node","name","time","fossil","sons");\r
+ for (i=0; i<sptree.nnode; i++) {\r
+ printf("%7d%7d %-14s %9.5f", \r
+ sptree.nodes[i].father+1, i+1, sptree.nodes[i].name, sptree.nodes[i].age);\r
+\r
+#ifdef MCMCTREE\r
+ if((k = sptree.nodes[i].fossil)) {\r
+ printf(" %s ( ", fossils[k]);\r
+ for(j=0; j<npfossils[k]; j++) {\r
+ printf("%6.4f", sptree.nodes[i].pfossil[j + (k==UPPER_F)]);\r
+ printf("%s", (j==npfossils[k]-1 ? " ) " : ", "));\r
+ }\r
+ }\r
+#endif\r
+\r
+ if(sptree.nodes[i].nson)\r
+ printf(" (%2d %2d)", sptree.nodes[i].sons[0]+1, sptree.nodes[i].sons[1]+1);\r
+ printf("\n");\r
+ }\r
+ copySptree();\r
+ FPN(F0); OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0); FPN(F0); \r
+ OutTreeN(F0,1,1); FPN(F0);\r
+}\r
+\r
+\r
+#endif\r
+\r
+\r
+\r
+\r
+#if (defined BASEML || defined CODEML)\r
+\r
+#if (defined CODEML)\r
+\r
+int GetMemPUVR(int nc, int nUVR)\r
+{\r
+/* this gets mem for nUVR sets of matrices\r
+*/\r
+ int i;\r
+\r
+ PMat=(double*)malloc((nc*nc+nUVR*nc*nc*2+nUVR*nc)*sizeof(double));\r
+ if(PMat==NULL) error2("oom getting P&U&V&Root");\r
+ U=_UU[0]=PMat+nc*nc; V=_VV[0]=_UU[0]+nc*nc; Root=_Root[0]=_VV[0]+nc*nc;\r
+ for(i=1; i<nUVR; i++) {\r
+ _UU[i]=_UU[i-1]+nc*nc*2+nc; _VV[i]=_VV[i-1]+nc*nc*2+nc; \r
+ _Root[i]=_Root[i-1]+nc*nc*2+nc;\r
+ }\r
+ return(0);\r
+}\r
+\r
+void FreeMemPUVR(void)\r
+{ \r
+ free(PMat); \r
+}\r
+\r
+\r
+int GetUVRoot_codeml (void)\r
+{\r
+/* This uses data.daafile[] to set up the eigen matrices U, V, Root for \r
+ combined clock analyses of multiple protein data sets (clock = 5 or 6).\r
+*/\r
+ int locus, nc=(com.seqtype==1?64:20), nUVR=data.ngene;\r
+ double mr=0;\r
+\r
+ if(com.seqtype==1 && (!com.fix_kappa || !com.fix_omega)) nUVR=1;\r
+ GetMemPUVR(nc, nUVR);\r
+\r
+ if(nUVR>6) error2("The maximum number of proteins is set to 6.");\r
+ if(com.seqtype==2) {\r
+ for(locus=0; locus<data.ngene; locus++) {\r
+ if(data.ngene>1) \r
+ strcpy(com.daafile, data.daafile[locus]);\r
+ GetDaa(NULL, com.daa);\r
+ if(com.model==Empirical_F) \r
+ xtoy(data.pi[locus], com.pi, nc);\r
+ eigenQaa(NULL, _Root[locus], _UU[locus], _VV[locus], NULL);\r
+ }\r
+ }\r
+ else if(com.seqtype==1 && com.fix_kappa & com.fix_omega) {\r
+ for(locus=0; locus<data.ngene; locus++) {\r
+ if(com.seqtype==1) {\r
+ com.icode=data.icode[locus];\r
+ setmark_61_64 ();\r
+ }\r
+ com.kappa=data.kappa[locus];\r
+ com.omega=data.omega[locus];\r
+ xtoy(data.pi[locus], com.pi, com.ncode);\r
+ eigenQcodon(1,-1,NULL,NULL,NULL, _Root[locus], _UU[locus], _VV[locus], &mr,\r
+ &com.kappa, com.omega, PMat);\r
+ }\r
+ }\r
+ return(0);\r
+}\r
+\r
+\r
+#endif\r
+\r
+\r
+int UseLocus (int locus, int copycondP, int setmodel, int setSeqName)\r
+{\r
+/* This point nodes to the gene tree at locus gnodes[locus] and set com.z[] \r
+ etc. for likelihood calculation for the locus. \r
+*/\r
+ int i, nS;\r
+ double mr=0;\r
+\r
+ com.ns=data.ns[locus]; com.ls=data.ls[locus];\r
+ tree.root=data.root[locus];\r
+ tree.nnode=2*com.ns-1; /* assumes binary tree */\r
+ tree.nbranch=tree.nnode-1;\r
+\r
+ nodes=gnodes[locus];\r
+\r
+ com.cleandata=data.cleandata[locus];\r
+ com.npatt=com.posG[1]=data.npatt[locus]; com.posG[0]=0;\r
+ com.fpatt=data.fpatt[locus];\r
+ for(i=0; i<com.ns; i++) com.z[i] = data.z[locus][i];\r
+\r
+ /* The following is model-dependent */\r
+ if(setmodel) {\r
+\r
+ com.kappa=data.kappa[locus];\r
+ com.omega=data.omega[locus];\r
+ com.alpha=data.alpha[locus];\r
+\r
+#if(defined CODEML)\r
+ if(com.seqtype==1) {\r
+ com.icode=data.icode[locus];\r
+ setmark_61_64 ();\r
+ }\r
+#endif\r
+\r
+#if(defined BASEML)\r
+ if(com.seqtype==0 && com.model!=0 && com.model!=1)\r
+ xtoy(data.pi[locus], com.pi, com.ncode);\r
+ if(com.model<=TN93)\r
+ eigenTN93(com.model, com.kappa, com.kappa, com.pi, &nR, Root, Cijk);\r
+ else if (com.model==REV)\r
+ eigenQREVbase (NULL, PMat, &com.kappa, com.pi, &nR, Root, Cijk);\r
+#else\r
+ if((com.seqtype==1 && com.codonf) || (com.seqtype==2 && com.model==3))\r
+ xtoy(data.pi[locus], com.pi, com.ncode);\r
+\r
+ if((com.seqtype==2 && (com.model==2 || com.model==3))\r
+ || (com.seqtype==1 && com.fix_kappa && com.fix_omega)) {\r
+ Root=_Root[locus]; U=_UU[locus]; V=_VV[locus];\r
+ }\r
+ else {\r
+ eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, &com.kappa, com.omega,PMat);\r
+ }\r
+\r
+#endif\r
+ if(com.alpha)\r
+ DiscreteGamma (com.freqK,com.rK,com.alpha,com.alpha,com.ncatG,DGammaUseMedian);\r
+\r
+ com.NnodeScale = data.NnodeScale[locus];\r
+ com.nodeScale = data.nodeScale[locus];\r
+ nS = com.NnodeScale*com.npatt * (com.conPSiteClass ? com.ncatG : 1);\r
+ for(i=0; i<nS; i++) com.nodeScaleF[i] = 0;\r
+ }\r
+ if(setSeqName)\r
+ for(i=0; i<com.ns; i++)\r
+ com.spname[i] = sptree.nodes[nodes[i].ipop].name;\r
+ return(0);\r
+}\r
+\r
+\r
+void GetMemBC (void)\r
+{\r
+/* This gets memory for baseml and codeml under local clock models for analysis \r
+ of combined data from multiple loci.\r
+ com.conP[] is shared across loci.\r
+ fhK[] uses shared space for loci.\r
+*/\r
+ int j, locus, nc = (com.seqtype==1?64:com.ncode);\r
+ size_t maxsizeScale=0, nS, sfhK=0, s1, snode;\r
+ double *p;\r
+\r
+ for(locus=0,com.sconP=0; locus<data.ngene; locus++) {\r
+ snode = nc*data.npatt[locus];\r
+ s1 = snode*(data.ns[locus]-1)*sizeof(double);\r
+ if(com.alpha) { /* this is for step 1, using method = 1 */\r
+ com.conPSiteClass = 1;\r
+ s1 *= com.ncatG;\r
+ }\r
+ if(s1>com.sconP) com.sconP = s1;\r
+ if(com.alpha && (size_t)data.npatt[locus]>sfhK) \r
+ sfhK = data.npatt[locus];\r
+ }\r
+\r
+ com.conP = (double*)malloc(com.sconP);\r
+ printf("\n%5lu bytes for conP\n", com.sconP); \r
+ if(com.conP==NULL)\r
+ error2("oom conP");\r
+ if (com.alpha) {\r
+ sfhK *= com.ncatG*sizeof(double);\r
+ if((com.fhK=(double*)realloc(com.fhK,sfhK))==NULL) error2("oom");\r
+ }\r
+\r
+ /* set gnodes[locus][].conP for internal nodes */\r
+ for(locus=0; locus<data.ngene; locus++) {\r
+ snode = nc*data.npatt[locus];\r
+ for(j=data.ns[locus]; j<data.ns[locus]*2-1; j++)\r
+ gnodes[locus][j].conP = com.conP + (j-data.ns[locus])*snode;\r
+ }\r
+ for(locus=0; locus<data.ngene; locus++) {\r
+ if(!data.cleandata[locus]) {\r
+ UseLocus(locus, -1, 0, 0);\r
+ }\r
+ }\r
+\r
+ if(sptree.nspecies>20) {\r
+ for(locus=0; locus<data.ngene; locus++) {\r
+ UseLocus(locus, -1, 0, 0);\r
+ com.NnodeScale = 0;\r
+ com.nodeScale = data.nodeScale[locus]=(char*)malloc(tree.nnode*sizeof(char));\r
+ if(com.nodeScale==NULL) error2("oom");\r
+ for(j=0; j<tree.nnode; j++) com.nodeScale[j] = 0;\r
+\r
+ SetNodeScale(tree.root);\r
+\r
+ data.NnodeScale[locus] = com.NnodeScale;\r
+ nS = com.NnodeScale*com.npatt;\r
+ if(com.conPSiteClass) nS *= com.ncatG;\r
+ maxsizeScale = max2(maxsizeScale, nS);\r
+\r
+ if(com.NnodeScale) {\r
+ printf("\n%d node(s) used for scaling at locus %d: \n",com.NnodeScale,locus+1);\r
+ FOR(j,tree.nnode) if(com.nodeScale[j]) printf(" %2d",j+1);\r
+ FPN(F0);\r
+ }\r
+ }\r
+ if(maxsizeScale) {\r
+ if((com.nodeScaleF=(double*)malloc(maxsizeScale*sizeof(double)))==NULL)\r
+ error2("oom nscale");\r
+ for(j=0; j<(int)maxsizeScale; j++) com.nodeScaleF[j] = 0;\r
+ }\r
+ }\r
+\r
+}\r
+\r
+void FreeMemBC (void)\r
+{\r
+ int locus, j;\r
+\r
+ for(locus=0; locus<data.ngene; locus++)\r
+ free(gnodes[locus]);\r
+ free(gnodes);\r
+ free(com.conP);\r
+ for(locus=0; locus<data.ngene; locus++) {\r
+ free(data.fpatt[locus]);\r
+ for(j=0;j<data.ns[locus]; j++)\r
+ free(data.z[locus][j]);\r
+ }\r
+ if(com.alpha)\r
+ free(com.fhK);\r
+\r
+ if(sptree.nspecies>20) {\r
+ for(locus=0; locus<data.ngene; locus++)\r
+ free(data.nodeScale[locus]);\r
+ if(com.nodeScaleF) free(com.nodeScaleF);\r
+ }\r
+}\r
+\r
+\r
+\r
+\r
+double nu_AHRS=0.001, *varb_AHRS;\r
+\r
+\r
+double funSS_AHRS(double x[], int np);\r
+\r
+\r
+double lnLfunHeteroData (double x[], int np)\r
+{\r
+/* This calculates the log likelihood, the log of the probability of the data \r
+ given gtree[] for each locus. This is for step 3 of Yang (2004. Acta \r
+ Zoologica Sinica 50:645-656)\r
+ x[0,1,...s-k] has node ages in the species tree, followed by branch rates \r
+ for genes 1, 2, ..., then kappa for genes, then alpha for genes\r
+*/\r
+ int i,k, locus;\r
+ double lnL=0, lnLt, *pbrate;\r
+\r
+ /* ??? need more work for codon sequences */\r
+ for(locus=0,k=com.ntime-1; locus<data.ngene; locus++) \r
+ k+=data.nbrate[locus];\r
+ if(!com.fix_kappa) FOR(locus,data.ngene) data.kappa[locus]=x[k++];\r
+ if(!com.fix_omega) FOR(locus,data.ngene) data.omega[locus]=x[k++];\r
+ if(!com.fix_alpha) FOR(locus,data.ngene) data.alpha[locus]=x[k++];\r
+\r
+ /* update node ages in species tree */\r
+ copySptree();\r
+ SetBranch(x);\r
+ FOR(i,tree.nnode) sptree.nodes[i].age=nodes[i].age;\r
+\r
+ for(locus=0,pbrate=x+com.ntime-1; locus<data.ngene; locus++) {\r
+\r
+ UseLocus(locus, -1, 1, 1);\r
+ /* copy node ages to gene tree */\r
+ FOR(i,tree.nnode) nodes[i].age=sptree.nodes[nodes[i].ipop].age;\r
+ FOR(i,tree.nnode) {\r
+ if(i!=tree.root) {\r
+ nodes[i].branch = (nodes[nodes[i].father].age-nodes[i].age) \r
+ * pbrate[(int)nodes[i].label];\r
+ if(nodes[i].branch<-1e-4)\r
+ puts("b<0");\r
+ }\r
+ }\r
+ lnL += lnLt = com.plfun(x, -1);\r
+ pbrate += data.nbrate[locus];\r
+ }\r
+ return(lnL);\r
+}\r
+\r
+\r
+double funSS_AHRS (double x[], int np)\r
+{\r
+/* Function to be minimized in the ad hoc rate smoothing procedure: \r
+ lnLb + lnLr\r
+ nodes[].label has node rate.\r
+ lnLb is weighted sum of squares using approximate variances for branch lengths.\r
+\r
+ lnLr is the log of the prior of rates under the geometric Brownian motion \r
+ model of rate evolution. There is no need for recursion as the order at \r
+ which sptree.nodes are visited is unimportant. The rates are stored in \r
+ gnodes[].label.\r
+ The root rate is fixed to be the weighted average rate of its two sons, \r
+ inversely weighted by the divergence times.\r
+*/\r
+ int locus, j,k, root, pa, son0, son1;\r
+ double lnLb, lnLr, lnLbi, lnLri; /* lnLb & lnLr are sum of squares for b and r */\r
+ double b,be,t, t0,t1, r,rA, w,y, small=1e-20, smallage=AgeLow[sptree.root]*small;\r
+ double nu = nu_AHRS, *varb=varb_AHRS;\r
+\r
+ /* set up node ages in species tree */\r
+ copySptree();\r
+ SetBranch(x);\r
+ for(j=0; j<tree.nnode; j++)\r
+ sptree.nodes[j].age = nodes[j].age;\r
+\r
+ k=com.ntime-1;\r
+ for(locus=0,lnLb=lnLr=0; locus<data.ngene; varb+=com.ns*2-1,locus++) {\r
+ UseLocus(locus, -1, 0, 0);\r
+ if(data.fix_nu==2) nu = x[np-1];\r
+ else if(data.fix_nu==3) nu = x[np-1-(data.ngene-1-locus)];\r
+\r
+ root = tree.root;\r
+ son0 = nodes[root].sons[0];\r
+ son1 = nodes[root].sons[1];\r
+ /* copy node ages and rates into gene tree nodes[]. */\r
+ for(j=0; j<tree.nnode; j++) { /* age and rates */\r
+ nodes[j].age=sptree.nodes[nodes[j].ipop].age;\r
+ if(j!=root)\r
+ nodes[j].label = x[k++];\r
+ }\r
+ t0 = nodes[root].age-nodes[son0].age;\r
+ t1 = nodes[root].age-nodes[son1].age;\r
+ if(t0+t1 < 1e-7)\r
+ error2("small root branch. Think about what to do.");\r
+ nodes[root].label = (nodes[son0].label*t1+nodes[son1].label*t0)/(t0+t1);\r
+\r
+ for(j=0,lnLbi=0; j<tree.nnode; j++) {\r
+ if(j==son0 || j==son1) continue;\r
+ pa = nodes[j].father;\r
+ if(j==root) {\r
+ b = nodes[son0].branch+nodes[son1].branch;\r
+ be = (nodes[j].age-nodes[son0].age) * (nodes[root].label+nodes[son0].label)/2\r
+ + (nodes[j].age-nodes[son1].age) * (nodes[root].label+nodes[son1].label)/2;\r
+ }\r
+ else {\r
+ b = nodes[j].branch;\r
+ be = (nodes[pa].age-nodes[j].age) * (nodes[pa].label+nodes[j].label)/2;\r
+ }\r
+ w = varb[j];\r
+ if(w<small) \r
+ puts("small variance");\r
+ lnLbi -= square(be-b)/(2*w);\r
+ }\r
+\r
+ for(j=0,lnLri=0; j<tree.nnode; j++) {\r
+ if(j==root) continue;\r
+ pa = nodes[j].father;\r
+ t = nodes[pa].age - nodes[j].age;\r
+ t = max2(t,smallage);\r
+ r = nodes[j].label;\r
+ rA= nodes[pa].label;\r
+\r
+ if(rA<small || t<small || r<small) puts("small r, rA, or t");\r
+ y = log(r/rA)+t*nu/2;\r
+ lnLri -= y*y/(2*t*nu) - log(r) - log(2*Pi*t*nu)/2;\r
+ }\r
+\r
+ if(data.fix_nu>1) lnLri += -nu/nu_AHRS-log(nu); /* exponential prior */\r
+ lnLb -= lnLbi;\r
+ lnLr -= lnLri;\r
+ }\r
+ return (lnLb + lnLr);\r
+}\r
+\r
+\r
+void SetBranchRates(int inode)\r
+{\r
+/* this uses node rates to set branch rates, and is used only after the ad hoc \r
+ rate smoothing iteration is finished.\r
+*/\r
+ int i;\r
+ if(inode<com.ns) \r
+ nodes[inode].label = (nodes[inode].label + nodes[nodes[inode].father].label)/2;\r
+ else\r
+ for(i=0; i<nodes[inode].nson; i++) \r
+ SetBranchRates(nodes[inode].sons[i]);\r
+}\r
+\r
+\r
+int GetInitialsClock6Step1 (double x[], double xb[][2])\r
+{\r
+/* This is for clock 6 step 1.\r
+*/\r
+ int i,k;\r
+ double tb[]={.0001, 999};\r
+\r
+ com.ntime=k=tree.nbranch;\r
+ GetInitialsTimes (x);\r
+\r
+ com.plfun = (com.alpha==0 ? lfun : lfundG);\r
+ com.conPSiteClass = (com.method && com.plfun==lfundG);\r
+\r
+/* InitializeNodeScale(); */\r
+\r
+ if(com.seqtype==0) com.nrate = !com.fix_kappa;\r
+\r
+ com.np=com.ntime+!com.fix_kappa+!com.fix_alpha;\r
+ if(com.seqtype==1 && !com.fix_omega) com.np++;\r
+\r
+ if(!com.fix_kappa) x[k++]=com.kappa;\r
+ if(!com.fix_omega) x[k++]=com.omega;\r
+ if(!com.fix_alpha) x[k++]=com.alpha;\r
+ NodeToBranch ();\r
+ \r
+ for(i=0; i<com.ntime; i++) \r
+ { xb[i][0]=tb[0]; xb[i][1]=tb[1]; }\r
+ for( ; i<com.np; i++) \r
+ { xb[i][0]=.001; xb[i][1]=999; }\r
+\r
+ if(noisy>3 && com.np<200) {\r
+ printf("\nInitials (np=%d)\n", com.np);\r
+ for(i=0; i<com.np; i++) printf(" %10.5f", x[i]); FPN(F0);\r
+ for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][0]); FPN(F0);\r
+ for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][1]); FPN(F0);\r
+ }\r
+ return (0);\r
+}\r
+\r
+\r
+\r
+int GetInitialsClock56Step3 (double x[])\r
+{\r
+/* This is for clock 5 or clock 6 step 3\r
+*/\r
+ int i, j,k=0, naa=20;\r
+\r
+ if(com.clock==5)\r
+ GetInitialsTimes (x);\r
+\r
+ com.plfun = (com.alpha==0 ? lfun : lfundG);\r
+ com.conPSiteClass = (com.method && com.plfun==lfundG);\r
+\r
+/* InitializeNodeScale(); */\r
+\r
+ com.np = com.ntime-1 + (1+!com.fix_kappa+!com.fix_omega+!com.fix_alpha)*data.ngene;\r
+ if(com.clock==5) \r
+ for(i=com.ntime-1;i<com.np;i++) x[i]=.2+rndu();\r
+ else if(com.clock==6) {\r
+ for(j=0,k=com.ntime-1; j<data.ngene; k+=data.nbrate[j],j++) \r
+ com.np += data.nbrate[j]-1;\r
+ if(!com.fix_kappa)\r
+ for(j=0; j<data.ngene; j++) x[k++]=data.kappa[j];\r
+ if(!com.fix_omega) \r
+ for(j=0; j<data.ngene; j++) x[k++]=data.omega[j];\r
+ if(!com.fix_alpha) \r
+ for(j=0; j<data.ngene; j++) x[k++]=data.alpha[j];\r
+ for(i=k;i<com.np;i++) x[i]=(.5+rndu())/2;\r
+ }\r
+ return (0);\r
+}\r
+\r
+\r
+double GetMeanRate (void)\r
+{\r
+/* This gets the rough average rate for the locus \r
+*/\r
+ int inode, i,j,k, ipop, nleft,nright,marks[NS], sons[2], nfossil;\r
+ double mr, md;\r
+\r
+ mr=0; nfossil=0;\r
+ for(inode=com.ns; inode<tree.nnode; inode++) {\r
+ ipop = nodes[inode].ipop; \r
+ if(sptree.nodes[ipop].fossil == 0) continue;\r
+ sons[0] = nodes[inode].sons[0];\r
+ sons[1] = nodes[inode].sons[1];\r
+ for(i=0,nleft=nright=0; i<com.ns; i++) {\r
+ for(j=i,marks[i]=0; j!=tree.root; j=nodes[j].father) {\r
+ if(j==sons[0]) { marks[i]=1; nleft++; break; }\r
+ else if (j==sons[1]) { marks[i]=2; nright++; break; }\r
+ }\r
+ }\r
+ if(nleft==0 || nright==0) {\r
+ puts("this calibration is not in gene tree.");\r
+ continue;\r
+ }\r
+ nfossil++;\r
+\r
+ for(i=0,md=0; i<com.ns; i++) {\r
+ for(j=0; j<com.ns; j++) {\r
+ if(marks[i]==1 && marks[j]==2) {\r
+ for(k=i; k!=inode; k=nodes[k].father)\r
+ md+=nodes[k].branch;\r
+ for(k=j; k!=inode; k=nodes[k].father)\r
+ md+=nodes[k].branch;\r
+ }\r
+ }\r
+ }\r
+ md /= (nleft*nright);\r
+ mr += md/(sptree.nodes[ipop].age*2);\r
+\r
+ /*\r
+ printf("node age & mr n%-4d %9.5f%9.5f ", inode, sptree.nodes[ipop].age, md);\r
+ if(com.ns<100) FOR(i,com.ns) printf("%d",marks[i]); \r
+ FPN(F0);\r
+ */\r
+ }\r
+ mr /= nfossil;\r
+ if(nfossil==0) \r
+ { printf("need fossils for this locus\n"); exit(-1); }\r
+\r
+ return(mr);\r
+}\r
+\r
+\r
+int AdHocRateSmoothing (FILE*fout, double x[NS*3], double xb[NS*3][2], double space[])\r
+{\r
+/* ad hoc rate smoothing for likelihood estimation of divergence times.\r
+ Step 1: Use JC69 to estimate branch lengths under no-clock model.\r
+ Step 2: ad hoc rate smoothing, estimating one set of divergence times\r
+ and many sets of branch rates for loci. Rate at root is set to \r
+ weighted average of rate at the two sons.\r
+*/\r
+ int model0=com.model, ntime0=com.ntime; /* is this useful? */\r
+ int fix_kappa0=com.fix_kappa, fix_omega0=com.fix_omega, fix_alpha0=com.fix_alpha;\r
+ int ib, son0, son1;\r
+ double kappa0=com.kappa, omega0=com.omega, alpha0=com.alpha, t0,t1, *varb;\r
+ double f, e=1e-8, pb=0.00001, rb[]={0.001,99}, lnL,lnLsum=0;\r
+ double mbrate[20], Rj[20], r,minr,maxr, beta, *pnu=&nu_AHRS,nu, mr[NGENE];\r
+ int i,j,k,k0, locus, nbrate[20],maxnbrate=20;\r
+ char timestr[32];\r
+ FILE *fBV = gfopen("in.BV","w");\r
+ FILE *fdist = gfopen("RateDist.txt","w");\r
+ FILE *finStep1 = fopen("in.ClockStep1","r"),\r
+ *finStep2 = fopen("in.ClockStep2","r");\r
+\r
+ noisy=4;\r
+ for(locus=0,k=0; locus<data.ngene; locus++)\r
+ k += 2*data.ns[locus]-1;\r
+ if((varb_AHRS=(double*)malloc(k*sizeof(double)))==NULL) \r
+ error2("oom AHRS");\r
+ for(i=0; i<k;i++) varb_AHRS[i]=-1;\r
+\r
+\r
+ /* Step 1: Estimate branch lengths without clock. */\r
+ printf("\nStep 1: Estimate branch lengths under no clock.\n");\r
+ fprintf(fout,"\n\nStep 1: Estimate branch lengths under no clock.\n");\r
+ com.clock=0; com.method=1;\r
+/*\r
+com.model=0; com.fix_kappa=1; com.kappa=1; \r
+com.fix_alpha=1; com.alpha=0;\r
+*/\r
+ for(locus=0; locus<data.ngene; locus++) {\r
+ if(!com.fix_kappa) data.kappa[locus]=com.kappa;\r
+ if(!com.fix_omega) data.omega[locus]=com.omega;\r
+ if(!com.fix_alpha) data.alpha[locus]=com.alpha;\r
+ }\r
+ for(locus=0,varb=varb_AHRS; locus<data.ngene; varb+=com.ns*2-1,locus++) {\r
+ UseLocus(locus, -1, 1, 1);\r
+\r
+ fprintf(fout,"\nLocus %d (%d sequences)\n", locus+1, com.ns);\r
+\r
+ son0 = nodes[tree.root].sons[0]; \r
+ son1 = nodes[tree.root].sons[1];\r
+\r
+ GetInitialsClock6Step1 (x, xb);\r
+\r
+ lnL=0;\r
+ if(com.ns>30) fprintf(frub, "\n\nLocus %d\n", locus+1);\r
+ if(finStep1) {\r
+ puts("read MLEs from step 1 from file");\r
+ for(i=0; i<com.np; i++) \r
+ fscanf(finStep1,"%lf",&x[i]);\r
+ }\r
+ else {\r
+ j = minB((com.ns>30?frub:NULL), &lnL, x, xb, e, space);\r
+ for(j=0; j<com.ns*2-1; j++) {\r
+ ib = nodes[j].ibranch;\r
+ if(j!=tree.root) varb[j] = (x[ib]>1e-8 ? -1/varb_minbranches[ib] : 999);\r
+ }\r
+/*\r
+matout(F0, x, 1, com.ntime);\r
+matout2(F0, varb, 1, tree.nnode, 10, 7);\r
+fout = stdout;\r
+exit(0);\r
+*/\r
+ }\r
+\r
+ if(!com.fix_kappa) data.kappa[locus] = x[com.ntime];\r
+ if(!com.fix_omega) data.omega[locus] = x[com.ntime + !com.fix_kappa];\r
+ if(!com.fix_alpha) data.alpha[locus] = x[com.ntime + !com.fix_kappa + !com.fix_omega];\r
+\r
+ lnLsum += lnL;\r
+\r
+ t0 = nodes[son0].branch; \r
+ t1 = nodes[son1].branch;\r
+ varb[tree.root] = varb[t0>t1?son0:son1];\r
+ nodes[son0].branch = nodes[son1].branch = (t0+t1)/2; /* arbitrary */\r
+ mr[locus] = GetMeanRate();\r
+\r
+ printf(" Locus %d: %d sequences, %d blengths, lnL = %15.6f mr=%.5f%10s\n", \r
+ locus+1, com.ns, com.np-1,-lnL,mr[locus], printtime(timestr));\r
+ fprintf(fout,"\nlnL = %.6f\n\n", -lnL);\r
+ OutTreeB(fout); FPN(fout);\r
+ for(i=0; i<com.np; i++) fprintf(fout," %8.5f",x[i]); FPN(fout);\r
+ for(i=0; i<tree.nbranch; i++) fprintf(fout," %8.5f", sqrt(varb[tree.branches[i][1]])); FPN(fout);\r
+ FPN(fout); OutTreeN(fout,1,1); FPN(fout); fflush(fout);\r
+\r
+ fprintf(fBV, "\n\nLocus %d: %d sequences, %d+1 branches\nlnL = %15.6f\n\n", \r
+ locus+1, com.ns, tree.nbranch-1, -lnL);\r
+ OutTreeB(fBV); FPN(fBV);\r
+ for(i=0; i<tree.nbranch; i++) fprintf(fBV," %12.9f",x[i]); FPN(fBV);\r
+ for(i=0; i<tree.nbranch; i++) fprintf(fBV," %12.9f", sqrt(varb[tree.branches[i][1]])); FPN(fBV);\r
+ FPN(fBV); OutTreeN(fBV,1,1); FPN(fBV); fflush(fBV);\r
+ }\r
+ fclose(fBV);\r
+ if(data.ngene>1) fprintf(fout,"\nSum of lnL over loci = %15.6f\n", -lnLsum);\r
+\r
+ /* Step 2: ad hoc rate smoothing to estimate branch rates. */\r
+ printf("\nStep 2: Ad hoc rate smoothing to estimate branch rates.\n");\r
+ fprintf(fout, "\n\nStep 2: Ad hoc rate smoothing to estimate branch rates.\n");\r
+ /* s - 1 - NFossils node ages, (2*s_i - 2) rates for branches at each locus */\r
+ com.clock = 1;\r
+ copySptree();\r
+ GetInitialsTimes (x);\r
+\r
+ for(locus=0,com.np=com.ntime-1; locus<data.ngene; locus++) \r
+ com.np += data.ns[locus]*2-2;\r
+ if(data.fix_nu==2) com.np++;\r
+ if(data.fix_nu==3) com.np+=data.ngene;\r
+\r
+ if(com.np>NS*6) error2("change NP for ad hoc rate smoothing.");\r
+ for(i=0; i<com.ntime-1; i++)\r
+ { xb[i][0]=pb; xb[i][1]=1-pb; }\r
+ if(!nodes[tree.root].fossil) \r
+ { xb[0][0]=AgeLow[tree.root]*1.0001; xb[0][1]=max2(AgeLow[tree.root]*10,50); }\r
+ for( ; i<com.np; i++) { /* for rates */\r
+ xb[i][0]=rb[0]; xb[i][1]=rb[1];\r
+ }\r
+ for(locus=0,i=com.ntime-1; locus<data.ngene; locus++) \r
+ for(j=0; j<data.ns[locus]*2-2; j++) \r
+ x[i++]=mr[locus]*(.8+.4*rndu());\r
+ for( ; i<com.np; i++) /* nu */\r
+ x[i]=0.001+0.1*rndu();\r
+\r
+ if(noisy>3) {\r
+ for(i=0; i<com.np; i++) \r
+ { printf(" %10.5f", x[i]); if(i==com.ntime-2) FPN(F0); } FPN(F0);\r
+ if(com.np<200) {\r
+ for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][0]); FPN(F0);\r
+ for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][1]); FPN(F0);\r
+ }\r
+ }\r
+\r
+ if(data.fix_nu>1) \r
+ pnu = x+com.np-(data.fix_nu==2 ? 1 : data.ngene);\r
+ printf(" %d times, %d rates, %d parameters, ", com.ntime-1,k,com.np);\r
+\r
+ noisy=3;\r
+ f = funSS_AHRS(x, com.np);\r
+ if(noisy>2) printf("\nf0 = %12.6f\n",f );\r
+\r
+ if(finStep2) {\r
+ puts("read MLEs from step 2 from file");\r
+ for(i=0; i<com.np; i++) fscanf(finStep2,"%lf",&x[i]);\r
+ matout(F0,x,1,com.np);\r
+ }\r
+ else {\r
+ j = ming2(frub, &f, funSS_AHRS, NULL, x, xb, space, 1e-9, com.np);\r
+\r
+ /* generate output to in.clockStep2\r
+ matout(fout,x,1,com.np);\r
+ */\r
+\r
+ if(j==-1) \r
+ { puts("\nad hoc rate smoothing iteration may not have converged.\nEnter to continue; Ctrl-C to break."); \r
+ getchar(); }\r
+ }\r
+ free(varb_AHRS);\r
+\r
+ fputs("\nEstimated divergence times from ad hoc rate smoothing\n\n",fout);\r
+ copySptree();\r
+ FOR(i,tree.nnode) nodes[i].branch*=100;\r
+ for(i=com.ns; i<tree.nnode; i++)\r
+ fprintf(fout, "Node %2d Time %9.5f\n", i+1, nodes[i].age*100);\r
+ FPN(fout); OutTreeN(fout,1,1); FPN(fout);\r
+\r
+ fprintf(fout, "\nEstimated rates from ad hoc rate smoothing\n");\r
+ for(locus=0,k=k0=com.ntime-1; locus<data.ngene; k0+=data.nbrate[locus++]) {\r
+\r
+ UseLocus(locus, -1, 0, 1);\r
+ for(i=0; i<tree.nnode; i++)\r
+ if(i!=tree.root) nodes[i].label=x[k++];\r
+ son0=nodes[tree.root].sons[0]; son1=nodes[tree.root].sons[1];\r
+ t0=nodes[tree.root].age-nodes[son0].age; \r
+ t1=nodes[tree.root].age-nodes[son1].age; \r
+ nodes[tree.root].label = (nodes[son0].label*t1+nodes[son1].label*t0)/(t0+t1);\r
+ SetBranchRates(tree.root); /* node rates -> branch rates */\r
+\r
+ nu = (data.fix_nu==3 ? *(pnu+locus) : *pnu);\r
+ fprintf(fout,"\nLocus %d (%d sequences)\n\n", locus+1, com.ns);\r
+ fprintf(fout,"nu = %.6g\n", nu);\r
+\r
+ /* this block can be deleted? */\r
+ fprintf(fout, "\nnode \tage \tlength \trate\n");\r
+ for(i=0; i<tree.nnode; i++,FPN(fout)) {\r
+ fprintf(fout, "%02d\t%.3f", i+1,nodes[i].age);\r
+ if(i!=tree.root) \r
+ fprintf(fout, "\t%.5f\t%.5f", nodes[i].branch,nodes[i].label);\r
+ }\r
+\r
+ fprintf(fout,"\nRates as labels in tree:\n"); \r
+ OutTreeN(fout,1,PrLabel); FPN(fout); fflush(fout);\r
+\r
+ if(data.nbrate[locus]>maxnbrate) error2("too many rate classes? Change source.");\r
+ for(i=0,minr=1e6,maxr=0; i<tree.nnode; i++)\r
+ if(i!=tree.root) {\r
+ r=nodes[i].label;\r
+ if(r<0 && noisy) \r
+ puts("node label<0?");\r
+ minr = min2(minr,r);\r
+ maxr = max2(maxr,r);\r
+ }\r
+\r
+ fprintf(fdist, "\n%6d\n", tree.nnode-1);\r
+ for(i=0; i<tree.nnode; i++) {\r
+ if(i==tree.root) continue;\r
+ fprintf(fdist, "R%-10.7f ", nodes[i].label);\r
+ for(j=0; j<i; j++)\r
+ if(j!=tree.root)\r
+ fprintf(fdist, " %9.6f", fabs(nodes[i].label-nodes[j].label));\r
+ FPN(fdist);\r
+ }\r
+ fflush(fdist);\r
+/*\r
+ for(j=0; j<data.nbrate[locus]; j++)\r
+ Rj[j]=minr+(j+1)*(maxr-minr)/data.nbrate[locus];\r
+*/\r
+ beta = pow(1/(data.nbrate[locus]+1.), 1/(data.nbrate[locus]-1.));\r
+ beta = 0.25+0.25*log((double)data.nbrate[locus]);\r
+ if(beta>1) beta=0.99;\r
+ for(j=0; j<data.nbrate[locus]; j++)\r
+ Rj[j]=minr+(maxr-minr)*pow(beta, data.nbrate[locus]-1.-j);\r
+\r
+printf("\nLocus %d: nu = %.6f, rate range (%.6f, %.6f)\n", locus+1,nu,minr,maxr);\r
+printf("Cutting points:\n");\r
+for(j=0; j<data.nbrate[locus]; j++)\r
+ printf(" < %.6f, ", Rj[j]);\r
+printf("\nThe number of rate groups (0 for no change)? ");\r
+/* scanf("%d", &j); */\r
+j=0;\r
+if(j) {\r
+ data.nbrate[locus]=j;\r
+ printf("input %d cutting points? ", data.nbrate[locus]-1);\r
+ for(j=0,Rj[data.nbrate[locus]-1]=maxr; j<data.nbrate[locus]-1; j++)\r
+ scanf("%lf", &Rj[j]);\r
+}\r
+\r
+ for(i=0;i<data.nbrate[locus];i++) { mbrate[i]=0; nbrate[i]=0; }\r
+ for(i=0; i<tree.nnode; i++) {\r
+ if(i==tree.root) continue;\r
+ r=nodes[i].label;\r
+ for(j=0; j<data.nbrate[locus]-1; j++)\r
+ if(r<Rj[j]) break;\r
+ mbrate[j] += r;\r
+ nbrate[j] ++;\r
+ nodes[i].label = j;\r
+ }\r
+ nodes[tree.root].label=-1;\r
+ for(i=0;i<data.nbrate[locus];i++) \r
+ mbrate[i] = (nbrate[i]?mbrate[i]/nbrate[i]:-1);\r
+\r
+ fprintf(fout,"\nCollapsing rates into groups\nRate range: (%.6f, %.6f)\n", minr,maxr);\r
+/* fprintf(fout,"\nCollapsing rates into groups\nbeta = %.6g Rate range: (%.6f, %.6f)\n", beta, minr,maxr);\r
+*/\r
+ for(j=0; j<data.nbrate[locus]; j++)\r
+ fprintf(fout,"rate group %d (%2d): <%9.6f, mean %9.6f\n", \r
+ j, nbrate[j], Rj[j], mbrate[j]);\r
+\r
+ FPN(fout); OutTreeN(fout,1,PrLabel); FPN(fout);\r
+ fprintf(fout, "\n\nRough rates for branch groups at locus %d\n", locus+1);\r
+ for(i=0; i<data.nbrate[locus]; i++)\r
+ x[k0+i] = mbrate[i];\r
+ }\r
+\r
+printf("\n\n%d times, %d timerates from AHRS:\n", com.ntime-1,k0);\r
+fprintf(fout,"\n\n%d times, %d timerates from AHRS\n", com.ntime-1,k0);\r
+for(i=0; i<k0; i++) {\r
+ printf("%12.6f", x[i]);\r
+ if(i==com.ntime-2) FPN(F0);\r
+ fprintf(fout,"%12.6f", x[i]);\r
+ if(i==com.ntime-2) FPN(fout);\r
+}\r
+FPN(F0); FPN(fout);\r
+\r
+ for(i=0; i<k0; i++) x[i]*=0.9+0.2*rndu(); \r
+ \r
+ com.model=model0; com.clock=6; \r
+\r
+\r
+ com.fix_kappa=fix_kappa0; com.kappa=kappa0;\r
+ com.fix_omega=fix_omega0; com.omega=omega0;\r
+ com.fix_alpha=fix_alpha0; com.alpha=alpha0;\r
+\r
+#if 0\r
+ /* fix parameters: value > 0, precise value unimportant */\r
+ if(!fix_kappa0) { com.fix_kappa=1; com.kappa=0.1; }\r
+ if(!fix_omega0) { com.fix_omega=1; com.omega=0.1; }\r
+ if(!fix_alpha0) { com.fix_alpha=1; com.alpha=0.1; }\r
+#endif\r
+\r
+ fclose(fdist);\r
+ fflush(fout);\r
+ printf(" %10s\n", printtime(timestr));\r
+\r
+ if(finStep1) fclose(finStep1);\r
+ if(finStep2) fclose(finStep2);\r
+\r
+ return(0);\r
+}\r
+\r
+\r
+void DatingHeteroData (FILE* fout)\r
+{\r
+/* This is for clock and local-clock dating using heterogeneous data from \r
+ multiple loci. Some species might be missing at some loci. Thus \r
+ gnodes[locus] stores the gene tree at locus. Branch lengths in the gene \r
+ tree are constructed using the divergence times in the master species tree, \r
+ and the rates for genes and branches. \r
+\r
+ com.clock = 5: global clock\r
+ 6: local clock\r
+*/\r
+ char timestr[64];\r
+ int i,j,k, s, np, sconP0=0, locus;\r
+ double x[NS*6],xb[NS*6][2], lnL,e=1e-7, *var=NULL;\r
+ int nbrate=4;\r
+ size_t maxnpML, maxnpADRS;\r
+\r
+ data.fix_nu=3;\r
+/*\r
+if(com.clock==6) {\r
+ printf("nu (1:fix; 2:estimate one for all genes; 3:estimate one for every gene)? ");\r
+ scanf("%d", &data.fix_nu);\r
+ if(data.fix_nu==1) scanf("%lf", &nu_AHRS);\r
+}\r
+*/\r
+ ReadTreeSeqs(fout);\r
+ com.nbtype=1;\r
+ for(j=0; j<sptree.nnode; j++) {\r
+ sptree.nodes[j].pfossil[0] = sptree.nodes[j].pfossil[1] = -1;\r
+ }\r
+ for(j=sptree.nspecies, com.ntime=j-1, sptree.nfossil=0; j<sptree.nnode; j++) {\r
+ if(sptree.nodes[j].fossil) {\r
+ com.ntime--;\r
+ sptree.nfossil++;\r
+ printf("node %2d age fixed at %.3f\n", j, sptree.nodes[j].age);\r
+ }\r
+ }\r
+ GetMemBC();\r
+ s = sptree.nspecies;\r
+ maxnpML = s-1 + (5+2)*data.ngene;\r
+ maxnpADRS = s-1 + (2*s-1)*data.ngene + 2*data.ngene;\r
+ com.sspace = max2(com.sspace, spaceming2(maxnpADRS));\r
+ com.sspace = max2(com.sspace, maxnpML*(maxnpML+1)*sizeof(double));\r
+ if((com.space = (double*)realloc(com.space,com.sspace))==NULL) \r
+ error2("oom space");\r
+\r
+#if (defined CODEML)\r
+ GetUVRoot_codeml ();\r
+#endif\r
+ if(com.clock==6) {\r
+ if(data.fix_nu<=1) {\r
+ printf("nu & nbrate? ");\r
+ scanf("%lf%d? ", &nu_AHRS, &nbrate);\r
+ }\r
+ for(locus=0; locus<data.ngene; locus++) \r
+ data.nbrate[locus] = nbrate;\r
+ AdHocRateSmoothing(fout, x, xb, com.space);\r
+\r
+ printf("\nStep 3: ML estimation of times and rates.");\r
+ fprintf(fout,"\n\nStep 3: ML estimation of times and rates.\n");\r
+ }\r
+ else { /* clock = 5, global clock */\r
+ for(locus=0; locus<data.ngene; locus++) \r
+ for(i=0,data.nbrate[locus]=1; i<data.ns[locus]*2-1; i++)\r
+ gnodes[locus][i].label=0;\r
+ }\r
+\r
+ noisy=3;\r
+\r
+ copySptree();\r
+ GetInitialsClock56Step3(x);\r
+ np=com.np;\r
+\r
+ SetxBound (com.np, xb);\r
+ lnL = lnLfunHeteroData(x,np);\r
+\r
+ if(noisy) {\r
+ printf("\nntime & nrate & np:%6d%6d%6d\n",com.ntime-1,com.nrate,com.np);\r
+ matout(F0,x,1,np);\r
+ printf("\nlnL0 = %12.6f\n",-lnL);\r
+ }\r
+\r
+ j = ming2(noisy>2?frub:NULL,&lnL,lnLfunHeteroData,NULL,x,xb, com.space,e,np);\r
+\r
+ if(noisy) printf("Out...\nlnL = %12.6f\n", -lnL);\r
+ \r
+ LASTROUND=1;\r
+ for(i=0,j=!sptree.nodes[sptree.root].fossil; i<sptree.nnode; i++) \r
+ if(i!=sptree.root && sptree.nodes[i].nson && !sptree.nodes[i].fossil) \r
+ x[j++]=sptree.nodes[i].age; /* copy node ages into x[] */\r
+\r
+ if (com.getSE) {\r
+ if(np>100 || (com.seqtype && np>20)) puts("Calculating SE's");\r
+ var=com.space+np;\r
+ Hessian (np,x,lnL,com.space,var,lnLfunHeteroData,var+np*np);\r
+ matinv(var,np,np,var+np*np);\r
+ }\r
+ copySptree();\r
+ SetBranch(x);\r
+ fprintf(fout,"\n\nTree: "); OutTreeN(fout,0,0);\r
+ fprintf(fout,"\nlnL(ntime:%3d np:%3d):%14.6f\n", com.ntime-1,np,-lnL);\r
+ OutTreeB(fout); FPN (fout);\r
+ for(i=0;i<np;i++) fprintf(fout," %9.5f",x[i]); FPN(fout); fflush(fout);\r
+\r
+ if(com.getSE) {\r
+ fprintf(fout,"SEs for parameters:\n");\r
+ for(i=0;i<np;i++) fprintf(fout," %9.5f",(var[i*np+i]>0.?sqrt(var[i*np+i]):-1));\r
+ FPN(fout);\r
+ if (com.getSE==2) matout2(fout, var, np, np, 15, 10);\r
+ }\r
+\r
+ fprintf(fout,"\nTree with node ages for TreeView\n");\r
+ FOR(i,tree.nnode) nodes[i].branch*=100;\r
+ FPN(fout); OutTreeN(fout,1,1); FPN(fout);\r
+ FPN(fout); OutTreeN(fout,1,PrNodeNum); FPN(fout);\r
+ FPN(fout); OutTreeN(fout,1,PrLabel|PrAge); FPN(fout);\r
+ FPN(fout); OutTreeN(fout,1,0); FPN(fout);\r
+ OutputTimesRates(fout, x, var);\r
+\r
+ fprintf(fout,"\nSubstititon rates for genes (per time unit)\n");\r
+ for(j=0,k=com.ntime-1; j<data.ngene; j++,FPN(fout)) {\r
+ fprintf(fout," Gene %2d: ", j+1);\r
+ for(i=0; i<data.nbrate[j]; i++,k++) {\r
+ fprintf(fout,"%10.5f", x[k]);\r
+ if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));\r
+ }\r
+ if(com.clock==6) fprintf(fout," ");\r
+ }\r
+ if(!com.fix_kappa) {\r
+ fprintf(fout,"\nkappa for genes\n");\r
+ for(j=0; j<data.ngene; j++,k++) {\r
+ fprintf(fout,"%10.5f", data.kappa[j]);\r
+ if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));\r
+ }\r
+ }\r
+ if(!com.fix_omega) {\r
+ fprintf(fout,"\nomega for genes\n");\r
+ for(j=0; j<data.ngene; j++,k++) {\r
+ fprintf(fout,"%10.5f", data.omega[j]);\r
+ if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));\r
+ }\r
+ }\r
+ if(!com.fix_alpha) {\r
+ fprintf(fout,"\nalpha for genes\n");\r
+ for(j=0; j<data.ngene; j++,k++) {\r
+ fprintf(fout,"%10.5f", data.alpha[j]);\r
+ if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));\r
+ }\r
+ }\r
+ FPN(fout);\r
+ FreeMemBC();\r
+ printf("\nTime used: %s\n", printtime(timestr));\r
+ exit(0);\r
+}\r
+\r
+#endif\r