src/treesub.c

   1 /* TREESUB.c\r
   2    subroutines that operates on trees, inserted into other programs \r
   3    such as baseml, basemlg, codeml, and pamp.\r
   4 */\r
   5 \r
   6 extern char BASEs[], *EquateBASE[], BASEs5[], *EquateBASE5[], AAs[], BINs[], CODONs[][4], nChara[], CharaMap[][64];\r
   7 \r
   8 extern int noisy;\r
   9 \r
  10 #ifdef  BASEML\r
  11 #define REALSEQUENCE\r
  12 #define NODESTRUCTURE\r
  13 #define TREESEARCH\r
  14 #define LSDISTANCE\r
  15 #define LFUNCTIONS\r
  16 #define RECONSTRUCTION\r
  17 #define MINIMIZATION\r
  18 #endif\r
  19 \r
  20 #ifdef  CODEML\r
  21 #define REALSEQUENCE\r
  22 #define NODESTRUCTURE\r
  23 #define TREESEARCH\r
  24 #define LSDISTANCE\r
  25 #define LFUNCTIONS\r
  26 #define RECONSTRUCTION\r
  27 #define MINIMIZATION\r
  28 #endif\r
  29 \r
  30 #ifdef  BASEMLG\r
  31 #define REALSEQUENCE\r
  32 #define NODESTRUCTURE\r
  33 #define LSDISTANCE\r
  34 #endif\r
  35 \r
  36 #ifdef  RECONSTRUCTION\r
  37 #define PARSIMONY\r
  38 #endif\r
  39 \r
  40 #ifdef  MCMCTREE\r
  41 #define REALSEQUENCE\r
  42 #define NODESTRUCTURE\r
  43 #define LFUNCTIONS\r
  44 #endif\r
  45 \r
  46 #if(defined CODEML || defined YN00)\r
  47 double SS, NN, Sd, Nd;  /* kostas, # of syn. sites,# of non syn. sites,# of syn. subst.,# of non syn. subst. */\r
  48 #endif\r
  49 \r
  50 \r
  51 \r
  52 #ifdef REALSEQUENCE\r
  53 \r
  54 int hasbase (char *str)\r
  55 {\r
  56    char *p=str, *eqdel=".-?";\r
  57    while (*p) \r
  58       if (*p==eqdel[0] || *p==eqdel[1] || *p==eqdel[2] || isalpha(*p++)) \r
  59          return(1);\r
  60    return(0);\r
  61 }\r
  62 \r
  63 \r
  64 int GetSeqFileType(FILE *fseq, int *paupseq);\r
  65 int IdenticalSeqs(void);\r
  66 void RemoveEmptySequences(void);\r
  67 \r
  68 int GetSeqFileType(FILE *fseq, int *format)\r
  69 {\r
  70 /* paupstart="begin data" and paupend="matrix" identify nexus file format.\r
  71    Modify if necessary.\r
  72    format: 0: alignment; 1: fasta; 2: nexus.\r
  73 \r
  74 */\r
  75    int  lline=1000, ch, aligned;\r
  76    char fastastarter='>';\r
  77    char line[1000], *paupstart="begin data",*paupend="matrix", *p;\r
  78    char *ntax="ntax",*nchar="nchar";\r
  79 \r
  80    while (isspace(ch=fgetc(fseq)))\r
  81           ;\r
  82    ungetc(ch, fseq);\r
  83    if(ch == fastastarter) {\r
  84       *format = 1;\r
  85       ScanFastaFile(fseq, &com.ns, &com.ls, &aligned);\r
  86       if(aligned)\r
  87          return(0);\r
  88       else \r
  89          error2("The seq file appears to be in fasta format, but not aligned?");\r
  90    }\r
  91    if(fscanf(fseq,"%d%d", &com.ns, &com.ls)==2) {\r
  92       *format = 0; return(0);\r
  93    }\r
  94    *format = 2;\r
  95    printf("\nseq file is not paml/phylip format.  Trying nexus format.");\r
  96 \r
  97    for ( ; ; ) {\r
  98       if(fgets(line,lline,fseq)==NULL) error2("seq err1: EOF");\r
  99       strcase(line,0);\r
 100       if(strstr(line,paupstart)) break;\r
 101    }\r
 102    for ( ; ; ) {\r
 103       if(fgets(line,lline,fseq)==NULL) error2("seq err2: EOF");\r
 104       strcase(line,0);\r
 105       if((p=strstr(line,ntax))!=NULL) {\r
 106          while (*p != '=') { if(*p==0) error2("seq err"); p++; }\r
 107          sscanf(p+1,"%d", &com.ns);\r
 108          if((p=strstr(line,nchar))==NULL) error2("expect nchar");\r
 109          while (*p != '=') { if(*p==0) error2("expect ="); p++; }\r
 110          sscanf(p+1,"%d", &com.ls);\r
 111          break;\r
 112       } \r
 113    }\r
 114    /* printf("\nns: %d\tls: %d\n", com.ns, com.ls);  */\r
 115    for ( ; ; ) {\r
 116       if(fgets(line,lline,fseq)==NULL) error2("seq err1: EOF");\r
 117       strcase(line,0);\r
 118       if (strstr(line,paupend)) break;\r
 119    }\r
 120    return(0);\r
 121 }\r
 122 \r
 123 int PopupComment(FILE *fseq)\r
 124 {\r
 125    int ch, comment1=']';\r
 126    for( ; ; ) {\r
 127       ch=fgetc(fseq);\r
 128       if(ch==EOF) error2("expecting ]");\r
 129       if(ch==comment1) break;\r
 130       if(noisy) putchar(ch);\r
 131    }\r
 132    return(0);\r
 133 }\r
 134 \r
 135 \r
 136 #if(MCMCTREE)\r
 137 \r
 138 int ReadMorphology (FILE *fout, FILE *fin)\r
 139 {\r
 140    int i,j, locus=data.nmorphloci;\r
 141    char line[1024], str[64];\r
 142 \r
 143    if((data.zmorph[locus][0] = (double*)malloc((com.ns*2-1)*com.ls*sizeof(double))) == NULL)\r
 144       error2("oom zmorph");\r
 145    if((data.Rmorph[locus] = (double*)malloc(com.ls*com.ls*sizeof(double))) == NULL)\r
 146       error2("oom Rmorph");\r
 147 \r
 148    if((data.nmorphloci = locus+1) > NMORPHLOCI) error2("raise NMORPHLOCI and recompile.");\r
 149    for(i=1; i<com.ns*2-1; i++) {\r
 150       data.zmorph[locus][i] = data.zmorph[locus][0] + i*com.ls;\r
 151    }\r
 152    for(i=0; i<com.ns; i++) {\r
 153       fscanf(fin, "%s", com.spname[i]);\r
 154       printf ("Reading data for species #%2d: %s     \r", i+1, com.spname[i]);\r
 155       for(j=0; j<com.ls; j++) \r
 156          fscanf(fin, "%lf", &data.zmorph[locus][i][j]);\r
 157    }\r
 158 \r
 159    for(i=0; i<com.ns; i++) {\r
 160       fprintf(fout, "%-10s ", com.spname[i]);\r
 161       for(j=0; j<com.ls; j++)\r
 162          fprintf(fout, " %8.5f", data.zmorph[locus][i][j]);\r
 163       FPN(fout);\r
 164    }\r
 165 \r
 166 #if(0)\r
 167    fscanf(fin, "%s", str);\r
 168    fgets(line, 1024, fin);\r
 169    i = j = -1;\r
 170    if(strstr("Correlation", str)) {\r
 171       for(i=0; i<com.ls; i++) {\r
 172          for(j=0; j<com.ls; j++) \r
 173             if(fscanf(fin, "%lf", &data.Rmorph[locus][i*com.ls+j]) != 1) break;\r
 174          if(j<com.ls) break;\r
 175       }\r
 176    }\r
 177    if(i!=com.ls || j!=com.ls) {\r
 178       printf("\ndid not find a good R matrix.  Setting it to identity matrix I.\n");\r
 179       for(i=0; i<com.ls; i++) \r
 180          for(j=0; j<com.ls; j++) \r
 181             data.Rmorph[locus][i*com.ls+j] = (i==j);\r
 182    }\r
 183 #endif\r
 184    return(0);\r
 185 }\r
 186 \r
 187 #endif\r
 188 \r
 189 int ReadSeq (FILE *fout, FILE *fseq, int cleandata, int locus)\r
 190 {\r
 191 /* read in sequence, translate into protein (CODON2AAseq), and \r
 192    This counts ngene but does not initialize lgene[].\r
 193    It also codes (transforms) the sequences.\r
 194    com.seqtype: 0=nucleotides; 1=codons; 2:AAs; 3:CODON2AAs; 4:BINs\r
 195    com.pose[] is used to store gene or site-partition labels.\r
 196    ls/3 gene marks for codon sequences.\r
 197    char opt_c[]="GIPM";\r
 198       G:many genes;  I:interlaved format;  P:patterns;  M:morphological characters\r
 199 \r
 200    Use cleandata=1 to clean up ambiguities.  In return, com.cleandata=1 if the \r
 201    data are clean or are cleaned, and com.cleandata=0 is the data are unclean. \r
 202 */\r
 203    char *p,*p1, eq='.', comment0='[', *line;\r
 204    int format=0;  /* 0: paml/phylip, 1: fasta; 2: paup/nexus */\r
 205    int i,j,k, ch, noptline=0, lspname=LSPNAME, miss=0, nb;\r
 206    int lline=10000,lt[NS], igroup, Sequential=1, basecoding=0;\r
 207    int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
 208    int gap=(n31==3?3:10), nchar=(com.seqtype==AAseq?20:4);\r
 209    int h,b[3]={0};\r
 210    char *pch=((com.seqtype<=1||com.seqtype==CODON2AAseq) ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5 ? BASEs5 : BINs)));\r
 211    char str[4]="   ";\r
 212    double lst;\r
 213 #if(MCMCTREE)\r
 214    data.datatype[locus] = com.seqtype;\r
 215 #endif\r
 216    str[0]=0; h=-1; b[0]=-1; /* avoid warning */\r
 217    com.readpattern = 0;\r
 218    if (com.seqtype==4) error2("seqtype==BINs, check with author");\r
 219    if (noisy>=9 && (com.seqtype<=CODONseq||com.seqtype==CODON2AAseq)) {\r
 220       puts("\n\nAmbiguity character definition table:\n");\r
 221       for(i=0; i<(int)strlen(BASEs); i++) {\r
 222          nb = strlen(EquateBASE[i]);\r
 223          printf("%c (%d): ", BASEs[i], nb);\r
 224          for(j=0; j<nb; j++)  printf("%c ", EquateBASE[i][j]);\r
 225          FPN(F0);\r
 226       }\r
 227    }\r
 228    GetSeqFileType(fseq, &format);\r
 229 \r
 230    if (com.ns>NS) error2("too many sequences.. raise NS?");\r
 231    if (com.ls%n31!=0) {\r
 232       printf ("\n%d nucleotides, not a multiple of 3!", com.ls); exit(-1);\r
 233    }\r
 234    if (noisy) printf ("\nns = %d  \tls = %d\n", com.ns, com.ls);\r
 235 \r
 236    for(j=0; j<com.ns; j++) {\r
 237       if(com.spname[j]) free(com.spname[j]);\r
 238       com.spname[j] = (char*)malloc((lspname+1)*sizeof(char));\r
 239       for(i=0; i<lspname+1; i++) com.spname[j][i]=0;\r
 240       if((com.z[j] = (unsigned char*)realloc(com.z[j],com.ls*sizeof(unsigned char))) == NULL)\r
 241          error2("oom z");\r
 242    }\r
 243    com.rgene[0] = 1;   com.ngene = 1;  \r
 244    lline = max2(lline, com.ls/n31*(n31+1)+lspname+50);\r
 245    if((line=(char*)malloc(lline*sizeof(char))) == NULL) error2("oom line");\r
 246 \r
 247    /* first line */\r
 248    if (format == 0) {\r
 249       if(!fgets(line,lline,fseq)) error2("ReadSeq: first line");\r
 250       com.readpattern = (strchr(line, 'P') || strchr(line, 'p'));\r
 251 #if(MCMCTREE)\r
 252       if(strchr(line, 'M') || strchr(line, 'm'))  data.datatype[locus] = MORPHC;\r
 253 #endif\r
 254    }\r
 255 #if(MCMCTREE)\r
 256    if(data.datatype[locus] == MORPHC) { /* morhpological data */\r
 257       ReadMorphology(fout, fseq);\r
 258       return(0);\r
 259    }\r
 260    else\r
 261 #endif\r
 262       if(!com.readpattern) {\r
 263          if((com.pose=(int*)realloc(com.pose, com.ls/n31*sizeof(int)))==NULL)\r
 264             error2("oom pose");\r
 265          for(j=0; j<com.ls/n31; j++) com.pose[j]=0;      /* gene #1, default */\r
 266       }\r
 267       else {\r
 268          if(com.pose) free(com.pose);  \r
 269          com.pose = NULL;\r
 270       }\r
 271    if(format) goto readseq;\r
 272 \r
 273    for (j=0; j<lline && line[j] && line[j]!='\n'; j++) {\r
 274       if (!isalnum(line[j])) continue;\r
 275       line[j]=(char)toupper(line[j]);\r
 276       switch (line[j]) {\r
 277          case 'G': noptline++;   break;\r
 278          case 'C': basecoding=1; break;\r
 279          case 'S': Sequential=1; break;\r
 280          case 'I': Sequential=0; break;\r
 281          case 'P':               break;  /* already dealt with. */\r
 282          default : \r
 283             printf ("\nBad option '%c' in first line of seqfile\n", line[j]);\r
 284             exit (-1);\r
 285       }\r
 286    }\r
 287    if (strchr(line,'C')) {   /* protein-coding DNA sequences */\r
 288       if(com.seqtype==2) error2("option C?");\r
 289       if(com.seqtype==0) {\r
 290          if (com.ls%3!=0 || noptline<1)  error2("option C?");\r
 291          com.ngene=3; \r
 292          for(i=0;i<3;i++) com.lgene[i]=com.ls/3;\r
 293 #if(defined(BASEML) || defined(BASEMLG))\r
 294          com.coding=1;\r
 295          if(com.readpattern) \r
 296             error2("partterns for coding sequences (G C P) not implemented.");\r
 297          else \r
 298             for (i=0;i<com.ls;i++) com.pose[i]=(char)(i%3);\r
 299          \r
 300 #endif\r
 301       }\r
 302       noptline--;\r
 303    }\r
 304 \r
 305    /* option lines */\r
 306    for(j=0; j<noptline; j++) {\r
 307       for(ch=0; ; ) {\r
 308          ch = (char)fgetc(fseq);\r
 309          if(ch == comment0) \r
 310             PopupComment(fseq);\r
 311          if(isalnum(ch)) break;\r
 312       }\r
 313 \r
 314       ch = (char)toupper(ch);\r
 315       switch (ch) {\r
 316       case ('G') :\r
 317          if(basecoding) error2("Error in sequence data file: incorrect option format, use GC?\n");\r
 318          if (fscanf(fseq,"%d",&com.ngene)!=1) error2("expecting #gene here..");\r
 319          if (com.ngene>NGENE) error2("raise NGENE?");\r
 320 \r
 321          fgets(line,lline,fseq);\r
 322          if (!blankline(line)) {    /* #sites in genes on the 2nd line */\r
 323             for (i=0,p=line; i<com.ngene; i++) {\r
 324                while (*p && !isalnum(*p)) p++;\r
 325                if (sscanf(p,"%d",&com.lgene[i])!=1) break;\r
 326                while (*p && isalnum(*p)) p++;\r
 327             }\r
 328             /* if ngene is large and some lgene is on the next line */\r
 329             for (; i<com.ngene; i++)\r
 330                if (fscanf(fseq,"%d", &com.lgene[i])!=1) error2("EOF at lgene");\r
 331 \r
 332             for(i=0,k=0; i<com.ngene; i++) \r
 333                k += com.lgene[i];\r
 334             if(k!=com.ls/n31) {\r
 335                matIout(F0, com.lgene, 1, com.ngene);\r
 336                printf("\n%6d != %d", com.ls/n31, k);\r
 337                puts("\nOption G: total length over genes is not correct");\r
 338                if(com.seqtype==1) {\r
 339                   puts("Note: gene length is in number of codons.");\r
 340                }\r
 341                puts("Sequence length in number of nucleotides.");\r
 342                exit(-1);\r
 343             }\r
 344             if(!com.readpattern)\r
 345                for(i=0,k=0; i<com.ngene; k+=com.lgene[i],i++)\r
 346                   for(j=0; j<com.lgene[i]; j++)\r
 347                      com.pose[k+j] = i;\r
 348 \r
 349          }\r
 350          else {                   /* site marks on later line(s)  */\r
 351             if(com.readpattern) \r
 352                error2("option PG: use number of patterns in each gene and not site marks");\r
 353             for(k=0; k<com.ls/n31; ) {\r
 354                if (com.ngene>9)  fscanf(fseq,"%d", &ch);\r
 355                else {\r
 356                   do ch=fgetc(fseq); while (!isdigit(ch));\r
 357                   ch=ch-(int)'1'+1;  /* assumes 1,2,...,9 are consecutive */\r
 358                }\r
 359                if (ch<1 || ch>com.ngene)\r
 360                   { printf("\ngene mark %d at %d?\n", ch, k+1);  exit (-1); }\r
 361                com.pose[k++]=ch-1;\r
 362             }\r
 363             if(!fgets(line,lline,fseq)) error2("sequence file, gene marks");\r
 364          }\r
 365          break;\r
 366       default :\r
 367          printf ("Bad option '%c' in option lines in seqfile\n", line[0]);\r
 368          exit (-1);\r
 369       }\r
 370    }\r
 371 \r
 372    readseq:\r
 373    /* read sequence */\r
 374    if (Sequential)  {    /* sequential */\r
 375       if (noisy) printf ("Reading sequences, sequential format..\n");\r
 376       for (j=0; j<com.ns; j++) {\r
 377          lspname = LSPNAME;\r
 378          for (i=0; i<2*lspname; i++) line[i]='\0';\r
 379          if (!fgets (line, lline, fseq)) error2("EOF?");\r
 380          if (blankline(line)) {\r
 381             if (PopEmptyLines (fseq, lline, line))\r
 382                { printf("error in sequence data file: empty line (seq %d)\n",j+1); exit(-1); }\r
 383          }\r
 384          p = line+(line[0]=='=' || line[0]=='>') ;\r
 385          while(isspace(*p)) p++;\r
 386          if ((ch=strstr(p,"  ")-p)<lspname && ch>0) lspname=ch;\r
 387          strncpy (com.spname[j], p, lspname);\r
 388          k = strlen(com.spname[j]);\r
 389          p += (k<lspname?k:lspname);\r
 390 \r
 391          for (; k>0; k--) /* trim spaces */\r
 392             if (!isgraph(com.spname[j][k]))   com.spname[j][k]=0;\r
 393             else    break;\r
 394 \r
 395          if (noisy>=2) printf ("Reading seq #%2d: %s     \r", j+1, com.spname[j]);\r
 396          for (k=0; k<com.ls; p++) {\r
 397             while (*p=='\n' || *p=='\0') {\r
 398                p=fgets(line, lline, fseq);\r
 399                if(p==NULL)\r
 400                   { printf("\nEOF at site %d, seq %d\n", k+1,j+1); exit(-1); }\r
 401             }\r
 402             *p = (char)toupper(*p);\r
 403             if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U') \r
 404                *p = 'T';\r
 405             p1 = strchr(pch, *p);\r
 406             if (p1 && p1-pch>=nchar)  \r
 407                miss = 1;\r
 408             if (*p==eq) {\r
 409                if (j==0) error2("Error in sequence data file: . in 1st seq.?");\r
 410                com.z[j][k] = com.z[0][k];  k++;\r
 411             }\r
 412             else if (p1) \r
 413                com.z[j][k++] = *p;\r
 414             else if (isalpha(*p)) {\r
 415                printf("\nError in sequence data file: %c at %d seq %d.\n",*p,k+1,j+1); \r
 416                puts("Make sure to separate the sequence from its name by 2 or more spaces.");\r
 417                exit(0); \r
 418             }\r
 419             else if (*p == (char)EOF) error2("EOF?");\r
 420          }           /* for(k) */\r
 421          if(strchr(p,'\n')==NULL) /* pop up line return */\r
 422             while((ch=fgetc(fseq))!='\n' && ch!=EOF) ;\r
 423       }   /* for (j,com.ns) */\r
 424    }\r
 425    else { /* interlaved */\r
 426       if (noisy) printf ("Reading sequences, interlaved format..\n");\r
 427       FOR (j, com.ns) lt[j]=0;  /* temporary seq length */\r
 428       for (igroup=0; ; igroup++) {\r
 429          /*\r
 430          printf ("\nreading block %d ", igroup+1);  matIout(F0,lt,1,com.ns);*/\r
 431 \r
 432          FOR (j, com.ns) if (lt[j]<com.ls) break;\r
 433          if (j==com.ns) break;\r
 434          FOR (j,com.ns) {\r
 435             if (!fgets(line,lline,fseq)) {\r
 436                printf("\nerr reading site %d, seq %d group %d\nsites read in each seq:",\r
 437                   lt[j]+1,j+1,igroup+1);\r
 438                error2("EOF?");\r
 439             }\r
 440             if (!hasbase(line)) {\r
 441                if (j) {\r
 442                   printf ("\n%d, seq %d group %d", lt[j]+1, j+1, igroup+1);\r
 443                   error2("empty line.");\r
 444                }\r
 445                else \r
 446                   if (PopEmptyLines(fseq,lline,line)==-1) {\r
 447                      printf ("\n%d, seq %d group %d", lt[j]+1, j+1, igroup+1);\r
 448                      error2("EOF?");\r
 449                   }\r
 450             }\r
 451             p=line;\r
 452             if (igroup==0) {\r
 453                lspname = LSPNAME;\r
 454                while(isspace(*p)) p++;\r
 455                if ((ch=strstr(p,"  ")-p)<lspname && ch>0)\r
 456                   lspname = ch;\r
 457                strncpy (com.spname[j], p, lspname);\r
 458                k = strlen(com.spname[j]);\r
 459                p += (k<lspname?k:lspname);\r
 460 \r
 461                for (; k>0; k--)   /* trim spaces */\r
 462                   if (!isgraph(com.spname[j][k]))\r
 463                      com.spname[j][k]=0;\r
 464                   else\r
 465                      break;\r
 466                if(noisy>=2) printf("Reading seq #%2d: %s     \r",j+1,com.spname[j]);\r
 467             }\r
 468             for (; *p && *p!='\n'; p++) {\r
 469                if (lt[j]==com.ls) break;\r
 470                *p = (char)toupper(*p);\r
 471                if((com.seqtype==BASEseq || com.seqtype==CODONseq) && *p=='U') \r
 472                   *p = 'T';\r
 473                p1 = strchr(pch, *p);\r
 474                if (p1 && p1-pch>=nchar) \r
 475                   miss = 1;\r
 476                if (*p == eq) {\r
 477                   if (j == 0) {\r
 478                      printf("err: . in 1st seq, group %d.\n",igroup);\r
 479                      exit (-1);\r
 480                   }\r
 481                   com.z[j][lt[j]] = com.z[0][lt[j]];\r
 482                   lt[j]++;\r
 483                }\r
 484                else if (p1)\r
 485                   com.z[j][lt[j]++]=*p;\r
 486                else if (isalpha(*p)) {\r
 487                   printf("\nerr: unrecognised character %c at %d seq %d block %d.",\r
 488                           *p,lt[j]+1,j+1,igroup+1);\r
 489                   exit(-1);\r
 490                }\r
 491                else if (*p==(char)EOF) error2("EOF");\r
 492             }         /* for (*p) */\r
 493          }            /* for (j,com.ns) */\r
 494 \r
 495          if(noisy>2) {\r
 496             printf("\nblock %3d:", igroup+1);\r
 497             for(j=0;j<com.ns;j++) printf(" %6d",lt[j]);\r
 498          }\r
 499 \r
 500       }               /* for (igroup) */\r
 501    }\r
 502    free(line);\r
 503 \r
 504 #ifdef CODEML\r
 505    /* mask stop codons as ???.  */\r
 506    if(com.seqtype==1 && MarkStopCodons())\r
 507       miss=1;\r
 508 #endif\r
 509 \r
 510    if(!miss)\r
 511       com.cleandata = 1;\r
 512    else if (cleandata) {  /* forced removal of ambiguity characters */\r
 513       if(noisy>2)  puts("\nSites with gaps or missing data are removed.");\r
 514       if(fout) {\r
 515          fprintf(fout,"\nBefore deleting alignment gaps\n");\r
 516          fprintf(fout, " %6d %6d\n", com.ns, com.ls);\r
 517          printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,gap,com.seqtype,0,0,NULL);\r
 518       }\r
 519       RemoveIndel ();\r
 520       if(fout) fprintf(fout,"\nAfter deleting gaps. %d sites\n",com.ls);\r
 521    }\r
 522 \r
 523    if(fout && !com.readpattern) {/* verbose=1, listing sequences again */\r
 524       fprintf(fout, " %6d %6d\n", com.ns, com.ls);\r
 525       printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,gap,com.seqtype,0,0,NULL);\r
 526    }\r
 527 \r
 528    if(n31==3) com.ls/=n31;\r
 529 \r
 530    /* IdenticalSeqs(); */\r
 531 \r
 532 #ifdef CODEML\r
 533    if(com.seqtype==1 && com.verbose) Get4foldSites();\r
 534 \r
 535    if(com.seqtype==CODON2AAseq) {\r
 536       if (noisy>2) puts("\nTranslating into AA sequences\n");\r
 537       for(j=0; j<com.ns; j++) {\r
 538          if (noisy>2) printf("Translating sequence %d\n",j+1);\r
 539          DNA2protein(com.z[j], com.z[j], com.ls,com.icode);\r
 540       }\r
 541       com.seqtype=AAseq;\r
 542 \r
 543       if(fout) {\r
 544          fputs("\nTranslated AA Sequences\n",fout);\r
 545          fprintf(fout,"%4d  %6d",com.ns,com.ls);\r
 546          printsma(fout,com.spname,com.z,com.ns,com.ls,com.ls,10,com.seqtype,0,0,NULL);\r
 547       }\r
 548    }\r
 549 #endif\r
 550 \r
 551 #if (defined CODEML || defined BASEML)\r
 552    if(com.ngene==1 && com.Mgene==1) com.Mgene=0;\r
 553    if(com.ngene>1 && com.Mgene==1 && com.verbose)  printSeqsMgenes ();\r
 554 \r
 555    if(com.bootstrap) { BootstrapSeq("boot.txt");  exit(0); }\r
 556 #endif\r
 557 \r
 558 \r
 559 #if (defined CODEML)\r
 560    /* list sites with 2 types of serine codons: TC? and TCY.  19 March 2014, Ziheng. */\r
 561    {\r
 562       char codon[4]="";\r
 563       int nbox0, nbox1;\r
 564       for(h=0; h<com.ls; h++) {\r
 565          for(i=0,nbox0=nbox1=0; i<com.ns; i++) {\r
 566             codon[0]=com.z[i][h*3+0]; codon[1]=com.z[i][h*3+1]; codon[2]=com.z[i][h*3+2];\r
 567             if(codon[0]=='T' && codon[1]=='C') nbox0++;\r
 568             else if(codon[0]=='A' && codon[1]=='G' && (codon[2]=='T' || codon[2]=='C')) nbox1++;\r
 569          }\r
 570          if(nbox0 && nbox1 && nbox0+nbox1==com.ns) {\r
 571             printf("\ncodon %7d: ", h+1);\r
 572             for(i=0; i<com.ns; i++)\r
 573                printf("%c%c%c ", com.z[i][h*3+0], com.z[i][h*3+1], com.z[i][h*3+2]);\r
 574          }\r
 575       }\r
 576    }\r
 577 #endif\r
 578 \r
 579 \r
 580 \r
 581    if(noisy>=2) printf ("\nSequences read..\n");\r
 582    if(com.ls==0) {\r
 583       puts("no sites. Got nothing to do");\r
 584       return(1);\r
 585    }\r
 586 \r
 587 #if (defined MCMCTREE)\r
 588    /* Check and remove empty sequences.  */\r
 589 \r
 590    if(com.cleandata==0)\r
 591       RemoveEmptySequences();\r
 592 \r
 593 #endif\r
 594 \r
 595    if(!com.readpattern) \r
 596       PatternWeight();\r
 597    else {  /*  read pattern counts */\r
 598       com.npatt = com.ls;\r
 599       if((com.fpatt=(double*)realloc(com.fpatt, com.npatt*sizeof(double))) == NULL)\r
 600          error2("oom fpatt");\r
 601       for(h=0,lst=0; h<com.npatt; h++) {\r
 602          fscanf(fseq, "%lf", &com.fpatt[h]);\r
 603          lst += com.fpatt[h];\r
 604          if(com.fpatt[h]<0 || com.fpatt[h]>1e6)\r
 605             printf("fpatth[%d] = %.6g\n", h+1, com.fpatt[h]);\r
 606       }\r
 607       if(lst>1.00001) { \r
 608          com.ls = (int)lst;\r
 609          if(noisy) printf("\n%d site patterns read, %d sites\n", com.npatt, com.ls);\r
 610       }\r
 611       if(com.ngene==1) { \r
 612          com.lgene[0] = com.ls; \r
 613          com.posG[0] = 0; \r
 614          com.posG[1] = com.npatt; \r
 615       }\r
 616       else {\r
 617          for(j=0,com.posG[0]=0; j<com.ngene; j++)\r
 618             com.posG[j+1] = com.posG[j] + com.lgene[j];\r
 619 \r
 620          for(j=0; j<com.ngene; j++) {\r
 621             com.lgene[j] = (j==0 ? 0 : com.lgene[j-1]);\r
 622             for(h=com.posG[j]; h<com.posG[j+1]; h++)\r
 623                com.lgene[j] += (int)com.fpatt[h];\r
 624          }\r
 625       }\r
 626    }\r
 627 \r
 628    EncodeSeqs();\r
 629 \r
 630    if(fout) {\r
 631       fprintf(fout,"\nPrinting out site pattern counts\n\n");\r
 632       printPatterns(fout);\r
 633    }\r
 634 \r
 635    return (0);\r
 636 }\r
 637 \r
 638 \r
 639 #if(defined CODEML)\r
 640 \r
 641 int MarkStopCodons(void)\r
 642 {\r
 643 /* this converts the whole column into ??? if there is a stop codon in one sequence.\r
 644    Data in com.z[] are just read in and not encoded yet.\r
 645 */\r
 646    int i,j,h,k, NColumnEdited=0;\r
 647    char codon[4]="", stops[6][4]={"","",""}, nstops=0;\r
 648 \r
 649    if(com.seqtype!=1) error2("should not be here");\r
 650 \r
 651    for(i=0; i<64; i++) \r
 652       if(GeneticCode[com.icode][i]==-1) \r
 653          getcodon(stops[nstops++], i);\r
 654 \r
 655    for(h=0; h<com.ls/3; h++) {\r
 656       for(i=0; i<com.ns; i++) {\r
 657          codon[0] = com.z[i][h*3+0];\r
 658          codon[1] = com.z[i][h*3+1];\r
 659          codon[2] = com.z[i][h*3+2];\r
 660          for(j=0; j<nstops; j++) \r
 661             if(strcmp(codon, stops[j])==0) {\r
 662                printf("stop codon %s in seq. # %3d (%s)\r", codon, i+1, com.spname[i]);\r
 663                break;\r
 664             }\r
 665          if(j<nstops) break;\r
 666       }\r
 667       if(i<com.ns) {\r
 668          for(i=0; i<com.ns; i++) \r
 669             com.z[i][h*3+0] = com.z[i][h*3+1] = com.z[i][h*3+2] = '?';\r
 670          NColumnEdited++;\r
 671       }\r
 672    }\r
 673    if(NColumnEdited) {\r
 674       printf("\n%2d columns are converted into ??? because of stop codons\nPress Enter to continue", NColumnEdited);\r
 675       getchar();\r
 676    }\r
 677    return(NColumnEdited);\r
 678 }\r
 679 \r
 680 #endif\r
 681 \r
 682 \r
 683 void RemoveEmptySequences(void)\r
 684 {\r
 685 /* this removes empty sequences (? or - only) and adjust com.ns\r
 686 */\r
 687    int j,h, nsnew;\r
 688    char emptyseq[NS];\r
 689 \r
 690    for(j=0; j<com.ns; j++) {\r
 691       emptyseq[j] = 1;\r
 692       for(h=0; h<com.ls*(com.seqtype==1?3:1); h++)\r
 693          if(com.z[j][h] != '?' && com.z[j][h] != '-') {\r
 694             emptyseq[j] = 0;\r
 695             break;\r
 696          }\r
 697    }\r
 698    for(j=0,nsnew=0; j<com.ns; j++) {\r
 699       if(emptyseq[j]) {\r
 700          printf("seq #%3d: %-30s is removed\n", j+1, com.spname[j]);\r
 701          free(com.z[j]);\r
 702          free(com.spname[j]);\r
 703          continue;\r
 704       }\r
 705       com.z[nsnew] = com.z[j];\r
 706       com.spname[nsnew] = com.spname[j];\r
 707       nsnew ++;\r
 708    }\r
 709    for(j=nsnew; j<com.ns; j++) {\r
 710       com.z[j] = NULL;      \r
 711       com.spname[j] = NULL;\r
 712    }\r
 713    com.ns = nsnew;\r
 714 }\r
 715 \r
 716 \r
 717 int printPatterns(FILE *fout)\r
 718 {\r
 719    int j,h, n31 = (com.seqtype==CODONseq||com.seqtype==CODON2AAseq ? 3 : 1);\r
 720    int gap=(n31==3?3:10), n=(com.seqtype==AAseq?20:4);\r
 721 \r
 722    fprintf(fout,"\n%10d %10d  P", com.ns, com.npatt*n31);\r
 723    if(com.ngene>1) {\r
 724       fprintf (fout," G\nG %d  ", com.ngene);\r
 725       for(j=0; j<com.ngene; j++)\r
 726          fprintf(fout,"%7d", com.posG[j+1]-com.posG[j]);\r
 727    }\r
 728    FPN(fout);\r
 729 \r
 730    if(com.seqtype==1 && com.cleandata) {\r
 731       ; /* nothing is printed out for yn00, as the coding is different. */\r
 732 #if(defined CODEML || defined YN00)\r
 733       printsmaCodon (fout, com.z, com.ns, com.npatt, com.npatt, 1);\r
 734 #endif\r
 735    }\r
 736    else\r
 737       printsma(fout,com.spname,com.z,com.ns, com.npatt,com.npatt, gap, com.seqtype, 1, 0, NULL);\r
 738     if(com.ls>1.0001) {\r
 739        fprintf(fout, "\n");\r
 740        for(h=0; h<com.npatt; h++) {\r
 741           fprintf(fout," %4.0f", com.fpatt[h]);\r
 742           if((h+1)%15 == 0) FPN(fout);\r
 743        }\r
 744        fprintf(fout, "\n\n");\r
 745    }\r
 746    return(0);\r
 747 }\r
 748 \r
 749 \r
 750 \r
 751 void EncodeSeqs (void)\r
 752 {\r
 753 /* This encodes sequences and set up com.TipMap[][], called after sites are collapsed \r
 754    into patterns.\r
 755 */\r
 756    int n=com.ncode, nA, is,h, i, j, k,ic, indel=0, ch, b[3];\r
 757    char *pch = ((com.seqtype==0||com.seqtype==1) ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));\r
 758    unsigned char c[4]="", str[4]="   ";\r
 759 \r
 760    if(com.seqtype != 1) {\r
 761       for(is=0; is<com.ns; is++) {\r
 762          for (h=0; h<com.npatt; h++) {\r
 763             ch = com.z[is][h];\r
 764             com.z[is][h] = (char)(k = strchr(pch, ch) - pch);\r
 765             if(k<0) {\r
 766                printf("strange character %c in seq %d site %d\n", ch, is+1, h+1);\r
 767                exit(-1);\r
 768             }\r
 769          }\r
 770       }\r
 771    }\r
 772 #if (defined CODEML || defined YN00)\r
 773    else if(com.seqtype==1) {\r
 774       /* collect all observed codons into CODONs, with a maximum of 256 distinct codons. */\r
 775       memset(&CODONs[0][0], 0, 256*4*sizeof(char));\r
 776       for(nA=0; nA<n; nA++) {\r
 777          ic=FROM61[nA]; b[0]=ic/16; b[1]=(ic/4)%4; b[2]=ic%4;\r
 778          for(i=0; i<3; i++) CODONs[nA][i] = BASEs[b[i]];\r
 779       }\r
 780       for(j=0,nA=n; j<com.ns; j++) {\r
 781          for(h=0; h<com.npatt; h++) {\r
 782             for(k=0; k<3; k++) {\r
 783                c[k] = com.z[j][h*3+k]; \r
 784                b[k] = strchr(BASEs,c[k]) - BASEs;\r
 785                if(b[k]<0) printf("strange nucleotide %c in seq %d\n", c[k], j+1);\r
 786             }\r
 787             if(b[0]<4 && b[1]<4 && b[2]<4) {\r
 788                k = FROM64[b[0]*16 + b[1]*4 + b[2]];\r
 789                if(k<0) {\r
 790                   printf("\nstop codon %s in seq #%2d: %s\n", c, j+1, com.spname[j]);\r
 791                   printf("\ncodons in other sequences are\n");\r
 792                   for(i=0; i<com.ns; i++) {\r
 793                      for(k=0; k<3; k++) c[k] = com.z[i][h*3+k]; \r
 794                      printf("seq #%2d %-30s %s\n", i+1, com.spname[i], c);\r
 795                   }\r
 796                   exit(-1);\r
 797                }\r
 798             }\r
 799             else {  /* an ambiguous codon */\r
 800                for(k=n; k<nA; k++) \r
 801                   if(strcmp(CODONs[k], c) == 0) break;\r
 802             }\r
 803             if(k==nA) {\r
 804                if(++nA>256) \r
 805                   error2("too many ambiguity codons in the data.  Contact author");\r
 806                strcpy(CODONs[nA-1], c);\r
 807             }\r
 808             com.z[j][h] = (unsigned char)k;\r
 809          }\r
 810          com.z[j] = (unsigned char*)realloc(com.z[j], com.npatt);\r
 811       }\r
 812       if(nA>n) {\r
 813          printf("%d ambiguous codons are seen in the data:\n", nA - n);\r
 814          for(k=n; k<nA; k++)  printf("%4s", CODONs[k]);\r
 815          printf("\n");\r
 816       }\r
 817    }\r
 818 #endif\r
 819 }\r
 820 \r
 821 \r
 822 void SetMapAmbiguity (void)\r
 823 {\r
 824 /* This sets up CharaMap, the map from the ambiguity characters to resolved characters.\r
 825 */\r
 826    int n=com.ncode, i,j, i0,i1,i2, nb[3], ib[3][4], ic;\r
 827    char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));\r
 828    char *pbases = (com.seqtype==0 ? BASEs : (com.seqtype==5 ? BASEs5: NULL));\r
 829    char **pEquateBASE = (com.seqtype==0 ? EquateBASE : (com.seqtype==5 ? EquateBASE5 : NULL));\r
 830    char debug=0;\r
 831 \r
 832    for(j=0; j<n; j++) {  /* basic characters, coded according to the definition in pch. */\r
 833       nChara[j] = (char)1;\r
 834       CharaMap[j][0] = (char)j;\r
 835    }\r
 836 \r
 837    if(com.seqtype != 1) {\r
 838       for(j=n,pch+=n; *pch; j++,pch++) {\r
 839          if(com.seqtype==0 || com.seqtype==5) {  /* ambiguities are allowed for those 2 types */\r
 840             nChara[j] = (char)strlen(pEquateBASE[j]);\r
 841             for(i=0; i<nChara[j]; i++)\r
 842                CharaMap[j][i] = (char)(strchr(pbases, pEquateBASE[j][i]) - pbases);\r
 843          }\r
 844          else {  /* for non-nucleotide characters, ambiguity characters must be ? or -. */\r
 845             nChara[j] = (char)n;\r
 846             for(i=0; i<n; i++)\r
 847                CharaMap[j][i] = (char)i;\r
 848          }\r
 849          if(debug) {\r
 850             printf("character %c (%d): ", pbases[j], nChara[j]);\r
 851             for(i=0; i<nChara[j]; i++)\r
 852                printf("%c", pbases[CharaMap[j][i]]);\r
 853             printf("\n");\r
 854          }\r
 855       }\r
 856    }\r
 857 #ifdef CODEML\r
 858    else {\r
 859       for(j=n; j<256 && CODONs[j][0]; j++) {\r
 860          nChara[j] = (char)0;\r
 861          for(i=0; i<3; i++)\r
 862             NucListall(CODONs[j][i], &nb[i], ib[i]);\r
 863          for(i0=0; i0<nb[0]; i0++) {\r
 864             for(i1=0; i1<nb[1]; i1++) \r
 865                for(i2=0; i2<nb[2]; i2++) {\r
 866                   ic = ib[0][i0]*16+ib[1][i1]*4+ib[2][i2];\r
 867                   if(GeneticCode[com.icode][ic] != -1) \r
 868                      CharaMap[j][nChara[j]++] = FROM64[ic];\r
 869                }\r
 870          }\r
 871          if(nChara[j]==0) {\r
 872             printf("\ncodon %s is stop codon", CODONs[j]);\r
 873             exit(-1);\r
 874          }\r
 875       }\r
 876    }\r
 877 #endif\r
 878 }\r
 879 \r
 880 \r
 881 int IdenticalSeqs(void)\r
 882 {\r
 883 /* This checks for identical sequences and create a data set of unique \r
 884    sequences.  The file name is <SeqDataFile.unique.  This is casually \r
 885    written and need more testing.\r
 886    The routine is called right after the sequence data are read.\r
 887    For codon sequences, com.ls has the number of codons, which are NOT\r
 888    coded.\r
 889 */\r
 890    char tmpf[96], keep[NS];\r
 891    FILE *ftmp;\r
 892    int is,js,h, same,nkept=com.ns;\r
 893    int ls1=com.ls*(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
 894 \r
 895    puts("\nIdenticalSeqs: not tested\a");\r
 896    for(is=0; is<com.ns; is++) \r
 897       keep[is] = 1;\r
 898    for(is=0; is<com.ns; is++) { \r
 899       if(!keep[is]) continue;\r
 900       for(js=0; js<is; js++) {\r
 901          for(h=0,same=1; h<ls1; h++)\r
 902             if(com.z[is][h] != com.z[js][h]) break;\r
 903          if(h == ls1) {\r
 904             printf("Seqs. %3d & %3d (%s & %s) are identical!\n",\r
 905                js+1,is+1,com.spname[js],com.spname[is]);\r
 906             keep[is] = 0;\r
 907          }\r
 908       }\r
 909    }\r
 910    for(is=0; is<com.ns; is++) \r
 911       if(!keep[is]) nkept--;\r
 912    if(nkept<com.ns) {\r
 913       strcpy(tmpf, com.seqf);\r
 914       strcat(tmpf, ".unique");\r
 915       if((ftmp=fopen(tmpf,"w"))==NULL) error2("IdenticalSeqs: file error");\r
 916       printSeqs(ftmp, NULL, keep, 1);\r
 917       fclose(ftmp);\r
 918       printf("\nUnique sequences collected in %s.\n", tmpf);\r
 919    }\r
 920    return(0);\r
 921 }\r
 922 \r
 923 \r
 924 void AllPatterns (FILE* fout)\r
 925 {\r
 926 /* This prints out an alignment containting all possible site patterns, and then exits.\r
 927    This alignment may be useful to generate a dataset of infinitely long sequences, \r
 928    summarized in the site pattern probabilities.\r
 929    Because the PatternWeight() function changes the order of patterns, this routine \r
 930    prints out the alignment as one of patterns, with lots of 1's below it, to avoid \r
 931    baseml or codeml calling that routine to collaps sites.  \r
 932    You then replace those 1'with the calculated pattern probabilities for further \r
 933    analysis.\r
 934 */\r
 935    int j, h, it, ic;\r
 936    char codon[4]="   ";\r
 937    int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
 938    int gap=(n31==3?3:10);\r
 939 \r
 940    com.ns = 3;\r
 941    for(j=0,com.npatt=1; j<com.ns; j++) com.npatt*=com.ncode;\r
 942    printf ("%3d species, %d site patterns\n", com.ns, com.npatt);\r
 943    com.cleandata=1;\r
 944    for(j=0; j<com.ns; j++) {\r
 945       com.spname[j] = (char*)realloc(com.spname[j], 11*sizeof(char));\r
 946       sprintf(com.spname[j], "%c ", 'a'+j);\r
 947    }\r
 948    for(j=0; j<com.ns; j++) \r
 949       if((com.z[j]=(unsigned char*) malloc(com.npatt*sizeof(char))) == NULL)\r
 950          error2("oom in AllPatterns");\r
 951    for (h=0; h<com.npatt; h++) {\r
 952       for (j=0,it=h; j<com.ns; j++) {\r
 953          ic = it%com.ncode;\r
 954          it /= com.ncode;\r
 955          com.z[com.ns-1-j][h] = (char)ic;\r
 956       }\r
 957    }\r
 958    com.ls = com.npatt;\r
 959 \r
 960    fprintf(fout, " %6d %6d  P\n", com.ns, com.ls*n31);\r
 961    if(com.seqtype==1) {\r
 962 #if(defined CODEML || defined YN00)\r
 963       printsmaCodon (fout, com.z, com.ns, com.ls, com.ls, 0);\r
 964 #endif\r
 965    }\r
 966    else\r
 967       printsma(fout,com.spname,com.z,com.ns, com.ls, com.ls, gap, com.seqtype, 1, 0, NULL);\r
 968 \r
 969    for(h=0; h<com.npatt; h++) {\r
 970       fprintf(fout, " 1");\r
 971       if((h+1)%40==0) FPN(fout);\r
 972    }\r
 973    FPN(fout);\r
 974    exit(0);\r
 975 }\r
 976 \r
 977 \r
 978 int PatternWeight (void)\r
 979 {\r
 980 /* This collaps sites into patterns, for nucleotide, amino acid, or codon sequences.\r
 981    This relies on \0 being the end of the string so that sequences should not be \r
 982    encoded before this routine is called.\r
 983    com.pose[i] has labels for genes as input and maps sites to patterns in return.\r
 984    com.fpatt, a vector of doubles, wastes space as site pattern counts are integers.\r
 985    Sequences z[ns*ls] are copied into patterns zt[ls*lpatt], and bsearch is used \r
 986    twice to avoid excessive copying, to count npatt first & to generate fpatt etc.\r
 987 */\r
 988    int maxnpatt=com.ls, h, ip,l,u, j, k, same, ig, *poset;\r
 989    int gap = (com.seqtype==CODONseq ? 3 : 10);\r
 990    int n31 = (com.seqtype==CODONseq ? 3 : 1);\r
 991    int lpatt=com.ns*n31+1;   /* extra 0 used for easy debugging, can be voided */\r
 992    int *p2s;  /* point patterns to sites in zt */\r
 993    char *zt, *p, timestr[36];\r
 994    double nc = (com.seqtype == 1 ? 64 : com.ncode) + !com.cleandata+1;\r
 995    int debug=0;\r
 996    char DS[]="DS";\r
 997 \r
 998    /* (A) \r
 999       Collect and sort patterns.  Get com.npatt, com.lgene, com.posG.\r
1000       Move sequences com.z[ns][ls] into sites zt[ls*lpatt].  \r
1001       Use p2s to map patterns to sites in zt to avoid copying.\r
1002    */\r
1003    if(noisy) printf("Counting site patterns.. %s\n", printtime(timestr));\r
1004 \r
1005    if((com.seqtype==1 && com.ns<5) || (com.seqtype!=1 && com.ns<7))\r
1006       maxnpatt = (int)(pow(nc, (double)com.ns) + 0.5) * com.ngene;\r
1007    if(maxnpatt>com.ls) maxnpatt = com.ls;\r
1008    p2s  = (int*)malloc(maxnpatt*sizeof(int));\r
1009    zt = (char*)malloc(com.ls*lpatt*sizeof(char));\r
1010    if(p2s==NULL || zt==NULL)  error2("oom p2s or zt");\r
1011    memset(zt, 0, com.ls*lpatt*sizeof(char));\r
1012    for(j=0; j<com.ns; j++) \r
1013       for(h=0; h<com.ls; h++) \r
1014          for(k=0; k<n31; k++)\r
1015             zt[h*lpatt+j*n31+k] = com.z[j][h*n31+k];\r
1016 \r
1017    for(j=0; j<com.ns; j++) free(com.z[j]); \r
1018 \r
1019    for(ig=0; ig<com.ngene; ig++) com.lgene[ig] = 0;\r
1020    for(ig=0,com.npatt=0; ig<com.ngene; ig++) {\r
1021       com.posG[ig] = l = u = ip = com.npatt;      \r
1022       for(h=0; h<com.ls; h++) {\r
1023          if(com.pose[h] != ig) continue;\r
1024          if(debug) printf("\nh %3d %s", h, zt+h*lpatt);\r
1025 \r
1026          /* bsearch in existing patterns.  Knuth 1998 Vol3 Ed2 p.410 \r
1027             ip is the loc for match or insertion.  [l,u] is the search interval.\r
1028          */\r
1029          same = 0;\r
1030          if(com.lgene[ig]++ != 0) {  /* not 1st pattern? */\r
1031             for(l=com.posG[ig], u=com.npatt-1; ; ) {\r
1032                if(u<l) break;\r
1033                ip = (l+u)/2;\r
1034                k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);\r
1035                if(k<0)        u = ip - 1;\r
1036                else if(k>0)   l = ip + 1;\r
1037                else         { same = 1;  break; }\r
1038             }\r
1039          }\r
1040          if(!same) {\r
1041             if(com.npatt>maxnpatt) \r
1042                error2("npatt > maxnpatt");\r
1043             if(l > ip) ip++;        /* last comparison in bsearch had k > 0. */\r
1044             /* Insert new pattern at ip.  This is the expensive step. */\r
1045 \r
1046             if(ip<com.npatt)\r
1047                memmove(p2s+ip+1, p2s+ip, (com.npatt-ip)*sizeof(int));\r
1048 \r
1049             /*\r
1050             for(j=com.npatt; j>ip; j--) \r
1051                p2s[j] = p2s[j-1];\r
1052             */\r
1053             p2s[ip] = h;\r
1054             com.npatt ++;\r
1055          }\r
1056 \r
1057          if(debug) {\r
1058             printf(": %3d (%c ilu %3d%3d%3d) ", com.npatt, DS[same], ip, l, u);\r
1059             for(j=0; j<com.npatt; j++)\r
1060                printf(" %s", zt+p2s[j]*lpatt);\r
1061          }\r
1062          if(noisy && ((h+1)%10000==0 || h+1==com.ls))\r
1063             printf("\r%12d patterns at %8d / %8d sites (%.1f%%), %s", \r
1064                com.npatt, h+1, com.ls, (h+1.)*100/com.ls, printtime(timestr));\r
1065 \r
1066       }     /* for (h)  */\r
1067    }        /* for (ig) */\r
1068    if(noisy) FPN(F0);\r
1069 \r
1070    /* (B) count pattern frequencies and collect pose[] */\r
1071    com.posG[com.ngene] = com.npatt;\r
1072    for(j=0; j<com.ngene; j++) \r
1073       if(com.lgene[j]==0) \r
1074          error2("some gene labels are missing");\r
1075    for(j=1; j<com.ngene; j++) \r
1076       com.lgene[j] += com.lgene[j-1];\r
1077 \r
1078    com.fpatt = (double*)realloc(com.fpatt, com.npatt*sizeof(double));\r
1079    poset = (int*)malloc(com.ls*sizeof(int));\r
1080    if(com.fpatt==NULL || poset==NULL) error2("oom poset");\r
1081    for(ip=0; ip<com.npatt; ip++) com.fpatt[ip] = 0;\r
1082 \r
1083    for(ig=0; ig<com.ngene; ig++) {\r
1084       for(h=0; h<com.ls; h++) {\r
1085          if(com.pose[h] != ig) continue;\r
1086          for(same=0, l=com.posG[ig], u=com.posG[ig+1]-1; ; ) {\r
1087             if(u<l) break;\r
1088             ip = (l+u)/2;\r
1089             k = strcmp(zt+h*lpatt, zt+p2s[ip]*lpatt);\r
1090             if(k<0)        u = ip - 1;\r
1091             else if(k>0)   l = ip + 1;\r
1092             else         { same = 1;  break; }\r
1093          }\r
1094          if(!same)\r
1095             error2("ghost pattern?");\r
1096          com.fpatt[ip]++;\r
1097          poset[h] = ip;\r
1098       }     /* for (h)  */\r
1099    }        /* for (ig) */\r
1100 \r
1101    if(com.seqtype==CODONseq && com.ngene==3 &&com.lgene[0]==com.ls/3) {\r
1102       puts("\nCheck option G in data file? (Enter)\n");\r
1103    }\r
1104 \r
1105    for(j=0; j<com.ns; j++) {\r
1106       com.z[j] = (unsigned char*)malloc(com.npatt*n31*sizeof(char));\r
1107       for(ip=0,p=com.z[j]; ip<com.npatt; ip++) \r
1108          for(k=0; k<n31; k++)\r
1109             *p++ = zt[p2s[ip]*lpatt + j*n31 + k];\r
1110    }\r
1111    memcpy(com.pose, poset, com.ls*sizeof(int));\r
1112    free(poset);  free(p2s);  free(zt);\r
1113 \r
1114    return (0);\r
1115 }\r
1116 \r
1117 \r
1118 void AddFreqSeqGene(int js,int ig,double pi0[],double pi[]);\r
1119 \r
1120 \r
1121 void Chi2FreqHomo(double f[], int ns, int nc, double X2G[2])\r
1122 {\r
1123 /* This calculates a chi-square like statistic for testing that the base \r
1124    or amino acid frequencies are identical among sequences.\r
1125    f[ns*nc] where ns is #sequences (rows) and nc is #states (columns).\r
1126 */\r
1127    int i, j;\r
1128    double mf[64]={0}, small=1e-50;\r
1129 \r
1130    X2G[0]=X2G[1]=0;\r
1131    for(i=0; i<ns; i++) \r
1132       for(j=0; j<nc; j++) \r
1133          mf[j]+=f[i*nc+j]/ns;\r
1134 \r
1135    for(i=0; i<ns; i++) {\r
1136       for(j=0; j<nc; j++) {\r
1137          if(mf[j]>small) {\r
1138             X2G[0] += square(f[i*nc+j]-mf[j])/mf[j];\r
1139             if(f[i*nc+j])\r
1140                X2G[1] += 2*f[i*nc+j]*log(f[i*nc+j]/mf[j]);\r
1141          }\r
1142       }\r
1143    }\r
1144 }\r
1145 \r
1146 int InitializeBaseAA (FILE *fout)\r
1147 {\r
1148 /* Count site patterns (com.fpatt) and calculate base or amino acid frequencies\r
1149    in genes and species.  This works on raw (uncoded) data.  \r
1150    Ambiguity characters in sequences are resolved by iteration. \r
1151    For frequencies in each species, they are resolved within that sequence.\r
1152    For average base frequencies among species, they are resolved over all \r
1153    species.\r
1154 \r
1155    This routine is called by baseml and aaml.  codonml uses another\r
1156    routine InitializeCodon()\r
1157 */\r
1158    char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));\r
1159    char indel[]="-?";\r
1160    int wname=30, h,js,k, ig, nconstp, n=com.ncode;\r
1161    int irf, nrf=20;\r
1162    double pi0[20], t,lmax=0, X2G[2], *pisg;  /* freq for species & gene, for X2 & G */\r
1163 \r
1164    if(noisy) printf("Counting frequencies..");\r
1165    if(fout)  fprintf(fout,"\nFrequencies..");\r
1166    if((pisg=(double*)malloc(com.ns*n*sizeof(double))) == NULL)\r
1167       error2("oom pisg");\r
1168    for(h=0,nconstp=0; h<com.npatt; h++) {\r
1169       for (js=1; js<com.ns; js++)\r
1170          if(com.z[js][h] != com.z[0][h])  break;\r
1171       if (js==com.ns && com.z[0][h]!=indel[0] && com.z[0][h]!=indel[1])\r
1172          nconstp += (int)com.fpatt[h];\r
1173    }\r
1174    for (ig=0,zero(com.pi,n); ig<com.ngene; ig++) {\r
1175       if (com.ngene>1)\r
1176          fprintf (fout,"\n\nGene %2d (len %4d)", ig+1, com.lgene[ig]-(ig==0?0:com.lgene[ig-1]));\r
1177       fprintf(fout,"\n%*s", wname, "");\r
1178       for(k=0; k<n; k++) fprintf(fout,"%7c", pch[k]);\r
1179 \r
1180       /* The following block calculates freqs in each species for each gene.  \r
1181          Ambiguities are resolved in each species.  com.pi and com.piG are \r
1182          used for output only, and are not be used later with missing data.\r
1183       */\r
1184       zero(com.piG[ig], n);\r
1185       zero(pisg, com.ns*n);\r
1186       for(js=0; js<com.ns; js++) {\r
1187          fillxc(pi0, 1.0/n, n);\r
1188          for(irf=0; irf<nrf; irf++) {\r
1189             zero(com.pi, n);\r
1190             AddFreqSeqGene(js, ig, pi0, com.pi);\r
1191             t = sum(com.pi, n);\r
1192             if(t<1e-10) {\r
1193                printf("Some sequences are empty.\n");\r
1194                fillxc(com.pi, 1.0/n, n);\r
1195             }\r
1196             else \r
1197                abyx(1/t, com.pi, n);\r
1198             if(com.cleandata || com.cleandata || (t=distance(com.pi,pi0,n))<1e-8)\r
1199                break;\r
1200             xtoy(com.pi, pi0, n);\r
1201          }   /* for(irf) */\r
1202          fprintf(fout,"\n%-*s", wname, com.spname[js]);\r
1203          for(k=0; k<n; k++) fprintf(fout, "%8.5f", com.pi[k]);\r
1204          if(com.ncode==4 && com.ngene==1) fprintf(fout, " GC = %5.3f", com.pi[1]+com.pi[3]);\r
1205          for(k=0; k<n; k++) com.piG[ig][k] += com.pi[k]/com.ns;\r
1206          xtoy(com.pi, pisg+js*n, n);\r
1207       }    /* for(js,ns) */\r
1208       if(com.ngene>1) {\r
1209          fprintf(fout,"\n\n%-*s", wname, "Mean");\r
1210          for(k=0; k<n; k++) fprintf(fout, "%7.4f", com.piG[ig][k]);\r
1211       }\r
1212 \r
1213       Chi2FreqHomo(pisg, com.ns, n, X2G);\r
1214 \r
1215       fprintf(fout,"\n\nHomogeneity statistic: X2 = %.5f G = %.5f ",X2G[0], X2G[1]);\r
1216 \r
1217       /* fprintf(frst1,"\t%.5f", X2G[1]); */\r
1218 \r
1219    }  /* for(ig) */\r
1220    if(noisy) printf("\n");\r
1221 \r
1222    /* If there are missing data, the following block calculates freqs \r
1223       in each gene (com.piG[]), as well as com.pi[] for the entire sequence.  \r
1224       Ambiguities are resolved over entire data sets across species (within \r
1225       each gene for com.piG[]).  These are used in ML calculation later.\r
1226    */\r
1227    if(com.cleandata) {\r
1228       for (ig=0,zero(com.pi,n); ig<com.ngene; ig++) {\r
1229          t = (ig==0 ? com.lgene[0] : com.lgene[ig]-com.lgene[ig-1])/(double)com.ls;\r
1230          for(k=0; k<n; k++)  com.pi[k] += com.piG[ig][k]*t;\r
1231       }\r
1232    }\r
1233    else {\r
1234       for (ig=0; ig<com.ngene; ig++) { \r
1235          xtoy(com.piG[ig], pi0, n);\r
1236          for(irf=0; irf<nrf; irf++) {  /* com.piG[] */\r
1237             zero(com.piG[ig], n);\r
1238             for(js=0; js<com.ns; js++)\r
1239                AddFreqSeqGene(js, ig, pi0, com.piG[ig]);\r
1240             t = sum(com.piG[ig], n);\r
1241             if(t<1e-10) \r
1242                puts("empty sequences?");\r
1243             abyx(1/t, com.piG[ig], n);\r
1244             if(distance(com.piG[ig], pi0, n)<1e-8) break;\r
1245             xtoy(com.piG[ig], pi0, n);\r
1246          }         /* for(irf) */\r
1247       }            /* for(ig) */\r
1248       zero(pi0, n);\r
1249       for(k=0; k<n; k++) for(ig=0; ig<com.ngene; ig++) \r
1250          pi0[k] += com.piG[ig][k]/com.ngene;\r
1251       for(irf=0; irf<nrf; irf++) {  /* com.pi[] */\r
1252          zero(com.pi,n);\r
1253          for(ig=0; ig<com.ngene; ig++)  for(js=0; js<com.ns; js++)\r
1254             AddFreqSeqGene(js, ig, pi0, com.pi);\r
1255          abyx(1/sum(com.pi,n), com.pi, n);\r
1256          if(distance(com.pi, pi0, n)<1e-8) break;\r
1257          xtoy(com.pi, pi0, n);\r
1258       }            /* for(ig) */\r
1259    }\r
1260    fprintf (fout, "\n\n%-*s", wname, "Average");\r
1261    for(k=0; k<n; k++) fprintf(fout,"%8.5f", com.pi[k]);\r
1262    if(!com.cleandata) fputs("\n(Ambiguity characters are used to calculate freqs.)\n",fout);\r
1263 \r
1264    fprintf (fout,"\n\n# constant sites: %6d (%.2f%%)",\r
1265             nconstp, (double)nconstp*100./com.ls);\r
1266 \r
1267    if (com.model==0 || (com.seqtype==BASEseq && com.model==1)) {\r
1268       fillxc(com.pi, 1./n, n);\r
1269       for(ig=0; ig<com.ngene; ig++)\r
1270          xtoy(com.pi, com.piG[ig], n);\r
1271    }\r
1272    if (com.seqtype==BASEseq && com.model==5) { /* T92 model */\r
1273       com.pi[0] = com.pi[2] = (com.pi[0] + com.pi[2])/2;\r
1274       com.pi[1] = com.pi[3] = (com.pi[1] + com.pi[3])/2;\r
1275       for(ig=0; ig<com.ngene; ig++) {\r
1276          com.piG[ig][0] = com.piG[ig][2] = (com.piG[ig][0] + com.piG[ig][2])/2;\r
1277          com.piG[ig][1] = com.piG[ig][3] = (com.piG[ig][1] + com.piG[ig][3])/2;\r
1278       }\r
1279    }\r
1280 \r
1281    /* this is used only for REV & REVu in baseml and model==3 in aaml */\r
1282    if(com.seqtype==AAseq) {\r
1283       for (k=0,t=0; k<n; k++)  t += (com.pi[k]>0);\r
1284       if (t<=4)\r
1285          puts("\n\a\t\tAre these a.a. sequences?");\r
1286    }\r
1287    if(com.cleandata && com.ngene==1) {\r
1288       for(h=0,lmax=-(double)com.ls*log((double)com.ls); h<com.npatt; h++)\r
1289          if(com.fpatt[h]>1) lmax += com.fpatt[h]*log((double)com.fpatt[h]);\r
1290    }\r
1291    if(fout) {\r
1292       if(lmax) fprintf(fout, "\nln Lmax (unconstrained) = %.6f\n", lmax);\r
1293       fflush(fout);\r
1294    }\r
1295 \r
1296    free(pisg);\r
1297    return(0);\r
1298 }\r
1299 \r
1300 \r
1301 void AddFreqSeqGene(int js, int ig, double pi0[], double pi[])\r
1302 {\r
1303 /* This adds the character counts in sequence js in gene ig to pi, \r
1304    using pi0, by resolving ambiguities.  The data are coded.  com.cleandata==1 or 0.\r
1305    This is for nucleotide and amino acid sequences only.\r
1306 */\r
1307    char *pch = (com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs : (com.seqtype==5 ? BASEs5: BINs)));\r
1308    int k, h, b, n=com.ncode;\r
1309    double t;\r
1310 \r
1311    if(com.cleandata) {\r
1312       for(h=com.posG[ig]; h<com.posG[ig+1]; h++) \r
1313          pi[com.z[js][h]] += com.fpatt[h];\r
1314    }\r
1315    else {\r
1316       for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
1317          b = com.z[js][h];\r
1318          if(b<n)\r
1319             pi[b] += com.fpatt[h];\r
1320          else {\r
1321             /*\r
1322             if(com.seqtype==BASEseq) {\r
1323                NucListall(BASEs[b], &nb, ib);\r
1324                for(k=0,t=0; k<nb; k++) t += pi0[ib[k]];\r
1325                for(k=0; k<nb; k++) \r
1326                   pi[ib[k]] += pi0[ib[k]]/t * com.fpatt[h];\r
1327             }\r
1328             */\r
1329             if(com.seqtype==BASEseq) {\r
1330                for(k=0,t=0; k<nChara[b]; k++) \r
1331                   t += pi0[CharaMap[b][k]];\r
1332                for(k=0; k<nChara[b]; k++) \r
1333                   pi[CharaMap[b][k]] += pi0[CharaMap[b][k]]/t * com.fpatt[h];\r
1334             }\r
1335             else if(com.seqtype==AAseq)  /* unrecognized AAs are treated as "?". */\r
1336                for(k=0; k<n; k++) pi[k] += pi0[k]*com.fpatt[h];\r
1337          }\r
1338       }\r
1339    }\r
1340 }\r
1341 \r
1342 \r
1343 int RemoveIndel(void)\r
1344 {\r
1345 /* Remove ambiguity characters and indels in the untranformed sequences, \r
1346    Changing com.ls and com.pose[] (site marks for multiple genes).\r
1347    For codonml, com.ls is still 3*#codons\r
1348    Called at the end of ReadSeq, when com.pose[] are still site marks.\r
1349    All characters in com.z[][] not found in the character string pch are\r
1350    considered ambiguity characters and are removed.\r
1351 */\r
1352    int  n=com.ncode, h,k, j,js,lnew,nindel, n31=1;\r
1353    char b, *miss;  /* miss[h]=1 if site (codon) h is missing, 0 otherwise */\r
1354    char *pch=((com.seqtype<=1||com.seqtype==CODON2AAseq)?BASEs:(com.seqtype==2?AAs: (com.seqtype==5?BASEs5:BINs)));\r
1355 \r
1356    if(com.seqtype==CODONseq || com.seqtype==CODON2AAseq) {\r
1357       n31=3; n=4;\r
1358    }\r
1359 \r
1360    if (com.ls%n31) error2("ls in RemoveIndel.");\r
1361    if((miss=(char*)malloc(com.ls/n31 *sizeof(char)))==NULL)\r
1362       error2("oom miss");\r
1363    for(h=0; h<com.ls/n31; h++)\r
1364       miss[h] = 0;\r
1365    for (js=0; js<com.ns; js++) {\r
1366       for (h=0,nindel=0; h<com.ls/n31; h++) {\r
1367          for (k=0; k<n31; k++) {\r
1368             b = (char)toupper(com.z[js][h*n31+k]);\r
1369             for(j=0; j<n; j++) \r
1370                if(b==pch[j]) break;\r
1371             if(j==n) {\r
1372                miss[h]=1; nindel++; \r
1373             }\r
1374          }\r
1375       }\r
1376       if (noisy>2 && nindel) \r
1377          printf("\n%6d ambiguity characters in seq. %d", nindel,js+1);\r
1378    }\r
1379    if(noisy>2) {\r
1380       for(h=0,k=0; h<com.ls/n31; h++)  if(miss[h]) k++;\r
1381       printf("\n%d sites are removed. ", k);\r
1382       if(k<1000)\r
1383          for(h=0; h<com.ls/n31; h++)  if(miss[h]) printf(" %2d", h+1);\r
1384    }\r
1385 \r
1386    for (h=0,lnew=0; h<com.ls/n31; h++)  {\r
1387       if(miss[h]) continue;\r
1388       for (js=0; js<com.ns; js++) {\r
1389          for (k=0; k<n31; k++)\r
1390             com.z[js][lnew*n31+k]=com.z[js][h*n31+k];\r
1391       }\r
1392       com.pose[lnew]=com.pose[h];\r
1393       lnew++;\r
1394    }\r
1395    com.ls=lnew*n31;\r
1396    free(miss);\r
1397    return (0);\r
1398 }\r
1399 \r
1400 \r
1401 \r
1402 int MPInformSites (void)\r
1403 {\r
1404 /* Outputs parsimony informative and noninformative sites into \r
1405    two files named MPinf.seq and MPninf.seq\r
1406    Uses transformed sequences.  \r
1407    Not used for a long time.  Does not work if com.pose is NULL.  \r
1408 */\r
1409    char *imark;\r
1410    char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
1411    int h, i, markb[NS], inf, lsinf;\r
1412    FILE *finf, *fninf;\r
1413 \r
1414 puts("\nMPInformSites: missing data not dealt with yet?\n");\r
1415 \r
1416    finf=fopen("MPinf.seq","w");\r
1417    fninf=fopen("MPninf.seq","w");\r
1418    if (finf==NULL || fninf==NULL) error2("MPInformSites: file creation error");\r
1419 \r
1420    puts ("\nSorting parsimony-informative sites: MPinf.seq & MPninf.seq");\r
1421    if ((imark=(char*)malloc(com.ls*sizeof(char)))==NULL) error2("oom imark");\r
1422    for (h=0,lsinf=0; h<com.ls; h++) {\r
1423       for (i=0; i<com.ns; i++) markb[i]=0;\r
1424       for (i=0; i<com.ns; i++) markb[(int)com.z[i][com.pose[h]]]++;\r
1425 \r
1426       for (i=0,inf=0; i<com.ncode; i++)  if (markb[i]>=2)  inf++;\r
1427       if (inf>=2) { imark[h]=1; lsinf++; }\r
1428       else imark[h]=0;\r
1429    }\r
1430    fprintf (finf, "%6d%6d\n", com.ns, lsinf);\r
1431    fprintf (fninf, "%6d%6d\n", com.ns, com.ls-lsinf);\r
1432    for (i=0; i<com.ns; i++) {\r
1433       fprintf (finf, "\n%s\n", com.spname[i]);\r
1434       fprintf (fninf, "\n%s\n", com.spname[i]);\r
1435       for (h=0; h<com.ls; h++)\r
1436          fprintf ((imark[h]?finf:fninf), "%c", pch[(int)com.z[i][com.pose[h]]]);\r
1437       FPN (finf); FPN(fninf);\r
1438    }\r
1439    free (imark);\r
1440    fclose(finf);  fclose(fninf);\r
1441    return (0);\r
1442 }\r
1443 \r
1444 \r
1445 int PatternWeightJC69like (FILE *fout)\r
1446 {\r
1447 /* This collaps site patterns further for JC69-like models, called after\r
1448    PatternWeight().  This is used for JC and poisson amino acid models. \r
1449    The routine could be merged into PatternWeight(), which should lead to \r
1450    faster computation, but this is not done because right now \r
1451    InitializeBaseAA() prints out base or amino acid frequencies after \r
1452    PatternWeight() and before this routine.  \r
1453    \r
1454    If the data have no ambiguities (com.cleanddata=1), the routine recodes \r
1455    the data, for example, changing data at a site 1120 (CCAT) into 0012 \r
1456    (TTCA) before checking against old patterns already found.  If the data \r
1457    contain ambiguities, they are not encoded.  In that case, for every \r
1458    site, the routine changes ? or N into - first.  It then checks whether there \r
1459    are any other ambibiguities and will recode if and only if there are not \r
1460    any other ambiguities.  For example, a site with data CC?T will be \r
1461    changed into CC-T first and then recoded into TT-C and checked against \r
1462    old patterns found.  A site with data CCRT will not be recoded.  In theory \r
1463    such sites may be packed as well, but perhaps the effort is not worthwhile.  \r
1464    The routine checks data like CCRT against old patterns already found, \r
1465 \r
1466    If com.pose is not NULL, the routine also updates com.pose.  This allows \r
1467    the program to work if com.readpattern==1.\r
1468 */\r
1469    char zh[NS], b, gap;\r
1470    char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
1471    int npatt0=com.npatt, h, ht, j,k, same=0, ig, recode;\r
1472 \r
1473    if(com.seqtype==1) \r
1474       error2("PatternWeightJC69like does not work for codon seqs");\r
1475    if(noisy) printf("Counting site patterns again, for JC69.\n");\r
1476    gap = (char) (strchr(pch, (int)'-') - pch);\r
1477    for (h=0,com.npatt=0,ig=-1; h<npatt0; h++) {\r
1478       if (ig<com.ngene-1 && h==com.posG[ig+1])\r
1479          com.posG[++ig] = com.npatt; \r
1480 \r
1481       if(com.cleandata) { /* clean data, always recode */\r
1482          zh[0] = b = 0; \r
1483          b++;\r
1484          for (j=1; j<com.ns; j++) {\r
1485             for(k=0; k<j; k++) \r
1486                if (com.z[j][h]==com.z[k][h]) break;\r
1487             zh[j] = (k<j ? zh[k] : b++);\r
1488          }\r
1489       }\r
1490       else { /* recode only if there are no non-gap ambiguity characters */\r
1491          for(j=0; j<com.ns; j++)\r
1492             zh[j] = com.z[j][h];\r
1493 \r
1494          /* After this loop, recode = 0 or 1 decides whether to recode. */\r
1495          for (j=0,recode=1; j<com.ns; j++) {\r
1496             if (zh[j] < com.ncode) \r
1497                continue;\r
1498             if (nChara[zh[j]] == com.ncode) {\r
1499                zh[j] = gap;\r
1500                continue;\r
1501             }\r
1502             recode = 0; \r
1503             break;\r
1504          }\r
1505          if(recode) {\r
1506             b = 0;\r
1507             if(zh[0] != gap) \r
1508                zh[0] = b++;\r
1509             for (j=1; j<com.ns; j++) {\r
1510                if(zh[j] != gap) {\r
1511                   for(k=0; k<j; k++)\r
1512                      if (zh[j] == com.z[k][h]) break;\r
1513                   if(k<j) zh[j] = zh[k];\r
1514                   else    zh[j] = b++;\r
1515                }\r
1516             }\r
1517          }\r
1518       }\r
1519 \r
1520       for (ht=com.posG[ig],same=0; ht<com.npatt; ht++) {\r
1521          for (j=0,same=1; j<com.ns; j++)\r
1522             if (zh[j]!=com.z[j][ht]) {\r
1523                same = 0;  break; \r
1524             }\r
1525          if (same) break; \r
1526       }\r
1527       if (same)\r
1528          com.fpatt[ht] += com.fpatt[h];\r
1529       else {\r
1530          for(j=0; j<com.ns; j++) com.z[j][com.npatt] = zh[j];\r
1531          com.fpatt[com.npatt++] = com.fpatt[h];\r
1532       }\r
1533       if(com.pose) \r
1534          for(k=0; k<com.ls; k++) \r
1535             if(com.pose[k]==h) com.pose[k] = ht;\r
1536    }     /* for (h)   */\r
1537    com.posG[com.ngene] = com.npatt;\r
1538    if (noisy) printf ("new no. site patterns:%7d\n", com.npatt);\r
1539 \r
1540    if(fout) {\r
1541       fprintf(fout, "\n\nPrinting out site pattern counts\n");\r
1542       printPatterns(fout);\r
1543    }\r
1544    return (0);\r
1545 }\r
1546 \r
1547 int Site2Pattern (FILE *fout)\r
1548 {\r
1549    int h;\r
1550    fprintf(fout,"\n\nMapping site to pattern (i.e. site %d has pattern %d):\n",\r
1551       com.ls-1, com.pose[com.ls-2]+1);\r
1552    FOR (h, com.ls) {\r
1553       fprintf (fout, "%6d", com.pose[h]+1);\r
1554       if ((h+1)%10==0) FPN (fout);\r
1555    }\r
1556    FPN (fout);\r
1557    return (0);\r
1558 }\r
1559 \r
1560 \r
1561 #endif\r
1562 \r
1563 \r
1564 \r
1565 int print1seq (FILE*fout, char *z, int ls, int pose[])\r
1566 {\r
1567 /* This prints out one sequence, and the sequences are encoded.  \r
1568    z[] contains patterns if (pose!=NULL)\r
1569    This uses com.seqtype.\r
1570 */\r
1571    int h, hp, gap=10;\r
1572    char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
1573    char str[4]="";\r
1574    int nb = (com.seqtype==CODONseq?3:1);\r
1575 \r
1576    for(h=0; h<ls; h++) {\r
1577       hp = (pose ? pose[h] : h);\r
1578       if(com.seqtype != CODONseq) {\r
1579          fprintf(fout, "%c", pch[(int)z[hp]]);\r
1580          if((h+1)%gap==0) fputc(' ', fout);\r
1581       }\r
1582       else\r
1583          fprintf(fout, "%s ", CODONs[z[hp]]);\r
1584    }\r
1585    return(0);\r
1586 }\r
1587 \r
1588 void printSeqs (FILE *fout, int *pose, char keep[], int format)\r
1589 {\r
1590 /* Print sequences into fout, using paml (format=0 or 1) or paup (format=2) \r
1591    formats.\r
1592    Use pose=NULL if called before site patterns are collapsed.  \r
1593    keep[] marks the sequences to be printed.  Use NULL for keep if all sequences \r
1594    are to be printed.\r
1595    Sequences may (com.cleandata==1) and may not (com.cleandata=0) be coded.\r
1596    com.z[] has site patterns if pose!=NULL.\r
1597    This uses com.seqtype, and com.ls is the number of codons for codon seqs.\r
1598    See notes in print1seq()\r
1599 \r
1600    format = 0,1: PAML sites or patterns\r
1601             2:   PAUP Nexus format.\r
1602 \r
1603    This is used by evolver.  Check and merge with printsma().\r
1604 \r
1605 */\r
1606    int h, j, ls1, n31=(com.seqtype==1?3:1), nskept=com.ns, wname=30;\r
1607    char *dt=(com.seqtype==AAseq?"protein":"dna");\r
1608 \r
1609    ls1 = (format==1 ? com.npatt : com.ls);\r
1610    if(keep) \r
1611       for(j=0; j<com.ns; j++) nskept -= !keep[j];\r
1612    if(format==0 || format==1)\r
1613       fprintf(fout, "\n\n%6d %7d %s\n\n", nskept, ls1*n31, (format==1?" P":""));\r
1614    else if(format==2) {  /* NEXUS format */\r
1615       fprintf(fout,"\nbegin data;\n");\r
1616       fprintf(fout,"   dimensions ntax=%d nchar=%d;\n", nskept, ls1*n31);\r
1617       fprintf(fout,"   format datatype=%s missing=? gap=-;\n   matrix\n",dt);\r
1618    }\r
1619 \r
1620    for(j=0; j<com.ns; j++,FPN(fout)) {\r
1621       if(keep && !keep[j]) continue;\r
1622       fprintf(fout,"%s%-*s  ", (format==2?"      ":""), wname, com.spname[j]);\r
1623       print1seq(fout, com.z[j], (format==1?com.npatt:com.ls), pose);\r
1624    }\r
1625    if(format==2) fprintf(fout, "   ;\nend;");\r
1626    else if (format==1) {\r
1627        for(h=0,FPN(fout); h<com.npatt; h++) {\r
1628          /* fprintf(fout," %12.8f", com.fpatt[h]/(double)com.ls); */\r
1629          fprintf(fout," %4.0f", com.fpatt[h]);\r
1630          if((h+1)%15==0) FPN(fout);\r
1631       }\r
1632    }\r
1633 \r
1634    fprintf(fout,"\n\n");\r
1635    fflush(fout);\r
1636 }\r
1637 \r
1638 #define gammap(x,alpha) (alpha*(1-pow(x,-1.0/alpha)))\r
1639 /* DistanceREV () used to be here, moved to pamp. \r
1640 */\r
1641 \r
1642 #if (defined BASEML || defined BASEMLG || defined MCMCTREE || defined PROBTREE || defined YULETREE) \r
1643 \r
1644 double SeqDivergence (double x[], int model, double alpha, double *kappa)\r
1645 {\r
1646 /* alpha=0 if no gamma \r
1647    return -1 if in error.\r
1648    Check DistanceF84() if variances are wanted.\r
1649 */\r
1650    int i,j;\r
1651    double p[4], Y,R, a1,a2,b, P1,P2,Q,fd,tc,ag, GC;\r
1652    double small=1e-10/com.ls,largek=999, larged=9;\r
1653 \r
1654    if (testXMat(x)) {\r
1655       matout(F0, x, 4, 4);\r
1656       printf("\nfrequency matrix error, setting distance to large d");\r
1657       return(larged);\r
1658    }\r
1659    for (i=0,fd=1,zero(p,4); i<4; i++) {\r
1660       fd -= x[i*4+i];\r
1661       FOR (j,4) { p[i]+=x[i*4+j]/2;  p[j]+=x[i*4+j]/2; }\r
1662    }\r
1663    P1 = x[0*4+1]+x[1*4+0];\r
1664    P2 = x[2*4+3]+x[3*4+2];\r
1665    Q = x[0*4+2]+x[0*4+3]+x[1*4+2]+x[1*4+3]+ x[2*4+0]+x[2*4+1]+x[3*4+0]+x[3*4+1];\r
1666    if(fd<small) \r
1667       return(0);\r
1668    if(P1<small) P1=0; \r
1669    if(P2<small) P2=0; \r
1670    if(Q<small) Q=0;\r
1671    Y=p[0]+p[1];    R=p[2]+p[3];  tc=p[0]*p[1]; ag=p[2]*p[3];\r
1672 \r
1673    switch (model) {\r
1674    case (JC69):\r
1675       FOR (i,4) p[i]=.25;\r
1676    case (F81):\r
1677       for (i=0,b=0; i<4; i++)  b += p[i]*(1-p[i]);\r
1678       if (1-fd/b<=0) return (larged);\r
1679 \r
1680       if (alpha<=0) return (-b*log (1-fd/b));\r
1681       else return  (-b*gammap(1-fd/b,alpha));\r
1682    case (K80) :\r
1683 /*\r
1684       printf("\nP Q = %.6f %.6f\n", P1+P2,Q);\r
1685       printf("\nP1 P2 Q = %.6f %.6f %.6f\n", P1,P2,Q);\r
1686 */\r
1687       a1=1-2*(P1+P2)-Q;   b=1-2*Q;\r
1688 /*      if (a1<=0 || b<=0) return (-1); */\r
1689       if (a1<=0 || b<=0) return (larged);\r
1690       if (alpha<=0)  { a1=-log(a1);  b=-log(b); }\r
1691       else          { a1=-gammap(a1,alpha); b=-gammap(b,alpha); }\r
1692       a1=.5*a1-.25*b;  b=.25*b;\r
1693       if(b>small) *kappa = a1/b; else *kappa=largek;\r
1694       return (a1+2*b);\r
1695    case (F84):\r
1696       if(Y<small || R<small)\r
1697          error2("Y or R = 0.");\r
1698 \r
1699       a1=(2*(tc+ag)+2*(tc*R/Y+ag*Y/R)*(1-Q/(2*Y*R)) -P1-P2) / (2*tc/Y+2*ag/R);\r
1700       b = 1 - Q/(2*Y*R);\r
1701 /*      if (a1<=0 || b<=0) return (-1); */\r
1702       if (a1<=0 || b<=0) return (larged);\r
1703       if (alpha<=0) { a1=-log(a1); b=-log(b); }\r
1704       else          { a1=-gammap(a1,alpha); b=-gammap(b,alpha); }\r
1705       a1=.5*a1;  b=.5*b;\r
1706       *kappa = a1/b-1;\r
1707       *kappa = max2(*kappa, -.5);\r
1708       return  4*b*(tc*(1+ *kappa/Y)+ag*(1+ *kappa/R)+Y*R);\r
1709    case (HKY85):         /* HKY85, from Rzhetsky & Nei (1995 MBE 12, 131-51) */\r
1710       if(Y<small || R<small)\r
1711          error2("Y or R = 0.");\r
1712 \r
1713       *kappa = largek;\r
1714       a1=1-Y*P1/(2*tc)-Q/(2*Y);\r
1715       a2=1-R*P2/(2*ag)-Q/(2*R);\r
1716       b=1-Q/(2*Y*R);\r
1717       if (a1<=0 || a2<=0 || b<=0) return (larged);\r
1718       if (alpha<=0) { a1=-log(a1); a2=-log(a2); b=-log(b); }\r
1719       else   { a1=-gammap(a1,alpha); a2=-gammap(a2,alpha); b=-gammap(b,alpha);}\r
1720       a1 = -R/Y*b + a1/Y;\r
1721       a2 = -Y/R*b + a2/R;\r
1722       if (b>0) *kappa = min2((a1+a2)/(2*b), largek);\r
1723       return 2*(p[0]*p[1] + p[2]*p[3])*(a1+a2)/2 + 2*Y*R*b;\r
1724    case (T92):\r
1725       *kappa = largek;\r
1726       GC=p[1]+p[3];\r
1727       a1 = 1 - Q - (P1+P2)/(2*GC*(1-GC));   b=1-2*Q;\r
1728       if (a1<=0 || b<=0) return (larged);\r
1729       if (alpha<=0) { a1=-log(a1); b=-log(b); }\r
1730       else   { a1=-gammap(a1,alpha); b=-gammap(b,alpha);}\r
1731       if(Q>0) *kappa = 2*a1/b-1;\r
1732       return 2*GC*(1-GC)*a1 + (1-2*GC*(1-GC))/2*b;\r
1733    case (TN93):         /* TN93  */\r
1734       if(Y<small || R<small)\r
1735          error2("Y or R = 0.");\r
1736       a1=1-Y*P1/(2*tc)-Q/(2*Y);  \r
1737       a2=1-R*P2/(2*ag)-Q/(2*R);\r
1738       b=1-Q/(2*Y*R);\r
1739 /*      if (a1<=0 || a2<=0 || b<=0) return (-1); */\r
1740       if (a1<=0 || a2<=0 || b<=0) return (larged);\r
1741       if (alpha<=0) { a1=-log(a1); a2=-log(a2); b=-log(b); }\r
1742       else   { a1=-gammap(a1,alpha); a2=-gammap(a2,alpha); b=-gammap(b,alpha);}\r
1743       a1=.5/Y*(a1-R*b);  a2=.5/R*(a2-Y*b);  b=.5*b;\r
1744       *kappa = largek;\r
1745 /*\r
1746       printf("\nk1&k2 = %.6f %.6f\n", a1/b,a2/b);\r
1747 */\r
1748       if (b>0) *kappa = min2((a1+a2)/(2*b), largek);\r
1749       return 4*p[0]*p[1]*a1 + 4*p[2]*p[3]*a2 + 4*Y*R*b;\r
1750    }\r
1751    return (-1);\r
1752 }\r
1753 \r
1754 \r
1755 double DistanceIJ (int is, int js, int model, double alpha, double *kappa)\r
1756 {\r
1757 /* Distance between sequences is and js.\r
1758    See DistanceMatNuc() for more details.\r
1759 */\r
1760    char b0,b1;\r
1761    int h, n=4, missing=0;\r
1762    double x[16], sumx, larged=9;\r
1763 \r
1764    zero(x, 16);\r
1765    if(com.cleandata && com.seqtype==0) {\r
1766       for (h=0; h<com.npatt; h++)\r
1767          x[com.z[is][h]*n+com.z[js][h]] += com.fpatt[h];\r
1768    }\r
1769    else {\r
1770       for (h=0; h<com.npatt; h++) {\r
1771          b0 = com.z[is][h];\r
1772          b1 = com.z[js][h];\r
1773          if(b0<n && b1<n)\r
1774             x[b0*n+b1] += com.fpatt[h];\r
1775          else\r
1776             missing=1;\r
1777       }\r
1778    }\r
1779    sumx = sum(x,16);\r
1780 \r
1781    if(sumx<=0) return(larged);    /* questionable??? */\r
1782    abyx(1./sum(x,16),x,16);\r
1783    return SeqDivergence(x, model, alpha, kappa);\r
1784 }\r
1785 \r
1786 \r
1787 #if (defined LSDISTANCE && defined REALSEQUENCE)\r
1788 \r
1789 extern double *SeqDistance;\r
1790 \r
1791 int DistanceMatNuc (FILE *fout, FILE*f2base, int model, double alpha)\r
1792 {\r
1793 /* This calculates pairwise distances.  The data may be clean and coded \r
1794    (com.cleandata==1) or not.  In the latter case, ambiguity sites are not \r
1795    used (pairwise deletion).  Site patterns are used.\r
1796 */\r
1797    int is,js, status=0;\r
1798    double kappat=0, t,bigD=9;\r
1799    \r
1800    if(f2base) fprintf(f2base,"%6d\n", com.ns);\r
1801    if(model>=REV) model=TN93; /* TN93 here */\r
1802    if(fout) {\r
1803       fprintf(fout,"\nDistances:%5s", models[model]);\r
1804       if (model!=JC69 && model!=F81) fprintf (fout, " (kappa) ");\r
1805       fprintf(fout," (alpha set at %.2f)\n", alpha);\r
1806       fprintf(fout,"This matrix is not used in later m.l. analysis.\n");\r
1807       if(!com.cleandata) fprintf(fout, "\n(Pairwise deletion.)");\r
1808    }\r
1809    for(is=0; is<com.ns; is++) {\r
1810       if(fout) fprintf(fout,"\n%-15s  ", com.spname[is]);\r
1811       if(f2base) fprintf(f2base,"%-15s   ", com.spname[is]);\r
1812       for(js=0; js<is; js++) {\r
1813          t = DistanceIJ(is, js, model, alpha, &kappat);\r
1814          if(t<0) { t=bigD; status=-1; }\r
1815          SeqDistance[is*(is-1)/2+js] = t;\r
1816          if(f2base) fprintf(f2base," %7.4f", t);\r
1817          if(fout) fprintf(fout,"%8.4f", t);\r
1818          if(fout && (model==K80 || model>=F84))\r
1819             fprintf(fout,"(%7.4f)", kappat);\r
1820        }\r
1821        if(f2base) FPN(f2base);\r
1822    }\r
1823    if(fout) FPN(fout);\r
1824    if(status) puts("\ndistance formula sometimes inapplicable..");\r
1825    return(status);\r
1826 }\r
1827 \r
1828 \r
1829 \r
1830 #endif\r
1831 \r
1832 \r
1833 #ifdef BASEMLG\r
1834 extern int CijkIs0[];\r
1835 #endif\r
1836 \r
1837 extern int nR;\r
1838 extern double Cijk[], Root[];\r
1839 \r
1840 int QTN93 (int model, double Q[], double kappa1, double kappa2, double pi[])\r
1841 {\r
1842    int i,j;\r
1843    double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G, scalefactor;\r
1844 \r
1845    if (model==JC69 || model==F81) kappa1=kappa2=com.kappa=1; \r
1846    else if (com.model<TN93)       kappa2=kappa1;\r
1847    if(model==F84) { kappa2=1+kappa1/R; kappa1=1+kappa1/Y; }\r
1848    scalefactor = 1/(2*T*C*kappa1+2*A*G*kappa2 + 2*Y*R);\r
1849 \r
1850    for(i=0; i<4; i++) for(j=0; j<4; j++) Q[i*4+j] = (i==j ? 0 : 1);\r
1851    Q[0*4+1] = Q[1*4+0] = kappa1;\r
1852    Q[2*4+3] = Q[3*4+2] = kappa2;\r
1853    for(i=0; i<4; i++) for(j=0; j<4; j++) Q[i*4+j] *= pi[j]*scalefactor;\r
1854    for(i=0; i<4; i++) { Q[i*4+i] = 0;  Q[i*4+i] = -sum(Q+i*4, 4); }\r
1855 \r
1856    return (0);\r
1857 }\r
1858 \r
1859 int RootTN93 (int model, double kappa1, double kappa2, double pi[], \r
1860     double *scalefactor, double Root[])\r
1861 {\r
1862    double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G;\r
1863 \r
1864    if (model==JC69 || model==F81) kappa1=kappa2=com.kappa=1; \r
1865    else if (com.model<TN93)       kappa2=kappa1;\r
1866    if(model==F84) { kappa2=1+kappa1/R; kappa1=1+kappa1/Y; }\r
1867 \r
1868    *scalefactor = 1/(2*T*C*kappa1+2*A*G*kappa2 + 2*Y*R);\r
1869 \r
1870    Root[0] = 0;\r
1871    Root[1] = - (*scalefactor);\r
1872    Root[2] = -(Y+R*kappa2) * (*scalefactor);\r
1873    Root[3] = -(Y*kappa1+R) * (*scalefactor);\r
1874    return (0);\r
1875 }\r
1876 \r
1877 \r
1878 int eigenTN93 (int model, double kappa1, double kappa2, double pi[],\r
1879     int *nR, double Root[], double Cijk[])\r
1880 {\r
1881 /* initialize Cijk[] & Root[], which are the only part to be changed\r
1882    for a new substitution model\r
1883    for JC69, K80, F81, F84, HKY85, TN93\r
1884    Root: real Root divided by v, the number of nucleotide substitutions.\r
1885 */\r
1886    int i,j,k, nr;\r
1887    double scalefactor, U[16],V[16], t;\r
1888    double T=pi[0],C=pi[1],A=pi[2],G=pi[3],Y=T+C,R=A+G;\r
1889 \r
1890    if (model==JC69 || model==F81) kappa1=kappa2=com.kappa=1; \r
1891    else if (com.model<TN93)       kappa2=kappa1;\r
1892    RootTN93(model, kappa1, kappa2, pi, &scalefactor, Root);\r
1893 \r
1894    *nR = nr = 2 + (model==K80||model>=F84) + (model>=HKY85);\r
1895    U[0*4+0]=U[1*4+0]=U[2*4+0]=U[3*4+0]=1;\r
1896    U[0*4+1]=U[1*4+1]=1/Y;   U[2*4+1]=U[3*4+1]=-1/R;\r
1897    U[0*4+2]=U[1*4+2]=0;  U[2*4+2]=G/R;  U[3*4+2]=-A/R;\r
1898    U[2*4+3]=U[3*4+3]=0;  U[0*4+3]=C/Y;  U[1*4+3]=-T/Y;\r
1899 \r
1900    xtoy (pi, V, 4);\r
1901    V[1*4+0]=R*T;   V[1*4+1]=R*C;\r
1902    V[1*4+2]=-Y*A;  V[1*4+3]=-Y*G;\r
1903    V[2*4+0]=V[2*4+1]=0;  V[2*4+2]=1;   V[2*4+3]=-1;\r
1904    V[3*4+0]=1;  V[3*4+1]=-1;   V[3*4+2]=V[3*4+3]=0;\r
1905 \r
1906    for(i=0; i<4; i++) for(j=0; j<4; j++) {\r
1907       Cijk[i*4*nr+j*nr+0]=U[i*4+0]*V[0*4+j];\r
1908       switch (model) {\r
1909       case JC69:\r
1910       case F81:\r
1911          for (k=1,t=0; k<4; k++) t += U[i*4+k]*V[k*4+j];\r
1912          Cijk[i*4*nr+j*nr+1] = t;\r
1913          break;\r
1914       case K80:\r
1915       case F84:\r
1916          Cijk[i*4*nr+j*nr+1]=U[i*4+1]*V[1*4+j];\r
1917          for (k=2,t=0; k<4; k++) t += U[i*4+k]*V[k*4+j];\r
1918          Cijk[i*4*nr+j*nr+2]=t;\r
1919          break;\r
1920       case HKY85:   case T92:   case TN93:\r
1921          for (k=1; k<4; k++)  Cijk[i*4*nr+j*nr+k] = U[i*4+k]*V[k*4+j];\r
1922          break;\r
1923       default:\r
1924          error2("model in eigenTN93");\r
1925       }\r
1926    }\r
1927 #ifdef BASEMLG\r
1928    FOR (i,64) CijkIs0[i] = (Cijk[i]==0);\r
1929 #endif\r
1930    return(0);\r
1931 }\r
1932 \r
1933 \r
1934 #endif\r
1935 \r
1936 \r
1937 \r
1938 #if (defined(CODEML) || defined(YN00))\r
1939 \r
1940 int printfcode (FILE *fout, double fb61[], double space[])\r
1941 {\r
1942 /* space[64*2]\r
1943 */\r
1944    int i, n=Nsensecodon;\r
1945 \r
1946    fprintf (fout, "\nCodon freq.,  x 10000\n");\r
1947    zero (space, 64);\r
1948    for(i=0; i<n; i++) space[FROM61[i]] = fb61[i]*10000;\r
1949    printcu(fout, space, com.icode);\r
1950    return(0);\r
1951 }\r
1952 \r
1953 \r
1954 int printsmaCodon (FILE *fout, unsigned char * z[],int ns,int ls,int lline,int simple)\r
1955 {\r
1956 /* print, in blocks, multiple aligned and transformed codon sequences.\r
1957    indels removed.\r
1958    This is needed as codons are coded 0,1, 2, ..., 60, and \r
1959    printsma won't work.\r
1960 */\r
1961    int ig, ngroup, lt, il,is, i,b, lspname=30;\r
1962    char equal='.',*pz, c0[4],c[4];\r
1963 \r
1964    if(ls==0) return(1);\r
1965    ngroup = (ls-1)/lline + 1;\r
1966    for (ig=0,FPN(fout); ig<ngroup; ig++)  {\r
1967       /* fprintf (fout,"%-8d\n", ig*lline+1); */\r
1968       for (is=0; is<ns; is++) {\r
1969          fprintf(fout,"%-*s  ", lspname, com.spname[is]);\r
1970          lt=0; \r
1971          for(il=ig*lline,pz=z[is]+il; lt<lline && il<ls; il++,lt++,pz++) {\r
1972             b = *pz;  \r
1973             b = FROM61[b]; \r
1974             c[0] = (char)(b/16); \r
1975             c[1] = (char)((b%16)/4);\r
1976             c[2] = (char)(b%4);\r
1977             c[3] = 0;\r
1978             for(i=0; i<3; i++)\r
1979                c[i] = BASEs[(int)c[i]];\r
1980             if (is && simple)  {\r
1981                b = z[0][il];\r
1982                b = FROM61[b];\r
1983                c0[0]=(char)(b/16); c0[1]=(char)((b%16)/4); c0[2]=(char)(b%4);\r
1984                for(i=0; i<3; i++)\r
1985                   if (c[i]==BASEs[(int)c0[i]]) c[i]=equal;\r
1986             }\r
1987             fprintf(fout,"%3s ", c);\r
1988          }\r
1989          FPN (fout);\r
1990       }\r
1991    }\r
1992    return (0);\r
1993 }\r
1994 \r
1995 \r
1996 int setmark_61_64 (void)\r
1997 {\r
1998 /* This sets two matrices FROM61[], and FROM64[], which translate between two \r
1999    codings of codons.  In one coding, codons go from 0, 1, ..., 63 while in \r
2000    the other codons range from 0, 1, ..., 61 with the three stop codons removed.\r
2001    FROM61[] translates from the 61-state coding to the 64-state coding, while \r
2002    FROM64[] translates from the 64-state coding to the 61-state coding.\r
2003 \r
2004    This routine also sets up FourFold[4][4], which defines the 4-fold codon\r
2005    boxes.\r
2006 */\r
2007    int i,j,k, *code=GeneticCode[com.icode];\r
2008    int c[3],aa0,aa1, by[3]={16,4,1};\r
2009    double nSilent, nStop, nRepl;\r
2010 \r
2011    Nsensecodon=0;\r
2012    for (i=0; i<64; i++) {\r
2013       if (code[i]==-1)  FROM64[i]=-1; \r
2014       else            { FROM61[Nsensecodon]=i; FROM64[i]=Nsensecodon++; }\r
2015    }\r
2016    com.ncode=Nsensecodon;\r
2017 \r
2018    for(i=0; i<4; i++) for(j=0; j<4; j++) {\r
2019       k=i*16+j*4;\r
2020       FourFold[i][j] = (code[k]==code[k+1] && code[k]==code[k+2] && code[k]==code[k+3]);\r
2021    }\r
2022 \r
2023    for (i=0,nSilent=nStop=nRepl=0; i<64; i++) {\r
2024       c[0]=i/16; c[1]=(i/4)%4; c[2]=i%4;\r
2025       if((aa0=code[i])==-1) continue;\r
2026       for(j=0; j<3; j++) for(k=0; k<3; k++) {\r
2027          aa1 = code[i + ((c[j]+k+1)%4 - c[j])*by[j]];\r
2028          if(aa1==-1)        nStop++;\r
2029          else if(aa0==aa1)  nSilent++;\r
2030          else               nRepl++;\r
2031       }\r
2032    }\r
2033 /*\r
2034    printf("\ncode Stop Silent Replace\n");\r
2035    printf("%3d (%d)  %6.0f%6.0f%6.0f  %12.6f%12.6f\n", \r
2036       com.icode, 64-com.ncode, nStop,nSilent,nRepl,nStop*3/(com.ncode*9),nSilent*3/(com.ncode*9));\r
2037 */\r
2038    return (0);\r
2039 }\r
2040 \r
2041 int DistanceMatNG86 (FILE *fout, FILE*fds, FILE*fdn, FILE*ft, double alpha)\r
2042 {\r
2043 /* Estimation of dS and dN by the method of Nei & Gojobori (1986)\r
2044    This works with both coded (com.cleandata==1) and uncoded data.\r
2045    In the latter case (com.cleandata==0), the method does pairwise delection.\r
2046 \r
2047    alpha for gamma rates is used for dN only.\r
2048 */\r
2049    char *codon[2];\r
2050    int is,js, i,k,h, wname=20, status=0, ndiff,nsd[4];\r
2051    int nb[3],ib[3][4], missing;\r
2052    double ns,na, nst,nat, S,N, St,Nt, dS,dN,dN_dS,y, bigD=3, lst;\r
2053    double SEds, SEdn, p;\r
2054 \r
2055    if(fout) { \r
2056       fputs("\n\n\nNei & Gojobori 1986. dN/dS (dN, dS)",fout);\r
2057       if(com.cleandata==0) fputs("\n(Pairwise deletion)",fout);\r
2058       fputs("\n(Note: This matrix is not used in later ML. analysis.\n",fout);\r
2059       fputs("Use runmode = -2 for ML pairwise comparison.)\n",fout);\r
2060    }\r
2061 \r
2062    if(fds) {\r
2063       fprintf(fds,"%6d\n",com.ns);\r
2064       fprintf(fdn,"%6d\n",com.ns); \r
2065       fprintf(ft,"%6d\n",com.ns);\r
2066    }\r
2067    if(noisy>1 && com.ns>10)  puts("NG distances for seqs.:");\r
2068    for(is=0; is<com.ns; is++) {\r
2069       if(fout) \r
2070          fprintf(fout,"\n%-*s", wname,com.spname[is]);\r
2071       if(fds) {\r
2072          fprintf(fds,   "%-*s ",wname,com.spname[is]);\r
2073          fprintf(fdn,   "%-*s ",wname,com.spname[is]);\r
2074          fprintf(ft,    "%-*s ",wname,com.spname[is]);\r
2075       }\r
2076       for(js=0; js<is; js++) {\r
2077          for(k=0; k<4; k++) nsd[k] = 0;\r
2078          for (h=0,lst=0,nst=nat=S=N=0; h<com.npatt; h++)  {\r
2079             if(com.z[is][h]>=com.ncode || com.z[js][h]>=com.ncode) \r
2080                continue;\r
2081             codon[0] = CODONs[com.z[is][h]];\r
2082             codon[1] = CODONs[com.z[js][h]];\r
2083             lst += com.fpatt[h];\r
2084             ndiff = difcodonNG(codon[0], codon[1], &St, &Nt, &ns, &na, 0, com.icode);\r
2085             nsd[ndiff] += (int)com.fpatt[h];\r
2086             S += St*com.fpatt[h];\r
2087             N += Nt*com.fpatt[h];\r
2088             nst += ns*com.fpatt[h];\r
2089             nat += na*com.fpatt[h];\r
2090          }  /* for(h) */\r
2091          if(S<=0 || N<=0)\r
2092             y=0;\r
2093          else {       /* rescale for stop codons */\r
2094             y = lst*3./(S+N);\r
2095             S *= y;\r
2096             N *= y;\r
2097          }\r
2098          if(noisy>=9)\r
2099            printf("\n%3d %3d:Sites %7.1f +%7.1f =%7.1f\tDiffs %7.1f +%7.1f =%7.1f",\r
2100              is+1,js+1,S,N,S+N,nst,nat, nst+nat);\r
2101 \r
2102          dS = (S<=0 ? 0 : 1-4./3*nst/S);\r
2103          dN = (N<=0 ? 0 : 1-4./3*nat/N);\r
2104          if(noisy>=9 && (dS<=0 || dN<=0))\r
2105             { puts("\nNG86 unusable."); status=-1;}\r
2106          if(dS==1) dS = 0;\r
2107          else      dS = (dS<=0 ? -1 : 3./4*(-log(dS)));\r
2108          if(dN==1) dN = 0;\r
2109          else      dN = (dN<=0 ? -1 : 3./4*(alpha==0?-log(dN):alpha*(pow(dN,-1/alpha)-1)));\r
2110 \r
2111          dN_dS = (dS>0 && dN>0 ? dN/dS : -1);\r
2112          if(fout) fprintf(fout,"%7.4f (%5.4f %5.4f)",   dN_dS, dN, dS);\r
2113 \r
2114          if(N>0 && dN<0)  dN = bigD; \r
2115          if(S>0&&dS<0)    dS = bigD;\r
2116 \r
2117 #ifdef CODEML\r
2118          SeqDistance[is*(is-1)/2+js] = (S<=0||N<=0 ? 0 : (S*dS+N*dN)*3/(S+N));\r
2119 #endif\r
2120 \r
2121          if(fds) {\r
2122             fprintf(fds," %7.4f", dS);\r
2123             fprintf(fdn," %7.4f", dN);\r
2124             fprintf(ft," %7.4f", (S*dS+N*dN)*3/(S+N));\r
2125          }\r
2126          if(alpha==0 && dS<bigD) { p=nst/S; SEds=sqrt(9*p*(1-p)/(square(3-4*p)*S)); }\r
2127          if(alpha==0 && dN<bigD) { p=nat/N; SEdn=sqrt(9*p*(1-p)/(square(3-4*p)*N)); }\r
2128       }    /* for(js) */\r
2129       if(fds) {\r
2130          FPN(fds); FPN(fdn); FPN(ft);\r
2131       }\r
2132       if(noisy>1 && com.ns>10)  printf(" %3d", is+1);\r
2133    }    /* for(is) */\r
2134    FPN(F0); \r
2135    if(fout) FPN(fout);\r
2136    if(status) fprintf (fout, "NOTE: -1 means that NG86 is inapplicable.\n");\r
2137 \r
2138    SS=S, NN=N, Sd=nst, Nd=nat;  /* kostas */\r
2139 \r
2140    return (0);\r
2141 }\r
2142 \r
2143 \r
2144 #endif\r
2145 \r
2146 \r
2147 \r
2148 #ifdef BASEML\r
2149 \r
2150 int eigenQREVbase (FILE* fout, double Q[NCODE*NCODE], double kappa[], double pi[], int *nR, double Root[], double Cijk[])\r
2151 {\r
2152 /* pi[] is constant.\r
2153    This returns the Q matrix in Q.\r
2154 */\r
2155    int n=com.ncode, i,j,k;\r
2156    int nr = (com.ngene>1 && com.Mgene>=3 ? com.nrate/com.ngene : com.nrate);\r
2157    double Q0[NCODE*NCODE], U[NCODE*NCODE], V[NCODE*NCODE], mr, space_pisqrt[NCODE*NCODE];\r
2158 \r
2159    NPMatUVRoot=0;\r
2160    *nR=n;\r
2161    zero(Q, n*n);\r
2162    if(com.model==REV) {\r
2163       if(n!=4) error2("ncode != 4 for REV");\r
2164       Q[3*n+2] = Q[2*n+3] = 1;  /* r_AG = r_GA = 1. */\r
2165       for(i=0,k=0; i<n-1; i++) for (j=i+1; j<n; j++)\r
2166          if(i*n+j != 2*n+3)\r
2167             Q[i*n+j] = Q[j*n+i] = kappa[k++];\r
2168    }\r
2169    else       /* (model==REVu) */\r
2170       for(i=0; i<n-1; i++) for(j=i+1; j<n; j++)\r
2171          Q[i*n+j]=Q[j*n+i] = (StepMatrix[i*n+j] ? kappa[StepMatrix[i*n+j]-1] : 1);\r
2172 \r
2173    for(i=0; i<n; i++) for(j=0; j<n; j++)\r
2174       Q[i*n+j] *= pi[j];\r
2175 \r
2176    for (i=0,mr=0; i<n; i++) {\r
2177       Q[i*n+i] = 0; \r
2178       Q[i*n+i] = -sum(Q+i*n, n);\r
2179       mr -= pi[i]*Q[i*n+i]; \r
2180    }\r
2181    abyx(1/mr, Q, n*n);\r
2182 \r
2183    if (fout) {\r
2184       mr = 2*pi[0]*Q[0*n+1] + 2*pi[2]*Q[2*n+3];\r
2185       if(com.nhomo==0) {\r
2186          fprintf(fout, "\nRate parameters:  ");\r
2187          for(j=0; j<nr; j++) \r
2188             fprintf(fout, " %8.5f", kappa[j]);\r
2189          fprintf(fout, "\nBase frequencies: ");\r
2190          for(j=0; j<n; j++) \r
2191             fprintf(fout," %8.5f", pi[j]);\r
2192       }\r
2193       fprintf (fout, "\nRate matrix Q, Average Ts/Tv =%9.4f", mr/(1-mr));\r
2194       matout (fout, Q, n, n);\r
2195    }\r
2196    else {\r
2197       xtoy (Q, Q0, n*n);\r
2198       eigenQREV(Q0, pi, n, Root, U, V, space_pisqrt);\r
2199       for(i=0; i<n; i++) for(j=0; j<n; j++) for(k=0; k<n; k++) \r
2200          Cijk[i*n*n+j*n+k] = U[i*n+k]*V[k*n+j];\r
2201    }\r
2202    return (0);\r
2203 }\r
2204 \r
2205 \r
2206 int QUNREST (FILE *fout, double Q[], double rate[], double pi[])\r
2207 {\r
2208 /* This constructs the rate matrix Q for the unrestricted model.\r
2209    pi[] is changed in the routine.\r
2210 */\r
2211    int n=com.ncode, i,j,k;\r
2212    double mr, ts, space[20];\r
2213 \r
2214    if(com.model==UNREST) {\r
2215       if(n!=4) error2("ncode != 4 for UNREST");\r
2216       for (i=0,k=0,Q[14]=1; i<n; i++) for(j=0; j<n; j++) \r
2217          if (i!=j && i*n+j != 14)  Q[i*n+j] = rate[k++];\r
2218    }\r
2219    else  /* (model==UNRESTu) */\r
2220       for(i=0; i<n; i++) for(j=0; j<n; j++) \r
2221          if(i != j) \r
2222             Q[i*n+j] = (StepMatrix[i*n+j] ? rate[StepMatrix[i*n+j]-1] : 1);\r
2223 \r
2224    for(i=0; i<n; i++) {\r
2225       Q[i*n+i] = 0; \r
2226       Q[i*n+i] = -sum(Q+i*n, n); \r
2227    }\r
2228 \r
2229    /* get pi */\r
2230    QtoPi(Q, com.pi, n, space);\r
2231 \r
2232    for (i=0,mr=0; i<n; i++)  mr -= pi[i]*Q[i*n+i];\r
2233    for (i=0; i<n*n; i++)  Q[i] /= mr;\r
2234 \r
2235    if (fout) {\r
2236       ts = pi[0]*Q[0*n+1] + pi[1]*Q[1*n+0] + pi[2]*Q[2*n+3] + pi[3]*Q[3*n+2];\r
2237 \r
2238       fprintf(fout, "Rate parameters:  ");\r
2239       for(j=0; j<com.nrate; j++)  fprintf(fout, " %8.5f", rate[j]);\r
2240       fprintf(fout, "\nBase frequencies: ");\r
2241       for(j=0; j<n; j++) fprintf(fout," %8.5f", pi[j]);\r
2242       if(n==4)\r
2243          fprintf (fout,"\nrate matrix Q, Average Ts/Tv (similar to kappa/2) =%9.4f", ts/(1-ts));\r
2244       else \r
2245          fprintf (fout,"\nrate matrix Q");\r
2246       matout (fout, Q, n, n);\r
2247    }\r
2248    return (0);\r
2249 }\r
2250 \r
2251 #endif\r
2252 \r
2253 \r
2254 #ifdef LSDISTANCE\r
2255 \r
2256 double *SeqDistance=NULL; \r
2257 int *ancestor=NULL;\r
2258 \r
2259 int SetAncestor()\r
2260 {\r
2261 /* This finds the most recent common ancestor of species is and js.\r
2262 */\r
2263    int is, js, it, a1, a2;\r
2264 \r
2265    for(is=0; is<com.ns; is++) for(js=0; js<is; js++) {\r
2266       it = is*(is-1)/2+js;\r
2267       ancestor[it] = -1;\r
2268       for (a1=is; a1!=-1; a1=nodes[a1].father) {\r
2269          for (a2=js; a2!=-1; a2=nodes[a2].father)\r
2270             if (a1==a2) { ancestor[it] = a1; break; }\r
2271          if (ancestor[it] != -1) break;\r
2272       }\r
2273       if (ancestor[it] == -1) error2("no ancestor");\r
2274    }\r
2275    return(0);\r
2276 }\r
2277 \r
2278 int fun_LS (double x[], double diff[], int np, int npair);\r
2279 \r
2280 int fun_LS (double x[], double diff[], int np, int npair)\r
2281 {\r
2282    int i,j, aa, it=-np;\r
2283    double dexp;\r
2284 \r
2285    if (SetBranch(x) && noisy>2) puts ("branch len.");\r
2286    if (npair != com.ns*(com.ns-1)/2) error2("# seq pairs err.");\r
2287    for(i=0; i<com.ns; i++) for(j=0; j<i; j++) {\r
2288       it = i*(i-1)/2+j;\r
2289       for (aa=i,dexp=0; aa!=ancestor[it]; aa=nodes[aa].father)\r
2290          dexp += nodes[aa].branch;\r
2291       for (aa=j; aa!=ancestor[it]; aa=nodes[aa].father)\r
2292          dexp += nodes[aa].branch;\r
2293       diff[it] = SeqDistance[it] - dexp;\r
2294 \r
2295       if(fabs(diff[it])>1000) {\r
2296          printf("\ndistances very different: diff = %12.6f ", diff[it]);\r
2297       }\r
2298 \r
2299    }\r
2300    return(0);\r
2301 }\r
2302 \r
2303 int LSDistance (double *ss,double x[],int (*testx)(double x[],int np))\r
2304 {\r
2305 /* get Least Squares estimates of branch lengths for a given tree topology\r
2306    This uses nls2, a general least squares algorithm for nonlinear programming \r
2307    to estimate branch lengths, and it thus inefficient.\r
2308 */\r
2309    int i;\r
2310 \r
2311    if ((*testx)(x, com.ntime)) {\r
2312       matout (F0, x, 1, com.ntime);\r
2313       puts ("initial err in LSDistance()");\r
2314    }\r
2315    SetAncestor();\r
2316    i = nls2((com.ntime>20&&noisy>=3?F0:NULL),\r
2317       ss,x,com.ntime,fun_LS,NULL,testx,com.ns*(com.ns-1)/2,1e-6);\r
2318 \r
2319    return (i);\r
2320 }\r
2321 \r
2322 double PairDistanceML(int is, int js)\r
2323 {\r
2324 /* This calculates the ML distance between is and js, the sum of ML branch \r
2325    lengths along the path between is and js.\r
2326    LSdistance() has to be called once to set ancestor before calling this \r
2327    routine.\r
2328 */\r
2329    int it, a;\r
2330    double dij=0;\r
2331 \r
2332    if(is==js) return(0);\r
2333    if(is<js) { it=is; is=js; js=it; }\r
2334 \r
2335    it = is*(is-1)/2 + js;\r
2336    for (a=is; a!=ancestor[it]; a=nodes[a].father)\r
2337       dij += nodes[a].branch;\r
2338    for (a=js; a!=ancestor[it]; a=nodes[a].father)\r
2339       dij += nodes[a].branch;\r
2340    return(dij);\r
2341 }\r
2342 \r
2343 \r
2344 int GroupDistances()\r
2345 {\r
2346 /* This calculates average group distances by summing over the ML \r
2347    branch lengths */\r
2348    int newancestor=0, i,j, ig,jg;\r
2349 /*   int ngroup=2, Ningroup[10], group[200]={1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
2350 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
2351 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
2352 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
2353 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r
2354 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r
2355 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r
2356 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r
2357 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; */ /* dloop for HC200.paup */\r
2358    int ngroup=10, Ningroup[10], group[115]={\r
2359        10, 9, 9, 9, 9, 9, 9, 9, 9, 10, \r
2360        9, 9, 3, 3, 1, 1, 1, 1, 1, 1, \r
2361        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
2362        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \r
2363        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \r
2364        1, 2, 2, 2, 2, 2, 2, 4, 4, 4, \r
2365        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, \r
2366        4, 4, 4, 4, 4, 4, 5, 5, 5, 5, \r
2367        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, \r
2368        5, 5, 5, 5, 6, 6, 6, 6, 6, 6, \r
2369        6, 7, 7, 7, 7, 7, 7, 7, 7, 7, \r
2370        8, 8, 8, 8, 8};  /* dloop data for Anne Yoder, ns=115 */\r
2371    double dgroup, npairused;\r
2372 \r
2373 /* ngroup=2; for(j=0;j<com.ns; j++) group[j]=1+(group[j]>2); */\r
2374 \r
2375    for(j=0;j<ngroup;j++) Ningroup[j]=0;\r
2376    for(j=0;j<com.ns; j++) Ningroup[group[j]-1]++;\r
2377    printf("\n# sequences in group:");\r
2378    matIout(F0,Ningroup,1,ngroup);\r
2379    if(ancestor==NULL) {\r
2380       newancestor=1;\r
2381       ancestor=(int*)realloc(ancestor, com.ns*(com.ns-1)/2*sizeof(int));\r
2382       if(ancestor==NULL) error2("oom ancestor");\r
2383    }\r
2384    SetAncestor();\r
2385 \r
2386    for(ig=0; ig<ngroup; ig++) {\r
2387       printf("\ngroup %2d",ig+1);\r
2388       for(jg=0; jg<ig+1; jg++) {\r
2389          dgroup=0;  npairused=0;\r
2390          for(i=0;i<com.ns;i++) for(j=0;j<com.ns;j++) {\r
2391             if(i!=j && group[i]==ig+1 && group[j]==jg+1) {\r
2392                dgroup += PairDistanceML(i, j);\r
2393                npairused++;\r
2394             }\r
2395          }\r
2396          dgroup/=npairused;\r
2397          printf("%9.4f", dgroup);\r
2398 \r
2399          /* printf("%6.1f", dgroup/0.2604*5); */ /* 0.2604, 0.5611 */\r
2400       }\r
2401    }\r
2402    if(newancestor==1)  free(ancestor);\r
2403    return(0);\r
2404 }\r
2405 \r
2406 #endif \r
2407 \r
2408 #ifdef NODESTRUCTURE\r
2409 \r
2410 void BranchToNode (void)\r
2411 {\r
2412 /* tree.root need to be specified before calling this\r
2413 */\r
2414    int i, from, to;\r
2415    \r
2416    tree.nnode=tree.nbranch+1;\r
2417    for(i=0; i<tree.nnode; i++)\r
2418       { nodes[i].father=nodes[i].ibranch=-1;  nodes[i].nson=0; }\r
2419    for (i=0; i<tree.nbranch; i++) {\r
2420       from=tree.branches[i][0];\r
2421       to  =tree.branches[i][1];\r
2422       nodes[from].sons[nodes[from].nson++]=to;\r
2423       nodes[to].father=from;\r
2424       nodes[to].ibranch=i;\r
2425    }\r
2426    /*  nodes[tree.root].branch=0;  this breaks method=1 */\r
2427 }\r
2428 \r
2429 void NodeToBranchSub (int inode);\r
2430 \r
2431 void NodeToBranchSub (int inode)\r
2432 {\r
2433    int i, ison;\r
2434 \r
2435    for(i=0; i<nodes[inode].nson; i++) {\r
2436       tree.branches[tree.nbranch][0] = inode;\r
2437       tree.branches[tree.nbranch][1] = ison = nodes[inode].sons[i];\r
2438       nodes[ison].ibranch = tree.nbranch++;\r
2439       if(nodes[ison].nson>0)  NodeToBranchSub(ison);\r
2440    }\r
2441 }\r
2442 \r
2443 void NodeToBranch (void)\r
2444 {\r
2445    tree.nbranch=0;\r
2446    NodeToBranchSub (tree.root);\r
2447    if(tree.nnode != tree.nbranch+1)\r
2448       error2("nnode != nbranch + 1?");\r
2449 }\r
2450 \r
2451 \r
2452 void ClearNode (int inode)\r
2453 {\r
2454 /* a source of confusion. Try not to use this routine.\r
2455 */\r
2456    nodes[inode].father = nodes[inode].ibranch = -1;\r
2457    nodes[inode].nson = 0;\r
2458    nodes[inode].branch = nodes[inode].age = 0;\r
2459    /* nodes[inode].label = -1; */\r
2460    /* nodes[inode].branch = 0; clear node structure only, not branch lengths */\r
2461    /* for(i=0; i<com.ns; i++) nodes[inode].sons[i]=-1; */\r
2462 }\r
2463 \r
2464 int ReadTreeB (FILE *ftree, int popline)\r
2465 {\r
2466    char line[255];\r
2467    int nodemark[NS*2-1]={0}; /* 0: absent; 1: father only (root); 2: son */\r
2468    int i,j, state=0, YoungAncestor=0;\r
2469 \r
2470    if(com.clock) {\r
2471       puts("\nbranch representation of tree might not work with clock model.");\r
2472       getchar();\r
2473    }\r
2474 \r
2475    fscanf (ftree, "%d", &tree.nbranch);\r
2476    for(j=0; j<tree.nbranch; j++) {\r
2477       for(i=0; i<2; i++) {\r
2478          if (fscanf (ftree, "%d", & tree.branches[j][i]) != 1) state=-1;\r
2479          tree.branches[j][i]--;\r
2480          if(tree.branches[j][i]<0 || tree.branches[j][i]>com.ns*2-1) \r
2481             error2("ReadTreeB: node numbers out of range");\r
2482       }\r
2483       nodemark[tree.branches[j][1]]=2;\r
2484       if(nodemark[tree.branches[j][0]]!=2) nodemark[tree.branches[j][0]]=1;\r
2485       if (tree.branches[j][0]<com.ns)  YoungAncestor=1;\r
2486 \r
2487       printf ("\nBranch #%3d: %3d -> %3d",j+1,tree.branches[j][0]+1,tree.branches[j][1]+1);\r
2488 \r
2489    }\r
2490    if(popline) fgets(line, 254, ftree);\r
2491    for(i=0,tree.root=-1; i<tree.nbranch; i++) \r
2492       if(nodemark[tree.branches[i][0]]!=2) tree.root=tree.branches[i][0];\r
2493    if(tree.root==-1) error2("root err");\r
2494    for(i=0; i<com.ns; i++)\r
2495       if(nodemark[i]==0) {\r
2496          matIout(F0,nodemark,1,com.ns);\r
2497          error2("branch specification of tree");\r
2498       }\r
2499 \r
2500    if(YoungAncestor) {\r
2501       puts("\nAncestors in the data?  Take care.");\r
2502       if(!com.cleandata) {\r
2503          puts("This kind of tree does not work with unclean data.");\r
2504          getchar();\r
2505       }\r
2506    }\r
2507 \r
2508 /*\r
2509    com.ntime = com.clock ? (tree.nbranch+1)-com.ns+(tree.root<com.ns)\r
2510                          : tree.nbranch;\r
2511 */\r
2512 \r
2513    BranchToNode ();\r
2514    return (state);\r
2515 }\r
2516 \r
2517 \r
2518 int OutTreeB (FILE *fout)\r
2519 {\r
2520    int j;\r
2521    char *fmt[]={" %3d..%-3d", " %2d..%-2d"};\r
2522    FOR (j, tree.nbranch)\r
2523       fprintf(fout, fmt[0], tree.branches[j][0]+1,tree.branches[j][1]+1);\r
2524    return (0);\r
2525 }\r
2526 \r
2527 int GetTreeFileType(FILE *ftree, int *ntree, int *pauptree, int shortform);\r
2528 \r
2529 int GetTreeFileType(FILE *ftree, int *ntree, int *pauptree, int shortform)\r
2530 {\r
2531 /* paupstart="begin trees" and paupend="translate" identify paup tree files.\r
2532    paupch=";" will be the last character before the list of trees.\r
2533    Modify if necessary.\r
2534 */\r
2535    int i,k, lline=32000, ch=0, paupch=';';\r
2536    char line[32000];\r
2537    char *paupstart="begin tree", *paupend="translate";\r
2538 \r
2539    *pauptree=0;\r
2540    k=fscanf(ftree,"%d%d",&i,ntree);\r
2541    if(k==2) {\r
2542       if(i==com.ns)  return(0);                 /* old paml style */\r
2543       else           error2("Number of sequences different in tree and seq files.");\r
2544    }\r
2545    else if(k==1) { *ntree=i; return(0); }           /* phylip & molphy style */\r
2546    while(ch!='(' && !isalnum(ch) && ch!=EOF)  ch=fgetc(ftree);  /* treeview style */\r
2547    if(ch=='(') { *ntree=-1; ungetc(ch,ftree); return(0); }\r
2548 \r
2549    puts("\n# seqs in tree file does not match.  Read as the nexus format.");\r
2550    for ( ; ; ) {\r
2551       if(fgets(line,lline,ftree)==NULL) error2("tree err1: EOF");\r
2552       strcase(line,0);\r
2553       if (strstr(line,paupstart)) { *pauptree=1; *ntree=-1; break; }\r
2554    }\r
2555    if(shortform) return(0);\r
2556    for ( ; ; ) {\r
2557       if(fgets(line,lline,ftree)==NULL) error2("tree err2: EOF");\r
2558       strcase(line,0);\r
2559       if (strstr(line,paupend)) break;\r
2560    }\r
2561    for ( ; ; ) {\r
2562       if((ch=fgetc(ftree))==EOF) error2("tree err3: EOF");\r
2563       if (ch==paupch) break;\r
2564    }\r
2565    if(fgets(line,lline,ftree)==NULL) error2("tree err4: EOF");\r
2566 \r
2567    return(0);\r
2568 }\r
2569 \r
2570 int PopPaupTreeRubbish(FILE *ftree);\r
2571 int PopPaupTreeRubbish(FILE *ftree)\r
2572 {\r
2573 /* This reads out the string in front of the tree in the nexus format, \r
2574    typically "tree PAUP_1 = [&U]" with "[&U]" optional\r
2575 */\r
2576    int ch;\r
2577 \r
2578    for (; ;) {\r
2579       ch=fgetc(ftree);\r
2580       if(ch=='(')\r
2581          { ungetc(ch,ftree); return(0); }\r
2582       else if(ch==EOF || ch=='/') \r
2583          return(-1);\r
2584    }\r
2585    return(0);\r
2586 }\r
2587 \r
2588 \r
2589 static int *CladeLabel = NULL;\r
2590 \r
2591 void DownTreeCladeLabel (int inode, int cLabel)\r
2592 {\r
2593 /* This goes down the tree to change $ labels (stored in CladeLabel[]) into\r
2594    # labels (stored in nodes[].label).  To deal with nested clade labels,\r
2595    branches within a clade are labeled by negative numbers initially, and \r
2596    converted to positive labels at the end of the algorithm.\r
2597 \r
2598    nodes[].label and CladeLabel[] are initialized to -1 before this routine \r
2599    is called.\r
2600 */\r
2601    int i, label;\r
2602 \r
2603    label = cLabel;\r
2604    if(CladeLabel[inode] != -1)       \r
2605       label = CladeLabel[inode];\r
2606    if(inode != tree.root && nodes[inode].label == -1) \r
2607       nodes[inode].label = label;\r
2608    for(i=0; i<nodes[inode].nson; i++)\r
2609       DownTreeCladeLabel(nodes[inode].sons[i], label);\r
2610 }\r
2611 \r
2612 int IsNameNumber(char line[])\r
2613 {\r
2614 /* returns 0 if line has species number; 1 if it has name.\r
2615    It uses com.ns.\r
2616 */\r
2617    int isname=1, alldigits=1, n;\r
2618    char *p=line;\r
2619 \r
2620    while(*p)\r
2621       if(!isdigit(*p++)) { alldigits=0; break; }\r
2622    if(alldigits) {\r
2623       n = atoi(line);\r
2624       if(n>=1 && n<=com.ns) isname = 0;\r
2625    }\r
2626    return(isname);\r
2627 }\r
2628 \r
2629 \r
2630 \r
2631 int ReadTreeN (FILE *ftree, int *haslength, int *haslabel, int copyname, int popline)\r
2632 {\r
2633 /* Read a tree from ftree, using the parenthesis node representation of trees.\r
2634    Branch lengths are read in nodes[].branch, and branch (node) labels \r
2635    (integers) are preceeded by # and read in nodes[].label.  If the clade label\r
2636    $ is used, the label is read into CladeLabel[] first and then moved into\r
2637    nodes[].label in the routine DownTreeCladeLabel().\r
2638 \r
2639    This assumes that com.ns is known.\r
2640    Species names are considered case-sensitive, with trailing spaces ignored.\r
2641 \r
2642    copyname = 0: species numbers and names are both accepted, but names have \r
2643                  to match the names in com.spname[], which are from the \r
2644                  sequence data file.  Used by baseml and codeml, for example.\r
2645               1: species names are copied into com.spname[], but species \r
2646                  numbers are accepted.  Used by evolver for simulation, \r
2647                  in which case no species names were read before.\r
2648               2: the tree must have species names, which are copied into com.spname[].\r
2649                  Note that com.ns is assumed known.  To remove this restrition, \r
2650                  one has to consider the space for nodes[], CladeLabel, starting \r
2651                  node number etc.\r
2652 \r
2653    isname = 0:   species number; 1: species name;\r
2654 \r
2655    Ziheng note (18/12/2011): I have changed the code so that sequence number is not used \r
2656    anymore.  isname = 1 always.\r
2657 */\r
2658    int cnode, cfather=-1;  /* current node and father */\r
2659    int inodeb=0;  /* node number that will have the next branch length */\r
2660    int cladeLabels=0, i,j,k, level=0, isname, ch=' ', icurspecies=0;\r
2661    char check[NS], delimiters[]="(),:#$=@><;", quote[]="\"\'";\r
2662    int lline=32000;\r
2663    char line[32000], *pch;\r
2664 \r
2665    if(com.ns<=0)  error2("you should specify # seqs in the tree file.");\r
2666 \r
2667    if((CladeLabel=(int*)malloc((com.ns*2-1)*sizeof(int)))==NULL) \r
2668       error2("oom trying to get space for cladelabel");\r
2669    for(i=0; i<2*com.ns-1; i++) \r
2670       CladeLabel[i] = -1;\r
2671 \r
2672    /* initialization */\r
2673    for(i=0; i<com.ns; i++) check[i]=0;\r
2674    *haslength = 0;       *haslabel = 0;\r
2675    tree.nnode = com.ns;  tree.nbranch = 0;\r
2676    for(i=0; i<2*com.ns-1; i++) {\r
2677       nodes[i].father = nodes[i].ibranch = -1;\r
2678       nodes[i].nson = 0;  nodes[i].label = -1;  nodes[i].branch = 0;\r
2679       nodes[i].age = 0;  /* TipDate models set this for each tree later. */\r
2680 #if (defined(BASEML) || defined(CODEML))\r
2681       nodes[i].fossil = 0;\r
2682 #endif\r
2683    }\r
2684    while(isspace(ch))\r
2685       ch=fgetc(ftree);  /* skip spaces */\r
2686    ungetc(ch,ftree);\r
2687    if (isdigit(ch))\r
2688       { ReadTreeB(ftree,popline); return(0); }\r
2689 \r
2690    if(PopPaupTreeRubbish(ftree) == -1) return(-1);\r
2691 \r
2692    for ( ; ; ) {\r
2693       ch = fgetc (ftree);\r
2694       if (ch==EOF) return(-1);\r
2695       else if (ch == ';') {\r
2696          if(level!=0) error2("; in treefile");\r
2697          else         break;\r
2698       }\r
2699       else if (ch==',') ;\r
2700       else if (!isgraph(ch))\r
2701          continue;\r
2702       else if (ch == '(') {       /* left (  */\r
2703          level++;\r
2704          cnode=tree.nnode++;\r
2705          if(tree.nnode>2*com.ns-1)\r
2706                          error2("check #seqs and tree: perhaps too many '('?");\r
2707          if (cfather >= 0) {\r
2708             if(nodes[cfather].nson >= MAXNSONS) {\r
2709                printf("there are at least %d daughter nodes, raise MAXNSONS?", nodes[cfather].nson);\r
2710                exit(-1);\r
2711             }\r
2712             nodes[cfather].sons[nodes[cfather].nson++] = cnode;\r
2713             nodes[cnode].father = cfather;\r
2714             tree.branches[tree.nbranch][0] = cfather;\r
2715             tree.branches[tree.nbranch][1] = cnode;\r
2716             nodes[cnode].ibranch = tree.nbranch++;\r
2717          }\r
2718          else\r
2719             tree.root = cnode;\r
2720          cfather = cnode;\r
2721       }\r
2722       /* treating : and > in the same way is risky. */\r
2723       else if (ch==')') {\r
2724          level--;  inodeb=cfather; cfather=nodes[cfather].father; \r
2725       }\r
2726       else if (ch==':'||ch=='>') { \r
2727          if(ch==':') *haslength=1;\r
2728          fscanf(ftree, "%lf", &nodes[inodeb].branch); \r
2729       }\r
2730       else if (ch==quote[0] || ch==quote[1]) {\r
2731          for (k=0; ; k++) {  /* read notes into line[] */\r
2732             line[k] = (char)fgetc(ftree);\r
2733             if((int)line[k] == EOF)\r
2734                error2("EOF when reading node label");\r
2735             if(line[k] == quote[0] || line[k] == quote[1])\r
2736                break;\r
2737          }\r
2738          line[k++] = '\0';\r
2739          nodes[inodeb].nodeStr = (char*)malloc(k*sizeof(char));\r
2740          if (nodes[inodeb].nodeStr == NULL) error2("oom nodeStr");\r
2741          strcpy(nodes[inodeb].nodeStr, line);\r
2742          if((pch = strchr(line,'#')) || (pch = strchr(line,'<'))) {\r
2743             *haslabel=1; sscanf(pch+1, "%lf", &nodes[inodeb].label); \r
2744          }\r
2745          if((pch = strchr(line,'>'))) {\r
2746             sscanf(pch+1, "%lf", &nodes[inodeb].branch); \r
2747          }\r
2748          if((pch = strchr(line,'$'))) {\r
2749             *haslabel=1; sscanf(pch+1, "%d", &CladeLabel[inodeb]);\r
2750          }\r
2751          if((pch = strchr(line,'=')) || (pch = strchr(line,'@'))) {\r
2752             sscanf(pch+1, "%lf", &nodes[inodeb].age);\r
2753 #if (defined(BASEML) || defined(CODEML))\r
2754             if(com.clock) nodes[inodeb].fossil = 1;\r
2755 #endif\r
2756 #if (defined(CODEML))\r
2757             nodes[inodeb].omega = 0;\r
2758 #endif\r
2759          }\r
2760       }\r
2761       else if (ch=='#' || ch=='<') { *haslabel=1; fscanf(ftree, "%lf", &nodes[inodeb].label); }\r
2762       else if (ch=='$')            { *haslabel=1; fscanf(ftree, "%d",  &CladeLabel[inodeb]); }\r
2763       else if (ch=='@' || ch=='=') { \r
2764          fscanf(ftree,"%lf", &nodes[inodeb].age);\r
2765 #if (defined(BASEML) || defined(CODEML))\r
2766          if(com.clock) nodes[inodeb].fossil = 1;\r
2767 #endif\r
2768 #if (defined(CODEML))\r
2769          nodes[inodeb].omega = 0;\r
2770 #endif\r
2771       }\r
2772       else { /* read species name or number */\r
2773          if(level<=0) \r
2774             error2("expecting ; in the tree file");\r
2775          line[0]=(char)ch;  line[1]=(char)fgetc(ftree);\r
2776 /*         if(line[1]==(char)EOF) error2("eof in tree file"); */\r
2777 \r
2778          for (i=1; i<lline; )  { /* read species name into line[] until delimiter */\r
2779             if ((strchr(delimiters,line[i]) && line[i]!='@') \r
2780                || line[i]==(char)EOF || line[i]=='\n')\r
2781                { ungetc(line[i],ftree); line[i]=0; break; }\r
2782             line[++i]=(char)fgetc(ftree);\r
2783          }\r
2784          for(j=i-1;j>0;j--) /* trim spaces*/\r
2785             if(isgraph(line[j])) break; else line[j]=0;\r
2786 \r
2787          if(FullSeqNames)\r
2788             isname = 1;   /* numbers are part of names. */\r
2789          else\r
2790             isname = IsNameNumber(line);\r
2791 \r
2792          if (isname==0) {  /* number */\r
2793             if(copyname==2) error2("Use names in tree.");\r
2794             sscanf(line, "%d", &cnode);\r
2795             cnode--;\r
2796          }\r
2797          else {                 /* name */\r
2798             if(!copyname) {\r
2799                for(i=0; i<com.ns; i++) if (!strcmp(line,com.spname[i])) break;\r
2800                if((cnode=i)==com.ns) \r
2801                   { printf("\nSpecies %s?\n", line); exit(-1); }\r
2802             }\r
2803             else {\r
2804                if(icurspecies>com.ns-1) {\r
2805                   error2("error in tree: too many species in tree");\r
2806                }\r
2807                strcpy(com.spname[cnode=icurspecies++], line);\r
2808             }\r
2809          }\r
2810          nodes[cnode].father=cfather;\r
2811          if(nodes[cfather].nson>=MAXNSONS)\r
2812             error2("too many daughter nodes, raise MAXNSONS");\r
2813 \r
2814          nodes[cfather].sons[nodes[cfather].nson++] = cnode;\r
2815          tree.branches[tree.nbranch][0] = cfather;\r
2816          tree.branches[tree.nbranch][1] = cnode;\r
2817          nodes[cnode].ibranch = tree.nbranch++;\r
2818          inodeb = cnode;\r
2819          check[cnode]++;\r
2820       }\r
2821    }\r
2822 \r
2823    if (popline) \r
2824       fgets(line, lline, ftree);\r
2825    for(i=0; i<com.ns; i++) {\r
2826       if(check[i]>1) {\r
2827          printf("\nSeq #%d occurs more than once in the tree\n",i+1); exit(-1); \r
2828       }\r
2829       else if(check[i]<1) {\r
2830          printf("\nSeq #%d (%s) is missing in the tree\n", i+1, com.spname[i]);\r
2831          exit(-1); \r
2832       }\r
2833    }\r
2834    if(tree.nbranch>2*com.ns-2) { \r
2835       printf("nbranch %d", tree.nbranch); puts("too many branches in tree?");\r
2836    }\r
2837    if (tree.nnode != tree.nbranch+1) {\r
2838       printf ("\nnnode%6d != nbranch%6d + 1\n", tree.nnode, tree.nbranch);\r
2839       exit(-1);\r
2840    }\r
2841 \r
2842 /* check that it is o.k. to comment out this line\r
2843    com.ntime = com.clock ? (tree.nbranch+1)-com.ns+(tree.root<com.ns)\r
2844                          : tree.nbranch;\r
2845 */\r
2846 \r
2847 \r
2848    /* check and convert clade labels $ */\r
2849 #if(defined(BASEML) || defined(CODEML))\r
2850 #if(defined(BASEML))\r
2851    if(com.seqtype==0 && com.nhomo==5) cladeLabels = 1;\r
2852 #endif\r
2853    if(com.clock>1 || (com.seqtype==1 && com.model>=2)) cladeLabels = 1;\r
2854    if(cladeLabels) {\r
2855       for(i=0,j=0; i<tree.nnode; i++) {\r
2856          if(CladeLabel[i] != -1) j++;\r
2857       }\r
2858       if(j) {  /* j is number of clade labels */\r
2859          DownTreeCladeLabel(tree.root, 0);\r
2860       }\r
2861 \r
2862       /*** Somehow some labels are still -1 after this, so I changed this.  Needs checking.  ***/\r
2863       for(i=0; i<tree.nnode; i++) \r
2864          if(i!=tree.root && nodes[i].label==-1) nodes[i].label = 0;\r
2865 \r
2866       /* OutTreeN(F0,1,PrBranch|PrNodeNum);  FPN(F0); */\r
2867       /* FPN(F0);  OutTreeN(F0,1,PrLabel);   FPN(F0); */\r
2868 \r
2869       for(i=0,com.nbtype=0; i<tree.nnode; i++) { \r
2870          if(i == tree.root) continue;\r
2871          j = (int)nodes[i].label;\r
2872          if(j+1 > com.nbtype)  com.nbtype = j+1;\r
2873          if(j<0 || j>tree.nbranch-1)  \r
2874             error2("branch label in the tree (note labels start from 0 and are consecutive)");\r
2875       }\r
2876       if (com.nbtype<=1)\r
2877          error2("need branch labels in the tree for the model.");\r
2878       else {\r
2879          printf("\n%d branch types are in tree. Stop if wrong.", com.nbtype);\r
2880       }\r
2881 \r
2882 #if(defined(CODEML))\r
2883       if(com.seqtype==1 && com.NSsites==2 && com.model==3 && com.nbtype>NBTYPE) \r
2884          error2("nbtype too large.  Raise NBTYPE");\r
2885       else if(com.seqtype==1 && com.NSsites && com.model==2 && com.nbtype!=2)\r
2886          error2("only two branch types are allowed for branch models.");\r
2887 #endif\r
2888 \r
2889    }\r
2890 #endif\r
2891 \r
2892    free(CladeLabel);\r
2893    return (0);\r
2894 }\r
2895 \r
2896 \r
2897 \r
2898 int OutSubTreeN (FILE *fout, int inode, int spnames, int printopt, char *labelfmt);\r
2899 \r
2900 int OutSubTreeN (FILE *fout, int inode, int spnames, int printopt, char *labelfmt)\r
2901 {\r
2902    int i, dad = nodes[inode].father, nsib = (inode==tree.root ? 0 : nodes[dad].nson);\r
2903 \r
2904    if(inode != tree.root && inode == nodes[dad].sons[0])\r
2905       fputc ('(', fout);\r
2906 \r
2907    for(i=0; i<nodes[inode].nson; i++)\r
2908       OutSubTreeN(fout, nodes[inode].sons[i], spnames, printopt, labelfmt);\r
2909 \r
2910    if(nodes[inode].nson==0) { /* inode is tip */\r
2911       if(spnames) {\r
2912          if(printopt & PrNodeNum) fprintf(fout, "%d_", inode+1);\r
2913          fprintf(fout, "%s", com.spname[inode]);\r
2914       }\r
2915       else \r
2916          fprintf(fout, "%d", inode+1);\r
2917    }\r
2918    if((printopt & PrNodeNum) && nodes[inode].nson) \r
2919       fprintf(fout," %d ", inode+1);\r
2920    if((printopt & PrLabel) && nodes[inode].label>0)\r
2921       fprintf(fout, labelfmt, nodes[inode].label);\r
2922    if((printopt & PrAge) && nodes[inode].age) \r
2923       fprintf(fout, " @%.6f", nodes[inode].age);\r
2924 \r
2925 /*  Add branch labels to be read by Rod Page's TreeView. */\r
2926 #if (defined CODEML)\r
2927    if((printopt & PrOmega) && inode != tree.root)\r
2928       fprintf(fout, " #%.4f ", nodes[inode].omega);\r
2929 #elif (defined (EVOLVER) || defined (MCMCTREE))\r
2930    if((printopt & PrLabel) && nodes[inode].nodeStr && nodes[inode].nodeStr[0])\r
2931       fprintf(fout, " %s", nodes[inode].nodeStr);\r
2932 #endif\r
2933    \r
2934    if((printopt & PrBranch) && (inode!=tree.root || nodes[inode].branch>0))\r
2935       fprintf(fout, ": %.6f", nodes[inode].branch);\r
2936    /*\r
2937    if((printopt & PrBranch) && nodes[inode].age>0)  // print node ages instead of branch lengths \r
2938       fprintf(fout, ": %.6f", nodes[inode].age);\r
2939    */\r
2940 \r
2941    if(nsib == 0)            /* root */\r
2942       fputc(';', fout);\r
2943    else if (inode == nodes[dad].sons[nsib-1])  /* last sib */\r
2944       fputc(')', fout);\r
2945    else                     /* not last sib */\r
2946       fprintf(fout, ", ");\r
2947 \r
2948    return (0);\r
2949 }\r
2950 \r
2951 \r
2952 int OutTreeN (FILE *fout, int spnames, int printopt)\r
2953 {\r
2954 /* print the current tree.\r
2955    Can the block of print statements be moved inside the recursive function?\r
2956 */\r
2957    int i, intlabel=1;\r
2958    char* labelfmt[2]={"#%.6f", "#%.0f"};\r
2959 \r
2960    if(printopt & PrLabel) {\r
2961       for(i=0; i<tree.nnode; i++) \r
2962          if(nodes[i].label-(int)nodes[i].label != 0) intlabel=0;\r
2963    }\r
2964 \r
2965    OutSubTreeN(fout, tree.root, spnames, printopt, labelfmt[intlabel]);\r
2966 \r
2967    return(0);\r
2968 }\r
2969 \r
2970 \r
2971 int DeRoot (void)\r
2972 {\r
2973 /* This cnages the bifurcation at the root into a trifurcation, but setting one of \r
2974    the sons to be the new root.  The new root is the first son that is not a tip.  \r
2975    tree.nnode is updated, but the routine does not re-number the nodes, so the new\r
2976    node labels do not go from ns, ns + 1, ..., as they normally should.\r
2977 */\r
2978    int i, ison, sib, root = tree.root;\r
2979 \r
2980    if(nodes[root].nson!=2) error2("in DeRoot?");\r
2981 \r
2982    ison = nodes[root].sons[i = 0];\r
2983    if(nodes[ison].nson==0)\r
2984       ison = nodes[root].sons[i = 1];\r
2985    sib = nodes[root].sons[1 - i];\r
2986    nodes[sib].branch += nodes[ison].branch;\r
2987    nodes[sib].father = tree.root = ison;\r
2988    nodes[tree.root].father = -1;\r
2989    nodes[tree.root].sons[nodes[tree.root].nson++] = sib;  /* sib added as the last child of the new root */\r
2990    nodes[tree.root].branch = 0;\r
2991    tree.nnode --;  /* added 2007/4/9 */\r
2992    return(0);\r
2993 }\r
2994 \r
2995 int PruneSubTreeN (int inode, int keep[])\r
2996 {\r
2997 /* This prunes tips from the tree, using keep[com.ns].  Removed nodes in the \r
2998    big tree has nodes[].father=-1 and nodes[].nson=0.\r
2999    Do not change nodes[inode].nson and nodes[inode].sons[] until after the \r
3000    node's descendent nodes are all processed.  So when a son is deleted, \r
3001    only the father node's nson is changed, but not \r
3002 */\r
3003    int i,j, ison, father=nodes[inode].father, nson0=nodes[inode].nson;\r
3004 \r
3005    nodes[inode].label = 0;\r
3006    for(i=0; i<nson0; i++)\r
3007       PruneSubTreeN(nodes[inode].sons[i], keep);\r
3008 \r
3009    /* remove inode because of no descendents.  \r
3010       Note this does not touch the father node */\r
3011    if(inode<com.ns && keep[inode]==0)\r
3012       nodes[inode].father = -1;\r
3013    else if(inode>=com.ns) {\r
3014       for(i=0,nodes[inode].nson=0; i<nson0; i++) {\r
3015          ison = nodes[inode].sons[i];\r
3016          if(nodes[ison].father!=-1) \r
3017             nodes[inode].sons[ nodes[inode].nson++ ] = nodes[inode].sons[i];\r
3018       }\r
3019       if(nodes[inode].nson == 0)\r
3020          nodes[inode].father = -1;\r
3021    }\r
3022 \r
3023    /* remove inode if it has a single descendent ison */\r
3024    if(inode>=com.ns && nodes[inode].nson==1 && inode!=tree.root) {\r
3025       ison = nodes[inode].sons[0];\r
3026       nodes[ison].father = father;\r
3027       nodes[ison].branch += nodes[inode].branch;\r
3028       nodes[ison].label ++;  /* records # deleted nodes for branch ison */\r
3029       for(j=0; j<nodes[father].nson; j++) {\r
3030          if(nodes[father].sons[j]==inode)\r
3031             { nodes[father].sons[j] = ison;  break; }\r
3032       }\r
3033       nodes[inode].nson = 0;\r
3034       nodes[inode].father = -1;\r
3035    }\r
3036    else if(nodes[inode].nson==1 && inode==tree.root) { /* move down root if root has 1 descendent */\r
3037       nodes[inode].father = -1;\r
3038       nodes[inode].nson = 0;\r
3039       ison = nodes[tree.root].sons[0];\r
3040       tree.root = ison;\r
3041       nodes[tree.root].father = -1;\r
3042       nodes[tree.root].branch = 0;\r
3043    }\r
3044 \r
3045    /*\r
3046    printf("\nVisiting inode %d\n", inode);\r
3047    FOR(i, tree.nnode) printf(" %2d", i);  FPN(F0);\r
3048    FOR(i, tree.nnode) printf(" %2.0f", nodes[i].label); FPN(F0);\r
3049    */\r
3050    return(0);\r
3051 }\r
3052 \r
3053 \r
3054 int GetSubTreeN (int keep[], int space[])\r
3055 {\r
3056 /* This removes some tips to generate the subtree.  Branch lengths are \r
3057    preserved by summing them up when some nodes are removed.  \r
3058    The algorithm use post-order tree traversal to remove tips and nodes.  It \r
3059    then switches to the branch representation to renumber nodes.\r
3060    space[] can be NULL.  If not, it returns newnodeNO[], which holds the \r
3061    new node numbers; for exmaple, newnodeNO[12]=5 means that old node 12 now \r
3062    becomes node 5.\r
3063 \r
3064    The routine does not change com.ns or com.spname[], which have to be updated \r
3065    outside.\r
3066 \r
3067    CHANGE OF ROOT happens if the root in the old tree had >=3 sons, but has 2 \r
3068    sons in the new tree and if (!com.clock).  In that case, the tree is derooted.\r
3069 \r
3070    This routine does not work if a current seq is ancestral to some others \r
3071    and if that sequence is removed. (***check this comment ***)\r
3072    \r
3073    Different formats for keep[] are used.  Suppose the current tree is for \r
3074    nine species: a b c d e f g h i.\r
3075    \r
3076    (A) keep[]={1,0,1,1,1,0,0,1,0} means that a c d e h are kept in the tree.  \r
3077        The old tip numbers are not changed, so that OutTreeN(?,1,?) gives the \r
3078        correct species names or OutTreeN(?,0,?) gives the old species numbers.\r
3079 \r
3080    (B) keep[]={1,0,2,3,4,0,0,5,0} means that a c d e h are kept in the tree, and \r
3081        they are renumbered 0 1 2 3 4 and all the internal nodes are renumbered \r
3082        as well to be consecutive.  Note that the positive numbers have to be \r
3083        consecutive natural numbers.\r
3084 \r
3085        keep[]={5,0,2,1,4,0,0,3,0} means that a c d e h are kept in the tree.  \r
3086        However, the order of the sequences are changed to d c h e a, so that the \r
3087        numbers are now 0 1 2 3 4 for d c h e a.  This is useful when the subtree \r
3088        is extracted from a big tree for a subset of the sequence data, while the \r
3089        species are odered d c h e a in the sequence data file.\r
3090        This option can be used to renumber the tips in the complete tree.\r
3091 */\r
3092    int nsnew, i,j,k, nnode0=tree.nnode, sumnumber=0, newnodeNO[2*NS-1], ison, sib;\r
3093    int unrooted = (nodes[tree.root].nson>=3);  /* com.clock is not checked here! */\r
3094    double *branch0;\r
3095    int debug=0;\r
3096 \r
3097    if(debug) { FOR(i,com.ns) printf("%-30s %2d\n", com.spname[i], keep[i]); }\r
3098    for(i=0,nsnew=0; i<com.ns; i++)\r
3099       if(keep[i]) { nsnew++; sumnumber+=keep[i]; }\r
3100    if(nsnew<2)  return(-1);\r
3101 \r
3102    /* mark removed nodes in the big tree by father=-1 && nson=0.\r
3103       nodes[].label records the number of nodes collapsed.\r
3104    */\r
3105    PruneSubTreeN(tree.root, keep);\r
3106    /* If unrooted tree has a bifurcation at the new root, collapse root.  */\r
3107    if (unrooted && nodes[tree.root].nson==2) {\r
3108       ison = nodes[tree.root].sons[i = 0];\r
3109       if(nodes[ison].nson==0)\r
3110          ison = nodes[tree.root].sons[i = 1];\r
3111       sib = nodes[tree.root].sons[1 - i];\r
3112 \r
3113       nodes[sib].branch += nodes[ison].branch;\r
3114       nodes[sib].label += nodes[ison].label + 2;\r
3115       nodes[sib].father = tree.root = ison;\r
3116       nodes[tree.root].father = -1;\r
3117       nodes[tree.root].sons[nodes[tree.root].nson++] = sib;  /* sib added as the last child of the new root */\r
3118       nodes[tree.root].branch = 0;\r
3119    }\r
3120    if(debug) printtree(1);\r
3121 \r
3122    for(i=0,k=1; i<tree.nnode; i++) if(nodes[i].father!=-1) k++;\r
3123    tree.nnode = k;\r
3124    NodeToBranch();\r
3125 \r
3126    /* to renumber the nodes */\r
3127    if(sumnumber>nsnew) {\r
3128       if(sumnumber != nsnew*(nsnew+1)/2)\r
3129          error2("keep[] not right in GetSubTreeN");\r
3130 \r
3131       if((branch0=(double*)malloc(nnode0*sizeof(double)))==NULL) error2("oom#");\r
3132       FOR(i,nnode0) branch0[i] = nodes[i].branch;\r
3133       FOR(i,nnode0) newnodeNO[i] = -1;\r
3134       FOR(i,com.ns) if(keep[i]) newnodeNO[i] = keep[i]-1;\r
3135 \r
3136       newnodeNO[tree.root] = k = nsnew;\r
3137       tree.root = k++;\r
3138       for( ; i<nnode0; i++) {\r
3139          if(nodes[i].father==-1) continue;\r
3140          for(j=0; j<tree.nbranch; j++)\r
3141             if(i==tree.branches[j][1]) break;\r
3142          if(j==tree.nbranch)\r
3143             error2("strange here");\r
3144          newnodeNO[i] = k++;\r
3145       }\r
3146       for(j=0; j<tree.nbranch; j++) FOR(i,2)\r
3147          tree.branches[j][i] = newnodeNO[tree.branches[j][i]];\r
3148       BranchToNode();\r
3149       for(i=0; i<nnode0; i++) {\r
3150          if(newnodeNO[i]>-1)\r
3151             nodes[newnodeNO[i]].branch = branch0[i];\r
3152       }\r
3153       free(branch0);\r
3154    }\r
3155 \r
3156    if(space) memmove(space, newnodeNO, (com.ns*2-1)*sizeof(int));\r
3157    return (0);\r
3158 }\r
3159 \r
3160 \r
3161 void printtree (int timebranches)\r
3162 {\r
3163    int i,j;\r
3164 \r
3165    printf("\nns = %d  nnode = %d", com.ns, tree.nnode);\r
3166    printf("\n%7s%7s", "father","node");\r
3167    if(timebranches)  printf("%10s%10s%10s", "age", "branch", "label");\r
3168    printf(" %7s%7s", "nson:","sons");\r
3169    FOR (i, tree.nnode) {\r
3170       printf ("\n%7d%7d", nodes[i].father, i);\r
3171       if(timebranches)\r
3172          printf(" %9.6f %9.6f %9.0f", nodes[i].age, nodes[i].branch,nodes[i].label);\r
3173 \r
3174       printf ("%7d: ", nodes[i].nson);\r
3175       FOR(j,nodes[i].nson) printf(" %2d", nodes[i].sons[j]);\r
3176    }\r
3177    FPN(F0); \r
3178    OutTreeN(F0,0,0); FPN(F0); \r
3179    OutTreeN(F0,1,0); FPN(F0); \r
3180    OutTreeN(F0,1,1); FPN(F0); \r
3181 }\r
3182 \r
3183 \r
3184 void PointconPnodes (void)\r
3185 {\r
3186 /* This points the nodes[com.ns+inode].conP to the right space in com.conP.\r
3187    The space is different depending on com.cleandata (0 or 1)\r
3188    This routine updates internal nodes com.conP only.  \r
3189    End nodes (com.conP0) are updated in InitConditionalPNode().\r
3190 */\r
3191    int nintern=0, i;\r
3192 \r
3193    for(i=0; i<tree.nbranch+1; i++)\r
3194       if(nodes[i].nson>0)  /* more thinking */\r
3195          nodes[i].conP = com.conP + com.ncode*com.npatt*nintern ++;\r
3196 }\r
3197 \r
3198 \r
3199 int SetxInitials (int np, double x[], double xb[][2])\r
3200 {\r
3201 /* This forces initial values into the boundary of the space\r
3202 */\r
3203    int i;\r
3204 \r
3205    for (i=com.ntime; i<np; i++) {\r
3206       if (x[i]<xb[i][0]*1.005) x[i]=xb[i][0]*1.05;\r
3207       if (x[i]>xb[i][1]/1.005) x[i]=xb[i][1]/1.05;\r
3208    }\r
3209    for (i=0; i<com.np; i++) {\r
3210       if (x[i]<xb[i][0]) x[i]=xb[i][0]*1.2;\r
3211       if (x[i]>xb[i][1]) x[i]=xb[i][1]*.8;\r
3212    }\r
3213    return(0);\r
3214 }\r
3215 \r
3216 \r
3217 #if(defined(BASEML) || defined(CODEML) || defined(MCMCTREE))\r
3218 \r
3219 int GetTipDate (double *TipDate, double *TipDate_TimeUnit)\r
3220 {\r
3221 /* This scans sequence names to collect the sampling dates.  The last field of \r
3222    the sequence name is assumed to contain the date.\r
3223    Divergence times are rescaled by using TipDate_TimeUnit.\r
3224 */\r
3225    int i, j, indate, ndates=0;\r
3226    double young=-1, old=-1;\r
3227    char *p;\r
3228 \r
3229    *TipDate = 0;\r
3230    for(i=0,ndates=0; i<com.ns; i++) {\r
3231       nodes[i].age = 0;\r
3232       j = strlen(com.spname[i]);\r
3233       for(indate=0,p=com.spname[i]+j-1; j>=0; j--,p--) {\r
3234          if(isdigit(*p) || *p=='.') indate=1;\r
3235          else if(indate) \r
3236             break;\r
3237       }\r
3238       sscanf(p+1, "%lf", &nodes[i].age);\r
3239       if(nodes[i].age<=0)\r
3240          error2("Tip date <= 0");\r
3241       else \r
3242          ndates++;\r
3243 \r
3244       if(i==0)\r
3245          young = old = nodes[i].age;\r
3246       else {\r
3247          old = min2(old, nodes[i].age);\r
3248          young = max2(young, nodes[i].age);\r
3249       }\r
3250    }\r
3251    if(ndates==0) {\r
3252       if(*TipDate_TimeUnit == -1) *TipDate_TimeUnit = 1;\r
3253       return(0);\r
3254    }\r
3255    else if (ndates!=com.ns) {\r
3256       printf("TipDate model requires date for each sequence.");\r
3257    }\r
3258 \r
3259    /* TipDate models */\r
3260    if(ndates != com.ns) \r
3261       error2("TipDate model: each sequence must have a date");\r
3262    *TipDate = young;\r
3263    if(*TipDate_TimeUnit <= 0) \r
3264       *TipDate_TimeUnit = (young - old)*2.5;\r
3265    if(young - old < 1e-30)\r
3266       error2("TipDate: all sequences are of the same age?");\r
3267    for(i=0; i<tree.nnode; i++) {\r
3268       if(i<com.ns || nodes[i].fossil) {\r
3269          nodes[i].age = (young - nodes[i].age) / *TipDate_TimeUnit;\r
3270          if(nodes[i].age<1e-20) nodes[i].age = 0;\r
3271       }\r
3272    }\r
3273 \r
3274    if(noisy) printf("\nTipDate model\nDate range: (%.2f, %.2f) => (0, %.2f). TimeUnit = %.2f.\n",\r
3275                      young, old, (young-old)/ *TipDate_TimeUnit, *TipDate_TimeUnit);\r
3276 \r
3277    return(0);\r
3278 }\r
3279 \r
3280 #endif\r
3281 \r
3282 \r
3283 #if(defined(BASEML) || defined(CODEML))\r
3284 \r
3285 double *AgeLow=NULL;\r
3286 int NFossils=0, AbsoluteRate=0;\r
3287 /* TipDate models: \r
3288       MutationRate = mut/TipDate_TimeUnit; \r
3289       age = age*TipDate_TimeUnit \r
3290 */\r
3291 \r
3292 void SetAge(int inode, double x[]);\r
3293 void GetAgeLow (int inode);\r
3294 /* number of internal node times, usd to deal with known ancestors.  Broken? */\r
3295 static int innode_time=0;  \r
3296 \r
3297 /* Ziheng Yang, 25 January 2003\r
3298    The following routines deal with clock and local clock models, including \r
3299    Andrew Rambaut's TipDate models (Rambaut 2000 Bioinformatics 16:395-399;\r
3300    Yoder & Yang 2000 Mol Biol Evol 17:1081-1090; Yang & Yoder 2003 Syst Biol).\r
3301    The tree is rooted.  The routine SetAge assumes that ancestral nodes are\r
3302    arranged in the increasing order and so works only if the input tree uses \r
3303    the parenthesis notation and not the branch notation.  The option of known \r
3304    ancestors is probably broken.\r
3305 \r
3306    The flag AbsoluteRate=1 if(TipDate || NFossils).  This could be removed\r
3307    as the flags TipDate and NFossils are sufficient.\r
3308       clock = 1: global clock, deals with TipDate with no or many fossils, \r
3309                  ignores branch rates (#) in tree if any.\r
3310             = 2: local clock models, as above, but requires branch rates # \r
3311                  in tree.\r
3312             = 3: as 2, but requires Mgene and option G in sequence file.\r
3313 \r
3314    Order of variables in x[]: divergence times, rates for branches, rgene, ...\r
3315    In the following ngene=4, com.nbtype=3, with r_ij to be the rate \r
3316    of gene i and branch class j.\r
3317 \r
3318    clock=1 or 2:\r
3319       [times, r00(if absolute) r01 r02  rgene1 rgene2 rgene3]\r
3320       NOTE: rgene[] has relative rates\r
3321    clock=3:\r
3322       [times, r00(if absolute) r01 r02  r11 r12  r21 r22 r31 r32 rgene1 rgene2 rgene3]\r
3323       NOTE: rgene1=r10, rgene2=r20, rgene3=r30\r
3324 \r
3325    If(nodes[tree.root].fossil==0) x[0] has absolute time for the root.  \r
3326    Otherwise x[0] has proportional ages.\r
3327 */\r
3328 \r
3329 \r
3330 double GetBranchRate(int igene, int ibrate, double x[], int *ix)\r
3331 {\r
3332 /* This finds the right branch rate in x[].  The rate is absolute if AbsoluteRate.\r
3333    ibrate=0,1,..., indicates the branch rate class.\r
3334    This routine is used in the likeihood calculation and in formatting output.\r
3335    ix (k) has the position in x[] for the branch rate if the rate is a parameter.\r
3336    and is -1 if the rate is not a parameter in the ML iteration.  This is \r
3337    for printing SEs.\r
3338 */\r
3339    int nage=tree.nnode-com.ns-NFossils, k=nage+AbsoluteRate;\r
3340    double rate00=(AbsoluteRate?x[nage]:1), brate=rate00;\r
3341 \r
3342    if(igene==0 && ibrate==0)\r
3343       k = (AbsoluteRate?nage:-1);\r
3344    else if(com.clock==GlobalClock) {\r
3345       brate = x[k=com.ntime+igene-1];  /* igene>0, rgene[] has absolute rates */\r
3346    }\r
3347    else if(com.clock==LocalClock) {  /* rgene[] has relative rates */\r
3348       if(igene==0 && ibrate)     { brate = x[k+=ibrate-1]; }\r
3349       else if(igene && ibrate==0){ brate = rate00*x[com.ntime+igene-1]; k=-1; }\r
3350       else if(igene && ibrate)   { brate = x[k+ibrate-1]*x[com.ntime+igene-1]; k=-1; }\r
3351    }\r
3352    else if(com.clock==ClockCombined) {\r
3353       if(ibrate==0 && igene)  brate = x[k=com.ntime+igene-1];\r
3354       else                    brate = x[k+=ibrate-1+igene*(com.nbtype-1)]; /* ibrate>0 */\r
3355    }\r
3356 \r
3357    if(ix) *ix=k;\r
3358    return(brate);\r
3359 }\r
3360 \r
3361 \r
3362 void SetAge (int inode, double x[])\r
3363 {\r
3364 /* This is called from SetBranch(), to set up age for nodes under clock \r
3365    models (clock=1,2,3).\r
3366    if(TipDate||NFossil), that is, if(AbsoluteRate), this routine sets up \r
3367    times (nodes[].age) and then SetBranch() sets up branch lengths by\r
3368    multiplying times with rate:\r
3369       [].age[i] = AgeLow[i] + ([father].age - AgeLow[i])*x[i]\r
3370    \r
3371    The routine assumes that times are arranged in the order of node numbers, \r
3372    and should work if parenthesis notation of tree is used in the tree file, \r
3373    but not if the branch notation is used.\r
3374 */\r
3375    int i,ison;\r
3376 \r
3377    FOR (i,nodes[inode].nson) {\r
3378       ison=nodes[inode].sons[i];\r
3379       if(nodes[ison].nson) {\r
3380          if(AbsoluteRate) {\r
3381             if(!nodes[ison].fossil)\r
3382                nodes[ison].age = AgeLow[ison]\r
3383                                    +(nodes[inode].age - AgeLow[ison])*x[innode_time++];\r
3384          }\r
3385          else \r
3386             nodes[ison].age = nodes[inode].age*x[innode_time++];\r
3387          SetAge(ison, x);\r
3388       }\r
3389    }\r
3390 }\r
3391 \r
3392 void GetAgeLow (int inode)\r
3393 {\r
3394 /* This sets AgeLow[], the minimum age of each node.  It moves down the tree to \r
3395    scan [].age, which has tip dates and fossil dates.  It is needed if(AbsoluteRate)\r
3396    and is called by GetInitialsTimes().\r
3397 */\r
3398    int i,ison;\r
3399    double tlow=0;\r
3400 \r
3401    for(i=0; i<nodes[inode].nson; i++) {\r
3402       ison = nodes[inode].sons[i];\r
3403       if(nodes[ison].nson)\r
3404          GetAgeLow(ison);\r
3405       tlow = max2(tlow, nodes[ison].age);\r
3406    }\r
3407    if(nodes[inode].fossil) {\r
3408       if(nodes[inode].age < tlow) \r
3409          error2("age in tree is in conflict.");\r
3410       AgeLow[inode] = nodes[inode].age;\r
3411    }\r
3412    else\r
3413       AgeLow[inode] = nodes[inode].age = tlow;\r
3414 }\r
3415 \r
3416 \r
3417 \r
3418 int SetBranch (double x[])\r
3419 {\r
3420 /* if(AbsoluteRate), mutation rate is not multiplied here, but during the \r
3421    likelihood calculation.  It is copied into com.rgene[0].\r
3422 */\r
3423    int i, status=0;\r
3424    double small=-1e-5;\r
3425 \r
3426    if(com.clock==0) {\r
3427       for(i=0; i<tree.nnode; i++) {\r
3428          if(i!=tree.root) \r
3429             if((nodes[i].branch=x[nodes[i].ibranch])<small)  status = -1;\r
3430       }\r
3431       return(status);\r
3432    }\r
3433    innode_time = 0;\r
3434    if(!LASTROUND) { /* transformed variables (proportions) are used */\r
3435       if(!nodes[tree.root].fossil) /* note order of times in x[] */\r
3436          nodes[tree.root].age = x[innode_time++];\r
3437       SetAge(tree.root, x);\r
3438    }\r
3439    else {           /* times are used */\r
3440       for(i=com.ns; i<tree.nnode; i++) \r
3441          if(!nodes[i].fossil) nodes[i].age = x[innode_time++];\r
3442    }\r
3443 \r
3444    for(i=0; i<tree.nnode; i++) {  /* [].age to [].branch */\r
3445       if(i==tree.root) continue;\r
3446       nodes[i].branch = nodes[nodes[i].father].age-nodes[i].age;\r
3447       if(nodes[i].branch<small)\r
3448          status = -1;\r
3449    }\r
3450    return(status);\r
3451 }\r
3452 \r
3453 \r
3454 int GetInitialsTimes (double x[])\r
3455 {\r
3456 /* this counts com.ntime and initializes x[] under clock and local clock models,\r
3457    including TipDate and ClockCombined models.  See above for notes.\r
3458    Under local clock models, com.ntime includes both times and rates for \r
3459    lineages.\r
3460    A recursive algorithm is used to specify initials if(TipDate||NFossil).\r
3461 */\r
3462    int i,j,k;\r
3463    double maxage, t;\r
3464 \r
3465    /* no clock */\r
3466    if(com.fix_blength==2)\r
3467       { com.ntime=0; com.method=0; return(0); }\r
3468    else if(com.clock==0) {\r
3469       com.ntime = tree.nbranch;\r
3470       if(com.fix_blength==1)  return(0);\r
3471       for(i=0; i<com.ntime; i++) \r
3472          x[i] = rndu()*0.1+0.01;\r
3473 \r
3474       if(com.fix_blength==0 && com.clock<5 && ancestor && com.ntime<100)\r
3475          LSDistance (&t, x, testx);\r
3476 \r
3477       return(0);\r
3478    }\r
3479  \r
3480    /* clock models: check branch rate labels and fossil dates first */\r
3481    if(com.clock<5) {\r
3482       com.nbtype=1;\r
3483       if(com.clock==1) \r
3484          for(i=0; i<tree.nnode; i++) nodes[i].label=0;\r
3485       else {\r
3486          for(i=0; i<tree.nnode; i++) {\r
3487             if(i!=tree.root && (j=(int)nodes[i].label+1)>com.nbtype) {\r
3488                com.nbtype = j;\r
3489                if(j<0 || j>tree.nbranch-1) error2("branch label in the tree.");\r
3490             }\r
3491          }\r
3492          for(j=0; j<com.nbtype; j++) {\r
3493             for(i=0; i<tree.nnode; i++) \r
3494                if(i!=tree.root && j==(int)nodes[i].label) break;\r
3495             if(i==tree.nnode)\r
3496                printf("\nNot all branch labels (0, ..., %d) are found on tree?", com.nbtype-1);\r
3497          }\r
3498          if(noisy) printf("\nfound %d branch rates in tree.\n", com.nbtype);\r
3499          if(com.nbtype<=1) error2("use clock = 1 or add branch rate labels in tree");\r
3500 \r
3501          for(i=0; i<tree.nbranch; i++) \r
3502             printf("%3.0f",nodes[tree.branches[i][1]].label); FPN(F0);\r
3503       }\r
3504    }\r
3505    for(i=0,NFossils=0,maxage=0; i<tree.nnode; i++) {\r
3506       if(nodes[i].nson && nodes[i].fossil) {\r
3507          NFossils ++;\r
3508          maxage = max2(maxage,nodes[i].age);\r
3509       }\r
3510    }\r
3511    if(NFossils && maxage>10) \r
3512       error2("Change time unit so that fossil dates fall in (0.00001, 10).");\r
3513 \r
3514    if(com.TipDate)\r
3515       GetTipDate(&com.TipDate, &com.TipDate_TimeUnit);\r
3516 \r
3517    AbsoluteRate = (com.TipDate || NFossils);\r
3518    if(com.clock>=5 && AbsoluteRate==0) \r
3519       error2("needs fossil calibrations");\r
3520 \r
3521    com.ntime = AbsoluteRate + (tree.nnode-com.ns-NFossils) + (com.nbtype-1);\r
3522    if(com.clock == ClockCombined)\r
3523       com.ntime += (com.ngene-1)*(com.nbtype-1);\r
3524    com.ntime += (tree.root<com.ns); /* root is a known sequence. Broken? */\r
3525 \r
3526    /* DANGER! AgeLow is not freed in the program. Fix this? */\r
3527    k=0;\r
3528    if(AbsoluteRate) {\r
3529       AgeLow = (double*)realloc(AgeLow, tree.nnode*sizeof(double));\r
3530       GetAgeLow(tree.root);\r
3531    }\r
3532    if(!nodes[tree.root].fossil)\r
3533       x[k++] = (AbsoluteRate?nodes[tree.root].age*(1.2+rndu()) : rndu()*.5+.1);  /* root age */\r
3534    for(; k<tree.nnode-com.ns-NFossils; k++)   /* relative times */\r
3535       x[k] = 0.4+.5*rndu();\r
3536    if(com.clock!=6)                           /* branch rates */\r
3537       for( ; k<com.ntime; k++)\r
3538          x[k] = 0.1*(.5+rndu());\r
3539    else\r
3540       for(j=0,k=com.ntime-1; j<data.ngene; j++,k++) \r
3541          x[k] = 0.1*(.5+rndu());\r
3542    return(0);\r
3543 }\r
3544 \r
3545 int OutputTimesRates (FILE *fout, double x[], double var[])\r
3546 {\r
3547 /* SetBranch() has been called before calling this, so that [].age is up \r
3548    to date.\r
3549 */\r
3550    int i,j,k=AbsoluteRate+tree.nnode-com.ns-NFossils, jeffnode;\r
3551    double scale=(com.TipDate ? com.TipDate_TimeUnit : 1);\r
3552 \r
3553    /* rates */\r
3554    if(AbsoluteRate && com.clock<5) {\r
3555       fputs("\nSubstitution rate is per time unit\n", fout);\r
3556       if(com.nbtype>1) fprintf(fout,"Rates for branch groups\n");\r
3557       for(i=0; i<com.ngene; i++,FPN(fout)) {\r
3558          if(com.ngene>1) fprintf(fout,"Gene %2d: ", i+1);\r
3559          for(j=0; j<com.nbtype; j++) {\r
3560             fprintf(fout,"%12.6f", GetBranchRate(i,j,x,&k));\r
3561             if(i==0 && j==0 && !AbsoluteRate) continue;\r
3562             if((com.clock!=LocalClock||com.ngene==1) && com.getSE) {\r
3563                if(k==-1) error2("we are in trouble. k should not be -1 here.");\r
3564                fprintf(fout," +- %8.6f", sqrt(var[k*com.np+k]));\r
3565             }\r
3566          }\r
3567       }\r
3568    }\r
3569    else\r
3570       if(com.clock==2) {\r
3571          fprintf (fout,"rates for branches:    1");\r
3572          for(k=tree.nnode-com.ns; k<com.ntime; k++) fprintf(fout," %8.5f",x[k]);\r
3573       }\r
3574 \r
3575 \r
3576    /* times */\r
3577    if(AbsoluteRate) {\r
3578       fputs("\nNodes and Times\n",fout);\r
3579       fputs("(JeffNode is for Thorne's multidivtime.  ML analysis uses ingroup data only.)\n\n",fout);\r
3580    }\r
3581    if(com.TipDate) { /* DANGER! SE not printed if(TipDate && NFossil). */\r
3582       for(i=0,k=0; i<tree.nnode; i++,FPN(fout)) {\r
3583          jeffnode=(i<com.ns?i:tree.nnode-1+com.ns-i);\r
3584          fprintf(fout,"Node %3d (Jeffnode %3d) Time %7.2f ",i+1, jeffnode, \r
3585             com.TipDate - nodes[i].age*scale);\r
3586          if(com.getSE && i>=com.ns && !nodes[i].fossil) {\r
3587             fprintf(fout," +- %6.2f", sqrt(var[k*com.np+k])*scale);\r
3588             k++;\r
3589          }\r
3590       }\r
3591    }\r
3592    else if(AbsoluteRate) {\r
3593       for(i=com.ns,k=0; i<tree.nnode; i++,FPN(fout)) {\r
3594          jeffnode=tree.nnode-1+com.ns-i;\r
3595          fprintf(fout,"Node %3d (Jeffnode %3d) Time %9.5f", i+1, tree.nnode-1+com.ns-i, \r
3596             nodes[i].age);\r
3597          if(com.getSE && i>=com.ns && !nodes[i].fossil) {\r
3598             fprintf(fout," +- %7.5f", sqrt(var[k*com.np+k]));\r
3599             if(fabs(nodes[i].age-x[k])>1e-5) error2("node order wrong.");\r
3600             k++;\r
3601          }\r
3602       }\r
3603    }\r
3604 \r
3605    return(0);\r
3606 }\r
3607 \r
3608 int SetxBoundTimes (double xb[][2])\r
3609 {\r
3610 /* This sets bounds for times (or branch lengths) and branch rates\r
3611 */ \r
3612    int i=-1,j,k;\r
3613    double tb[]={4e-6,50}, rateb[]={1e-4,99}, pb[]={.000001,.999999};\r
3614 \r
3615    if(com.clock==0) {\r
3616       for(i=0;i<com.ntime;i++) {\r
3617          xb[i][0] = tb[0];\r
3618          xb[i][1] = tb[1];\r
3619       }\r
3620    }\r
3621    else {\r
3622       k=0;  xb[0][0]=tb[0];  xb[0][1]=tb[1];\r
3623       if(!nodes[tree.root].fossil) {\r
3624          if(AbsoluteRate)  xb[0][0]=AgeLow[tree.root];\r
3625          k=1;\r
3626       }\r
3627       for( ; k<tree.nnode-com.ns-NFossils; k++)  /* proportional ages */\r
3628          { xb[k][0]=pb[0]; xb[k][1]=pb[1]; }\r
3629       for(; k<com.ntime; k++)                    /* rate and branch rates */\r
3630          FOR(j,2) xb[k][j]=rateb[j];\r
3631    }\r
3632    return(0);\r
3633 }\r
3634 \r
3635 #endif\r
3636 \r
3637 \r
3638 #if(defined(BASEML) || defined(BASEMLG) || defined(CODEML))\r
3639 \r
3640 \r
3641 int readx(double x[], int *fromfile)\r
3642 {\r
3643 /* this reads parameters from file, used as initial values\r
3644    if(runmode>0), this reads common substitution parameters only into x[], which \r
3645    should be copied into another place before heuristic tree search.  This is broken\r
3646    right now.  Ziheng, 9 July 2003.\r
3647    fromfile=0: if nothing read from file, 1: read from file, -1:fix parameters\r
3648 */\r
3649    static int times=0;\r
3650    int i, npin;\r
3651    double *xin;\r
3652 \r
3653    times++;  *fromfile=0;\r
3654    if(finitials==NULL || (com.runmode>0 && times>1)) return(0);\r
3655    if(com.runmode<=0) { npin=com.np; xin=x; }\r
3656    else               { npin=com.np-com.ntime; xin=x+com.ntime; }\r
3657 \r
3658    if(npin<=0) return(0);\r
3659    if(com.runmode>0&&com.seqtype==1&&com.model) error2("option or in.codeml");\r
3660    printf("\nReading initials/paras from file (np=%d). Stop if wrong.\n",npin);\r
3661    fscanf(finitials,"%lf",&xin[i=0]);\r
3662    *fromfile=1;\r
3663    if(xin[0]==-1) { *fromfile=-1; LASTROUND=1; }\r
3664    else           i++;\r
3665    for( ; i<npin; i++) \r
3666       if(fscanf(finitials, "%lf", &xin[i])!=1) break;\r
3667    if(i<npin)\r
3668       { printf("err at #%d. Edit or remove it.\n",i+1); exit(-1); }\r
3669    if(com.runmode>0) {\r
3670       matout(F0,xin,1,npin);\r
3671       puts("Those are fixed for tree search.  Stop if wrong.");\r
3672    }\r
3673    return(0);\r
3674 }\r
3675 \r
3676 #endif\r
3677 \r
3678 #if(defined(BASEML) || defined(CODEML))\r
3679 \r
3680 int CollapsNode (int inode, double x[]) \r
3681 {\r
3682 /* Merge inode to its father. Update the first com.ntime elments of\r
3683    x[] only if (x!=NULL), by using either x[] if clock=1 or\r
3684    nodes[].branch if clock=0.  So when clock=0, the routine works\r
3685    properly only if SetBranch() is called before this routine, which\r
3686    is true if m.l. or l.s. has been used to estimate branch lengths.\r
3687 */\r
3688    int i,j, ifather, ibranch, ison;\r
3689 \r
3690    if (inode==tree.root || inode<com.ns) error2("err CollapsNode");\r
3691    ibranch=nodes[inode].ibranch;   ifather=nodes[inode].father; \r
3692    for (i=0; i<nodes[inode].nson; i++) {\r
3693       ison=nodes[inode].sons[i];\r
3694       tree.branches[nodes[ison].ibranch][0]=ifather;\r
3695    }\r
3696    for (i=ibranch+1; i<tree.nbranch; i++) \r
3697       for (j=0; j<2; j++) tree.branches[i-1][j]=tree.branches[i][j];\r
3698    tree.nbranch--; com.ntime--;\r
3699    for (i=0; i<tree.nbranch; i++)  for (j=0; j<2; j++) \r
3700         if (tree.branches[i][j]>inode)  tree.branches[i][j]--;\r
3701    BranchToNode();\r
3702 \r
3703    if (x) {\r
3704       if (com.clock) \r
3705          for (i=inode+1; i<tree.nnode+1; i++) x[i-1-com.ns]=x[i-com.ns];\r
3706       else {\r
3707          for (i=ibranch+1; i<tree.nbranch+1; i++)  x[i-1]=x[i];\r
3708          SetBranch (x);\r
3709       }\r
3710    }\r
3711    return (0);\r
3712 }\r
3713 \r
3714 #endif\r
3715 \r
3716 \r
3717 #if(defined(BPP) || defined(EVOLVER))\r
3718 \r
3719 void Tree2PartitionDescentTree (int inode, char split[])\r
3720 {\r
3721    int i, ison;\r
3722 \r
3723    for(i=0; i<nodes[inode].nson; i++) {\r
3724       ison = nodes[inode].sons[i];\r
3725       if(ison<com.ns)\r
3726          split[ison] = '1';\r
3727       else \r
3728          Tree2PartitionDescentTree(ison, split);\r
3729    }\r
3730 }\r
3731 \r
3732 void Tree2Partition (char splits[])\r
3733 {\r
3734 /* This generates branch partitions in splits[nib*(com.ns+1)].  \r
3735    splits[0,...,ns-1] is the first split, splits[ns,...,2*ns-1] the second, and so on.\r
3736    For large trees, the algorithm is inefficient.\r
3737    The root node has 2 sons if the tree is rooted and >=3 sons if the tree \r
3738    is unrooted.  For unrooted tree, the mark for the first species is set to 0.\r
3739    For rooted trees, the mark for the first species can be either 0 or 1.\r
3740 */\r
3741    int unrooted = (nodes[tree.root].nson>=3);\r
3742    int s=com.ns, lsplit=s+1, nsplit=tree.nnode-s-1, i, j, k;\r
3743    char *split;\r
3744 \r
3745    if(s<=2) return ;\r
3746    memset(splits, 0, nsplit*lsplit*sizeof(char));\r
3747    for(i=com.ns,k=0; i<tree.nnode; i++) {\r
3748       if(i==tree.root) continue;\r
3749       split = splits+k*lsplit;\r
3750       for(j=0; j<s; j++) split[j] = '0';\r
3751       Tree2PartitionDescentTree(i, split);\r
3752       /* If unrooted tree, set first species to 0 if tree is unrooted */\r
3753       if(unrooted && split[0]=='1')\r
3754          for(j=0; j<s; j++) split[j] = '0' + '1' - split[j];\r
3755       k++;\r
3756    }\r
3757 }\r
3758 \r
3759 int Partition2Tree (char splits[], int lsplit, int ns, int nsplit, double label[])\r
3760 {\r
3761 /* This generates the tree in nodes[] using splits or branch partitions.\r
3762    It constructs pptable[(2*s-1)*(2*s-1)] to generate the tree.  \r
3763    Split i corresponds to node ns+i, while the root is ns + nsplit.\r
3764    label[] has labels for splits and become labels for nodes on the tree.\r
3765    This works even if ns=1 and ns=2.\r
3766 */\r
3767    int i,j,k, s21=ns*2-1, a, minndesc, ndesc[NS]={0};  /* clade size */\r
3768    char debug=0, *p, *pptable;\r
3769 \r
3770    if(nsplit>ns-2) error2("too many splits for ns");\r
3771    if(nsplit<0) nsplit=0;\r
3772    if((pptable=(char*)malloc((s21*s21+1)*sizeof(char))) == NULL)\r
3773       error2("oom in Partition2Tree");\r
3774    memset(pptable, 0, s21*s21*sizeof(char));\r
3775 \r
3776    /* initialize tree */\r
3777    tree.nnode = ns+nsplit+1;\r
3778    tree.root  = ns+nsplit;\r
3779    tree.nbranch = 0;\r
3780    for(i=0; i<tree.nnode; i++) {\r
3781       nodes[i].father = nodes[i].ibranch = -1;\r
3782       nodes[i].nson = 0;  nodes[i].label = -1;  nodes[i].branch = nodes[i].age = 0;\r
3783    }\r
3784 \r
3785    /* set up pptable */\r
3786    for(i=0,p=splits,ndesc[tree.root-ns]=ns; i<nsplit; i++, p+=lsplit) {\r
3787       for(j=0; j<ns; j++) {\r
3788          if(p[j] == '1') {  /* clade (split) i includes tip j */\r
3789             pptable[j*s21 + ns+i] = 1;\r
3790             ndesc[i]++;\r
3791          }\r
3792       }\r
3793    }\r
3794    for(i=0; i<tree.nnode-1; i++) pptable[i*s21+tree.root] = 1;\r
3795    for(i=0; i<nsplit; i++) {\r
3796       for(j=0; j<i; j++) {\r
3797          if(pptable[(ns+i)*s21+ns+j] || pptable[(ns+j)*s21+ns+i] || ndesc[i] == ndesc[j])\r
3798             continue;\r
3799          for(k=0; k<ns; k++)\r
3800             if(pptable[k*s21+ns+i]==1 && pptable[k*s21+ns+j]==1) break;\r
3801          if(k<ns) {  /* i and j are ancestral to k, and are ancestral to each other. */\r
3802             if(ndesc[i] < ndesc[j])   pptable[(ns+i)*s21+ns+j] = 1;\r
3803             else                      pptable[(ns+j)*s21+ns+i] = 1;\r
3804          }\r
3805       }\r
3806    }\r
3807    if(debug) {\r
3808       printf("\npptable\n");\r
3809       for(i=0; i<s21; i++,FPN(F0))\r
3810          for(j=0; j<s21; j++)\r
3811             printf(" %1d", (int)pptable[i*s21+j]);\r
3812       printf("ndesc: ");\r
3813       for(i=0; i<nsplit; i++) printf(" %2d", ndesc[i]);  FPN(F0);\r
3814    }\r
3815 \r
3816    /* generate tree nodes and labels.  For each nonroot node, youngest ancestor is dad. */\r
3817    for(i=0; i<tree.nnode-1; i++) {\r
3818       minndesc=ns+1;  a=-1;\r
3819       for(j=ns; j<tree.nnode; j++) {\r
3820          if(pptable[i*s21+j]==1 && minndesc>ndesc[j-ns]) \r
3821             { minndesc = ndesc[j-ns];  a=j; }\r
3822       }\r
3823       if(a<0)\r
3824          error2("jgl");\r
3825       nodes[i].father = a;\r
3826       nodes[a].sons[nodes[a].nson++] = i;\r
3827       if(a!=tree.root && label) nodes[a].label = label[a-ns];\r
3828    }\r
3829    if(debug) {\r
3830       printtree(1);\r
3831       OutTreeN(F0,1,0);  FPN(F0);\r
3832    }\r
3833    free(pptable);\r
3834    return(0);\r
3835 }\r
3836 \r
3837 \r
3838 int GetNSfromTreeFile(FILE *ftree, int *ns, int *ntree)\r
3839 {\r
3840 /* This gets the sequence names from the tree file.\r
3841 */\r
3842    char separators[]="(,):#";\r
3843    int inname=0, k, c;\r
3844    double y;\r
3845 \r
3846    *ns = *ntree = -1;\r
3847    k = fscanf(ftree, "%d%d", ns, ntree);\r
3848    if(k==1) { *ntree = *ns; *ns = -1; }\r
3849    else if(k==0) {\r
3850       *ns = 0;\r
3851       while((c = fgetc(ftree)) != ';') {\r
3852          if(c==EOF) return(-1);\r
3853          if(strchr(separators, c)) {\r
3854             if     (c == ':') fscanf(ftree, "%lf", &y);\r
3855             else if(c == '#') fscanf(ftree, "%lf", &y);\r
3856             if(inname) { inname=0; (*ns)++; }\r
3857          }\r
3858          else if(isgraph(c))\r
3859             inname = 1;\r
3860       }\r
3861       rewind(ftree);\r
3862    }\r
3863    return(0);\r
3864 }\r
3865 \r
3866 void CladeSupport (FILE *fout, char treef[], int getSnames, char mastertreef[], int pick1tree)\r
3867 {\r
3868 /* This reads all bootstrap or Bayesian trees from treef to identify the best trees \r
3869    (trees with the highest posterior probabilities), and to construct the majority-rule \r
3870    consensus tree.  The set of the best trees constitute the 95% or 99% credibility set \r
3871    of trees.\r
3872    A tree (ptree) is represented by its splits, ordered lexicographically, and concatenated.\r
3873    It can also read a master tree file and goes through master trees and attach support \r
3874    values on splits on each master tree.\r
3875    split1 if for one tree, and split50 is for the majority-rule consensus tree.\r
3876 */\r
3877    int i,j,k, i1, ntreeM,ntree, itreeM, sizetree, found, lline=1024;\r
3878    int *index, *indexspace, maxnsplits, nsplits=0, s, nsplit1, nsplit50, lsplit, same;\r
3879    int maxnptree, nptree=0, sizeptree;\r
3880    char *split1, *splits=NULL, *splitM=NULL, *split50=NULL, *pM, *ptree, *ptrees=NULL, *pc;\r
3881    double *countsplits=NULL, *countptree=NULL, *Psplit50, *Psame, y, cdf;\r
3882    char pick1treef[32]="pick1tree.tre", line[1024];\r
3883    struct TREEN *nodes_t;\r
3884    FILE *ft, *fM=NULL, *f1tree=NULL;\r
3885    int debug=0;\r
3886 \r
3887    /* Count trees and splits */\r
3888    printf("\nRead tree sample, count trees & splits \n");\r
3889    ft = gfopen(treef, "r");\r
3890 \r
3891    if(getSnames)      /* species ordered as in first tree in file */\r
3892       GetNSfromTreeFile(ft, &com.ns, &k);\r
3893    if(com.ns<3) error2("need >=3 species to justify this much effort.");\r
3894    s=com.ns;  lsplit=s+1;  maxnsplits=maxnptree=s; sizeptree=(s-2)*lsplit;\r
3895    if((split1=(char*)malloc(3*(s-2) * lsplit * sizeof(char))) == NULL)\r
3896       error2("oom splits");\r
3897    ptree = split1 + (s-2)*lsplit;\r
3898    split50 = ptree + (s-2)*lsplit;\r
3899    memset(split1, 0, 3*(s-2) * lsplit * sizeof(char));\r
3900    if((Psplit50=(double*)malloc(s*sizeof(double))) == NULL)\r
3901       error2("oom Psplit50");\r
3902 \r
3903    sizetree=(s*2-1)*sizeof(struct TREEN);\r
3904    if((nodes=(struct TREEN*)malloc(sizetree*2))==NULL) error2("oom");\r
3905    for(i=0; i<s*2-1; i++) nodes[i].nodeStr=NULL;\r
3906    nodes_t = nodes + s*2-1;\r
3907 \r
3908    for(ntree=0;  ; ntree++) {\r
3909       if(nptree+s >= maxnptree) {\r
3910          maxnptree = (int)(maxnptree*(ntree<1000 ? 2 : 1.2));\r
3911          ptrees = (char *)realloc(ptrees, maxnptree*sizeptree);\r
3912          countptree = (double*)realloc(countptree, maxnptree*sizeof(double));\r
3913          if(ptrees==NULL || countptree==NULL) error2("oom ptrees || countptree");\r
3914          memset(ptrees+nptree*sizeptree, 0, (maxnptree-nptree)*sizeptree);\r
3915          memset(countptree+nptree, 0, (maxnptree-nptree)*sizeof(double));\r
3916       }\r
3917       if(nsplits+s >= maxnsplits) {\r
3918          maxnsplits *= 2;\r
3919          splits = (char*)realloc(splits, maxnsplits * lsplit * sizeof(char));\r
3920          countsplits = (double*)realloc(countsplits, maxnsplits*sizeof(double));\r
3921          if(splits==NULL || countsplits==NULL) error2("oom splits realloc");\r
3922       }\r
3923 \r
3924       /* if (getSnames), copy species/sequence names from first tree in file.  */\r
3925       if(ReadTreeN(ft, &i, &j, (getSnames && ntree==0), 1)) break;\r
3926       if(debug || (ntree+1)%5000==0) {\r
3927          printf("\rtree %5d  ", ntree+1);\r
3928          if(s<15) OutTreeN(F0, 1, 0);\r
3929       }\r
3930       Tree2Partition(split1);\r
3931       nsplit1 = tree.nnode - s - 1;\r
3932 \r
3933       /* Process the tree */      \r
3934       qsort(split1, nsplit1, lsplit, (int(*)(const void *, const void *))strcmp);\r
3935       if(debug)\r
3936          { for(i=0; i<nsplit1; i++) printf(" %s", split1+i*lsplit);  printf("\n"); }\r
3937       for(i=0,pc=ptree; i<nsplit1; i++) \r
3938          for(j=0; j<s; j++) *pc++ = split1[i*lsplit+j];\r
3939       j = binarysearch(ptree, ptrees, nptree, sizeptree, (int(*)(const void *, const void *))strcmp, &found);\r
3940       if(found)\r
3941          countptree[j]++;\r
3942       else {\r
3943          if(j<nptree) {\r
3944             memmove(ptrees+(j+1)*sizeptree, ptrees+j*sizeptree, (nptree-j)*sizeptree);\r
3945             memmove(countptree+j+1, countptree+j, (nptree-j)*sizeof(double));\r
3946          }\r
3947          memmove(ptrees+j*sizeptree, ptree, sizeptree);\r
3948          nptree++;\r
3949          countptree[j]=1;\r
3950       }\r
3951 \r
3952       /* Process the splits in the tree */\r
3953       for(i=0; i<nsplit1; i++) {  /* going through splits in current tree */\r
3954          j = binarysearch(split1+i*lsplit, splits, nsplits, lsplit, (int(*)(const void *, const void *))strcmp, &found);\r
3955          if(found)  /* found */\r
3956             countsplits[j]++;\r
3957          else {\r
3958             if(j<nsplits) {  /* check the size of the moved block here */\r
3959                memmove(splits+(j+1)*lsplit, splits+j*lsplit, (nsplits-j)*lsplit*sizeof(char));\r
3960                memmove(countsplits+(j+1), countsplits+j, (nsplits-j)*sizeof(double));\r
3961             }\r
3962             memcpy(splits+j*lsplit, split1+i*lsplit, lsplit*sizeof(char));\r
3963             nsplits++;\r
3964             countsplits[j]=1;\r
3965          }\r
3966       }\r
3967       if(debug) {\r
3968          printf("%4d splits: ", nsplits);\r
3969          for(k=0; k<nsplits; k++) printf(" %s (%.0f)", splits+k*lsplit, countsplits[k]);\r
3970          FPN(F0);\r
3971       }\r
3972    }\r
3973    printf("\n%6d trees read, %d distinct trees.\n", ntree, nptree);\r
3974 \r
3975    k = max2(nsplits, nptree);\r
3976    if((index=(int*)malloc(k*2*sizeof(int)))==NULL) error2("oom index");\r
3977    indexspace = index+k;\r
3978 \r
3979    printf("\nSpecies in order:\n");\r
3980    for(i=0; i<s; i++) printf("%2d. %s\n", i+1, com.spname[i]);\r
3981    printf("\n(A) Best trees in the sample (%d distinct trees in all)\n", nptree);\r
3982    fprintf(fout, "\n\nSpecies in order:\n");\r
3983    for(i=0; i<s; i++) fprintf(fout, "%2d. %s\n", i+1, com.spname[i]);\r
3984    fprintf(fout, "\n(A) Best trees in the sample (%d distinct trees in all)\n", nptree);\r
3985 \r
3986    indexing(countptree, nptree, index, 1, indexspace);\r
3987 \r
3988    for(k=0,cdf=0; k<nptree; k++) {\r
3989       j = index[k];  y=countptree[j];\r
3990       for(i=0,pc=split1; i<nsplit1; i++,*pc++='\0') for(i1=0; i1<s; i1++)\r
3991          *pc++ = ptrees[j*sizeptree + i*s + i1];\r
3992       Partition2Tree(split1, lsplit, s, nsplit1, NULL);\r
3993       printf(" %6.0f %8.5f %8.5f  ", y, y/ntree, (cdf+=y/ntree));\r
3994       OutTreeN(F0, 1, 0); \r
3995       /* for(i=0; i<nsplit1; i++) printf(" %s", split1+i*lsplit);  */\r
3996       printf("\n");\r
3997 \r
3998       fprintf(fout, " %6.0f %8.5f %8.5f  ", y, y/ntree, cdf);\r
3999       OutTreeN(fout, 1, 0); \r
4000       /* for(i=0; i<nsplit1; i++) fprintf(fout, " %s", split1+i*lsplit); */  \r
4001       fprintf(fout, "\n");\r
4002 \r
4003       if(cdf > 0.999) break;\r
4004    }\r
4005 \r
4006    printf("\n(B) Best splits in the sample of trees (%d splits in all)\n", nsplits);\r
4007    indexing(countsplits, nsplits, index, 1, indexspace);\r
4008    for(k=0; k<nsplits; k++) {\r
4009       j = index[k];  y=countsplits[j];\r
4010       printf(" %6.0f %9.5f  %s\n", y, y/ntree, splits+j*lsplit);\r
4011       if(y/ntree < 0.001) break;\r
4012    }\r
4013    fprintf(fout, "\n(B) Best splits in the sample of trees (%d splits in all)\n", nsplits);\r
4014    for(k=0; k<nsplits; k++) {\r
4015       j = index[k];  y=countsplits[j];\r
4016       fprintf(fout, " %6.0f %9.5f  %s\n", y, y/ntree, splits+j*lsplit);\r
4017       if(y/ntree < 0.001) break;\r
4018    }\r
4019 \r
4020    /* Majority-rule consensus tree */\r
4021    for(k=0,nsplit50=0; k<nsplits; k++)\r
4022       if(countsplits[k]/ntree >= 0.5) nsplit50++;\r
4023    for(k=0,nsplit50=0; k<nsplits; k++) {\r
4024       if(countsplits[k]/ntree > 0.5) {\r
4025          memmove(split50+nsplit50*lsplit, splits+k*lsplit, lsplit);\r
4026          Psplit50[nsplit50 ++] = countsplits[k]/ntree;\r
4027       }\r
4028    }\r
4029    Partition2Tree(split50, lsplit, s, nsplit50, Psplit50);\r
4030    printf("\n(C) Majority-rule consensus tree\n");\r
4031    OutTreeN(F0, 1, PrLabel);  FPN(F0);\r
4032    fprintf(fout, "\n(C) Majority-rule consensus tree\n");\r
4033    OutTreeN(fout, 1, PrLabel);  FPN(fout);\r
4034 \r
4035    if(mastertreef) fM = fopen(mastertreef, "r");\r
4036    if(fM==NULL) goto CleanUp;\r
4037 \r
4038    fscanf(fM, "%d%d", &i, &ntreeM);\r
4039    if(i!=s || ntreeM<1) error2("<ns> <ntree> on the first line in master tree.");\r
4040 \r
4041    /* Probabilities of trees in the master tree file */\r
4042    splitM = (char*)malloc(ntreeM * (s-2)*lsplit * sizeof(char));\r
4043    Psame = (double*)malloc(ntreeM * sizeof(double));\r
4044    if(splitM==NULL || Psame==NULL) error2("oom splitM");\r
4045    zero(Psame, ntreeM);\r
4046    if(pick1tree>=1 && pick1tree<=ntreeM && (f1tree=(FILE*)fopen(pick1treef,"w"))==NULL)\r
4047       error2("oom");\r
4048    for(itreeM=0,pM=splitM; itreeM<ntreeM; itreeM++,pM+=(s-2)*lsplit) {\r
4049       if(ReadTreeN(fM, &i, &j, 0, 1)) break;\r
4050       if(tree.nnode<s*2-2 || tree.nnode>s*2-1) error2("Master trees have to be binary");\r
4051       Tree2Partition(pM);\r
4052       qsort(pM, tree.nnode-s-1, lsplit, (int(*)(const void *, const void *))strcmp);\r
4053       if(debug) {\r
4054          printf("\nMaster tree %2d: ", itreeM+1);\r
4055          OutTreeN(F0, 1, 0);\r
4056          for(i=0; i<tree.nnode-s-1; i++) printf(" %s", pM+i*lsplit);\r
4057       }\r
4058    }\r
4059    /* read the tree sample again */\r
4060    rewind(ft);\r
4061    for(ntree=0;  ; ntree++) {\r
4062       if(ReadTreeN(ft, &i, &j, 0, 0)) break;\r
4063       fgets(line, lline, ft);\r
4064       Tree2Partition(split1);\r
4065       for(itreeM=0,pM=splitM; itreeM<ntreeM; itreeM++,pM+=(s-2)*lsplit) {\r
4066          for(i=0,same=1; i<tree.nnode-s-1; i++) {\r
4067             if(bsearch(split1+i*lsplit, pM, tree.nnode-s-1, lsplit, (int(*)(const void *, const void *))strcmp) == NULL) \r
4068                { same=0; break; }\r
4069          }\r
4070          if(same) {\r
4071             Psame[itreeM] ++;\r
4072             if(pick1tree-1==itreeM) {\r
4073                OutTreeN(f1tree, 1, 1); fprintf(f1tree, "%s", line);\r
4074             }\r
4075          }\r
4076       }\r
4077    }\r
4078 \r
4079    printf("\n(D) Probabilities of trees in the master tree file\n");\r
4080    fprintf(fout, "\n(D) Probabilities of trees in the master tree file\n");\r
4081    /* read the master trees another round just for printing. */\r
4082    rewind(fM);\r
4083    fscanf(fM, "%d%d", &s, &ntreeM);\r
4084    for(itreeM=0; itreeM<ntreeM; itreeM++) {\r
4085       if(ReadTreeN(fM, &i, &j, 0, 1)) break;\r
4086       Tree2Partition(split1);\r
4087       for(i=s,k=0; i<tree.nnode; i++) {\r
4088          if(i==tree.root) continue;\r
4089          j = binarysearch(split1+k*lsplit, splits, nsplits, lsplit, (int(*)(const void *, const void *))strcmp, &found);\r
4090          if(found) nodes[i].label = countsplits[j]/ntree;\r
4091          k++;\r
4092       }\r
4093       printf(" P = %6.4f  ", Psame[itreeM]/ntree);\r
4094       OutTreeN(F0, 1, PrLabel);  FPN(F0);\r
4095       \r
4096       fprintf(fout, " P = %6.4f  ", Psame[itreeM]/ntree);\r
4097       OutTreeN(fout, 1, PrLabel);  FPN(fout);\r
4098 \r
4099    }\r
4100    if(pick1tree) printf("\ntree #%d collected into %s\n", pick1tree, pick1treef);\r
4101    \r
4102 CleanUp:\r
4103    if(fM) { \r
4104       free(splitM);  free(Psame);  \r
4105       fclose(fM);    if(f1tree) fclose(f1tree);\r
4106    }\r
4107    free(split1);  free(splits);  free(countsplits);  free(Psplit50);  \r
4108    free(ptrees);  free(countptree);  free(index);\r
4109    free(nodes);\r
4110    fclose(ft);\r
4111    exit(0);\r
4112 }\r
4113 \r
4114 #endif\r
4115 \r
4116 \r
4117 int NSameBranch (char partition1[],char partition2[], int nib1,int nib2, int IBsame[])\r
4118 {\r
4119 /* counts the number of correct (identical) bipartitions.\r
4120    nib1 and nib2 are the numbers of interior branches in the two trees\r
4121    correctIB[0,...,(correctbranch-1)] lists the correct interior branches, \r
4122    that is, interior branches in tree 1 that is also in tree 2.\r
4123    IBsame[i]=1 if interior branch i is correct.\r
4124 */\r
4125    int i,j,k=0, nsamebranch;\r
4126 \r
4127 #if(1)\r
4128    for (i=0,nsamebranch=0; i<nib1; i++)\r
4129       for(j=0,IBsame[i]=0; j<nib2; j++) {\r
4130          if(strcmp(partition1+i*(com.ns+1), partition2+j*(com.ns+1)) == 0) {\r
4131             nsamebranch++;  IBsame[i]=1;  break; \r
4132          }\r
4133    }\r
4134 #else\r
4135    for (i=0,nsamebranch=0; i<nib1; i++)\r
4136       for(j=0,IBsame[i]=0; j<nib2; j++) {\r
4137          for (k=0;k<com.ns;k++)\r
4138             if(partition1[i*(com.ns+1)+k] != partition2[j*(com.ns+1)+k]) break;\r
4139          if (k==com.ns) {\r
4140             nsamebranch++;  IBsame[i]=1;  break; \r
4141          }\r
4142    }\r
4143 #endif\r
4144    return (nsamebranch);\r
4145 }\r
4146 \r
4147 \r
4148 int AddSpecies (int is, int ib)\r
4149 {\r
4150 /* Add species (is) to tree at branch ib.  The tree currently has \r
4151    is+1-1 species.  Interior node numbers are increased by 2 to make \r
4152    room for the new nodes.\r
4153    if(com.clock && ib==tree.nbranch), the new species is added as an\r
4154    outgroup to the rooted tree.\r
4155 */\r
4156    int i,j, it;\r
4157 \r
4158    if(ib>tree.nbranch+1 || (ib==tree.nbranch && !com.clock)) return(-1);\r
4159 \r
4160    if(ib==tree.nbranch && com.clock) { \r
4161       FOR(i,tree.nbranch) FOR(j,2)\r
4162          if (tree.branches[i][j]>=is) tree.branches[i][j]+=2;\r
4163       it=tree.root;  if(tree.root>=is) it+=2;\r
4164       FOR(i,2) tree.branches[tree.nbranch+i][0]=tree.root=is+1;\r
4165       tree.branches[tree.nbranch++][1]=it;\r
4166       tree.branches[tree.nbranch++][1]=is;\r
4167    }\r
4168    else {\r
4169       FOR(i,tree.nbranch) FOR(j,2)\r
4170          if (tree.branches[i][j]>=is) tree.branches[i][j]+=2;\r
4171       it=tree.branches[ib][1];\r
4172       tree.branches[ib][1]=is+1;\r
4173       tree.branches[tree.nbranch][0]=is+1;\r
4174       tree.branches[tree.nbranch++][1]=it;\r
4175       tree.branches[tree.nbranch][0]=is+1;\r
4176       tree.branches[tree.nbranch++][1]=is;\r
4177       if (tree.root>=is) tree.root+=2;\r
4178    }\r
4179    BranchToNode ();\r
4180    return (0);\r
4181 }\r
4182 \r
4183 \r
4184 #ifdef TREESEARCH\r
4185 \r
4186 static struct TREE\r
4187   {struct TREEB tree; struct TREEN nodes[2*NS-1]; double x[NP]; } \r
4188   treebest, treestar;\r
4189 /*\r
4190 static struct TREE \r
4191   {struct TREEB tree; struct TREEN nodes[2*NS-1];} treestar;\r
4192 */\r
4193 \r
4194 int Perturbation(FILE* fout, int initialMP, double space[]);\r
4195 \r
4196 int Perturbation(FILE* fout, int initialMP, double space[])\r
4197 {\r
4198 /* heuristic tree search by the NNI tree perturbation algorithm.  \r
4199    Some trees are evaluated multiple times as no trees are kept.\r
4200    This needs more work.\r
4201 */\r
4202    int step=0, ntree=0, nmove=0, improve=0, ineighb, i,j;\r
4203    int sizetree=(2*com.ns-1)*sizeof(struct TREEN);\r
4204    double *x=treestar.x;\r
4205    FILE *ftree;\r
4206 \r
4207    if(com.clock) error2("\n\aerr: pertubation does not work with a clock yet.\n");\r
4208    if(initialMP&&!com.cleandata)\r
4209       error2("\ncannot get initial parsimony tree for gapped data yet.");\r
4210 \r
4211    fprintf(fout, "\n\nHeuristic tree search by NNI perturbation\n");\r
4212    if (initialMP) {\r
4213       if (noisy) printf("\nInitial tree from stepwise addition with MP:\n");\r
4214       fprintf(fout, "\nInitial tree from stepwise addition with MP:\n");\r
4215       StepwiseAdditionMP (space);\r
4216    }\r
4217    else {\r
4218       if (noisy) printf ("\nInitial tree read from file %s:\n", com.treef);\r
4219       fprintf(fout, "\nInitial tree read from file.\n");\r
4220       if ((ftree=fopen (com.treef,"r"))==NULL) error2("treefile not exist?");\r
4221       fscanf (ftree, "%d%d", &i, &ntree);\r
4222       if (i!=com.ns) error2("ns in the tree file");\r
4223       if(ReadTreeN(ftree, &i, &j, 0, 1)) error2("err tree..");\r
4224       fclose(ftree);\r
4225    }\r
4226    if (noisy) { FPN (F0);  OutTreeN(F0,0,0);  FPN(F0); }\r
4227    tree.lnL=TreeScore(x, space);\r
4228    if (noisy) { OutTreeN(F0,0,1);  printf("\n lnL = %.4f\n",-tree.lnL); }\r
4229    OutTreeN(fout,1,1);  fprintf(fout, "\n lnL = %.4f\n",-tree.lnL);\r
4230    if (com.np>com.ntime) {\r
4231       fprintf(fout, "\tparameters:"); \r
4232       for(i=com.ntime; i<com.np; i++) fprintf(fout, "%9.5f", x[i]);\r
4233       FPN(fout);\r
4234    }\r
4235    fflush(fout);\r
4236    treebest.tree=tree;  memcpy(treebest.nodes, nodes, sizetree);\r
4237 \r
4238    for (step=0; ; step++) {\r
4239       for (ineighb=0,improve=0; ineighb<(tree.nbranch-com.ns)*2; ineighb++) {\r
4240          tree=treebest.tree; memcpy (nodes, treebest.nodes, sizetree);\r
4241          NeighborNNI (ineighb);\r
4242          if(noisy) {\r
4243             printf("\nTrying tree # %d (%d move[s]) \n", ++ntree,nmove);\r
4244             OutTreeN(F0,0,0);  FPN(F0);\r
4245          }\r
4246          tree.lnL=TreeScore(x, space);\r
4247          if (noisy) { OutTreeN(F0,1,1); printf("\n lnL = %.4f\n",-tree.lnL);}\r
4248          if (noisy && com.np>com.ntime) {\r
4249             printf("\tparameters:"); \r
4250             for(i=com.ntime; i<com.np; i++) printf("%9.5f", x[i]);\r
4251             FPN(F0);\r
4252          }\r
4253          if (tree.lnL<=treebest.tree.lnL) {\r
4254             treebest.tree=tree;  memcpy (treebest.nodes, nodes, sizetree);\r
4255             improve=1; nmove++;\r
4256             if (noisy) printf(" moving to this tree\n");\r
4257             if (fout) {\r
4258                fprintf(fout, "\nA better tree:\n");\r
4259                OutTreeN(fout,0,0); FPN(fout); OutTreeN(fout,1,1); FPN(fout); \r
4260                fprintf(fout, "\nlnL = %.4f\n", tree.lnL);\r
4261                if (com.np>com.ntime) {\r
4262                   fprintf(fout,"\tparameters:"); \r
4263                   for(i=com.ntime; i<com.np; i++) fprintf(fout,"%9.5f", x[i]);\r
4264                   FPN(fout);\r
4265                }\r
4266                fflush(fout);\r
4267           }\r
4268          }\r
4269       }\r
4270       if (!improve) break;\r
4271    }\r
4272    tree=treebest.tree;  memcpy (nodes, treebest.nodes, sizetree);\r
4273    if (noisy) {\r
4274       printf("\n\nBest tree found:\n");\r
4275       OutTreeN(F0,0,0);  FPN(F0);  OutTreeN(F0,1,1);  FPN(F0); \r
4276       printf("\nlnL = %.4f\n", tree.lnL);\r
4277    }\r
4278    if (fout) {\r
4279       fprintf(fout, "\n\nBest tree found:\n");\r
4280       OutTreeN(fout,0,0);  FPN(fout);  OutTreeN(fout,1,1);  FPN(fout); \r
4281       fprintf(fout, "\nlnL = %.4f\n", tree.lnL);\r
4282    }\r
4283    return (0);\r
4284 }\r
4285 \r
4286 \r
4287 static int *_U0, *_step0, _mnnode;\r
4288 /* up pass characters and changes for the star tree: each of size npatt*nnode*/\r
4289 \r
4290 int StepwiseAdditionMP (double space[])\r
4291 {\r
4292 /* tree search by species addition.\r
4293 */\r
4294    char *z0[NS];\r
4295    int  ns0=com.ns, is, i,j,h, tiestep=0,tie,bestbranch=0;\r
4296    int sizetree=(2*com.ns-1)*sizeof(struct TREEN);\r
4297    double bestscore=0,score;\r
4298 \r
4299    _mnnode=com.ns*2-1;\r
4300    _U0=(int*)malloc(com.npatt*_mnnode*sizeof(int));\r
4301    _step0=(int*)malloc(com.npatt*_mnnode*sizeof(int));\r
4302    if (noisy>2) \r
4303      printf("\n%9ld bytes for MP (U0 & N0)\n", 2*com.npatt*_mnnode*sizeof(int));\r
4304    if (_U0==NULL || _step0==NULL) error2("oom U0&step0");\r
4305 \r
4306    FOR (i,ns0)  z0[i]=com.z[i];\r
4307    tree.nbranch=tree.root=com.ns=3;\r
4308    FOR (i, tree.nbranch) { tree.branches[i][0]=com.ns; tree.branches[i][1]=i; }\r
4309    BranchToNode ();\r
4310    FOR (h, com.npatt)\r
4311       FOR (i,com.ns)\r
4312         { _U0[h*_mnnode+i]=1<<(com.z[i][h]-1); _step0[h*_mnnode+i]=0; }\r
4313    for (is=com.ns,tie=0; is<ns0; is++) {\r
4314       treestar.tree=tree;  memcpy (treestar.nodes, nodes, sizetree);\r
4315 \r
4316       for (j=0; j<treestar.tree.nbranch; j++,com.ns--) {\r
4317          tree=treestar.tree;  memcpy (nodes, treestar.nodes, sizetree);\r
4318          com.ns++;\r
4319          AddSpecies (is, j);\r
4320          score=MPScoreStepwiseAddition(is, space, 0);\r
4321 /*\r
4322 OutTreeN(F0, 0, 0); \r
4323 printf(" Add sp %d (ns=%d) at branch %d, score %.0f\n", is+1,com.ns,j+1,score);\r
4324 */\r
4325          if (j && score==bestscore) tiestep=1;\r
4326          if (j==0 || score<bestscore || (score==bestscore&&rndu()<.1)) {\r
4327             tiestep=0;\r
4328             bestscore=score; bestbranch=j;\r
4329          }\r
4330       }\r
4331       tie+=tiestep;\r
4332       tree=treestar.tree;  memcpy (nodes, treestar.nodes, sizetree);\r
4333       com.ns=is+1;\r
4334       AddSpecies (is, bestbranch);\r
4335       score=MPScoreStepwiseAddition(is, space, 1);\r
4336 \r
4337       if (noisy)\r
4338        { printf("\r  Added %d [%5.0f steps]",is+1,-bestscore); fflush(F0);}\r
4339    }\r
4340    if (noisy>2) printf("  %d stages with ties, ", tie);\r
4341    tree.lnL=bestscore;\r
4342    free(_U0); free(_step0);\r
4343    return (0);\r
4344 }\r
4345 \r
4346 double MPScoreStepwiseAddition (int is, double space[], int save)\r
4347 {\r
4348 /* this changes only the part of the tree affected by the newly added \r
4349    species is.\r
4350    save=1 for the best tree, so that _U0 & _step0 are updated\r
4351 */\r
4352    int *U,*N,U3[3], h,ist, i,father,son2,*pU0=_U0,*pN0=_step0;\r
4353    double score;\r
4354 \r
4355    U=(int*)space;  N=U+_mnnode;\r
4356    for (h=0,score=0; h<com.npatt; h++,pU0+=_mnnode,pN0+=_mnnode) {\r
4357       FOR (i, tree.nnode) { U[i]=pU0[i-2*(i>=is)]; N[i]=pN0[i-2*(i>=is)]; }\r
4358       U[is]=1<<(com.z[is][h]-1);  N[is]=0;\r
4359       for (ist=is; (father=nodes[ist].father)!=tree.root; ist=father) {\r
4360          if ((son2=nodes[father].sons[0])==ist)  son2=nodes[father].sons[1];\r
4361          N[father]=N[ist]+N[son2];\r
4362          if ((U[father]=U[ist]&U[son2])==0)\r
4363             { U[father]=U[ist]|U[son2];  N[father]++; }\r
4364       }\r
4365       FOR (i,3) U3[i]=U[nodes[tree.root].sons[i]];\r
4366       N[tree.root]=2;\r
4367       if (U3[0]&U3[1]&U3[2]) N[tree.root]=0;\r
4368       else if (U3[0]&U3[1] || U3[1]&U3[2] || U3[0]&U3[2]) N[tree.root]=1;\r
4369       FOR(i,3) N[tree.root]+=N[nodes[tree.root].sons[i]];\r
4370 \r
4371       if (save) {\r
4372          memcpy (pU0, U, tree.nnode*sizeof(int));\r
4373          memcpy (pN0, N, tree.nnode*sizeof(int));\r
4374       }\r
4375       score+=N[tree.root]*com.fpatt[h];\r
4376    }\r
4377    return (score);\r
4378 }\r
4379 \r
4380 \r
4381 double TreeScore(double x[], double space[])\r
4382 {\r
4383    static int fromfile=0;\r
4384    int i;\r
4385    double xb[NP][2], e=1e-9, lnL=0;\r
4386 \r
4387    if(com.clock==2) error2("local clock in TreeScore");\r
4388    com.ntime = com.clock ? tree.nnode-com.ns : tree.nbranch;\r
4389 \r
4390    GetInitials(x, &i);  /* this shoulbe be improved??? */\r
4391    if(i) fromfile=1;\r
4392    PointconPnodes();\r
4393 \r
4394    if(com.method==0 || !fromfile) SetxBound(com.np, xb);\r
4395 \r
4396    if(fromfile) {\r
4397       lnL = com.plfun(x,com.np);\r
4398       com.np = com.ntime;\r
4399    }\r
4400    NFunCall=0;\r
4401    if(com.method==0 || com.ntime==0)\r
4402       ming2(NULL,&lnL,com.plfun,NULL,x,xb, space,e,com.np);\r
4403    else\r
4404       minB(NULL, &lnL, x, xb, e, space);\r
4405 \r
4406    return(lnL);\r
4407 }\r
4408 \r
4409 \r
4410 int StepwiseAddition (FILE* fout, double space[])\r
4411 {\r
4412 /* heuristic tree search by species addition.  Species are added in the order \r
4413    of occurrence in the data.\r
4414    Try to get good initial values.\r
4415 */\r
4416    char *z0[NS], *spname0[NS];\r
4417    int ns0=com.ns, is, i,j, bestbranch=0, randadd=0, order[NS];\r
4418    int sizetree=(2*com.ns-1)*sizeof(struct TREEN);\r
4419    double bestscore=0,score, *x=treestar.x;\r
4420 \r
4421    if(com.ns>50) printf("if this crashes, increase com.sspace?");\r
4422 \r
4423    if(com.ns<3) error2("2 sequences, no need for tree search");\r
4424    if (noisy) printf("\n\nHeuristic tree search by stepwise addition\n");\r
4425    if (fout) fprintf(fout, "\n\nHeuristic tree search by stepwise addition\n");\r
4426    FOR (i,ns0)  { z0[i]=com.z[i]; spname0[i]=com.spname[i]; }\r
4427    tree.nbranch=tree.root=com.ns=(com.clock?2:3);  \r
4428 \r
4429    FOR(i,ns0) order[i]=i;\r
4430    if(randadd) {\r
4431       FOR(i,ns0)\r
4432          { j=(int)(ns0*rndu()); is=order[i]; order[i]=order[j]; order[j]=is; }\r
4433       if(noisy) FOR(i,ns0) printf(" %d", order[i]+1);\r
4434       if(fout) { \r
4435          fputs("\nOrder of species addition:\n",fout); \r
4436          FOR(i,ns0)fprintf(fout,"%3d  %-s\n", order[i]+1,com.spname[order[i]]);\r
4437       }\r
4438       for(i=0; i<ns0; i++) { \r
4439          com.z[i]=z0[order[i]]; \r
4440          com.spname[i]=spname0[order[i]]; \r
4441       }\r
4442    }\r
4443 \r
4444    for(i=0; i<tree.nbranch; i++) {\r
4445       tree.branches[i][0]=com.ns; tree.branches[i][1]=i; \r
4446    }\r
4447    BranchToNode ();\r
4448    for (is=com.ns; is<ns0; is++) {                  /* add the is_th species */\r
4449       treestar.tree=tree;  memcpy (treestar.nodes, nodes, sizetree);\r
4450 \r
4451       for (j=0; j<treestar.tree.nbranch+(com.clock>0); j++,com.ns--) { \r
4452          tree=treestar.tree;  memcpy(nodes, treestar.nodes, sizetree);\r
4453          com.ns++;\r
4454          AddSpecies(is,j);\r
4455          score=TreeScore(x, space);\r
4456          if (noisy>1)\r
4457             { printf("\n "); OutTreeN(F0, 0, 0); printf("%12.3f",-score); }\r
4458 \r
4459          if (j==0 || score<bestscore || (score==bestscore&&rndu()<.2)) {\r
4460             treebest.tree=tree;  memcpy(treebest.nodes, nodes, sizetree);\r
4461             xtoy (x, treebest.x, com.np);\r
4462             bestscore=score; bestbranch=j;\r
4463          }\r
4464       }\r
4465       tree=treebest.tree;  memcpy(nodes,treebest.nodes, sizetree);\r
4466       xtoy (treebest.x, x, com.np);\r
4467       com.ns=is+1;\r
4468 \r
4469       if (noisy) {\r
4470          printf("\n\nAdded sp. %d, %s [%.3f]\n",is+1,com.spname[is],-bestscore);\r
4471          OutTreeN(F0,0,0);  FPN(F0);  OutTreeN(F0,1,0);  FPN(F0);\r
4472          if (com.np>com.ntime) {\r
4473             printf("\tparameters:"); \r
4474             for(i=com.ntime; i<com.np; i++) printf("%9.5f", x[i]);\r
4475             FPN(F0);\r
4476          }\r
4477       }\r
4478       if (fout) {\r
4479          fprintf(fout,"\n\nAdded sp. %d, %s [%.3f]\n",\r
4480                  is+1, com.spname[is], -bestscore);\r
4481          OutTreeN(fout,0,0); FPN(fout);\r
4482          OutTreeN(fout,1,1); FPN(fout);\r
4483          if (com.np>com.ntime) {\r
4484             fprintf(fout, "\tparameters:"); \r
4485             for(i=com.ntime; i<com.np; i++) fprintf(fout, "%9.5f", x[i]);\r
4486             FPN(fout);\r
4487          }\r
4488          fflush(fout);\r
4489       }\r
4490    }\r
4491    tree.lnL=bestscore;\r
4492 \r
4493    return (0);\r
4494 }\r
4495 \r
4496 \r
4497 int DecompTree (int inode, int ison1, int ison2);\r
4498 #define hdID(i,j) (max2(i,j)*(max2(i,j)-1)/2+min2(i,j))\r
4499 \r
4500 int StarDecomposition (FILE *fout, double space[])\r
4501 {\r
4502 /* automatic tree search by star decomposition, nhomo<=1\r
4503    returns (0,1,2,3) for the 4s problem.\r
4504 */\r
4505    int status=0,stage=0, i,j, itree,ntree=0,ntreet,best=0,improve=1,collaps=0;\r
4506    int inode, nson=0, ison1,ison2, son1, son2;\r
4507    int sizetree=(2*com.ns-1)*sizeof(struct TREEN);\r
4508    double x[NP];\r
4509    FILE *ftree, *fsum=frst;\r
4510 \r
4511    if (com.runmode==1) {   /* read the star-like tree from tree file */\r
4512       if ((ftree=fopen (com.treef,"r"))==NULL)\r
4513          error2("no treefile");\r
4514       fscanf (ftree, "%d%d", &i, &ntree);\r
4515       if (ReadTreeN(ftree, &i, &j, 0, 1)) error2("err tree file");\r
4516       fclose (ftree);\r
4517    }\r
4518    else {                  /* construct the star tree of ns species */\r
4519       tree.nnode = (tree.nbranch=tree.root=com.ns)+1;\r
4520       for (i=0; i<tree.nbranch; i++)\r
4521          { tree.branches[i][0]=com.ns; tree.branches[i][1]=i; }\r
4522       com.ntime = com.clock?1:tree.nbranch;\r
4523       BranchToNode ();\r
4524    }\r
4525    if (noisy) { printf("\n\nstage 0: ");       OutTreeN(F0,0,0); }\r
4526    if (fsum) { fprintf(fsum,"\n\nstage 0: ");  OutTreeN(fsum,0,0); }\r
4527    if (fout) { fprintf(fout,"\n\nstage 0: ");  OutTreeN(fout,0,0); }\r
4528 \r
4529    tree.lnL=TreeScore(x,space);\r
4530 \r
4531    if (noisy)  printf("\nlnL:%14.6f%6d", -tree.lnL, NFunCall);\r
4532    if (fsum) fprintf(fsum,"\nlnL:%14.6f%6d", -tree.lnL, NFunCall);\r
4533    if (fout) {\r
4534       fprintf(fout,"\nlnL(ntime:%3d  np:%3d):%14.6f\n",\r
4535          com.ntime, com.np, -tree.lnL);\r
4536       OutTreeB (fout);  FPN(fout);\r
4537       FOR (i, com.np) fprintf (fout,"%9.5f", x[i]);  FPN (fout);\r
4538    }\r
4539    treebest.tree=tree;  memcpy(treebest.nodes,nodes,sizetree);\r
4540    FOR (i,com.np) treebest.x[i]=x[i];\r
4541    for (ntree=0,stage=1; ; stage++) {\r
4542       for (inode=treebest.tree.nnode-1; inode>=0; inode--) {\r
4543          nson=treebest.nodes[inode].nson;\r
4544          if (nson>3) break;\r
4545          if (com.clock) { if (nson>2) break; }\r
4546          else if (nson>2+(inode==treebest.tree.root)) break;\r
4547       }\r
4548       if (inode==-1 || /*stage>com.ns-3+com.clock ||*/ !improve) { /* end */\r
4549          tree=treebest.tree;  memcpy (nodes, treebest.nodes, sizetree);\r
4550 \r
4551          if (noisy) {\r
4552             printf("\n\nbest tree: ");  OutTreeN(F0,0,0);\r
4553             printf("   lnL:%14.6f\n", -tree.lnL);\r
4554          }\r
4555          if (fsum) {\r
4556             fprintf(fsum, "\n\nbest tree: ");  OutTreeN(fsum,0,0);\r
4557             fprintf(fsum, "   lnL:%14.6f\n", -tree.lnL);\r
4558          }\r
4559          if (fout) {\r
4560             fprintf(fout, "\n\nbest tree: ");  OutTreeN(fout,0,0);\r
4561             fprintf(fout, "   lnL:%14.6f\n", -tree.lnL);\r
4562             OutTreeN(fout,1,1);  FPN(fout);\r
4563          }\r
4564          break;\r
4565       }\r
4566       treestar=treebest;  memcpy(nodes,treestar.nodes,sizetree);\r
4567 \r
4568       if (collaps && stage) { \r
4569          printf ("\ncollapsing nodes\n");\r
4570          OutTreeN(F0, 1, 1);  FPN(F0);\r
4571 \r
4572          tree=treestar.tree;  memcpy(nodes, treestar.nodes, sizetree);\r
4573          for (i=com.ns,j=0; i<tree.nnode; i++)\r
4574             if (i!=tree.root && nodes[i].branch<1e-7) \r
4575                { CollapsNode (i, treestar.x);  j++; }\r
4576          treestar.tree=tree;  memcpy(treestar.nodes, nodes, sizetree);\r
4577 \r
4578          if (j)  { \r
4579             fprintf (fout, "\n%d node(s) collapsed\n", j);\r
4580             OutTreeN(fout, 1, 1);  FPN(fout);\r
4581          }\r
4582          if (noisy) {\r
4583             printf ("\n%d node(s) collapsed\n", j);\r
4584             OutTreeN(F0, 1, 1);  FPN(F0);\r
4585 /*            if (j) getchar (); */\r
4586          }\r
4587       }\r
4588 \r
4589       ntreet = nson*(nson-1)/2;\r
4590       if (!com.clock && inode==treestar.tree.root && nson==4)  ntreet=3;\r
4591       com.ntime++;  com.np++;\r
4592 \r
4593       if (noisy) {\r
4594          printf ("\n\nstage %d:%6d trees, ntime:%3d  np:%3d\nstar tree: ",\r
4595             stage, ntreet, com.ntime, com.np);\r
4596          OutTreeN(F0, 0, 0);\r
4597          printf ("  lnL:%10.3f\n", -treestar.tree.lnL);\r
4598       }\r
4599       if (fsum) {\r
4600        fprintf (fsum, "\n\nstage %d:%6d trees, ntime:%3d  np:%3d\nstar tree: ",\r
4601          stage, ntreet, com.ntime, com.np);\r
4602          OutTreeN(fsum, 0, 0);\r
4603          fprintf (fsum, "  lnL:%10.6f\n", -treestar.tree.lnL);\r
4604       }\r
4605       if (fout) {\r
4606          fprintf (fout,"\n\nstage %d:%6d trees\nstar tree: ", stage, ntreet);\r
4607          OutTreeN(fout, 0, 0);\r
4608          fprintf (fout, " lnL:%14.6f\n", -treestar.tree.lnL);\r
4609          OutTreeN(fout, 1, 1);  FPN (fout);\r
4610       }\r
4611 \r
4612       for (ison1=0,itree=improve=0; ison1<nson; ison1++)\r
4613       for (ison2=ison1+1; ison2<nson&&itree<ntreet; ison2++,itree++,ntree++) {\r
4614          DecompTree (inode, ison1, ison2);\r
4615          son1=nodes[tree.nnode-1].sons[0];\r
4616          son2=nodes[tree.nnode-1].sons[1];\r
4617 \r
4618          for(i=com.np-1; i>0; i--)  x[i]=treestar.x[i-1];\r
4619          if (!com.clock)\r
4620             for (i=0; i<tree.nbranch; i++)\r
4621                x[i]=max2(nodes[tree.branches[i][1]].branch*0.99, 0.0001);\r
4622          else\r
4623             for (i=1,x[0]=max2(x[0],.01); i<com.ntime; i++)  x[i]=.5;\r
4624 \r
4625          if (noisy) {\r
4626             printf("\nS=%d:%3d/%d  T=%4d  ", stage,itree+1,ntreet,ntree+1);\r
4627             OutTreeN(F0, 0, 0);\r
4628          }\r
4629          if (fsum) {\r
4630          fprintf(fsum, "\nS=%d:%3d/%d  T=%4d  ", stage,itree+1,ntreet,ntree+1);\r
4631             OutTreeN(fsum, 0, 0);\r
4632          }\r
4633          if (fout) {\r
4634            fprintf(fout,"\nS=%d:%4d/%4d  T=%4d ",stage,itree+1,ntreet,ntree+1);\r
4635            OutTreeN(fout, 0, 0);\r
4636          }\r
4637          tree.lnL=TreeScore(x, space);\r
4638 \r
4639          if (tree.lnL<treebest.tree.lnL) {\r
4640             treebest.tree=tree;  memcpy (treebest.nodes, nodes, sizetree);\r
4641             FOR(i,com.np) treebest.x[i]=x[i];\r
4642             best=itree+1;   improve=1;\r
4643          }\r
4644          if (noisy) \r
4645             printf("%6d%2c %+8.6f", NFunCall,(status?'?':'X'),treestar.tree.lnL-tree.lnL);\r
4646          if (fsum) {\r
4647             fprintf(fsum, "%6d%2c", NFunCall, (status?'?':'X'));\r
4648             for (i=com.ntime; i<com.np; i++)  fprintf(fsum, "%7.3f", x[i]);\r
4649             fprintf(fsum, " %+8.6f", treestar.tree.lnL-tree.lnL);\r
4650             fflush(fsum);\r
4651          }\r
4652          if (fout) {\r
4653             fprintf(fout,"\nlnL(ntime:%3d  np:%3d):%14.6f\n",\r
4654                          com.ntime, com.np, -tree.lnL);\r
4655             OutTreeB (fout);   FPN(fout);\r
4656             FOR (i,com.np) fprintf(fout,"%9.5f", x[i]); \r
4657             FPN(fout); fflush(fout);\r
4658          }\r
4659       }  /* for (itree) */\r
4660       son1=treebest.nodes[tree.nnode-1].sons[0];\r
4661       son2=treebest.nodes[tree.nnode-1].sons[1];\r
4662    }    /* for (stage) */\r
4663 \r
4664    if (com.ns<=4 && !improve && best) error2("strange");\r
4665 \r
4666    if (com.ns<=4) return (best);\r
4667    else return (0);\r
4668 }\r
4669 \r
4670 int DecompTree (int inode, int ison1, int ison2)\r
4671 {\r
4672 /* decompose treestar at NODE inode into tree and nodes[]\r
4673 */\r
4674    int i, son1, son2;\r
4675    int sizetree=(2*com.ns-1)*sizeof(struct TREEN);\r
4676    double bt, fmid=0.001, fclock=0.0001;\r
4677 \r
4678    tree=treestar.tree;  memcpy (nodes, treestar.nodes, sizetree);\r
4679    for (i=0,bt=0; i<tree.nnode; i++)\r
4680       if (i!=tree.root) bt+=nodes[i].branch/tree.nbranch;\r
4681 \r
4682    nodes[tree.nnode].nson=2;\r
4683    nodes[tree.nnode].sons[0]=son1=nodes[inode].sons[ison1];\r
4684    nodes[tree.nnode].sons[1]=son2=nodes[inode].sons[ison2];\r
4685    nodes[tree.nnode].father=inode;\r
4686    nodes[son1].father=nodes[son2].father=tree.nnode;\r
4687 \r
4688    nodes[inode].sons[ison1]=tree.nnode;\r
4689    for (i=ison2; i<nodes[inode].nson; i++)\r
4690       nodes[inode].sons[i]=nodes[inode].sons[i+1];\r
4691    nodes[inode].nson--;\r
4692 \r
4693    tree.nnode++;\r
4694    NodeToBranch();\r
4695    if (!com.clock)\r
4696       nodes[tree.nnode-1].branch=bt*fmid;\r
4697    else\r
4698       nodes[tree.nnode-1].age=nodes[inode].age*(1-fclock);\r
4699 \r
4700    return(0);\r
4701 }\r
4702 \r
4703 \r
4704 #ifdef REALSEQUENCE\r
4705 \r
4706 \r
4707 int MultipleGenes (FILE* fout, FILE*fpair[], double space[])\r
4708 {\r
4709 /* This does the separate analysis of multiple-gene data.\r
4710    Note that com.pose[] is not correct and so RateAncestor = 0 should be set\r
4711    in baseml and codeml.\r
4712 */\r
4713    int ig=0, j, ngene0, npatt0, lgene0[NGENE], posG0[NGENE+1];\r
4714    int nb = ((com.seqtype==1 && !com.cleandata) ? 3 : 1);\r
4715    \r
4716    if(com.ndata>1) error2("multiple data sets & multiple genes?");\r
4717 \r
4718    ngene0=com.ngene;  npatt0=com.npatt;\r
4719    for(ig=0; ig<ngene0; ig++)   lgene0[ig]=com.lgene[ig];\r
4720    for(ig=0; ig<ngene0+1; ig++) posG0[ig]=com.posG[ig];\r
4721 \r
4722    ig=0;\r
4723 /*\r
4724    printf("\nStart from gene (1-%d)? ", com.ngene);\r
4725    scanf("%d", &ig); \r
4726    ig--;\r
4727 */\r
4728 \r
4729    for ( ; ig<ngene0; ig++) {\r
4730 \r
4731       com.ngene=1;\r
4732       com.ls=com.lgene[0]= ig==0?lgene0[0]:lgene0[ig]-lgene0[ig-1];\r
4733       com.npatt =  ig==ngene0-1 ? npatt0-posG0[ig] : posG0[ig+1]-posG0[ig];\r
4734       com.posG[0]=0;  com.posG[1]=com.npatt;\r
4735       FOR (j,com.ns) com.z[j]+=posG0[ig]*nb;   com.fpatt+=posG0[ig];\r
4736       xtoy (com.piG[ig], com.pi, com.ncode);\r
4737 \r
4738       printf ("\n\nGene %2d  ls:%4d  npatt:%4d\n",ig+1,com.ls,com.npatt);\r
4739       fprintf(fout,"\nGene %2d  ls:%4d  npatt:%4d\n",ig+1,com.ls,com.npatt);\r
4740       fprintf(frst,"\nGene %2d  ls:%4d  npatt:%4d\n",ig+1,com.ls,com.npatt);\r
4741       fprintf(frst1,"%d\t%d\t%d",ig+1,com.ls,com.npatt);\r
4742 \r
4743 #ifdef CODEML\r
4744       if(com.seqtype==CODONseq) {\r
4745          DistanceMatNG86(fout,fpair[0],fpair[1],fpair[2],0);\r
4746          if(com.codonf>=F1x4MG) com.pf3x4 = com.f3x4[ig];\r
4747       }\r
4748 #else\r
4749       if(com.fix_alpha)\r
4750          DistanceMatNuc(fout,fpair[0],com.model,com.alpha);\r
4751 #endif\r
4752 \r
4753       if (com.runmode==0)  Forestry(fout);\r
4754 #ifdef CODEML\r
4755       else if (com.runmode==-2) {\r
4756          if(com.seqtype==CODONseq) PairwiseCodon(fout,fpair[3],fpair[4],fpair[5],space);\r
4757          else                      PairwiseAA(fout,fpair[0]);\r
4758       }\r
4759 #endif\r
4760       else                         StepwiseAddition(fout, space);\r
4761 \r
4762       for(j=0; j<com.ns; j++) com.z[j] -= posG0[ig]*nb;\r
4763       com.fpatt -= posG0[ig];\r
4764       FPN(frst1);\r
4765    }\r
4766    com.ngene = ngene0;\r
4767    com.npatt = npatt0;\r
4768    com.ls = lgene0[ngene0-1];\r
4769    for(ig=0; ig<ngene0; ig++)\r
4770       com.lgene[ig] = lgene0[ig];\r
4771    for(ig=0; ig<ngene0+1; ig++)\r
4772       com.posG[ig] = posG0[ig];\r
4773    return (0);\r
4774 }\r
4775 \r
4776 void printSeqsMgenes (void)\r
4777 {\r
4778 /* separate sites from different partitions (genes) into different files.\r
4779    called before sequences are coded.\r
4780    Note that this is called before PatternWeight and so posec or posei is used\r
4781    and com.pose is not yet allocated.\r
4782    In case of codons, com.ls is the number of codons.\r
4783 */\r
4784    FILE *fseq;\r
4785    char seqf[20];\r
4786    int ig, lg, i,j,h;\r
4787    int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
4788 \r
4789    puts("Separating sites in genes into different files.\n");\r
4790    for (ig=0, FPN(F0); ig<com.ngene; ig++) {\r
4791       for (h=0,lg=0; h<com.ls; h++)\r
4792          if(com.pose[h]==ig)\r
4793             lg++;\r
4794       sprintf(seqf, "Gene%d.seq", ig+1);\r
4795       if((fseq=fopen(seqf,"w"))==NULL) error2("file creation err.");\r
4796       printf("%d sites in gene %d go to file %s\n", lg, ig+1,seqf);\r
4797 \r
4798       fprintf (fseq, "%8d%8d\n", com.ns, lg*n31);\r
4799       for (j=0; j<com.ns; j++) {\r
4800 \r
4801          /* fprintf(fseq,"*\n>\n%s\n", com.spname[j]); */\r
4802          fprintf(fseq,"%-20s  ", com.spname[j]);\r
4803          if (n31==1)  {       /* nucleotide or aa sequences */\r
4804             FOR (h,com.ls)\r
4805                        if(com.pose[h]==ig)\r
4806                               fprintf(fseq, "%c", com.z[j][h]);\r
4807          }\r
4808          else {               /* codon sequences */\r
4809             FOR (h,com.ls)\r
4810                if(com.pose[h]==ig) {\r
4811                   FOR (i,3) fprintf(fseq,"%c", com.z[j][h*3+i]);\r
4812                   fputc(' ',fseq);\r
4813                }\r
4814          }\r
4815          FPN(fseq);\r
4816       }\r
4817       fclose (fseq);\r
4818    }\r
4819    return ;\r
4820 }\r
4821 \r
4822 void printSeqsMgenes2 (void)\r
4823 {\r
4824 /* This print sites from certain genes into one file.\r
4825    called before sequences are coded.\r
4826    In case of codons, com.ls is the number of codons.\r
4827 */\r
4828    FILE *fseq;\r
4829    char seqf[20]="newseqs";\r
4830    int ig, lg, i,j,h;\r
4831    int n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
4832    \r
4833    int ngenekept=0;\r
4834    char *genenames[44]={"atpa", "atpb", "atpe", "atpf", "atph", "petb", "petg", "psaa",\r
4835 "psab", "psac", "psaj", "psba", "psbb", "psbc", "psbd", "psbe",\r
4836 "psbf", "psbh", "psbi", "psbj", "psbk", "psbl", "psbn", "psbt",\r
4837 "rl14", "rl16", "rl2", "rl20", "rl36", "rpob", "rpoc", "rpod", "rs11",\r
4838 "rs12", "rs14", "rs18", "rs19", "rs2", "rs3", "rs4", "rs7", "rs8",\r
4839 "ycf4", "ycf9"};\r
4840    int wantgene[44]={0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
4841                      0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \r
4842                      0, 0, 0, 0};\r
4843 /*\r
4844 for(ig=0,lg=0; ig<com.ngene; ig++) wantgene[ig]=!wantgene[ig];\r
4845 */\r
4846 \r
4847    if(com.ngene!=44) error2("ngene!=44");\r
4848    FOR(h,com.ls) { \r
4849       printf("%3d",com.pose[h]); \r
4850       if((h+1)%20==0) FPN(F0); if((h+1)%500==0) getchar();\r
4851    }\r
4852    matIout(F0,com.lgene,1,com.ngene);\r
4853    matIout(F0,wantgene,1,com.ngene);\r
4854 \r
4855    for(ig=0,lg=0; ig<com.ngene; ig++) \r
4856       if(wantgene[ig]) { ngenekept++; lg+=com.lgene[ig]; }\r
4857 \r
4858    if((fseq=fopen(seqf,"w"))==NULL) error2("file creation err.");\r
4859    fprintf(fseq,"%4d %4d  G\nG  %d  ", com.ns, lg*n31, ngenekept);\r
4860    FOR(ig,com.ngene) if(wantgene[ig]) fprintf(fseq," %3d", com.lgene[ig]);\r
4861    FPN(fseq);\r
4862 \r
4863    for (j=0; j<com.ns; FPN(fseq),j++) {\r
4864       fprintf(fseq,"%-20s  ", com.spname[j]);\r
4865       if (n31==1)  {       /* nucleotide or aa sequences */\r
4866          FOR (h,com.ls)   \r
4867             if(wantgene[ig=com.pose[h]]) fprintf(fseq,"%c",com.z[j][h]);\r
4868       }\r
4869       else {               /* codon sequences */\r
4870          FOR (h,com.ls)\r
4871             if (wantgene[ig=com.pose[h]]) {\r
4872                FOR (i,3) fprintf(fseq,"%c", com.z[j][h*3+i]);\r
4873                fputc(' ', fseq);\r
4874             }\r
4875       }\r
4876    }\r
4877    FPN(fseq); \r
4878    FOR(ig,com.ngene) if(wantgene[ig]) fprintf(fseq," %s", genenames[ig]);\r
4879    FPN(fseq);\r
4880    fclose (fseq);\r
4881 \r
4882    exit(0);\r
4883 }\r
4884 \r
4885 #endif   /* ifdef REALSEQUENCE */\r
4886 #endif   /* ifdef TREESEARCH */\r
4887 #endif   /* ifdef NODESTRUCTURE */\r
4888 \r
4889 \r
4890 \r
4891 #ifdef PARSIMONY\r
4892 \r
4893 void UpPassScoreOnly (int inode);\r
4894 void UpPassScoreOnlyB (int inode);\r
4895 \r
4896 static int *Nsteps, *chUB;   /* MM */\r
4897 static char *Kspace, *chU, *NchU; \r
4898 /* Elements of chU are character states (there are NchU of them).  This \r
4899    representation is used to speed up calculation for large trees.\r
4900    Bit operations on chUB are performed for binary trees\r
4901 */\r
4902 \r
4903 void UpPassScoreOnly (int inode)\r
4904 {\r
4905 /* => VU, VL, & MM, theorem 2 */\r
4906    int ison, i, j;\r
4907    char *K=Kspace, maxK;  /* chMark (VV) not used in up pass */\r
4908 \r
4909    FOR (i,nodes[inode].nson)\r
4910       if (nodes[nodes[inode].sons[i]].nson>0)\r
4911           UpPassScoreOnly (nodes[inode].sons[i]);\r
4912 \r
4913    FOR (i,com.ncode) K[i]=0;\r
4914    FOR (i,nodes[inode].nson) \r
4915       for (j=0,ison=nodes[inode].sons[i]; j<NchU[ison]; j++)\r
4916          K[(int)chU[ison*com.ncode+j]]++;\r
4917    for (i=0,maxK=0; i<com.ncode; i++)  if (K[i]>maxK) maxK=K[i];\r
4918    for (i=0,NchU[inode]=0; i<com.ncode; i++)\r
4919       if (K[i]==maxK)  chU[inode*com.ncode+NchU[inode]++]=(char)i;\r
4920    Nsteps[inode]=nodes[inode].nson-maxK;\r
4921    FOR (i, nodes[inode].nson)  Nsteps[inode]+=Nsteps[nodes[inode].sons[i]];\r
4922 }\r
4923 \r
4924 void UpPassScoreOnlyB (int inode)\r
4925 {\r
4926 /* uses bit operation, for binary trees only \r
4927 */\r
4928    int ison1,ison2, i, change=0;\r
4929 \r
4930    FOR (i,nodes[inode].nson)\r
4931       if (nodes[nodes[inode].sons[i]].nson>0)\r
4932           UpPassScoreOnlyB (nodes[inode].sons[i]);\r
4933 \r
4934    ison1=nodes[inode].sons[0];  ison2=nodes[inode].sons[1];\r
4935    if ((chUB[inode]=(chUB[ison1] & chUB[ison2]))==0)\r
4936       { chUB[inode]=(chUB[ison1] | chUB[ison2]);  change=1; }\r
4937    Nsteps[inode]=change+Nsteps[ison1]+Nsteps[ison2];\r
4938 }\r
4939 \r
4940 \r
4941 double MPScore (double space[])\r
4942 {\r
4943 /* calculates MP score for a given tree using Hartigan's (1973) algorithm.\r
4944    sizeof(space) = nnode*sizeof(int)+(nnode+2)*ncode*sizeof(char).\r
4945    Uses Nsteps[nnode], chU[nnode*ncode], NchU[nnode].\r
4946    if(BitOperation), bit operations are used on binary trees.\r
4947 */\r
4948    int h,i, BitOperation,U[3],change;\r
4949    double score;\r
4950 \r
4951    Nsteps=(int*)space;\r
4952    BitOperation=(tree.nnode==2*com.ns-1 - (nodes[tree.root].nson==3));\r
4953    BitOperation=(BitOperation&&com.ncode<32);\r
4954    if (BitOperation)  chUB=Nsteps+tree.nnode;\r
4955    else {\r
4956       chU=(char*)(Nsteps+tree.nnode);\r
4957       NchU=chU+tree.nnode*com.ncode;  Kspace=NchU+tree.nnode;\r
4958    }\r
4959    for (h=0,score=0; h<com.npatt; h++) {\r
4960       FOR (i,tree.nnode) Nsteps[i]=0;\r
4961       if (BitOperation) { \r
4962          FOR (i,com.ns)  chUB[i]=1<<(com.z[i][h]);\r
4963          UpPassScoreOnlyB (tree.root);\r
4964          if (nodes[tree.root].nson>2) {\r
4965             FOR (i,3) U[i]=chUB[nodes[tree.root].sons[i]];\r
4966             change=2;\r
4967             if (U[0]&U[1]&U[2]) change=0;\r
4968             else if (U[0]&U[1] || U[1]&U[2] || U[0]&U[2]) change=1;\r
4969             for (i=0,Nsteps[tree.root]=change; i<3; i++) \r
4970                Nsteps[tree.root]+=Nsteps[nodes[tree.root].sons[i]];\r
4971        }\r
4972       }\r
4973       else {                   /* polytomies, use characters */\r
4974          FOR(i,com.ns)\r
4975             {chU[i*com.ncode]=(char)(com.z[i][h]); NchU[i]=(char)1; }\r
4976          for (i=com.ns; i<tree.nnode; i++)  NchU[i]=0;\r
4977          UpPassScoreOnly (tree.root);\r
4978       }\r
4979       score+=Nsteps[tree.root]*com.fpatt[h];\r
4980 /*\r
4981 printf("\nh %3d:    ", h+1);\r
4982 FOR(i,com.ns) printf("%2d  ", com.z[i][h]);\r
4983 printf(" %6d ", Nsteps[tree.root]);\r
4984 if((h+1)%10==0) exit(1);\r
4985 */\r
4986    }\r
4987 \r
4988    return (score);\r
4989 }\r
4990 \r
4991 double RemoveMPNinfSites (double *nsiteNinf)\r
4992 {\r
4993 /* Removes parsimony-noninformative sites and return the number of changes \r
4994    at those sites.\r
4995    Changes .z[], .fpatt[], .npatt, etc.\r
4996 */\r
4997    int  h,j, it, npatt0=com.npatt, markb[NCODE], gt2;\r
4998    double MPScoreNinf;\r
4999 \r
5000    for (h=0,com.npatt=0,MPScoreNinf=0,*nsiteNinf=0; h<npatt0; h++) {\r
5001       FOR (j, com.ncode) markb[j]=0;\r
5002       FOR (j, com.ns)  markb[(int)com.z[j][h]]++;\r
5003       for (j=0,it=gt2=0; j<com.ncode; j++)\r
5004          if (markb[j]>=2) { it++; gt2=1; }\r
5005       if (it<2) {                         /* non-informative */\r
5006        *nsiteNinf+=com.fpatt[h];\r
5007          FOR (j,com.ncode) if(markb[j]==1) MPScoreNinf+=com.fpatt[h];\r
5008          if (!gt2) MPScoreNinf-=com.fpatt[h];\r
5009       }\r
5010       else {\r
5011          FOR (j, com.ns) com.z[j][com.npatt]=com.z[j][h];\r
5012          com.fpatt[com.npatt++]=com.fpatt[h];\r
5013       }\r
5014    }\r
5015    return (MPScoreNinf);\r
5016 }\r
5017 \r
5018 #endif\r
5019 \r
5020 \r
5021 #ifdef RECONSTRUCTION\r
5022 \r
5023 static char *chMark, *chMarkU, *chMarkL; /* VV, VU, VL */\r
5024 /* chMark, chMarkU, chMarkL (VV, VU, VL) have elements 0 or 1, marking\r
5025    whether the character state is present in the set */\r
5026 static char *PATHWay, *NCharaCur, *ICharaCur, *CharaCur;\r
5027 /* PATHWay, NCharaCur, ICharaCur, CharaCur are for the current \r
5028    reconstruction.  \r
5029 */\r
5030 \r
5031 int UpPass (int inode);\r
5032 int DownPass (int inode);\r
5033 \r
5034 int UpPass (int inode)\r
5035 {\r
5036 /* => VU, VL, & MM, theorem 2 */\r
5037    int n=com.ncode, i, j;\r
5038    char *K=chMark, maxK;   /* chMark (VV) not used in up pass */\r
5039 \r
5040    FOR (i,nodes[inode].nson)\r
5041       if (nodes[nodes[inode].sons[i]].nson>0) UpPass (nodes[inode].sons[i]);\r
5042 \r
5043    FOR (i, n) K[i]=0;\r
5044    FOR (i,nodes[inode].nson) \r
5045       FOR (j, n)  if(chMarkU[nodes[inode].sons[i]*n+j]) K[j]++;\r
5046    for (i=0,maxK=0; i<n; i++)  if (K[i]>maxK) maxK=K[i];\r
5047    for (i=0; i<n; i++) {\r
5048       if (K[i]==maxK)         chMarkU[inode*n+i]=1; \r
5049       else if (K[i]==maxK-1)  chMarkL[inode*n+i]=1;\r
5050    }\r
5051    Nsteps[inode]=nodes[inode].nson-maxK;\r
5052    FOR (i, nodes[inode].nson)  Nsteps[inode]+=Nsteps[nodes[inode].sons[i]];\r
5053    return (0);\r
5054 }\r
5055 \r
5056 int DownPass (int inode)\r
5057 {\r
5058 /* VU, VL => VV, theorem 3 */\r
5059    int n=com.ncode, i, j, ison;\r
5060 \r
5061    FOR (i,nodes[inode].nson) {\r
5062       ison=nodes[inode].sons[i];\r
5063       FOR (j,n) if (chMark[inode*n+j]>chMarkU[ison*n+j]) break;\r
5064       if (j==n) \r
5065          FOR (j,n) chMark[ison*n+j]=chMark[inode*n+j];\r
5066       else \r
5067          FOR (j,n)\r
5068             chMark[ison*n+j] = \r
5069              (char)(chMarkU[ison*n+j]||(chMark[inode*n+j]&&chMarkL[ison*n+j]));\r
5070    }\r
5071    FOR (i,nodes[inode].nson)\r
5072       if (nodes[nodes[inode].sons[i]].nson>0) DownPass (nodes[inode].sons[i]);\r
5073    return (0);\r
5074 }\r
5075 \r
5076 \r
5077 int DownStates (int inode)\r
5078 {\r
5079 /* VU, VL => NCharaCur, CharaCur, theorem 4 */\r
5080    int i;\r
5081 \r
5082    FOR (i,nodes[inode].nson) \r
5083       if (nodes[inode].sons[i]>=com.ns) \r
5084          DownStatesOneNode (nodes[inode].sons[i], inode);\r
5085    return (0);\r
5086 }\r
5087 \r
5088 int DownStatesOneNode (int ison, int father)\r
5089 {\r
5090 /* States down inode, given father */\r
5091    char chi=PATHWay[father-com.ns];\r
5092    int n=com.ncode, j, in;\r
5093 \r
5094    if((in=ison-com.ns)<0) return (0);\r
5095    if (chMarkU[ison*n+chi]) {\r
5096       NCharaCur[in]=1;   CharaCur[in*n+0]=chi;\r
5097    }\r
5098    else if (chMarkL[ison*n+chi]) {\r
5099       for (j=0,NCharaCur[in]=0; j<n; j++) \r
5100          if (chMarkU[ison*n+j] || j==chi) CharaCur[in*n+NCharaCur[in]++]=(char)j;\r
5101    }\r
5102    else {\r
5103       for (j=0,NCharaCur[in]=0; j<n; j++) \r
5104          if (chMarkU[ison*n+j]) CharaCur[in*n+NCharaCur[in]++]=(char)j;\r
5105    }\r
5106    PATHWay[in]=CharaCur[in*n+(ICharaCur[in]=0)];\r
5107    FOR (j, nodes[ison].nson)  if (nodes[ison].sons[j]>=com.ns) break;\r
5108    if (j<nodes[ison].nson) DownStates (ison);\r
5109 \r
5110    return (0);\r
5111 }\r
5112 \r
5113 int InteriorStatesMP (int job, int h, int *nchange, char NChara[NS-1], \r
5114     char Chara[(NS-1)*NCODE], double space[]);\r
5115 \r
5116 int InteriorStatesMP (int job, int h, int *nchange, char NChara[NS-1], \r
5117     char Chara[(NS-1)*NCODE], double space[])\r
5118 {\r
5119 /* sizeof(space) = nnode*sizeof(int)+3*nnode*ncode*sizeof(char)\r
5120    job: 0=# of changes; 1:equivocal states\r
5121 */\r
5122    int n=com.ncode, i,j;\r
5123 \r
5124    Nsteps=(int*)space;            chMark=(char*)(Nsteps+tree.nnode);\r
5125    chMarkU=chMark+tree.nnode*n;   chMarkL=chMarkU+tree.nnode*n;\r
5126    FOR (i,tree.nnode) Nsteps[i]=0;\r
5127    FOR (i,3*n*tree.nnode) chMark[i]=0;\r
5128    FOR (i,com.ns)  chMark[i*n+com.z[i][h]]=chMarkU[i*n+com.z[i][h]]=1;\r
5129    UpPass (tree.root);\r
5130    *nchange=Nsteps[tree.root];\r
5131    if (job==0) return (0);\r
5132    FOR (i,n) chMark[tree.root*n+i]=chMarkU[tree.root*n+i];\r
5133    DownPass (tree.root);\r
5134    FOR (i,tree.nnode-com.ns) \r
5135       for (j=0,NChara[i]=0; j<n; j++) \r
5136          if (chMark[(i+com.ns)*n+j])  Chara[i*n+NChara[i]++]=(char)j;\r
5137    return (0);\r
5138 }\r
5139 \r
5140 \r
5141 int PathwayMP (FILE *fout, double space[])\r
5142 {\r
5143 /* Hartigan, JA.  1973.  Minimum mutation fits to a given tree. \r
5144    Biometrics, 29:53-65.\r
5145 */\r
5146    char *pch=(com.seqtype==0?BASEs:AAs), visit[NS-1];\r
5147    int n=com.ncode, nid=tree.nbranch-com.ns+1, it, i,j,k, h, npath;\r
5148    int nchange, nchange0;\r
5149    char nodeb[NNODE], Equivoc[NS-1];\r
5150 \r
5151    PATHWay=(char*)malloc(nid*(n+3)*sizeof(char));\r
5152    NCharaCur=PATHWay+nid;  ICharaCur=NCharaCur+nid;  CharaCur=ICharaCur+nid;\r
5153 \r
5154    for (j=0,visit[i=0]=(char)(tree.root-com.ns); j<tree.nbranch; j++) \r
5155      if (tree.branches[j][1]>=com.ns) \r
5156         visit[++i]=(char)(tree.branches[j][1]-com.ns);\r
5157 /*\r
5158    printf ("\nOrder in nodes: ");\r
5159    FOR (j, nid) printf ("%4d", visit[j]+1+com.ns); FPN(F0);\r
5160 */\r
5161    for (h=0; h<com.npatt; h++) {\r
5162       fprintf (fout, "\n%4d%6.0f  ", h+1, com.fpatt[h]);\r
5163       FOR (j, com.ns) fprintf (fout, "%c", pch[(int)com.z[j][h]]);\r
5164       fprintf (fout, ":  ");\r
5165 \r
5166       FOR (j,com.ns) nodeb[j]=(char)(com.z[j][h]);\r
5167 \r
5168       InteriorStatesMP (1, h, &nchange, NCharaCur, CharaCur, space); \r
5169       ICharaCur[j=tree.root-com.ns]=0;  PATHWay[j]=CharaCur[j*n+0];\r
5170       FOR (j,nid) Equivoc[j]=(char)(NCharaCur[j]>1);\r
5171       DownStates (tree.root);\r
5172 \r
5173       for (npath=0; ;) {\r
5174          for (j=0,k=visit[nid-1]; j<NCharaCur[k]; j++) {\r
5175             PATHWay[k]=CharaCur[k*n+j]; npath++; \r
5176             FOR (i, nid) fprintf (fout, "%c", pch[(int)PATHWay[i]]);\r
5177             fprintf (fout, "  ");\r
5178 \r
5179             FOR (i,nid) nodeb[i+com.ns]=PATHWay[i];\r
5180             for (i=0,nchange0=0; i<tree.nbranch; i++) \r
5181             nchange0+=(nodeb[tree.branches[i][0]]!=nodeb[tree.branches[i][1]]);\r
5182             if (nchange0!=nchange) \r
5183                { puts("\a\nerr:PathwayMP"); fprintf(fout,".%d. ", nchange0);}\r
5184 \r
5185          }\r
5186          for (j=nid-2; j>=0; j--) {\r
5187             if(Equivoc[k=visit[j]] == 0) continue;\r
5188             if (ICharaCur[k]+1<NCharaCur[k]) {\r
5189                PATHWay[k] = CharaCur[k*n + (++ICharaCur[k])];\r
5190                DownStates (k+com.ns);\r
5191                break;\r
5192             }\r
5193             else { /* if (next equivocal node is not ancestor) update node k */\r
5194                for (i=j-1; i>=0; i--) if (Equivoc[(int)visit[i]]) break;\r
5195                if (i>=0) { \r
5196                   for (it=k+com.ns,i=visit[i]+com.ns; ; it=nodes[it].father)\r
5197                      if (it==tree.root || nodes[it].father==i) break;\r
5198                   if (it==tree.root)\r
5199                      DownStatesOneNode(k+com.ns, nodes[k+com.ns].father);\r
5200                }\r
5201             }\r
5202          }\r
5203          if (j<0) break;\r
5204        }\r
5205        fprintf (fout, " |%4d (%d)", npath, nchange);\r
5206    }   /* for (h) */\r
5207    free (PATHWay);\r
5208    return (0);\r
5209 }\r
5210 \r
5211 #endif\r
5212 \r
5213 \r
5214 \r
5215 #if(BASEML || CODEML)\r
5216 \r
5217 \r
5218 int BootstrapSeq (char* seqf)\r
5219 {\r
5220 /* This is called from within ReadSeq(), right after the sequences are read \r
5221    and before the data are coded.\r
5222    jackknife if(lsb<com.ls && com.ngene==1).\r
5223    gmark[start+19] marks the position of the 19th site in that gene.\r
5224 */\r
5225    int iboot,nboot=com.bootstrap, h, is, ig, lg[NGENE]={0}, j, start;\r
5226    int lsb=com.ls, n31=1,gap=10, gpos[NGENE];\r
5227    int *sites=(int*)malloc(com.ls*sizeof(int)), *gmark=NULL;\r
5228    FILE *fseq=(FILE*)gfopen(seqf, "w");\r
5229    enum {PAML=0, PAUP};\r
5230    char *datatype = (com.seqtype==AAseq?"protein":"dna");\r
5231    char *paupstart="paupstart", *paupblock="paupblock", *paupend="paupend";\r
5232    int  format=0;  /* 0: paml-phylip; 1:paup-nexus */\r
5233 \r
5234    if(com.readpattern) error2("work on bootstrapping pattern data.");\r
5235 \r
5236    printf("\nGenerating bootstrap samples in file %s\n", seqf);\r
5237    if(format==PAUP) {\r
5238       printf("%s, %s, & %s will be appended if existent.\n",\r
5239          paupstart,paupblock,paupend);\r
5240       appendfile(fseq, paupstart);\r
5241    }\r
5242 \r
5243    if(com.seqtype==CODONseq||com.seqtype==CODON2AAseq) { n31=3; gap=1; }\r
5244    if(sites==NULL) error2("oom in BootstrapSeq");\r
5245    if(com.ngene>1) {\r
5246       if(lsb<com.ls) error2("jackknife when #gene>1");\r
5247       if((gmark=(int*)malloc(com.ls*sizeof(int)))==NULL) \r
5248          error2("oom in BootstrapSeq");\r
5249 \r
5250       for(ig=0; ig<com.ngene; ig++)  com.lgene[ig] = gpos[ig] = 0;\r
5251       for(h=0; h<com.ls; h++)  com.lgene[com.pose[h]]++;\r
5252       for(j=0; j<com.ngene; j++) lg[j] = com.lgene[j];\r
5253       for(j=1; j<com.ngene; j++) com.lgene[j] += com.lgene[j-1];\r
5254 \r
5255       if(noisy && com.ngene>1) {\r
5256          printf("Bootstrap uses stratefied sampling for %d partitions.", com.ngene);\r
5257          printf("\nnumber of sites in each partition: ");\r
5258          for(ig=0; ig<com.ngene; ig++) printf(" %4d", lg[ig]);\r
5259          FPN(F0);\r
5260       }\r
5261 \r
5262       for(h=0; h<com.ls; h++) {     /* create gmark[] */\r
5263          ig = com.pose[h];\r
5264          start = (ig==0 ? 0 : com.lgene[ig-1]);\r
5265          gmark[start + gpos[ig]++] = h;\r
5266       }\r
5267    }\r
5268 \r
5269    for (iboot=0; iboot<nboot; iboot++,FPN(fseq)) {\r
5270       if(com.ngene<=1)\r
5271          for(h=0; h<lsb; h++) sites[h] = (int)(rndu()*com.ls);\r
5272       else {\r
5273          for(ig=0; ig<com.ngene; ig++) {\r
5274             start = (ig==0 ? 0 : com.lgene[ig-1]);\r
5275             for(h=0; h<lg[ig]; h++)\r
5276                sites[start+h] = gmark[start+(int)(rndu()*lg[ig])];\r
5277          }\r
5278       }\r
5279 \r
5280       /* print out the bootstrap sample */\r
5281       if(format==PAUP) {\r
5282          fprintf(fseq,"\n\n[Replicate # %d]\n", iboot+1);\r
5283          fprintf(fseq,"\nbegin data;\n");\r
5284          fprintf(fseq,"   dimensions ntax=%d nchar=%d;\n", com.ns, lsb*n31);\r
5285          fprintf(fseq,"   format datatype=%s missing=? gap=-;\n   matrix\n", datatype);\r
5286 \r
5287          for(is=0;is<com.ns;is++,FPN(fseq)) {\r
5288             fprintf(fseq,"%-20s  ", com.spname[is]);\r
5289             for(h=0; h<lsb; h++) {\r
5290                for(j=0; j<n31; j++) fprintf(fseq,"%c", com.z[is][sites[h]*n31+j]);\r
5291                if((h+1)%gap==0) fprintf(fseq," ");\r
5292             }\r
5293          }\r
5294 \r
5295          fprintf(fseq, "   ;\nend;");\r
5296          /* site partitions */\r
5297          if(com.ngene>1) {\r
5298             fprintf(fseq, "\n\nbegin paup;\n");\r
5299             for(ig=0; ig<com.ngene; ig++)\r
5300                fprintf(fseq, "   charset partition%-2d = %-4d - %-4d;\n", \r
5301                   ig+1, (ig==0 ? 1 : com.lgene[ig-1]+1), com.lgene[ig]);\r
5302             fprintf(fseq, "end;\n");\r
5303          }\r
5304          appendfile(fseq, paupblock);\r
5305       }\r
5306       else {\r
5307          if(com.ngene==1) \r
5308             fprintf(fseq,"%6d %6d\n", com.ns, lsb*n31);\r
5309          else {\r
5310             fprintf(fseq,"%6d %6d  G\nG %d  ", com.ns, lsb*n31, com.ngene);\r
5311             for(ig=0; ig<com.ngene; ig++)\r
5312                fprintf(fseq," %4d", lg[ig]);\r
5313             fprintf(fseq,"\n\n");\r
5314          }\r
5315          for(is=0; is<com.ns; is++,FPN(fseq)) {\r
5316             fprintf(fseq,"%-20s  ", com.spname[is]);\r
5317             for(h=0; h<lsb; h++) {\r
5318                for(j=0; j<n31; j++)\r
5319                   fprintf(fseq,"%c", com.z[is][sites[h]*n31+j]);\r
5320                if((h+1)%gap==0) fprintf(fseq," ");\r
5321             }\r
5322          }\r
5323       }\r
5324 \r
5325       if(noisy && (iboot+1)%10==0) printf("\rdid sample #%d", iboot+1);\r
5326    }  /* for(iboot) */\r
5327    free(sites);  if(com.ngene>1) free(gmark);\r
5328    fclose(fseq);\r
5329    return(0);\r
5330 }\r
5331 \r
5332 \r
5333 \r
5334 int rell (FILE*flnf, FILE*fout, int ntree)\r
5335 {\r
5336 /* This implements three methods for tree topology comparison.  The first \r
5337    tests the log likelihood difference using a normal approximation \r
5338    (Kishino and Hasegawa 1989).  The second does approximate bootstrap sampling\r
5339    (the RELL method, Kishino and Hasegawa 1989, 1993).  The third is a \r
5340    modification of the K-H test with a correction for multiple comparison \r
5341    (Shimodaira and Hasegawa 1999) .\r
5342    The routine reads input from the file lnf.\r
5343 \r
5344    fpattB[npatt] stores the counts of site patterns in the bootstrap sample, \r
5345    with sitelist[ls] listing sites by gene, for stratefied sampling. \r
5346   \r
5347    com.space[ntree*(npatt+nr+5)]: \r
5348    lnf[ntree*npatt] lnL0[ntree] lnL[ntree*nr] pRELL[ntree] pSH[ntree] vdl[ntree]\r
5349    btrees[ntree]\r
5350 */\r
5351    char *line, timestr[64];\r
5352    int nr=(com.ls<100000?10000:(com.ls<10000?5000:500));\r
5353    int lline=16000, ntree0,ns0=com.ns, ls0,npatt0;\r
5354    int itree, h,ir,j,k, ig, mltree, nbtree, *btrees, status=0;\r
5355    int *sitelist, *fpattB, *lgeneB, *psitelist;\r
5356    double *lnf, *lnL0, *lnL, *pRELL, *lnLmSH, *pSH, *vdl, y, mdl, small=1e-5;\r
5357    size_t s;\r
5358 \r
5359    fflush(fout);\r
5360    puts( "\nTree comparisons (Kishino & Hasegawa 1989; Shimodaira & Hasegawa 1999)");\r
5361    fputs("\nTree comparisons (Kishino & Hasegawa 1989; Shimodaira & Hasegawa 1999)\n",fout);\r
5362    fprintf(fout,"Number of replicates: %d\n", nr);\r
5363 \r
5364    fscanf(flnf,"%d%d%d", &ntree0, &ls0, & npatt0);\r
5365    if(ntree0!=-1 && ntree0!=ntree)  error2("rell: input data file strange.  Check.");\r
5366    if (ls0!=com.ls || npatt0!=com.npatt)\r
5367       error2("rell: input data file incorrect.");\r
5368    s = ntree*(com.npatt+nr+5)*sizeof(double);\r
5369    if(com.sspace < s) {\r
5370       if(noisy) printf("resetting space to %lu bytes in rell.\n",s);\r
5371       com.sspace = s;\r
5372       if((com.space=(double*)realloc(com.space,com.sspace))==NULL)\r
5373          error2("oom space");\r
5374    }\r
5375    lnf=com.space; lnL0=lnf+ntree*com.npatt; lnL=lnL0+ntree; pRELL=lnL+ntree*nr;\r
5376    pSH=pRELL+ntree; vdl=pSH+ntree; btrees=(int*)(vdl+ntree);\r
5377    fpattB=(int*)malloc((com.npatt+com.ls+com.ngene)*sizeof(int));\r
5378    if(fpattB==NULL) error2("oom fpattB in rell.");\r
5379    sitelist=fpattB+com.npatt;  lgeneB=sitelist+com.ls;\r
5380 \r
5381    lline = (com.seqtype==1 ? ns0*8 : ns0) + 100;\r
5382    lline = max2(16000, lline);\r
5383    if((line=(char*)malloc((lline+1)*sizeof(char)))==NULL) error2("oom rell");\r
5384 \r
5385    /* read lnf from file flnf, calculates lnL0[] & find ML tree */\r
5386    for(itree=0,mltree=0; itree<ntree; itree++) {\r
5387       printf("\r\tReading lnf for tree # %d", itree+1);\r
5388       fscanf(flnf, "%d", &j);\r
5389       if(j != itree+1) \r
5390          { printf("\nerr: lnf, reading tree %d.",itree+1); return(-1); }\r
5391       for(h=0,lnL0[itree]=0; h<com.npatt; h++) {\r
5392          fscanf (flnf, "%d%d%lf", &j, &k, &y);\r
5393          if(j!=h+1)\r
5394             { printf("\nlnf, patt %d.",h+1); return(-1); }\r
5395          fgets(line,lline,flnf);\r
5396          lnL0[itree]+=com.fpatt[h]*(lnf[itree*com.npatt+h]=y);\r
5397       }\r
5398       if(itree && lnL0[itree]>lnL0[mltree]) mltree=itree;\r
5399    }\r
5400    printf(", done.\n");\r
5401    free(line);\r
5402 \r
5403    /* calculates SEs (vdl) by sitewise comparison */\r
5404 \r
5405    printtime(timestr);\r
5406    printf("\r\tCalculating SEs by sitewise comparison");\r
5407    FOR(itree,ntree) {\r
5408       if(itree==mltree) { vdl[itree]=0; continue; }\r
5409       mdl=(lnL0[itree]-lnL0[mltree])/com.ls;\r
5410       for(h=0,vdl[itree]=0; h<com.npatt; h++) {\r
5411          y=lnf[itree*com.npatt+h]-lnf[mltree*com.npatt+h];\r
5412          vdl[itree]+=com.fpatt[h]*(y-mdl)*(y-mdl);\r
5413       }\r
5414       vdl[itree]=sqrt(vdl[itree]);\r
5415    }\r
5416    printf(", %s\n", printtime(timestr));\r
5417 \r
5418    /* bootstrap resampling */\r
5419    for(ig=0; ig<com.ngene; ig++)\r
5420       lgeneB[ig]=(ig?com.lgene[ig]-com.lgene[ig-1]:com.lgene[ig]);\r
5421    for(h=0,k=0;h<com.npatt;h++) \r
5422       FOR(j,(int)com.fpatt[h]) sitelist[k++]=h;\r
5423 \r
5424    zero(pRELL,ntree); zero(pSH,ntree); zero(lnL,ntree*nr);\r
5425    for(ir=0; ir<nr; ir++) {\r
5426       for(h=0; h<com.npatt; h++) fpattB[h]=0;\r
5427       for(ig=0,psitelist=sitelist; ig<com.ngene; psitelist+=lgeneB[ig++]) {\r
5428          for(k=0; k<lgeneB[ig]; k++) {\r
5429             j=(int)(lgeneB[ig]*rndu());\r
5430             h=psitelist[j];\r
5431             fpattB[h]++;\r
5432          }\r
5433       }\r
5434       for(h=0; h<com.npatt; h++) {\r
5435          if(fpattB[h])\r
5436             for(itree=0; itree<ntree; itree++) \r
5437                lnL[itree*nr+ir] += fpattB[h]*lnf[itree*com.npatt+h];\r
5438       }\r
5439       \r
5440       /* y is the lnL for the best tree from replicate ir. */\r
5441       for(j=1,nbtree=1,btrees[0]=0,y=lnL[ir]; j<ntree; j++) {\r
5442          if(fabs(lnL[j*nr+ir]-y)<small) \r
5443             btrees[nbtree++]=j;\r
5444          else if (lnL[j*nr+ir]>y)\r
5445             { nbtree=1; btrees[0]=j; y=lnL[j*nr+ir]; }\r
5446       }\r
5447 \r
5448       for(j=0; j<nbtree; j++) \r
5449          pRELL[btrees[j]]+=1./(nr*nbtree);\r
5450       if(nr>100 && (ir+1)%(nr/100)==0) \r
5451          printf("\r\tRELL Bootstrapping.. replicate: %6d / %d %s",ir+1,nr, printtime(timestr));\r
5452 \r
5453    }\r
5454    free(fpattB);\r
5455 \r
5456    if(fabs(1-sum(pRELL,ntree))>1e-6) error2("sum pRELL != 1.");\r
5457 \r
5458    /* Shimodaira & Hasegawa correction (1999), working on lnL[ntree*nr] */\r
5459    printf("\nnow doing S-H test");\r
5460    if((lnLmSH=(double*)malloc(nr*sizeof(double))) == NULL) error2("oom in rell");\r
5461    for(j=0; j<ntree; j++)  /* step 3: centering */\r
5462       for(ir=0,y=sum(lnL+j*nr,nr)/nr; ir<nr; ir++) lnL[j*nr+ir] -= y;\r
5463    for(ir=0; ir<nr; ir++) {\r
5464       for(j=1,lnLmSH[ir]=lnL[ir]; j<ntree; j++) \r
5465          if(lnL[j*nr+ir]>lnLmSH[ir]) lnLmSH[ir] = lnL[j*nr+ir];\r
5466    }\r
5467    for(itree=0; itree<ntree; itree++) {  /* steps 4 & 5 */\r
5468       for(ir=0; ir<nr; ir++)\r
5469          if(lnLmSH[ir]-lnL[itree*nr+ir] > lnL0[mltree]-lnL0[itree]) \r
5470             pSH[itree] += 1./nr;\r
5471    }\r
5472 \r
5473    fprintf(fout,"\n%6s %12s %9s %9s%8s%10s%9s\n\n",\r
5474       "tree","li","Dli"," +- SE","pKH","pSH","pRELL");\r
5475    FOR(j,ntree) {\r
5476       mdl=lnL0[j]-lnL0[mltree]; \r
5477       if(j==mltree || fabs(vdl[j])<1e-6) { y=-1; pSH[j]=-1; status=-1; }\r
5478       else y=1-CDFNormal(-mdl/vdl[j]);\r
5479       fprintf(fout,"%6d%c%12.3f %9.3f %9.3f%8.3f%10.3f%9.3f\n",\r
5480            j+1,(j==mltree?'*':' '),lnL0[j],mdl,vdl[j],y,pSH[j],pRELL[j]);\r
5481    }\r
5482 \r
5483 fprintf(frst1,"%3d %12.6f",mltree+1, lnL0[mltree]);\r
5484 for(j=0;j<ntree;j++) fprintf(frst1," %5.3f",pRELL[j]);\r
5485 /*\r
5486 for(j=0;j<ntree;j++) if(j!=mltree) fprintf(frst1,"%9.6f",pSH[j]);\r
5487 */\r
5488 \r
5489    fputs("\npKH: P value for KH normal test (Kishino & Hasegawa 1989)\n",fout);\r
5490    fputs("pRELL: RELL bootstrap proportions (Kishino & Hasegawa 1989)\n",fout);\r
5491    fputs("pSH: P value with multiple-comparison correction (MC in table 1 of Shimodaira & Hasegawa 1999)\n",fout);\r
5492    if(status) fputs("(-1 for P values means N/A)\n",fout);\r
5493 \r
5494    FPN(F0);\r
5495    free(lnLmSH);\r
5496    return(0);\r
5497 }\r
5498 \r
5499 #endif\r
5500 \r
5501 \r
5502 \r
5503 \r
5504 #ifdef LFUNCTIONS\r
5505 #ifdef RECONSTRUCTION\r
5506 \r
5507 \r
5508 void ListAncestSeq(FILE *fout, char *zanc);\r
5509 \r
5510 void ListAncestSeq(FILE *fout, char *zanc)\r
5511 {\r
5512 /* zanc[nintern*com.npatt] holds ancestral sequences.\r
5513    Extant sequences are coded if cleandata.\r
5514 */\r
5515    int wname=15, j,h, n31=(com.seqtype==CODONseq||com.seqtype==CODON2AAseq?3:1);\r
5516    int lst=(com.readpattern?com.npatt:com.ls);\r
5517 \r
5518    fputs("\n\n\nList of extant and reconstructed sequences\n\n",fout);\r
5519    if(!com.readpattern) fprintf(fout, "%6d %6d\n\n", tree.nnode, lst*n31);\r
5520    else                 fprintf(fout, "%6d %6d  P\n\n", tree.nnode, lst*n31);\r
5521    for(j=0;j<com.ns;j++,FPN(fout)) {\r
5522       fprintf(fout,"%-*s   ", wname,com.spname[j]);\r
5523       print1seq(fout, com.z[j], lst, com.pose);\r
5524    }\r
5525    for(j=0;j<tree.nnode-com.ns;j++,FPN(fout)) {\r
5526       fprintf(fout,"node #%-*d  ", wname-5,com.ns+j+1);\r
5527       print1seq(fout, zanc+j*com.npatt, lst, com.pose);\r
5528    }\r
5529    if(com.readpattern) {\r
5530       for(h=0,FPN(fout); h<com.npatt; h++) {\r
5531          fprintf(fout," %4.0f", com.fpatt[h]);\r
5532          if((h+1)%15==0) FPN(fout);\r
5533       }\r
5534       fprintf(fout,"\n\n");\r
5535    }\r
5536 }\r
5537 \r
5538 int ProbSitePattern(double x[], double *lnL, double fhsiteAnc[], double ScaleC[]);\r
5539 int AncestralMarginal(FILE *fout, double x[], double fhsiteAnc[], double Sir[]);\r
5540 int AncestralJointPPSG2000(FILE *fout, double x[]);\r
5541 \r
5542 \r
5543 int ProbSitePattern (double x[], double *lnL, double fhsiteAnc[], double ScaleC[])\r
5544 {\r
5545 /* This calculates probabilities for observing site patterns fhsite[].  \r
5546    The following notes are for ncatG>1 and method = 0.  \r
5547    The routine calculates the scale factor common to all site classes (ir), \r
5548    that is, the greatest of the scale factors among the ir classes.  \r
5549    The common scale factors will be used in scaling nodes[].conP for all site \r
5550    classes for all nodes in PostProbNode().  Small conP for some site classes \r
5551    will be essentially set to 0, which is fine.\r
5552 \r
5553    fhsite[npatt]\r
5554    ScaleSite[npatt]\r
5555 \r
5556    Ziheng Yang, 7 Sept, 2001\r
5557 */\r
5558    int ig, i,k,h, ir;\r
5559    double fh, S, y=1;\r
5560 \r
5561    if(com.ncatG>1 && com.method==1) error2("don't need this?");\r
5562    if (SetParameters(x)) puts ("par err.");\r
5563    for(h=0; h<com.npatt; h++)\r
5564       fhsiteAnc[h] = 0;\r
5565    if (com.ncatG<=1) {\r
5566       for (ig=0,*lnL=0; ig<com.ngene; ig++) {\r
5567          if(com.Mgene>1) SetPGene(ig, 1, 1, 0, x);\r
5568          ConditionalPNode (tree.root, ig, x);\r
5569          for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
5570             for (i=0; i<com.ncode; i++) \r
5571                fhsiteAnc[h] += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];\r
5572             *lnL -= log(fhsiteAnc[h])*com.fpatt[h];\r
5573             if(com.NnodeScale) \r
5574                for(k=0; k<com.NnodeScale; k++) \r
5575                *lnL -= com.nodeScaleF[k*com.npatt+h]*com.fpatt[h];\r
5576          }\r
5577       }\r
5578    }\r
5579    else {\r
5580       for (ig=0; ig<com.ngene; ig++) {\r
5581          if(com.Mgene>1 || com.nalpha>1)\r
5582             SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);\r
5583          for (ir=0; ir<com.ncatG; ir++) {\r
5584 #ifdef CODEML\r
5585             if(com.seqtype==1 && com.NSsites /* && com.model */) IClass=ir;\r
5586 #endif\r
5587             SetPSiteClass(ir, x);\r
5588             ConditionalPNode (tree.root, ig, x);\r
5589 \r
5590             for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
5591                for (i=0,fh=0; i<com.ncode; i++)\r
5592                   fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];\r
5593    \r
5594                if(com.NnodeScale) {\r
5595                   for(k=0,S=0; k<com.NnodeScale; k++)  S += com.nodeScaleF[k*com.npatt+h];\r
5596                   y=1;\r
5597                   if(ir==0)               ScaleC[h]=S;\r
5598                   else if(S<=ScaleC[h])   y=exp(S-ScaleC[h]);\r
5599                   else      /* change of scale factor */\r
5600                      { fhsiteAnc[h] *= exp(ScaleC[h]-S);  ScaleC[h]=S; }\r
5601                }\r
5602                fhsiteAnc[h] += com.freqK[ir]*fh*y;\r
5603             }\r
5604          }\r
5605       }\r
5606       for(h=0, *lnL=0; h<com.npatt; h++)\r
5607          *lnL -= log(fhsiteAnc[h])*com.fpatt[h];\r
5608       if(com.NnodeScale) \r
5609          for(h=0; h<com.npatt; h++)\r
5610             *lnL -= ScaleC[h]*com.fpatt[h];\r
5611    }\r
5612    if(noisy) printf("\nlnL = %12.6f from ProbSitePattern.\n", - *lnL);\r
5613 \r
5614    return (0);\r
5615 }\r
5616 \r
5617 \r
5618 int updateconP(double x[], int inode);\r
5619 \r
5620 int PostProbNode (int inode, double x[], double fhsiteAnc[], double ScaleC[],\r
5621     double *lnL, double pChar1node[], char za[], double pnode[])\r
5622 {\r
5623 /* This calculates the full posterior distribution for node inode at each site.\r
5624    Below are special comments on gamma models and method = 0.\r
5625 \r
5626    Marginal reconstruction under gamma models, with complications arising from \r
5627    scaling on large trees (com.NnodeScale) and the use of two iteration algorithms \r
5628    (method).\r
5629    Z. Yang Sept 2001\r
5630    \r
5631    The algorithm is different depending on method, which makes the code clumsy.\r
5632 \r
5633    gamma method=0 or 2 (simultaneous updating):\r
5634       nodes[].conP overlap and get destroyed for different site classes (ir)\r
5635       The same for scale factors com.nodeScaleF. \r
5636       fhsite[npatt] and common scale factors ScaleC[npatt] are calculated for all \r
5637       nodes before this routine is called.  The common scale factors are then \r
5638       used to adjust nodes[].conP before they are summed across ir classes.\r
5639 \r
5640    gamma method=1 (one branch at a time):\r
5641       nodes[].conP (and com.nodeScaleF if node scaling is on) are separately \r
5642       allocated for different site classes (ir), so that all info needed is\r
5643       available.  Use of updateconP() saves computation on large trees.\r
5644       Scale factor Sir[] is of size ncatG and reused for each h.\r
5645 */\r
5646    int n=com.ncode, i,k,h, ir,it=-1,best, ig;\r
5647    double fh, y,pbest, *Sir=ScaleC, S;\r
5648 \r
5649    *lnL=0;\r
5650    zero(pChar1node,com.npatt*n);\r
5651 \r
5652    /* nodes[].conP are reused for different ir, with or without node scaling */\r
5653    if (com.ncatG>1 && com.method!=1) {\r
5654       ReRootTree(inode);\r
5655       for (ig=0; ig<com.ngene; ig++) {\r
5656          if(com.Mgene>1 || com.nalpha>1)\r
5657             SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,x);\r
5658          for (ir=0; ir<com.ncatG; ir++) {\r
5659 #ifdef CODEML\r
5660             if(com.seqtype==1 && com.NSsites)  IClass=ir;\r
5661 #endif\r
5662             SetPSiteClass(ir, x);\r
5663             ConditionalPNode (tree.root, ig, x);\r
5664 \r
5665             for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
5666                if(!com.NnodeScale) S=1;\r
5667                else {\r
5668                   for(k=0,S=0; k<com.NnodeScale; k++) \r
5669                      S += com.nodeScaleF[k*com.npatt+h];\r
5670                   S=exp(S-ScaleC[h]);\r
5671                }\r
5672                for (i=0,fh=0; i<n; i++) {\r
5673                   y = com.freqK[ir]*com.pi[i]*nodes[tree.root].conP[h*n+i] * S;\r
5674                   fh += y;\r
5675                   pChar1node[h*n+i] += y ;\r
5676                }\r
5677             }\r
5678          }\r
5679       }\r
5680       for (h=0; h<com.npatt; h++) {\r
5681          for(i=0,y=0;i<n;i++) y += (pChar1node[h*n+i]/=fhsiteAnc[h]);\r
5682          if (fabs(1-y)>1e-5) \r
5683             error2("PostProbNode: sum!=1");\r
5684          for (i=0,best=-1,pbest=-1; i<n; i++)\r
5685             if (pChar1node[h*n+i]>pbest) {\r
5686                best=i;\r
5687                pbest=pChar1node[h*n+i]; \r
5688             }\r
5689          za[(inode-com.ns)*com.npatt+h] = (char)best;\r
5690          pnode[(inode-com.ns)*com.npatt+h] = pbest;\r
5691          *lnL -= log(fhsiteAnc[h])*com.fpatt[h];\r
5692          if(com.NnodeScale) *lnL -= ScaleC[h]*com.fpatt[h];\r
5693       }\r
5694    }\r
5695    else {  /* all other cases: (alpha==0 || method==1) */\r
5696       for(i=0; i<tree.nnode; i++) com.oldconP[i] = 1;\r
5697       ReRootTree(inode);\r
5698       updateconP(x,inode);\r
5699       if (com.alpha==0 && com.ncatG<=1) { /* (alpha==0) (ngene>1 OK) */\r
5700          for (ig=0; ig<com.ngene; ig++) {\r
5701             if(com.Mgene==2 || com.Mgene==4)\r
5702                xtoy(com.piG[ig], com.pi, n);\r
5703             for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
5704                for (i=0,fh=0,pbest=0,best=-1; i<n; i++) {\r
5705                   y = com.pi[i]*nodes[tree.root].conP[h*n+i];\r
5706                   fh +=  y;\r
5707                   if (y>pbest)\r
5708                      { pbest=y; best=i; }\r
5709                   pChar1node[h*n+i] = y;\r
5710                }\r
5711                za[(inode-com.ns)*com.npatt+h] = (char)best;\r
5712                pnode[(inode-com.ns)*com.npatt+h] = (pbest/=fh);\r
5713                for (i=0; i<n; i++)\r
5714                   pChar1node[h*n+i] /= fh;\r
5715                *lnL -= log(fh)*(double)com.fpatt[h];\r
5716                for(i=0; i<com.NnodeScale; i++)\r
5717                   *lnL -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];\r
5718             }\r
5719          }\r
5720       }\r
5721       else {  /* (ncatG>1 && method = 1)  This should work for NSsites? */\r
5722          for (ig=0; ig<com.ngene; ig++) {\r
5723             if(com.Mgene==2 || com.Mgene==4)\r
5724                xtoy(com.piG[ig], com.pi, n);\r
5725             for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
5726                if(com.NnodeScale)\r
5727                   for(ir=0,it=0; ir<com.ncatG; ir++) {  /* Sir[it] is the biggest */\r
5728                      for(k=0,Sir[ir]=0; k<com.NnodeScale; k++)\r
5729                         Sir[ir] += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];\r
5730                      if(Sir[ir]>Sir[it]) it = ir;\r
5731                   }\r
5732                for (i=0,fh=0; i<n; i++)  {\r
5733                   for(ir=0; ir<com.ncatG; ir++) {\r
5734                      if(com.method==1)\r
5735                         y = nodes[tree.root].conP[ir*(tree.nnode-com.ns)*com.npatt*n+h*n+i];\r
5736                      else\r
5737                         y = nodes[tree.root].conP[h*n+i]; /* wrong right now */\r
5738                      y *= com.pi[i]*com.freqK[ir];\r
5739                      if(com.NnodeScale) y *= exp(Sir[ir]-Sir[it]);\r
5740    \r
5741                      pChar1node[h*n+i] += y;\r
5742                      fh += y;\r
5743                   }\r
5744                }\r
5745                for (i=0,best=0; i<n; i++)  {\r
5746                   pChar1node[h*n+i] /= fh;\r
5747                   if(i && pChar1node[h*n+best]<pChar1node[h*n+i])\r
5748                      best = i;\r
5749                }\r
5750                za[(inode-com.ns)*com.npatt+h] = (char)best;\r
5751                pnode[(inode-com.ns)*com.npatt+h] = pChar1node[h*n+best];\r
5752                *lnL -= log(fh)*com.fpatt[h];\r
5753                if(com.NnodeScale) *lnL -= Sir[it]*com.fpatt[h];\r
5754             }\r
5755          }\r
5756       }\r
5757    }\r
5758    return(0);\r
5759 }\r
5760 \r
5761 \r
5762 void getCodonNode1Site(char codon[], char zanc[], int inode, int site);\r
5763 \r
5764 int AncestralMarginal (FILE *fout, double x[], double fhsiteAnc[], double Sir[])\r
5765 {\r
5766 /* Ancestral reconstruction for each interior node.  This works under both \r
5767    the one rate and gamma rates models.\r
5768    pnode[npatt*nid] stores the prob for the best chara at a node and site.\r
5769    The best character is kept in za[], coded as 0,...,n-1.\r
5770    The data may be coded (com.cleandata==1) or not (com.cleandata==0).\r
5771    Call ProbSitePatt() before running this routine.\r
5772    pMAPnode[NS-1], pMAPnodeA[] stores the MAP probabilities (accuracy)\r
5773    for a site and for the entire sequence, respectively.\r
5774  \r
5775    The routine PostProbNode calculates pChar1node[npatt*ncode], which stores \r
5776    prob for each char at each pattern at each given node inode.  The rest of \r
5777    the routine is to output the results in different ways.\r
5778 \r
5779    Deals with node scaling to avoid underflows.  See above \r
5780    (Z. Yang, 2 Sept 2001)\r
5781 */\r
5782    char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
5783    char *zanc, str[4]="",codon[2][4]={"   ","   "}, aa[4]="";\r
5784    char *sitepatt=(com.readpattern?"pattern":"site");\r
5785    int n=com.ncode, inode, ic=0,b[3],i,j,k1=-1,k2=-1,c1,c2,k3, lsc=com.ls;\r
5786    int lst=(com.readpattern?com.npatt:com.ls);\r
5787    int h,hp,ig, best, oldroot=tree.root;\r
5788    int nid=tree.nnode-com.ns, nchange;\r
5789    double lnL=0, fh, y, pbest, *pChar1node, *pnode, p1=-1,p2=-1;\r
5790    double pMAPnode[NS-1], pMAPnodeA[NS-1], smallp=0.001;\r
5791 \r
5792    char coding=0, *bestAA=NULL;\r
5793    double pAA[21], *pbestAA=NULL, ns,na, nst,nat,S,N;\r
5794     /* bestAA[nid*npatt], pbestAA[nid*npatt]: \r
5795        To reconstruct aa seqs using codon or nucleotide seqs, universal code */\r
5796 \r
5797    if(noisy) puts("Marginal reconstruction.");\r
5798 \r
5799    fprintf (fout,"\n(1) Marginal reconstruction of ancestral sequences\n");\r
5800    fprintf (fout,"(eqn. 4 in Yang et al. 1995 Genetics 141:1641-1650).\n");\r
5801    pChar1node = (double*)malloc(com.npatt*n*sizeof(double));\r
5802    pnode = (double*)malloc((nid*com.npatt+1)*(sizeof(double)+sizeof(char)));\r
5803    if (pnode==NULL||pChar1node==NULL) \r
5804       error2("oom pnode");\r
5805    zanc = (char*)(pnode+nid*com.npatt);\r
5806 \r
5807 #ifdef BASEML\r
5808    if(com.seqtype==0 && com.ls%3==0 && com.coding) { coding=1; lsc=com.ls/3; }\r
5809 #endif\r
5810    if(com.seqtype==1) { coding=1; lsc=com.npatt; }\r
5811    if(coding==1) {\r
5812       if((pbestAA=(double*)malloc(nid*lsc*2*sizeof(double)))==NULL) \r
5813          error2("oom pbestAA");\r
5814       bestAA = (char*)(pbestAA+nid*lsc);\r
5815    }\r
5816 \r
5817    if(SetParameters(x)) puts("par err."); \r
5818 \r
5819    if(com.verbose>1) \r
5820       fprintf(fout,"\nProb distribs at nodes, those with p < %.3f not listed\n", smallp);\r
5821 \r
5822    /* This loop reroots the tree at inode & reconstructs sequence at inode */\r
5823    for (inode=com.ns; inode<tree.nnode; inode++) {\r
5824 \r
5825       PostProbNode (inode, x, fhsiteAnc, Sir, &lnL, pChar1node, zanc, pnode);\r
5826       if(noisy) printf ("\tNode %3d: lnL = %12.6f\n", inode+1, -lnL);\r
5827 \r
5828       /* print Prob distribution at inode if com.verbose>1 */\r
5829       if (com.verbose>1) {\r
5830          fprintf(fout,"\nProb distribution at node %d, by %s\n", inode+1, sitepatt);\r
5831          fprintf(fout,"\n%7s  Freq   Data\n\n", sitepatt);\r
5832          for(h=0;h<lst;h++,FPN(fout)) {\r
5833             hp = (!com.readpattern ? com.pose[h] : h);\r
5834             fprintf (fout,"%7d%7.0f   ", h+1, com.fpatt[hp]);\r
5835             print1site(fout, hp);\r
5836             fputs(": ", fout);\r
5837             for(j=0; j<n; j++) {\r
5838                if (com.seqtype!=CODONseq) { \r
5839                   str[0] = pch[j];\r
5840                   str[1] = 0;\r
5841                }\r
5842                else\r
5843                   strcpy(str, CODONs[j]);\r
5844                fprintf(fout,"%s(%5.3f) ", str, pChar1node[hp*n+j]);\r
5845             }\r
5846          }\r
5847       }     /* if (verbose) */\r
5848 \r
5849 \r
5850       /* find the best amino acid for coding seqs */\r
5851 #ifdef CODEML\r
5852       if(com.seqtype==CODONseq)\r
5853          for(h=0; h<com.npatt; h++) {\r
5854             for(j=0; j<20; j++) pAA[j]=0; \r
5855             for(j=0; j<n; j++) {\r
5856                i = GeneticCode[com.icode][FROM61[j]];\r
5857                pAA[i] += pChar1node[h*n+j];\r
5858             }\r
5859             /* matout(F0,pAA,1,20); */\r
5860             for(j=0,best=0,pbest=0; j<20; j++) \r
5861                if(pAA[j]>pbest) { pbest=pAA[j]; best=j; }\r
5862             bestAA[(inode-com.ns)*com.npatt+h] = (char)best;\r
5863             pbestAA[(inode-com.ns)*com.npatt+h] = pbest;\r
5864          }\r
5865 #endif\r
5866       if(com.seqtype==0 && coding) { /* coding seqs analyzed by baseml */\r
5867          for(h=0; h<lsc; h++) {  /* h-th codon */\r
5868             /* sums up probs for the 20 AAs for each node. Stop codons are \r
5869                ignored, and so those probs are approxiamte. */\r
5870             for(j=0,y=0; j<20; j++) pAA[j]=0;\r
5871             for(k1=0; k1<4; k1++) for(k2=0; k2<4; k2++) for(k3=0; k3<4; k3++) {\r
5872                ic = k1*16+k2*4+k3;\r
5873                b[0] = com.pose[h*3+0]*n+k1; \r
5874                b[1] = com.pose[h*3+1]*n+k2; \r
5875                b[2] = com.pose[h*3+2]*n+k3;\r
5876                fh = pChar1node[b[0]]*pChar1node[b[1]]*pChar1node[b[2]];\r
5877                if((ic=GeneticCode[com.icode][ic])==-1) \r
5878                   y += fh;\r
5879                else\r
5880                   pAA[ic] += fh;\r
5881             }\r
5882             if(fabs(1-y-sum(pAA,20))>1e-6) error2("AncestralMarginal strange?");\r
5883 \r
5884             for(j=0,best=0,pbest=0; j<20; j++) \r
5885                if(pAA[j]>pbest) { pbest=pAA[j]; best=j; }\r
5886 \r
5887             bestAA[(inode-com.ns)*com.ls/3+h] = (char)best;\r
5888             pbestAA[(inode-com.ns)*com.ls/3+h] = pbest/(1-y);\r
5889          }\r
5890       }\r
5891    }        /* for (inode), This closes the big loop */\r
5892 \r
5893    for(i=0; i<tree.nnode; i++)\r
5894       com.oldconP[i] = 0;\r
5895    ReRootTree(oldroot);\r
5896 \r
5897    if(com.seqtype==0 && coding && !com.readpattern) { /* coding seqs analyzed by baseml */\r
5898       fputs("\nBest amino acids reconstructed from nucleotide model.\n",fout);\r
5899       fputs("Prob at each node listed by amino acid (codon) site\n",fout);\r
5900       fputs("(Please ignore if not relevant)\n\n",fout);\r
5901       for(h=0;h<com.ls/3;h++,FPN(fout)) {\r
5902          fprintf(fout,"%4d ", h+1);\r
5903          for(j=0; j<com.ns; j++) {\r
5904             getCodonNode1Site(codon[0], NULL, j, h);\r
5905             Codon2AA(codon[0], aa, com.icode, &i);\r
5906             fprintf(fout," %s(%c)",codon[0],AAs[i]);\r
5907          }\r
5908          fprintf(fout,": ");\r
5909          for (j=0; j<tree.nnode-com.ns; j++) {\r
5910             fprintf(fout," %1c (%5.3f)", AAs[bestAA[j*com.ls/3+h]], pbestAA[j*com.ls/3+h]);\r
5911          }\r
5912       }\r
5913    }\r
5914 \r
5915    /* calculate accuracy measures */\r
5916    zero(pMAPnode,nid);  fillxc(pMAPnodeA, 1., nid);\r
5917    for (inode=0; inode<tree.nnode-com.ns; inode++) {\r
5918       for(h=0; h<com.npatt; h++) {\r
5919          pMAPnode[inode] += com.fpatt[h]*pnode[inode*com.npatt+h]/com.ls;\r
5920          pMAPnodeA[inode] *= pow(pnode[inode*com.npatt+h], com.fpatt[h]);\r
5921       }\r
5922    }\r
5923 \r
5924    fprintf(fout,"\nProb of best state at each node, listed by %s", sitepatt);\r
5925    if (com.ngene>1) fprintf(fout,"\n\n%7s (g) Freq  Data: \n", sitepatt);\r
5926    else             fprintf(fout,"\n\n%7s   Freq   Data: \n", sitepatt);\r
5927 \r
5928    for(h=0; h<lst; h++) {\r
5929       hp = (!com.readpattern ? com.pose[h] : h);\r
5930       fprintf(fout,"\n%4d ",h+1);\r
5931       if (com.ngene>1) {  /* which gene the site is from */\r
5932          for(ig=1; ig<com.ngene; ig++) \r
5933             if(hp<com.posG[ig]) break;\r
5934          fprintf(fout,"(%d)",ig);\r
5935       }\r
5936       fprintf(fout," %5.0f   ", com.fpatt[hp]);\r
5937       print1site(fout, hp);\r
5938       fprintf(fout, ": ");\r
5939 \r
5940       for(j=0; j<nid; j++) {\r
5941          if (com.seqtype!=CODONseq)\r
5942             fprintf(fout,"%c(%5.3f) ", pch[(int)zanc[j*com.npatt+hp]],pnode[j*com.npatt+hp]);\r
5943 #ifdef CODEML\r
5944          else {\r
5945             ic = zanc[j*com.npatt+hp];\r
5946             Codon2AA(CODONs[ic], aa, com.icode, &i);\r
5947             fprintf(fout," %s %1c %5.3f (%1c %5.3f)",\r
5948                CODONs[ic], AAs[i], pnode[j*com.npatt+hp], AAs[(int)bestAA[j*com.npatt+hp]], pbestAA[j*com.npatt+hp]);\r
5949          }\r
5950 #endif\r
5951       }\r
5952       if(noisy && (h+1)%100000==0) printf("\r\tprinting, %d sites done", h+1);\r
5953    }\r
5954    if(noisy && h>=100000) printf("\n");\r
5955 \r
5956    /* Map changes onto branches \r
5957       k1 & k2 are the two characters; p1 and p2 are the two probs. */\r
5958 \r
5959    if(!com.readpattern) {\r
5960       fputs("\n\nSummary of changes along branches.\n",fout);\r
5961       fputs("Check root of tree for directions of change.\n",fout);\r
5962       if(!com.cleandata && com.seqtype==1) \r
5963          fputs("Counts of n & s are incorrect along tip branches with ambiguity data.\n",fout);\r
5964       for(j=0; j<tree.nbranch; j++,FPN(fout)) {\r
5965          inode = tree.branches[j][1];  \r
5966          nchange = 0;\r
5967          fprintf(fout,"\nBranch %d:%5d..%-2d",j+1,tree.branches[j][0]+1,inode+1);\r
5968          if(inode<com.ns) fprintf(fout," (%s) ",com.spname[inode]);\r
5969 \r
5970          if(coding) {\r
5971             lsc = (com.seqtype==1 ? com.ls : com.ls/3);\r
5972             for (h=0,nst=nat=0; h<lsc; h++)  {\r
5973                getCodonNode1Site(codon[0], zanc, inode, h);\r
5974                getCodonNode1Site(codon[1], zanc, tree.branches[j][0], h);\r
5975                difcodonNG(codon[0], codon[1], &S, &N, &ns,&na, 0, com.icode);\r
5976                nst += ns;\r
5977                nat += na;\r
5978             }\r
5979             fprintf(fout," (n=%4.1f s=%4.1f)",nat,nst);\r
5980          }\r
5981          fprintf(fout,"\n\n");\r
5982          for(h=0; h<lst; h++) {\r
5983             hp = (!com.readpattern ? com.pose[h] : h);\r
5984             if (com.seqtype!=CODONseq) {\r
5985                if(inode<com.ns)\r
5986                   k2 = pch[(int)com.z[inode][hp]];\r
5987                else {\r
5988                   k2 = pch[(int)zanc[(inode-com.ns)*com.npatt+hp]]; \r
5989                   p2 = pnode[(inode-com.ns)*com.npatt+hp];\r
5990                }\r
5991                k1 = pch[ zanc[(tree.branches[j][0]-com.ns)*com.npatt+hp] ];\r
5992                p1 = pnode[(tree.branches[j][0]-com.ns)*com.npatt+hp];\r
5993             }\r
5994 #ifdef CODEML\r
5995             else {\r
5996                if(inode<com.ns) {\r
5997                   strcpy(codon[1], CODONs[com.z[inode][hp]]);\r
5998                   k2 = GetAASiteSpecies(inode, hp);\r
5999                }\r
6000                else {\r
6001                   strcpy(codon[1], CODONs[(int)zanc[(inode-com.ns)*com.npatt+hp]]);\r
6002                   k2 = AAs[(int)bestAA[(inode-com.ns)*com.npatt+hp]];\r
6003                   p2 = pbestAA[(inode-com.ns)*com.npatt+hp];\r
6004                }\r
6005                strcpy(codon[0], CODONs[(int)zanc[(tree.branches[j][0]-com.ns)*com.npatt+hp]]);\r
6006                k1 = AAs[(int)bestAA[(tree.branches[j][0]-com.ns)*com.npatt+hp]];\r
6007                p1 = pbestAA[(tree.branches[j][0]-com.ns)*com.npatt+hp];\r
6008 \r
6009                if(strcmp(codon[0],codon[1])) {\r
6010                   if(inode<com.ns) \r
6011                      fprintf(fout,"\t%4d %s (%c) %.3f -> %s (%c)\n",     h+1,codon[0],k1,p1, codon[1],k2);\r
6012                   else\r
6013                      fprintf(fout,"\t%4d %s (%c) %.3f -> %s (%c) %.3f\n",h+1,codon[0],k1,p1, codon[1],k2,p2);\r
6014                }\r
6015                k1 = k2 = 0;\r
6016             }\r
6017 #endif\r
6018             if(k1==k2) continue;\r
6019             fprintf(fout,"\t%4d ",h+1);\r
6020 \r
6021 #ifdef SITELABELS\r
6022             if(sitelabels) fprintf(fout," %5s   ",sitelabels[h]);\r
6023 #endif\r
6024             if(inode<com.ns) fprintf(fout,"%c %.3f -> %1c\n",k1,p1,k2);\r
6025             else             fprintf(fout,"%c %.3f -> %1c %.3f\n",k1,p1,k2,p2);\r
6026             nchange++;\r
6027          }\r
6028       }\r
6029    }\r
6030 \r
6031    ListAncestSeq(fout, zanc);\r
6032    fprintf(fout,"\n\nOverall accuracy of the %d ancestral sequences:", nid);\r
6033    matout2(fout,pMAPnode, 1, nid, 9,5);  fputs("for a site.\n",fout);\r
6034    matout2(fout,pMAPnodeA, 1, nid, 9,5); fputs("for the sequence.\n", fout);\r
6035 \r
6036    /* best amino acid sequences from codonml */\r
6037 #ifdef CODEML\r
6038    if(com.seqtype==1) {\r
6039       fputs("\n\nAmino acid sequences inferred by codonml.\n",fout);\r
6040       if(!com.cleandata) \r
6041          fputs("Results unreliable for sites with alignment gaps.\n",fout);\r
6042       for(inode=0; inode<nid; inode++) {\r
6043          fprintf(fout,"\nNode #%-10d  ",com.ns+inode+1);\r
6044          for(h=0; h<lst; h++) {\r
6045             hp = (!com.readpattern ? com.pose[h] : h);\r
6046             fprintf(fout, "%c", AAs[(int)bestAA[inode*com.npatt+hp]]);\r
6047             if((h+1)%10==0) fputc(' ', fout);\r
6048          }\r
6049       }\r
6050       FPN(fout);\r
6051    }\r
6052 #endif\r
6053    ChangesSites(fout, coding, zanc);\r
6054 \r
6055    free(pnode);\r
6056    free(pChar1node);\r
6057    if(coding) free(pbestAA);\r
6058    return (0);\r
6059 }\r
6060 \r
6061 \r
6062 void getCodonNode1Site(char codon[], char zanc[], int inode, int site)\r
6063 {\r
6064 /* this is used to retrive the codon from a codon sequence for codonml \r
6065    or coding sequence in baseml, used in ancestral reconstruction\r
6066    zanc has ancestral sequences\r
6067    site is codon site\r
6068 */\r
6069    int i, hp;\r
6070 \r
6071    for(i=0; i<3; i++)  /* to force crashes */\r
6072       codon[i]=-1;\r
6073    if(com.seqtype==CODONseq) {\r
6074       hp = (!com.readpattern ? com.pose[site] : site);\r
6075 #ifdef CODEML\r
6076       if(inode>=com.ns)\r
6077          strcpy(codon, CODONs[zanc[(inode-com.ns)*com.npatt+hp]]);\r
6078       else\r
6079          strcpy(codon, CODONs[com.z[inode][hp]]);\r
6080 #endif\r
6081    }\r
6082    else {      /* baseml coding reconstruction */\r
6083       if(inode>=com.ns)\r
6084          for(i=0; i<3; i++)\r
6085             codon[i] = BASEs[(int)zanc[(inode-com.ns)*com.npatt+com.pose[site*3+i]]];\r
6086       else\r
6087          for(i=0; i<3; i++) codon[i] = BASEs[ com.z[inode][com.pose[site*3+i]] ];\r
6088    }\r
6089 \r
6090 }\r
6091 \r
6092 int ChangesSites(FILE*frst, int coding, char *zanc)\r
6093 {\r
6094 /* this lists and counts changes at sites from reconstructed ancestral sequences\r
6095    com.z[] has the data, and zanc[] has the ancestors\r
6096    For codon sequences (codonml or baseml with com.coding), synonymous and \r
6097    nonsynonymous changes are counted separately.\r
6098    Added in Nov 2000.\r
6099 */\r
6100    char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
6101    char codon[2][4]={"   ","   "};\r
6102    int  h,hp,inode,k1,k2,d, ls1=(com.readpattern?com.npatt:com.ls);\r
6103    double S,N,Sd,Nd, S1,N1,Sd1,Nd1, b,btotal=0, p,C;\r
6104 \r
6105    if(com.seqtype==0 && coding) ls1/=3;\r
6106    if(coding) {\r
6107       fprintf(frst,"\n\nCounts of changes at sites, listed by %s\n\n", \r
6108          (com.readpattern?"pattern":"site"));\r
6109       fprintf(frst1,"\nList of sites with changes according to ancestral reconstruction\n");\r
6110       fprintf(frst1,"Suzuki-Gojobori (1999) style test\n");\r
6111       if(!com.cleandata)\r
6112          fprintf(frst, "(Counts of n & s are incorrect at sites with ambiguity data)\n\n");\r
6113 \r
6114       for(inode=0; inode<tree.nnode; inode++)  \r
6115          if(inode!=tree.root) btotal += nodes[inode].branch;\r
6116       for(h=0; h<ls1; h++) {\r
6117          fprintf(frst,"%4d ",h+1);\r
6118          for(inode=0,S=N=Sd=Nd=0; inode<tree.nnode; inode++) {\r
6119             if(inode==tree.root) continue;\r
6120             b = nodes[inode].branch;\r
6121             getCodonNode1Site(codon[0], zanc, nodes[inode].father, h);\r
6122             getCodonNode1Site(codon[1], zanc, inode, h);\r
6123 \r
6124             difcodonNG(codon[0], codon[1], &S1, &N1, &Sd1, &Nd1, 0, com.icode);\r
6125             S += S1*b/btotal;\r
6126             N += N1*b/btotal;\r
6127             if(Sd1 || Nd1) {\r
6128                Sd += Sd1;\r
6129                Nd += Nd1;\r
6130                fprintf(frst," %3s.%3s ",codon[0],codon[1]);\r
6131             }\r
6132          }\r
6133          b = S+N; S /= b;  N /= b;\r
6134          fprintf(frst,"(S N: %7.3f%7.3f Sd Nd: %6.1f %5.1f)\n", S*3,N*3,Sd,Nd);\r
6135          fprintf(frst1,"%4d S N: %7.3f%7.3f Sd Nd: %6.1f %5.1f ", h+1,S*3,N*3,Sd,Nd);\r
6136          if(Sd+Nd) {\r
6137             if(Nd/(Sd+Nd)<N) {\r
6138                for(d=0,p=0,C=1; d<=Nd; d++) {\r
6139                   p += C*pow(N,d) * pow(1-N,Sd+Nd-d);\r
6140                   C *= (Sd+Nd-d)/(d+1);\r
6141                }\r
6142                fprintf(frst1," - p =%6.3f %s", p,(p<.01?"**":(p<.05?"*":"")));\r
6143             }\r
6144             else {\r
6145                for(d=0,p=0,C=1; d<=Sd; d++) {\r
6146                   p += C*pow(S,d)*pow(1-S,Sd+Nd-d);\r
6147                   C *= (Sd+Nd-d)/(d+1);\r
6148                }\r
6149                fprintf(frst1," + p =%6.3f %s", p,(p<.01?"**":(p<.05?"*":"")));\r
6150             }\r
6151          }\r
6152          fprintf(frst1,"\n");\r
6153       }\r
6154    }\r
6155    else {  /* noncoding nucleotide or aa sequences */\r
6156       fprintf(frst,"\n\nCounts of changes at sites%s\n\n",\r
6157          (com.readpattern?", listed by pattern":""));\r
6158       for(h=0; h<ls1; h++) {\r
6159          hp=(!com.readpattern ? com.pose[h] : h);\r
6160          fprintf(frst,"%4d ",h+1);\r
6161          for(inode=0,d=0;inode<tree.nnode;inode++) {\r
6162             if(inode==tree.root) continue;\r
6163             k1 = pch[(int) zanc[(nodes[inode].father-com.ns)*com.npatt+hp] ];\r
6164             if(inode<com.ns)\r
6165                k2 = pch[com.z[inode][hp]];\r
6166             else  \r
6167                k2 = pch[(int) zanc[(inode-com.ns)*com.npatt+hp] ];\r
6168             if(k1!=k2) {\r
6169                d++;\r
6170                fprintf(frst," %c%c", k1,k2);\r
6171             }\r
6172          }\r
6173          fprintf(frst," (%d)\n", d);\r
6174       }\r
6175    }\r
6176    return(0);\r
6177 }\r
6178 \r
6179 \r
6180 \r
6181 #define  NBESTANC  4  /* use 1 2 3 or 4 */\r
6182 int  parsimony=0, *nBestScore, *icharNode[NBESTANC], *combIndex;\r
6183 double *fhsiteAnc, *lnPanc[NBESTANC], *PMatTips, *combScore;\r
6184 char *charNode[NBESTANC], *ancSeq, *ancState1site;\r
6185 FILE *fanc;\r
6186 int largeReconstruction;\r
6187 \r
6188 void DownPassPPSG2000OneSite (int h, int inode, int inodestate, int ipath);\r
6189 void PrintAncState1site (char ancState1site[], double prob);\r
6190 \r
6191 \r
6192 double P0[16]={0, 1, 1.5, 1.5, \r
6193                1, 0, 1.5, 1.5, \r
6194                1.5, 1.5, 0, 1, \r
6195                1.5, 1.5, 1, 0};\r
6196 \r
6197 double piroot[NCODE]={0};\r
6198 \r
6199 /* combIndex[] uses two bits for each son to record the path that is taken by \r
6200    each reconstruction; for 32-bit integers, the maximum number of sons for \r
6201    each node is 16.\r
6202 \r
6203    lnPanc[3][(tree.nnode-com.ns)*npatt*n] uses the space of com.conP.  \r
6204    It holds the ln(Pr) for the best reconstructions at the subtree down inode \r
6205    given the state of the father node.  \r
6206    charNode[0,1,2] holds the corresponding state at inode.   \r
6207    \r
6208    int nBestScore[maxnson];\r
6209    int   combIndex[2*n*ncomb];  \r
6210    double *combScore[n*ncomb];\r
6211    char ancSeq[nintern*npatt], ancState1site[nintern]; \r
6212    int  icharNode[NBESTANC][nintern*npatt*n];\r
6213    char  charNode[NBESTANC][nintern*npatt*n];\r
6214 */\r
6215 \r
6216 void UpPassPPSG2000 (int inode, int igene, double x[])\r
6217 {\r
6218 /* The algorithm of PPSG2000, modified.  This routine is based on ConditionalPNode(). \r
6219    lnPanc[h*n+i] is the best lnP, given that inode has state i.  \r
6220    charNode[] stores the characters that achieved the best lnP.\r
6221 */\r
6222    int debug=0;\r
6223    int n=com.ncode, it,ibest,i,j,k,h, ison, nson=nodes[inode].nson, *pc;\r
6224    int pos0=com.posG[igene],pos1=com.posG[igene+1], ichar,jchar;\r
6225    int ncomb=1,icomb, ipath;\r
6226    double t, y, psum1site=-1;\r
6227 \r
6228    if(com.ncode!=4) debug=0;   \r
6229 \r
6230    for(i=0; i<nson; i++)\r
6231       if(nodes[nodes[inode].sons[i]].nson>0)\r
6232          UpPassPPSG2000(nodes[inode].sons[i], igene, x);\r
6233    for(i=0,ncomb=1; i<nson; i++)\r
6234       ncomb *= (nBestScore[i] = (nodes[nodes[inode].sons[i]].nson>0 ? NBESTANC : 1));\r
6235    if(debug) {\r
6236       printf("\n\nNode %2d has sons ", inode+1);\r
6237       for(i=0; i<nson; i++) printf(" %2d", nodes[inode].sons[i]+1);\r
6238       printf("  ncomb=%2d: ", ncomb);\r
6239       for(i=0; i<nson; i++) printf(" %2d", nBestScore[i]);  FPN(F0);\r
6240    }\r
6241 \r
6242    if(inode!=tree.root) {    /* calculate log{P(t)} from father to inode */\r
6243       t = nodes[inode].branch*_rateSite;\r
6244       if(com.clock<5) {\r
6245          if(com.clock)  t *= GetBranchRate(igene,(int)nodes[inode].label,x,NULL);\r
6246          else           t *= com.rgene[igene];\r
6247       }\r
6248       GetPMatBranch(PMat, x, t, inode);\r
6249       for(j=0; j<n*n; j++)\r
6250          PMat[j] = (PMat[j]<1e-300 ? 300 : -log(PMat[j]));\r
6251    }\r
6252 \r
6253    for(h=pos0; h<pos1; h++) {  /* loop through site patterns */\r
6254       if(h) debug=0;\r
6255       /* The last round for inode==tree.root, shares some code with other nodes, \r
6256          and is thus embedded in the same loop.  Alternatively this round can be \r
6257          taken out of the loop with some code duplicated.\r
6258       */\r
6259       for(ichar=0; ichar<(inode!=tree.root?n:1); ichar++) { /* ichar for father */\r
6260          /* given ichar for the father, what are the best reconstructions at \r
6261             inode?  Look at n*ncomb possibilities, given father state ichar.\r
6262          */\r
6263          if(debug) {\r
6264             if(inode==tree.root) printf("\n\nfather is root\n");\r
6265             else  printf("\n\nichar = %2d  %c for father\n", ichar+1,BASEs[ichar]);\r
6266          }\r
6267 \r
6268          for(icomb=0; icomb<n*ncomb; icomb++) {\r
6269             jchar = icomb/ncomb;      /* jchar is for inode */\r
6270             if(inode==tree.root) \r
6271                combScore[icomb] = -log(com.pi[jchar]+1e-300);\r
6272             else\r
6273                combScore[icomb] = PMat[ichar*n+jchar];\r
6274 \r
6275             if(inode==tree.root && parsimony) combScore[icomb] = 0;\r
6276 \r
6277             if(debug) printf("comb %2d %c", icomb+1,BASEs[jchar]);\r
6278 \r
6279             for(i=0,it=icomb%ncomb; i<nson; i++) { /* The ibest-th state in ison. */\r
6280                ison = nodes[inode].sons[i];\r
6281                ibest = it%nBestScore[i];\r
6282                it /= nBestScore[i];\r
6283 \r
6284                if(nodes[ison].nson)    /* internal node */\r
6285                   y = lnPanc[ibest][(ison-com.ns)*com.npatt*n+h*n+jchar];\r
6286                else if (com.cleandata)  /* tip clean: PMatTips[] has log{P(t)}. */\r
6287                   y = PMatTips[ ison*n*n + jchar*n + com.z[ison][h] ];\r
6288                else {                   /* tip unclean: PMatTips[] has P(t). */\r
6289                   for(k=0,y=0; k<nChara[com.z[ison][h]]; k++)\r
6290                      y += PMatTips[ ison*n*n+jchar*n + CharaMap[com.z[ison][h]][k] ];\r
6291                   y = -log(y);\r
6292                }\r
6293 \r
6294                combScore[icomb] += y;\r
6295                if(debug) printf("%*s son %2d #%2d %7.1f\n", (i?10:1),"", ison+1, ibest+1,y);\r
6296             }\r
6297          }  /* for(icomb) */\r
6298 \r
6299          if(debug) { printf("score "); for(i=0;i<n*ncomb; i++) printf(" %4.1f",combScore[i]); FPN(F0); }\r
6300          indexing(combScore, n*ncomb, combIndex, 0, combIndex+n*ncomb);\r
6301          if(debug) { printf("index "); for(i=0;i<n*ncomb; i++) printf(" %4d",combIndex[i]); FPN(F0); }\r
6302 \r
6303          /* print out reconstructions at the site if inode is root. */\r
6304          if(inode==tree.root) {\r
6305             fprintf(fanc,"%4d ", h+1);\r
6306             if(com.ngene>1) fprintf(fanc,"(%d) ", igene+1);\r
6307             fprintf(fanc," %6.0f  ",com.fpatt[h]);\r
6308             print1site(fanc, h); \r
6309             fprintf(fanc, ": ");\r
6310          }\r
6311          psum1site=0;  /* used if inode is root */\r
6312 \r
6313          for(j=0; j<(inode!=tree.root ? NBESTANC : n*ncomb); j++) {\r
6314             jchar = (it=combIndex[j])/ncomb; it%=ncomb;\r
6315             if(j<NBESTANC) {\r
6316                lnPanc[j][(inode-com.ns)*com.npatt*n+h*n+ichar] = combScore[combIndex[j]];\r
6317                charNode[j][(inode-com.ns)*com.npatt*n+h*n+ichar] = jchar;\r
6318             }\r
6319             if(debug) printf("\t#%d: %6.1f %c ", j+1, combScore[combIndex[j]], BASEs[jchar]);\r
6320 \r
6321             for(i=0,ipath=0; i<nson; i++) {\r
6322                ison=nodes[inode].sons[i]; \r
6323                ibest=it%nBestScore[i];\r
6324                it/=nBestScore[i];\r
6325                ipath |= ibest<<(2*i);\r
6326                if(debug) printf("%2d", ibest+1);\r
6327             }\r
6328             if(j<NBESTANC) \r
6329                icharNode[j][(inode-com.ns)*com.npatt*n+h*n+ichar]=ipath;\r
6330 \r
6331             if(debug) printf(" (%o)", ipath);\r
6332    \r
6333             /* print if inode is root. */\r
6334             if(inode==tree.root) {\r
6335                ancState1site[inode-com.ns]=jchar;\r
6336                if(parsimony) y = combScore[combIndex[j]];\r
6337                else          psum1site += y = exp(-combScore[combIndex[j]]-fhsiteAnc[h]);\r
6338 \r
6339                for(i=0; i<nson; i++) {\r
6340                   if(nodes[ison=nodes[inode].sons[i]].nson)\r
6341                      DownPassPPSG2000OneSite(h, tree.root, jchar, ipath);\r
6342                }\r
6343                PrintAncState1site(ancState1site, y);\r
6344                if(j>NBESTANC && y<.001) break;\r
6345             }\r
6346          }  /* for(j) */\r
6347       }     /* for(ichar) */\r
6348       if(inode==tree.root) fprintf(fanc," (total %6.3f)\n", psum1site);\r
6349 \r
6350       if(largeReconstruction && (h+1)%2000==0)\r
6351          printf("\r\tUp pass for gene %d node %d sitepatt %d.", igene+1,inode+1,h+1);\r
6352 \r
6353    }        /* for(h) */\r
6354    if(largeReconstruction)\r
6355       printf("\r\tUp pass for gene %d node %d.", igene+1,inode+1);\r
6356 }\r
6357 \r
6358 void DownPassPPSG2000OneSite (int h, int inode, int inodestate, int ipath)\r
6359 {\r
6360 /* this puts the state in ancState1site[nintern], using \r
6361    int icharNode[NBESTANC][nintern*npatt*n],\r
6362    char charNode[NBESTANC][nintern*npatt*n].\r
6363    jchar is the state at inode, and ipath is the ipath code for inode.\r
6364 */\r
6365    int n=com.ncode, i, ison, ibest, sonstate;\r
6366 \r
6367    for(i=0; i<nodes[inode].nson; i++) {\r
6368       ison=nodes[inode].sons[i];\r
6369       if(nodes[ison].nson>1) {\r
6370          ibest = (ipath & (3<<(2*i))) >> (2*i);\r
6371          ancState1site[ison-com.ns] = sonstate =\r
6372             charNode[ibest][(ison-com.ns)*com.npatt*n+h*n+inodestate];\r
6373          DownPassPPSG2000OneSite(h, ison, sonstate, \r
6374            icharNode[ibest][(ison-com.ns)*com.npatt*n+h*n+inodestate]);\r
6375       }\r
6376    }\r
6377 }\r
6378 \r
6379 \r
6380 void PrintAncState1site (char ancState1site[], double prob)\r
6381 {\r
6382    int i;\r
6383    char codon[4]="";\r
6384    char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
6385 \r
6386    for(i=0; i<tree.nnode-com.ns; i++) {\r
6387       if(com.seqtype==1) {\r
6388 #ifdef CODEML\r
6389          fprintf(fanc,"%s ",getcodon(codon,FROM61[(int)ancState1site[i]]));\r
6390 #endif   \r
6391       }\r
6392       else\r
6393          fprintf(fanc, "%c", pch[(int)ancState1site[i]]);\r
6394    }\r
6395    fprintf(fanc," (%5.3f) ", prob);\r
6396 }\r
6397 \r
6398 void DownPassPPSG2000 (int inode)\r
6399 {\r
6400 /* this reads out the best chara for inode from charNode[] into ancSeq[].\r
6401 */\r
6402    int i,ison, h;\r
6403    char c0=0;\r
6404 \r
6405    for(h=0; h<com.npatt; h++) {\r
6406       if(inode!=tree.root) \r
6407          c0=ancSeq[(nodes[inode].father-com.ns)*com.npatt+h];\r
6408       ancSeq[(inode-com.ns)*com.npatt+h]\r
6409          = charNode[0][(inode-com.ns)*com.npatt*com.ncode+h*com.ncode+c0];\r
6410    }\r
6411    for(i=0; i<nodes[inode].nson; i++)\r
6412       if(nodes[ison=nodes[inode].sons[i]].nson > 1)\r
6413          DownPassPPSG2000(ison);\r
6414 }\r
6415 \r
6416 \r
6417 \r
6418 int AncestralJointPPSG2000 (FILE *fout, double x[])\r
6419 {\r
6420 /* Ziheng Yang, 8 June 2000, rewritten on 8 June 2005.\r
6421    Joint ancestral reconstruction, taking character states for all nodes at a \r
6422    site as one entity, based on the algorithm of Pupko et al. (2000 \r
6423    Mol. Biol. Evol. 17:890-896).\r
6424 \r
6425    fhsiteAns[]: fh[] for each site pattern\r
6426    nodes[].conP[] are destroyed and restored at the end of the routine.\r
6427    ancSeq[] stores the ancestral seqs, the best reconstruction.\r
6428 \r
6429    This outputs results by pattern.  I tried to print results by sites, but gave up as \r
6430    some variables use the same memory (e.g., combIndex) for different site patterns.\r
6431 */\r
6432    char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
6433    char codon[4]="";\r
6434    int n=com.ncode,nintern=tree.nnode-com.ns, i,j,k,h,hp,igene;\r
6435    int maxnson=0, maxncomb, lst=(com.readpattern?com.npatt:com.ls);\r
6436    char *sitepatt=(com.readpattern?"pattern":"site");\r
6437    double t;\r
6438    size_t sconPold = com.sconP, s;\r
6439 \r
6440    largeReconstruction = (noisy && (com.ns>300 || com.ls>1000000));\r
6441 \r
6442    if(noisy) puts("Joint reconstruction.");\r
6443 \r
6444    for(i=0; i<tree.nnode; i++) maxnson=max2(maxnson,nodes[i].nson);\r
6445    if(maxnson>16 || NBESTANC>4) /* for int at least 32 bits */\r
6446       error2("NBESTANC too large or too many sons.");\r
6447    for(i=0,maxncomb=1; i<maxnson; i++) maxncomb*=NBESTANC;\r
6448    if((PMatTips=(double*)malloc(com.ns*n*n*sizeof(double)))==NULL) \r
6449       error2("oom PMatTips");\r
6450    s = NBESTANC*nintern*(size_t)com.npatt*n*sizeof(double);\r
6451    if(s > sconPold) {\r
6452       com.sconP = s;\r
6453       printf("\n%9lu bytes for conP, adjusted\n", com.sconP);\r
6454       if((com.conP=(double*)realloc(com.conP,com.sconP))==NULL)\r
6455          error2("oom conP");\r
6456    }\r
6457    s = NBESTANC*nintern*com.npatt*n;\r
6458    s = ((s*sizeof(int)+(s+nintern)*sizeof(char)+16)/sizeof(double))*sizeof(double);\r
6459    if(s > com.sspace) {\r
6460       com.sspace=s;\r
6461       printf("\n%9lu bytes for space, adjusted\n",com.sspace);\r
6462       if((com.space=(double*)realloc(com.space,com.sspace))==NULL) error2("oom space");\r
6463    }\r
6464    for(i=0; i<NBESTANC; i++) {\r
6465       lnPanc[i]= com.conP+i*nintern*com.npatt*n;\r
6466       icharNode[i] = (int*)com.space+i*nintern*com.npatt*n;\r
6467       charNode[i] = (char*)((int*)com.space+NBESTANC*nintern*com.npatt*n)\r
6468                   + i*nintern*com.npatt*n;\r
6469       ancState1site = charNode[0]+NBESTANC*nintern*com.npatt*n;\r
6470    }\r
6471    if((ancSeq=(char*)malloc(nintern*com.npatt*n*sizeof(char)))==NULL)\r
6472       error2("oom charNode");\r
6473 \r
6474    if((combScore=(double*)malloc((3*n*maxncomb+com.ns)*sizeof(double)))==NULL)\r
6475       error2("oom combScore");\r
6476    nBestScore = (int*)(combScore+n*maxncomb);\r
6477    combIndex = nBestScore + com.ns;  /* combIndex[2*n*ncomb] contains work space */\r
6478 \r
6479    fanc = fout;\r
6480    fprintf(fout, "\n\n(2) Joint reconstruction of ancestral sequences\n");\r
6481    fprintf(fout, "(eqn. 2 in Yang et al. 1995 Genetics 141:1641-1650), using ");\r
6482    fprintf(fout, "the algorithm of Pupko et al. (2000 Mol Biol Evol 17:890-896),\n");\r
6483    fprintf(fout, "modified to generate sub-optimal reconstructions.\n");\r
6484    fprintf(fout, "\nReconstruction (prob.), listed by pattern (use the observed data to find the right site).\n");\r
6485    fprintf(fout, "\nPattern Freq   Data:\n\n"); \r
6486 \r
6487    for(igene=0; igene<com.ngene; igene++) {\r
6488       if(com.Mgene>1) SetPGene(igene,1,1,0,x);\r
6489       for(i=0; i<com.ns; i++) {\r
6490          t = nodes[i].branch*_rateSite;\r
6491          if(com.clock<5) {\r
6492             if(com.clock)  t *= GetBranchRate(igene,(int)nodes[i].label,x,NULL);\r
6493             else           t *= com.rgene[igene];\r
6494          }\r
6495          GetPMatBranch(PMatTips+i*n*n, x, t, i);\r
6496       }\r
6497 \r
6498       if(com.cleandata) {\r
6499          for(i=0; i<com.ns*n*n; i++)\r
6500             PMatTips[i] = (PMatTips[i]<1e-20 ? 300 : -log(PMatTips[i]));\r
6501       }\r
6502       if(parsimony) \r
6503          for(i=0; i<com.ns; i++)\r
6504              xtoy(P0, PMatTips+i*n*n, n*n);\r
6505 \r
6506       UpPassPPSG2000(tree.root, igene, x); /* this prints into frst as well */\r
6507    }\r
6508 \r
6509    if(largeReconstruction) puts("\n\tDown pass.");\r
6510    DownPassPPSG2000(tree.root);\r
6511 \r
6512    ListAncestSeq(fout, ancSeq);\r
6513 \r
6514    free(ancSeq);\r
6515    free(PMatTips);\r
6516    free(combScore);\r
6517    com.sconP = sconPold;\r
6518    if((com.conP=(double*)realloc(com.conP,com.sconP))==NULL)\r
6519       error2("conP");\r
6520    PointconPnodes();\r
6521    return (0);\r
6522 }\r
6523 \r
6524 \r
6525 \r
6526 int AncestralSeqs (FILE *fout, double x[])\r
6527 {\r
6528 /* Ancestral sequence reconstruction using likelihood (Yang et al. 1995).\r
6529    Marginal works with constant rate and variable rates among sites.\r
6530    Joint works only with constant rate among sites (ncatG=1).\r
6531 */\r
6532    int h, k, i;\r
6533    double lnL, *ScaleC=NULL;  /* collected scale factors */\r
6534 \r
6535    if(com.Mgene==1)\r
6536       error2("When Mgene=1, use RateAncestor = 0.");\r
6537    if (tree.nnode==com.ns) \r
6538       { puts("\nNo ancestral nodes to reconstruct..\n");  return(0); }\r
6539    if (noisy) printf ("\nReconstructed ancestral states go into file rst.\n");\r
6540    fprintf(fout, "\nAncestral reconstruction by %sML.\n",\r
6541           (com.seqtype==0?"BASE":(com.seqtype==1?"CODON":"AA")));\r
6542    FPN(fout);  OutTreeN(fout,1,1);  FPN(fout);  FPN(fout);\r
6543    OutTreeN(fout,0,0);  FPN(fout);  FPN(fout);\r
6544    OutTreeB(fout);      FPN(fout);\r
6545 \r
6546    fputs("\ntree with node labels for Rod Page's TreeView\n",fout);\r
6547    OutTreeN(fout,1,PrNodeNum);  FPN(fout);\r
6548 \r
6549    fprintf (fout, "\nNodes %d to %d are ancestral\n", com.ns+1,tree.nnode);\r
6550    if((fhsiteAnc=(double*)malloc(com.npatt*sizeof(double)))==NULL)\r
6551       error2("oom fhsiteAnc");\r
6552    if(com.NnodeScale && com.ncatG>1)\r
6553       if((ScaleC=(double*)malloc(max2(com.npatt,com.ncatG) *sizeof(double)))==NULL) \r
6554          error2("oom ScaleC in AncestralSeqs");\r
6555 \r
6556    if(com.alpha)\r
6557       puts("Rates are variable among sites, marginal reconstructions only.");\r
6558    if(!com.cleandata) fputs("Unreliable at sites with alignment gaps\n", fout);\r
6559 \r
6560    if(com.ncatG<=1 || com.method!=1)\r
6561       ProbSitePattern (x, &lnL, fhsiteAnc, ScaleC);\r
6562 \r
6563 #ifdef BASEML\r
6564    if(com.nhomo<=1)\r
6565 #endif\r
6566            AncestralMarginal(fout, x, fhsiteAnc, ScaleC);\r
6567    \r
6568    fflush(fout);\r
6569    /* fhsiteAnc[] is modified by both Marginal and Joint. */\r
6570    if(com.ncatG<=1 && tree.nnode>com.ns+1) {\r
6571       ProbSitePattern (x, &lnL, fhsiteAnc, ScaleC);\r
6572       for(h=0; h<com.npatt; h++) {\r
6573          fhsiteAnc[h] = log(fhsiteAnc[h]);\r
6574          for(k=0; k<com.NnodeScale; k++) \r
6575             fhsiteAnc[h] += com.nodeScaleF[k*com.npatt+h];\r
6576       }\r
6577       /* AncestralJointPPSG2000 corrupts com.conP[] and fhsiteAnc[]. \r
6578       */\r
6579       AncestralJointPPSG2000(fout, x);\r
6580    }\r
6581    FPN(fout);\r
6582    free(fhsiteAnc);\r
6583    if(com.NnodeScale && com.ncatG>1) free(ScaleC);\r
6584 \r
6585    return (0);\r
6586 }\r
6587 \r
6588 \r
6589 #endif\r
6590 \r
6591 \r
6592 int SetNodeScale(int inode);\r
6593 int NodeScale(int inode, int pos0, int pos1);\r
6594 \r
6595 void InitializeNodeScale(void)\r
6596 {\r
6597 /* This allocates memory to hold scale factors for nodes and also decide on the \r
6598    nodes for scaling by calling SetNodeScale().  \r
6599    The scaling node is chosen before the iteration by counting the number of \r
6600    nodes visited in the post-order tree travesal algorithm (see the routine \r
6601    SetNodeScale).\r
6602    See Yang (2000 JME 51:423-432) for details.\r
6603    The memory required is  com.NnodeScale*com.npatt*sizeof(double).\r
6604 */\r
6605    int i, nS;\r
6606 \r
6607    if(com.clock>=5) return;\r
6608 \r
6609    com.NnodeScale = 0;\r
6610    com.nodeScale = (char*)realloc(com.nodeScale, tree.nnode*sizeof(char));\r
6611    if(com.nodeScale==NULL) error2("oom");\r
6612    for(i=0; i<tree.nnode; i++) com.nodeScale[i] = 0;\r
6613    SetNodeScale(tree.root);\r
6614    nS = com.NnodeScale*com.npatt;\r
6615    if(com.conPSiteClass) nS *= com.ncatG;\r
6616    if(com.NnodeScale) {\r
6617       if((com.nodeScaleF=(double*)realloc(com.nodeScaleF, nS*sizeof(double)))==NULL)\r
6618          error2("oom nscale");\r
6619       for(i=0; i<nS; i++) com.nodeScaleF[i] = 0;\r
6620 \r
6621       if(noisy) {\r
6622          printf("\n%d node(s) used for scaling (Yang 2000 J Mol Evol 51:423-432):\n",com.NnodeScale);\r
6623          for(i=0; i<tree.nnode; i++)\r
6624             if(com.nodeScale[i]) printf(" %2d",i+1);\r
6625          FPN(F0);\r
6626       }\r
6627    }\r
6628 }\r
6629 \r
6630 \r
6631 int SetNodeScale (int inode)\r
6632 {\r
6633 /* This marks nodes for applying scaling factors when calculating f[h].\r
6634 */\r
6635    int i,ison, d=0, every;\r
6636 \r
6637    if(com.seqtype==0)       every=100;   /* baseml */\r
6638    else if(com.seqtype==1)  every=15;    /* codonml */\r
6639    else                     every=50;    /* aaml */\r
6640 \r
6641    for(i=0; i<nodes[inode].nson; i++) {\r
6642       ison = nodes[inode].sons[i];\r
6643       d += (nodes[ison].nson ? SetNodeScale(ison) : 1);\r
6644    }\r
6645    if(inode!=tree.root && d>every) {\r
6646       com.nodeScale[inode] = 1;\r
6647       d = 1;\r
6648       com.NnodeScale++; \r
6649    }\r
6650    return(d);\r
6651 }\r
6652 \r
6653 \r
6654 int NodeScale (int inode, int pos0, int pos1)\r
6655 {\r
6656 /* scale to avoid underflow\r
6657 */\r
6658    int h,k,j, n=com.ncode;\r
6659    double t, smallw=1e-12;\r
6660 \r
6661    for(j=0,k=0; j<tree.nnode; j++)   /* k-th node for scaling */\r
6662       if(j==inode) break;\r
6663       else if(com.nodeScale[j]) k++;\r
6664 \r
6665    for(h=pos0; h<pos1; h++) {\r
6666       for(j=0,t=0;j<n;j++)\r
6667          if(nodes[inode].conP[h*n+j]>t)\r
6668             t = nodes[inode].conP[h*n+j];\r
6669 \r
6670       if(t<1e-300) {\r
6671          for(j=0;j<n;j++)\r
6672             nodes[inode].conP[h*n+j]=1;  /* both 0 and 1 fine */\r
6673          com.nodeScaleF[k*com.npatt+h] = -800;  /* this is problematic? */\r
6674       }\r
6675       else {  \r
6676          for(j=0;j<n;j++)\r
6677             nodes[inode].conP[h*n+j]/=t;\r
6678          com.nodeScaleF[k*com.npatt+h] = log(t);\r
6679       }\r
6680    }\r
6681    return(0);\r
6682 }\r
6683 \r
6684 \r
6685 \r
6686 static double *dfsites;\r
6687 \r
6688 int fx_r(double x[], int np);\r
6689 \r
6690 \r
6691 #if (BASEML || CODEML)\r
6692 \r
6693 int HessianSKT2004 (double xmle[], double lnLm, double g[], double H[])\r
6694 {\r
6695 /* this calculates the hessian matrix of branch lengths using the approximation \r
6696    of Seo et al. (2004), especially useful for approximate likelihood calcualtion \r
6697    in divergence time estimation.\r
6698    df[0][i*com.npatt+h] has   d log(f_h)/d b_i.\r
6699    method = 0 uses difference approximation to first derivatives.\r
6700    method = 1 uses analytical calculation of first derivatives (Yang 2000).  \r
6701    I am under the impression that method = 1 may be useful for very large datasets \r
6702    with >10M sites, but I have not implemented this method because the analytical \r
6703    calculation of first derivatives is possible for branch lengths only, and not \r
6704    available for other parameters.  Right now with method = 0, H and the SEs are \r
6705    calculated for all parameters although the H matrix in rst2 is a subset for \r
6706    branch lengths only.  More thought about what to do.  Ziheng's note on 8 March 2010.\r
6707 */\r
6708    int method=0, backforth, h, i, j, lastround0=LASTROUND, nzero=0;\r
6709    double *x, *lnL[2], *df[2], eh0=Small_Diff*2, eh, small;\r
6710 \r
6711    if(com.np!=tree.nbranch && method==1)\r
6712       error2("I think HessianSKT2004 works for branch lengths only");\r
6713    df[0] = (double*)malloc((com.npatt*2+1)*com.np*sizeof(double));\r
6714    if(df[0]==NULL) error2("oom space in HessianSKT2004");\r
6715    df[1] = df[0] + com.npatt*com.np;\r
6716    x     = df[1] + com.npatt*com.np;\r
6717    lnL[0] = (double*)malloc(com.np*2*sizeof(double));\r
6718    lnL[1] = lnL[0]+com.np;\r
6719 \r
6720    LASTROUND = 2;\r
6721 \r
6722    for(backforth=0; backforth<2; backforth++) {\r
6723       for(i=0; i<com.np; i++) {\r
6724          xtoy(xmle, x, com.np);\r
6725          eh = eh0*(fabs(xmle[i]) + 1);\r
6726          if(backforth==0) x[i] = xmle[i] - eh;\r
6727          else             x[i] = xmle[i] + eh;\r
6728          if(x[i] <= 4e-6)\r
6729             nzero ++;\r
6730          dfsites = df[backforth] + i*com.npatt;\r
6731          lnL[backforth][i] = -com.plfun(x, com.np);\r
6732       }\r
6733    }\r
6734 \r
6735    for(i=0; i<com.np; i++) {\r
6736       eh = eh0*(fabs(xmle[i]) + 1);    \r
6737       g[i] = (lnL[1][i] - lnL[0][i])/(eh*2);\r
6738    }\r
6739    /*\r
6740    printf("\nx gL g H");\r
6741    matout(F0, xmle, 1, com.np);\r
6742    matout(F0, g, 1, com.np);\r
6743    */\r
6744    zero(H, com.np*com.np);\r
6745    for(i=0; i<com.np; i++) {\r
6746       eh = eh0*(fabs(xmle[i]) + 1);\r
6747       for(h=0; h<com.npatt; h++)\r
6748          df[0][i*com.npatt+h] = (df[1][i*com.npatt+h] - df[0][i*com.npatt+h])/(eh*2);\r
6749    }\r
6750 \r
6751    for(i=0; i<com.np; i++) {\r
6752       for(j=0; j<com.np; j++)\r
6753          for(h=0; h<com.npatt; h++)\r
6754             H[i*com.np+j] -= df[0][i*com.npatt+h] * df[0][j*com.npatt+h] * com.fpatt[h];\r
6755    }\r
6756 \r
6757    if(nzero) printf("\nWarning: Hessian matrix may be unreliable for zero branch lengths\n");\r
6758    LASTROUND = lastround0;\r
6759    free(df[0]);\r
6760    free(lnL[0]);\r
6761    return(0);\r
6762 }\r
6763 \r
6764 \r
6765 \r
6766 int lfunRates (FILE* fout, double x[], int np)\r
6767 {\r
6768 /* for dG, AdG or similar non-parametric models\r
6769    This distroys com.fhK[], and in return,\r
6770    fhK[<npatt] stores rates for conditional mean (re), and \r
6771    fhK[<2*npatt] stores the most probable rate category number.\r
6772    fhsite[npatt] stores fh=log(fh).\r
6773 */\r
6774    int ir,il,it, h,hp,j, nscale=1, direction=-1;\r
6775    int lst=(com.readpattern?com.npatt:com.ls);\r
6776    double lnL=0,fh,fh1, t, re,mre,vre, b1[NCATG],b2[NCATG],*fhsite;\r
6777 \r
6778    if (noisy) printf("\nEstimated rates for sites go into file %s\n",ratef);\r
6779    if (SetParameters(x)) puts ("par err. lfunRates");\r
6780 \r
6781    fprintf(fout, "\nEstimated rates for sites from %sML.\n",\r
6782           (com.seqtype==0?"BASE":(com.seqtype==1?"CODON":"AA")));\r
6783    OutTreeN(fout,1,1); FPN(fout);\r
6784    fprintf (fout,"\nFrequencies and rates for categories (K=%d)", com.ncatG);\r
6785    fprintf(fout, "\nrate:");  FOR(j,com.ncatG) fprintf(fout," %8.5f",com.rK[j]);\r
6786    fprintf(fout, "\nfreq:");  FOR(j,com.ncatG) fprintf(fout," %8.5f",com.freqK[j]);\r
6787    FPN(fout);\r
6788 \r
6789    if (com.rho) {\r
6790       fprintf(fout,"\nTransition prob matrix over sites");\r
6791       matout2(fout,com.MK,com.ncatG,com.ncatG,8,4);\r
6792    }\r
6793 \r
6794    if((fhsite=(double*)malloc(com.npatt*sizeof(double)))==NULL) error2("oom fhsite");\r
6795    fx_r(x, np);\r
6796    if(com.NnodeScale) {\r
6797       FOR(h,com.npatt) {\r
6798          for(ir=1,it=0; ir<com.ncatG; ir++)\r
6799             if(com.fhK[ir*com.npatt+h] > com.fhK[it*com.npatt+h])\r
6800                it = ir;\r
6801          t = com.fhK[it*com.npatt+h];\r
6802          lnL -= com.fpatt[h]*t;\r
6803          for(ir=0; ir<com.ncatG; ir++)\r
6804             com.fhK[ir*com.npatt+h] = exp(com.fhK[ir*com.npatt+h] - t);\r
6805       }\r
6806    }\r
6807    for(h=0; h<com.npatt; h++) {\r
6808       for(ir=0,fhsite[h]=0; ir<com.ncatG; ir++)\r
6809          fhsite[h] += com.freqK[ir]*com.fhK[ir*com.npatt+h];\r
6810    }\r
6811 \r
6812    if (com.rho==0) {     /* dG model */\r
6813       if(com.verbose>1) {\r
6814          fprintf(fout,"\nPosterior probabilities for site classes, by %s\n\n",\r
6815             (com.readpattern?"pattern":"site"));\r
6816          for (h=0; h<lst; h++,FPN(fout)) {\r
6817             fprintf(fout, " %5d  ", h+1);\r
6818             hp = (!com.readpattern ? com.pose[h] : h);\r
6819             for (ir=0; ir<com.ncatG; ir++)\r
6820                fprintf(fout, " %9.4f", com.freqK[ir]*com.fhK[ir*com.npatt+hp]/fhsite[hp]);\r
6821          }\r
6822       }\r
6823 \r
6824       fprintf(fout,"\n%7s  Freq   Data    Rate (posterior mean & category)\n\n", \r
6825          (com.readpattern?"Pattern":"Site"));\r
6826       for (h=0,mre=vre=0; h<com.npatt; h++) {\r
6827          for (ir=0,it=0,t=re=0; ir<com.ncatG; ir++) {\r
6828             fh1 = com.freqK[ir]*com.fhK[ir*com.npatt+h];\r
6829             if(fh1>t)  { t=fh1; it=ir; }\r
6830             re += fh1*com.rK[ir];\r
6831          }\r
6832          lnL -= com.fpatt[h]*log(fhsite[h]);\r
6833 \r
6834          re /= fhsite[h];\r
6835          mre += com.fpatt[h]*re/com.ls;\r
6836          vre += com.fpatt[h]*re*re/com.ls;\r
6837          com.fhK[h] = re;\r
6838          com.fhK[com.npatt+h] = it+1.;\r
6839       }\r
6840       vre-=mre*mre;\r
6841       for(h=0; h<lst; h++) {\r
6842          hp=(!com.readpattern ? com.pose[h] : h);\r
6843          fprintf(fout,"%7d %5.0f  ",h+1, com.fpatt[hp]);\r
6844          print1site(fout, hp);\r
6845          fprintf(fout," %8.3f%6.0f\n", com.fhK[hp], com.fhK[com.npatt+hp]);\r
6846       }\r
6847    }\r
6848    else {      /* Auto-dGamma model */\r
6849       fputs("\nSite Freq  Data  Rates\n\n",fout);\r
6850       h = (direction==1?com.ls-1:0);\r
6851       for (il=0,mre=vre=0; il<lst; h-=direction,il++) {\r
6852          hp=(!com.readpattern ? com.pose[h] : h);\r
6853          if (il==0)\r
6854             FOR(ir,com.ncatG) b1[ir]=com.fhK[ir*com.npatt+hp];\r
6855          else {\r
6856             for (ir=0; ir<com.ncatG; ir++) {\r
6857                for (j=0,fh=0; j<com.ncatG; j++)\r
6858                   fh+=com.MK[ir*com.ncatG+j]*b1[j];\r
6859                b2[ir] = fh*com.fhK[ir*com.npatt+hp];\r
6860             }\r
6861             xtoy (b2, b1, com.ncatG);\r
6862          }\r
6863          if ((il+1)%nscale==0)\r
6864             { fh=sum(b1,com.ncatG); abyx(1/fh,b1,com.ncatG); lnL-=log(fh); }\r
6865 \r
6866          for (ir=0,it=-1,re=fh1=t=0; ir<com.ncatG; ir++) {\r
6867             re+=com.freqK[ir]*b1[ir]*com.rK[ir];\r
6868             fh1+=com.freqK[ir]*b1[ir];\r
6869             if (b1[ir]>t) {it=ir; t=b1[ir]; }\r
6870          }\r
6871          re /= fh1;\r
6872          mre += re/com.ls;\r
6873          vre += re*re/com.ls;\r
6874 \r
6875          fprintf(fout,"%4d %5.0f  ",h+1, com.fpatt[hp]);\r
6876          print1site(fout, hp);\r
6877          fprintf(fout," %8.3f%6.0f\n", re, it+1.);\r
6878       }  /* for(il) */\r
6879       vre -= mre*mre;\r
6880       for (ir=0,fh=0; ir<com.ncatG; ir++)  fh += com.freqK[ir]*b1[ir];\r
6881       lnL -= log(fh);\r
6882    }\r
6883    if (noisy) printf ("lnL =%14.6f\n", -lnL);\r
6884    fprintf (fout,"\nlnL =%14.6f\n", -lnL);\r
6885    if(com.ngene==1) {\r
6886       fprintf (fout,"\nmean(r^)=%9.4f  var(r^)=%9.4f", mre, vre);\r
6887       fprintf (fout,"\nAccuracy of rate prediction: corr(r^,r) =%9.4f\n", \r
6888                sqrt(com.alpha*vre));\r
6889    }\r
6890    free(fhsite);\r
6891    return (0);\r
6892 }\r
6893 \r
6894 \r
6895 double lfunAdG (double x[], int np)\r
6896 {\r
6897 /* Auto-Discrete-Gamma rates for sites\r
6898    See notes in lfundG().\r
6899 */\r
6900    int  nscale=1, h,il, ir, j, FPE=0;\r
6901    int  direction=-1;  /* 1: n->1;  -1: 1->n */\r
6902    double lnL=0, b1[NCATG], b2[NCATG], fh;\r
6903 \r
6904    NFunCall++;\r
6905    fx_r(x, np);\r
6906    if(com.NnodeScale)\r
6907       FOR(h,com.npatt) {\r
6908          fh=com.fhK[0*com.npatt+h];\r
6909          lnL-=fh*com.fpatt[h];\r
6910          for(ir=1,com.fhK[h]=1; ir<com.ncatG; ir++) \r
6911             com.fhK[ir*com.npatt+h]=exp(com.fhK[ir*com.npatt+h]-fh);\r
6912       }\r
6913    h = (direction==1?com.ls-1:0);\r
6914    for (il=0; il<com.ls; h-=direction,il++) {\r
6915       if (il==0)\r
6916          FOR(ir,com.ncatG) b1[ir]=com.fhK[ir*com.npatt+com.pose[h]];\r
6917       else {\r
6918          for (ir=0; ir<com.ncatG; ir++) {\r
6919             for (j=0,fh=0; j<com.ncatG; j++)\r
6920                fh+=com.MK[ir*com.ncatG+j]*b1[j];\r
6921             b2[ir]=fh*com.fhK[ir*com.npatt+com.pose[h]];\r
6922          }\r
6923          xtoy(b2,b1,com.ncatG);\r
6924       }\r
6925       if((il+1)%nscale==0) {\r
6926          fh=sum(b1,com.ncatG);\r
6927          if(fh<1e-90) {\r
6928             if(!FPE) {\r
6929                FPE=1; printf ("h,fh%6d %12.4e\n", h+1,fh);\r
6930                print1site(F0,h);\r
6931                FPN(F0);\r
6932             }\r
6933             fh=1e-300;\r
6934          }\r
6935          abyx(1/fh,b1,com.ncatG); lnL-=log(fh);\r
6936       }\r
6937    }\r
6938    for (ir=0,fh=0; ir<com.ncatG; ir++)  fh+=com.freqK[ir]*b1[ir];\r
6939    lnL-=log(fh);\r
6940    return (lnL);\r
6941 }\r
6942 \r
6943 #endif\r
6944 \r
6945 \r
6946 \r
6947 \r
6948 #if (defined(BASEML))\r
6949 \r
6950 int GetPMatBranch (double Pt[], double x[], double t, int inode)\r
6951 {\r
6952 /* P(t) for branch leading to inode, called by routines ConditionalPNode()\r
6953    and AncestralSeq() in baseml and codeml.  x[] is not used by baseml.\r
6954 */\r
6955    int n=com.ncode, i;\r
6956    double space[NCODE*NCODE*3] = {0};\r
6957 \r
6958    if (com.model<=K80)\r
6959       PMatK80(Pt, t, (com.nhomo==2 ? *nodes[inode].pkappa : com.kappa));\r
6960    else {\r
6961       if (com.nhomo==2)\r
6962          eigenTN93(com.model, *nodes[inode].pkappa, -1, com.pi, &nR, Root, Cijk);\r
6963       else if (com.nhomo>2 && com.model<=TN93)\r
6964          eigenTN93(com.model, *nodes[inode].pkappa, *(nodes[inode].pkappa+1), nodes[inode].pi, &nR, Root, Cijk);\r
6965       else if (com.nhomo>2 && com.model==REV)\r
6966          eigenQREVbase(NULL, Pt, nodes[inode].pkappa, nodes[inode].pi, &nR, Root, Cijk);\r
6967 \r
6968       if(com.model<=REV||com.model==REVu)  \r
6969          PMatCijk(Pt, t);\r
6970       else {\r
6971          QUNREST(NULL, Pt, x+com.ntime+com.nrgene, com.pi);\r
6972          for(i=0; i<n*n; i++) Pt[i] *= t;\r
6973          matexp (Pt, n, 7, 5, space);\r
6974       }\r
6975    }\r
6976    return(0);\r
6977 }\r
6978 \r
6979 #elif (defined(CODEML))\r
6980 \r
6981 int GetPMatBranch (double Pt[], double x[], double t, int inode)\r
6982 {\r
6983 /* P(t) for branch leading to inode, called by routines ConditionalPNode()\r
6984    and AncestralSeq() in baseml and codeml.\r
6985 \r
6986    Qfactor in branch & site models (model = 2 or 3 and NSsites = 2 or 3):\r
6987    Qfactor scaling is applied here and not inside eigenQcodon().\r
6988 */\r
6989    int iUVR=0, nUVR=NBTYPE+2, ib = (int)nodes[inode].label, updateUVR=0;\r
6990    double *pkappa, w, mr=1, Qfactor=1;\r
6991    double *pomega = com.pomega; /* x+com.ntime+com.nrgene+com.nkappa; */\r
6992 \r
6993    pkappa = (com.hkyREV||com.codonf==FMutSel?x+com.ntime+com.nrgene:&com.kappa);\r
6994 \r
6995    if(com.seqtype==CODONseq  && com.NSsites && com.model) {\r
6996       /* branch&site models (both NSsites & model):\r
6997          Usual likelihood calculation, no need to re-calculate UVRoot.  \r
6998          Only need to point to the right place.\r
6999       */\r
7000       iUVR = Set_UVR_BranchSite (IClass, ib);\r
7001       Qfactor = Qfactor_NS_branch[ib];\r
7002    }\r
7003    else if (com.seqtype==CODONseq && BayesEB==2 && com.model>1) { /* BEB for A&C */\r
7004       /* branch&site models (both NSsites & model) BEB calculation:\r
7005          Need to calculate UVRoot, as w is different.  com.pomega points to wbranches[]\r
7006          in get_grid_para_like_M2M8() or get_grid_para_like_AC().\r
7007 \r
7008          Qfactor_NS_branch[] is fixed at the MLE: \r
7009          "we fix the branch lengths at the synonymous sites (i.e., the expected \r
7010          number of synonymous substitutions per codon) at their MLEs."\r
7011       */\r
7012       w = com.pomega[ib];\r
7013       eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, w, Pt);\r
7014       Qfactor = Qfactor_NS_branch[ib];\r
7015    }\r
7016    else if (com.seqtype==CODONseq && (com.model==1 ||com.model==2) && com.nbtype<=nUVR) { \r
7017       /* branch model, also for AAClasses */\r
7018       iUVR = (int)nodes[inode].label;\r
7019       U=_UU[iUVR]; V=_VV[iUVR]; Root=_Root[iUVR];\r
7020    }\r
7021    else if (com.seqtype==CODONseq && com.model) {\r
7022       mr = 0;\r
7023       if(com.aaDist==AAClasses) { /* AAClass model */\r
7024          com.pomega = PointOmega(x+com.ntime, -1, inode, -1);\r
7025          eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, -1, Pt);\r
7026       }\r
7027       else if(com.nbtype>nUVR) {  /* branch models, with more than 8 omega */\r
7028          eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[inode].omega, Pt);\r
7029       }\r
7030    }\r
7031 \r
7032    if (com.seqtype == AAseq && com.model == Poisson)\r
7033       PMatJC69like(Pt, t, com.ncode);\r
7034    else {\r
7035       t *= Qfactor;\r
7036       PMatUVRoot(Pt, t, com.ncode, U, V, Root);\r
7037    }\r
7038 \r
7039    return(0);\r
7040 }\r
7041 \r
7042 #endif\r
7043 \r
7044 \r
7045 \r
7046 void print_lnf_site (int h, double logfh)\r
7047 {\r
7048 #if(defined BASEML || defined CODEML)\r
7049    fprintf(flnf, "\n%6d %6.0f %16.10f %16.12f %12.4f  ",\r
7050                  h+1, com.fpatt[h], logfh, exp(logfh), com.ls*exp(logfh));\r
7051    print1site(flnf, h);\r
7052 \r
7053 #endif\r
7054 }\r
7055 \r
7056 double lfundG (double x[], int np)\r
7057 {\r
7058 /* likelihood function for site-class models.\r
7059    This deals with scaling for nodes to avoid underflow if(com.NnodeScale).\r
7060    The routine calls fx_r() to calculate com.fhK[], which holds log{f(x|r)} \r
7061    when scaling or f(x|r) when not.  Scaling factors are set and used for each \r
7062    site class (ir) to calculate log(f(x|r).  When scaling is used, the routine \r
7063    converts com.fhK[] into f(x|r), by collecting scaling factors into lnL.  \r
7064    The rest of the calculation then becomes the same and relies on f(x|r).  \r
7065    Check notes in fx_r.\r
7066    This is also used for NSsites models in codonml.  \r
7067    Note that scaling is used between fx_r() and ConditionalPNode()\r
7068    When this routine is used under the multiple-gene site-class model, note \r
7069    that right now it assumes one set of com.freqK[] for the different genes, \r
7070    which may be an issue.\r
7071 */\r
7072    int h,ir, it, FPE=0;\r
7073    double lnL=0, fh=0,t;\r
7074 \r
7075    NFunCall++;\r
7076    fx_r(x,np);\r
7077 \r
7078    for(h=0; h<com.npatt; h++) {\r
7079       if (com.fpatt[h]<=0 && com.print>=0) continue;\r
7080       if(com.NnodeScale) { /* com.fhK[] has log{f(x|r}.  Note the scaling for nodes */\r
7081          for(ir=1,it=0; ir<com.ncatG; ir++) /* select term for scaling */\r
7082             if(com.fhK[ir*com.npatt+h] > com.fhK[it*com.npatt+h]) it = ir;\r
7083          t = com.fhK[it*com.npatt+h];\r
7084          for(ir=0,fh=0; ir<com.ncatG; ir++)\r
7085             fh += com.freqK[ir]*exp(com.fhK[ir*com.npatt+h]-t);\r
7086          fh = t + log(fh);\r
7087       }\r
7088       else {\r
7089          for(ir=0,fh=0; ir<com.ncatG;ir++) \r
7090             fh += com.freqK[ir]*com.fhK[ir*com.npatt+h];\r
7091          if(fh<=0) {\r
7092             if(!FPE) {\r
7093                FPE=1;  matout(F0,x,1,np);\r
7094                printf("\nlfundG: h=%4d  fhK=%9.6e\ndata: ", h+1, fh);\r
7095                print1site(F0, h);\r
7096                FPN(F0);\r
7097             }\r
7098             fh = 1e-300;\r
7099          }\r
7100          fh = log(fh);\r
7101       }\r
7102       lnL -= fh*com.fpatt[h];\r
7103       if(LASTROUND==2) dfsites[h] = fh;\r
7104       if (com.print<0) print_lnf_site(h, fh);\r
7105    }\r
7106 \r
7107    return(lnL);\r
7108 }\r
7109 \r
7110 \r
7111 int SetPSiteClass(int iclass, double x[])\r
7112 {\r
7113 /* This sets parameters for the iclass-th site class\r
7114    This is used by ConditionalPNode() and also updateconP in both algorithms\r
7115    For method=0 and 1.\r
7116 */\r
7117    int k = com.nrgene + !com.fix_kappa;\r
7118    double *pkappa=NULL, *xcom=x+com.ntime, mr;\r
7119 \r
7120    _rateSite = com.rK[iclass];\r
7121 #if CODEML\r
7122    IClass = iclass;\r
7123    mr = 1/Qfactor_NS;\r
7124    pkappa = (com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);\r
7125    if(com.seqtype == CODONseq && com.NSsites) {\r
7126       _rateSite = 1;\r
7127       if (com.model==0) {\r
7128          if(com.aaDist) {\r
7129             if(com.aaDist<10)       com.pomega = xcom + k + com.ncatG - 1 + 2*iclass;\r
7130             else if(com.aaDist==11) com.pomega = xcom + k + com.ncatG - 1 + 4*iclass;\r
7131             else if(com.aaDist==12) com.pomega = xcom + k + com.ncatG - 1 + 5*iclass;\r
7132          }\r
7133          eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, com.rK[iclass], PMat);\r
7134       }\r
7135    }\r
7136 #endif\r
7137    return (0);\r
7138 }\r
7139 \r
7140 extern int prt, Locus, Ir;\r
7141 \r
7142 \r
7143 int fx_r (double x[], int np)\r
7144 {\r
7145 /* This calculates f(x|r) if(com.NnodeScale==0) or log{f(x|r)} \r
7146    if(com.NnodeScale>0), that is, the (log) probability of observing data x \r
7147    at a site, given the rate r or dN/dS ratio for the site.  This is used by \r
7148    the discrete-gamma models in baseml and codeml as well as the NSsites models \r
7149    in codeml.  \r
7150    The results are stored in com.fhK[com.ncatG*com.npatt].\r
7151    This deals with underflows with large trees using global variables \r
7152    com.nodeScale and com.nodeScaleF[com.NnodeScale*com.npatt].\r
7153 */\r
7154    int  h, ir, i,k, ig, FPE=0;\r
7155    double fh, smallw=1e-12; /* for testing site class with w=0 */\r
7156 \r
7157    if(!BayesEB)\r
7158       if(SetParameters(x)) puts("\npar err..");\r
7159 \r
7160    for(ig=0; ig<com.ngene; ig++) { /* alpha may differ over ig */\r
7161       if(com.Mgene>1 || com.nalpha>1)\r
7162          SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);\r
7163       for(ir=0; ir<com.ncatG; ir++) {\r
7164          if(ir && com.conPSiteClass) {  /* shift com.nodeScaleF & conP */\r
7165             if(com.NnodeScale) \r
7166                com.nodeScaleF += (size_t)com.npatt*com.NnodeScale;\r
7167             for(i=com.ns; i<tree.nnode; i++)\r
7168                nodes[i].conP += (tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;\r
7169          }\r
7170          SetPSiteClass(ir,x);\r
7171          ConditionalPNode(tree.root,ig, x);\r
7172 \r
7173          for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
7174             if (com.fpatt[h]<=0 && com.print>=0) continue;\r
7175             for (i=0,fh=0; i<com.ncode; i++)\r
7176                fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];\r
7177             if (fh<=0) {\r
7178                if(fh<-1e-10 /* && !FPE */) { /* note that 0 may be o.k. here */\r
7179                   FPE=1; matout(F0,x,1,np);\r
7180                   printf("\nfx_r: h = %d  r = %d fhK = %.5e ", h+1,ir+1,fh);\r
7181                   if(com.seqtype==0||com.seqtype==2) {\r
7182                      printf("Data: ");\r
7183                      print1site(F0, h);\r
7184                      FPN(F0);\r
7185                   }\r
7186                }\r
7187                fh = 1e-300;\r
7188             }\r
7189             if(!com.NnodeScale)\r
7190                com.fhK[ir*com.npatt+h] = fh;\r
7191             else\r
7192                for(k=0,com.fhK[ir*com.npatt+h]=log(fh); k<com.NnodeScale; k++)\r
7193                   com.fhK[ir*com.npatt+h] += com.nodeScaleF[k*com.npatt+h];\r
7194          }  /* for (h) */\r
7195       }     /* for (ir) */\r
7196 \r
7197       if(com.conPSiteClass) {  /* shift pointers conP back */\r
7198          if(com.NnodeScale) \r
7199             com.nodeScaleF -= (com.ncatG-1)*com.NnodeScale*(size_t)com.npatt;\r
7200          for(i=com.ns; i<tree.nnode; i++)\r
7201             nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;\r
7202       }\r
7203    }  /* for(ig) */\r
7204    return(0);\r
7205 }\r
7206 \r
7207 \r
7208 double lfun (double x[], int np)\r
7209 {\r
7210 /* likelihood function for models of one rate for all sites including \r
7211    Mgene models.\r
7212 */\r
7213    int  h,i,k, ig, FPE=0;\r
7214    double lnL=0, fh;\r
7215 \r
7216    NFunCall++;\r
7217    if(SetParameters(x)) puts ("\npar err..");\r
7218    for(ig=0; ig<com.ngene; ig++) {\r
7219       if(com.Mgene>1) \r
7220          SetPGene(ig,1,1,0,x);\r
7221       ConditionalPNode (tree.root, ig, x);\r
7222 \r
7223       for(h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
7224          if (com.fpatt[h]<=0 && com.print>=0) continue;\r
7225          for(i=0,fh=0; i<com.ncode; i++)\r
7226             fh += com.pi[i]*nodes[tree.root].conP[h*com.ncode+i];\r
7227          if(fh<=0) {\r
7228             if(fh<-1e-5 && noisy) {\r
7229                printf("\nfh = %.6f negative\n",fh);\r
7230                exit(-1);\r
7231             }\r
7232             if(!FPE) {\r
7233                FPE=1;  matout(F0,x,1,np);\r
7234                printf("lfun: h=%4d  fh=%9.6e\nData: ", h+1,fh);\r
7235                print1site(F0, h);\r
7236                FPN(F0);\r
7237             }\r
7238             fh = 1e-80;\r
7239          }\r
7240          fh = log(fh);\r
7241          for(k=0; k<com.NnodeScale; k++)\r
7242             fh += com.nodeScaleF[k*com.npatt+h];\r
7243 \r
7244          lnL -= fh*com.fpatt[h];\r
7245          if(LASTROUND==2) dfsites[h] = fh;\r
7246          if (com.print<0)\r
7247             print_lnf_site(h,fh);\r
7248       }\r
7249    }\r
7250    return (lnL);\r
7251 }\r
7252 \r
7253 \r
7254 \r
7255 \r
7256 int print1site (FILE*fout, int h)\r
7257 {\r
7258 /* This print out one site in the sequence data, com.z[].  It may be the h-th \r
7259    site in the original data file or the h-th pattern.  The data are coded.\r
7260    naa > 1 if the codon codes for more than one amino acid.\r
7261 */\r
7262    char *pch=(com.seqtype==0 ? BASEs : (com.seqtype==2 ? AAs: (com.seqtype==5?BASEs5:BINs)));\r
7263    char compatibleAAs[20]="";\r
7264    int n=com.ncode, i, b, aa=0;\r
7265 \r
7266    for(i=0; i<com.ns; i++) {\r
7267       b = com.z[i][h];\r
7268       if(com.seqtype==0 || com.seqtype==2) \r
7269          fprintf(fout,"%c", pch[b]);\r
7270 #if defined(CODEML)\r
7271       else if(com.seqtype==1) {\r
7272          aa = GetAASiteSpecies(i, h);\r
7273          fprintf(fout, "%s (%c) ", CODONs[b], aa);\r
7274       }\r
7275 #endif\r
7276    }\r
7277    return(0);\r
7278 }\r
7279    \r
7280 \r
7281 #if(defined MINIMIZATION)\r
7282 \r
7283 /* November, 1999, Minimization branch by branch */\r
7284 int noisy_minbranches;\r
7285 double *space_minbranches, *g_minbranches, *varb_minbranches, e_minbranches;\r
7286 \r
7287 double minbranches(double xcom[], int np);\r
7288 int lfunt(double t, int a,int b,double x[],double *l, double space[]);\r
7289 int lfuntdd(double t, int a,int b,double x[], double *l,double*dl,double*ddl,\r
7290     double space[]);\r
7291 int lfunt_SiteClass(double t, int a,int b,double x[],double *l,double space[]);\r
7292 int lfuntdd_SiteClass(double t, int a,int b,double x[],\r
7293     double *l,double*dl,double*ddl,double space[]);\r
7294 \r
7295 int minB (FILE*fout, double *lnL,double x[],double xb[][2],double e0, double space[])\r
7296 {\r
7297 /* This calculates lnL for given values of common parameters by optimizing \r
7298    branch lengths, cycling through them.\r
7299    Z. Yang, November 1999\r
7300    This calls minbranches to optimize branch lengths and ming2 to \r
7301    estimate other paramters.\r
7302    At the end of the routine, there is a call to lfun to restore nodes[].conP.\r
7303    Returns variances of branch lengths in space[].\r
7304    space[] is com.space[].  com.space may be reallocated here, which may be unsafe \r
7305    as the pointers in the calling routine may not be pointing to the right places.\r
7306 \r
7307    return value: 0 convergent;  -1: not convergent.\r
7308 */\r
7309    int ntime0=com.ntime, fix_blength0=com.fix_blength;\r
7310    int status=0, i, npcom=com.np-com.ntime;\r
7311    size_t s;\r
7312    double *xcom=x+com.ntime, lnL0= *lnL, dl, e=1e-5;\r
7313    double (*xbcom)[2]=xb+ntime0;\r
7314    int small_times=0, max_small_times=100, ir,maxr=(npcom?200:1);\r
7315    double small_improvement=0.001;\r
7316    char timestr[64];\r
7317 \r
7318    if(com.conPSiteClass) {\r
7319       s = (2*com.ncode*com.ncode+com.ncode*(size_t)com.npatt)*sizeof(double);\r
7320       if(com.sspace < s) {  /* this assumes that space is com.space */\r
7321          printf("\n%lu bytes in space, %lu bytes needed\n", com.sspace, s);\r
7322          printf("minB: reallocating memory for working space.\n");\r
7323          com.space = (double*)realloc(com.space, s);\r
7324          if(com.space==NULL) error2("oom space");\r
7325          com.sspace = s;\r
7326       }\r
7327    }\r
7328    g_minbranches = com.space;\r
7329    varb_minbranches = com.space + com.np;\r
7330    s = (3*com.ncode*com.ncode + (com.conPSiteClass) * 4 *(size_t)com.npatt) *sizeof(double);\r
7331    if((space_minbranches=(double*)malloc(s))==NULL) \r
7332       error2("oom minB");\r
7333    if(com.ntime==0) error2("minB: should not come here");\r
7334 \r
7335    if(*lnL<=0)  *lnL = com.plfun(x,com.np);\r
7336    e = e_minbranches = (npcom ? 5.0 : e0);\r
7337    com.ntime = 0; com.fix_blength = 2;\r
7338 #if(CODEML)\r
7339    if(com.NSsites==0) com.pomega = xcom+com.nrgene+!com.fix_kappa;\r
7340 #endif\r
7341 \r
7342    for(ir=0; (npcom==0||com.method) && ir<maxr; ir++) {\r
7343       if(npcom) {\r
7344          if(noisy>2) printf("\n\nRound %da: Paras (%d) (e=%.6g)",ir+1,npcom,e);\r
7345          ming2(NULL,lnL,com.plfun,NULL,xcom, xbcom, com.space,e,npcom);\r
7346          if(noisy>2) {\r
7347             FPN(F0); FOR(i,npcom) printf(" %11.6f", xcom[i]);\r
7348             printf("%8s%s\n", "", printtime(timestr));\r
7349          }\r
7350       }\r
7351       noisy_minbranches = noisy;\r
7352       if(noisy>2)\r
7353          printf("\nRound %db: Blengths (%d, e=%.6g)\n",ir+1,tree.nbranch,e_minbranches);\r
7354 \r
7355       *lnL = minbranches(xcom, -1);\r
7356       for(i=0; i<tree.nnode; i++)  \r
7357          if(i != tree.root) \r
7358             x[nodes[i].ibranch] = nodes[i].branch;\r
7359       if(noisy>2) printf("\n%s\n", printtime(timestr));\r
7360 \r
7361       if((dl=fabs(*lnL-lnL0))<e0 && e<=0.02) break;\r
7362       if(dl<small_improvement) small_times++;\r
7363       else                     small_times=0;\r
7364       if((small_times>max_small_times && ntime0<200) || (com.method==2&&ir==1)) {\r
7365          if(noisy && com.method!=2) puts("\nToo slow, switching algorithm.");\r
7366          status=2;\r
7367          break;\r
7368       }\r
7369       if(noisy && small_times>5) \r
7370          printf("\n%d rounds of small improvement.",small_times);\r
7371 \r
7372       e/=2;  if(dl<1) e/=2;\r
7373       if(dl<0.5)     e = min2(e,1e-3); \r
7374       else if(dl>10) e = max2(e,0.1); \r
7375       e_minbranches = max2(e, 1e-6);\r
7376       e = max2(e,1e-6);\r
7377 \r
7378       lnL0= *lnL;\r
7379       if(fout) {\r
7380          fprintf(fout,"%4d %12.5f x ", ir+1,*lnL);\r
7381          for(i=0; i<com.np; i++)\r
7382             fprintf(fout, " %8.5f", x[i]);\r
7383          FPN(fout);  fflush(fout);\r
7384       }\r
7385    }\r
7386    if (npcom && ir==maxr) status=-1;\r
7387 \r
7388    if(npcom && status==2) {\r
7389       noisy_minbranches = 0;\r
7390       com.ntime = ntime0; \r
7391       com.fix_blength = fix_blength0;\r
7392       ming2(NULL,lnL,com.plfun,NULL,x,xb, com.space,e0,com.np);\r
7393       for(i=0; i<tree.nnode; i++) space[i] = -1;\r
7394    }\r
7395 \r
7396    for(i=0; i<tree.nnode; i++)\r
7397       if(i!=tree.root) x[nodes[i].ibranch] = nodes[i].branch;\r
7398 \r
7399    if(noisy>2) printf("\nlnL  = %12.6f\n",- *lnL);\r
7400 \r
7401    com.ntime = ntime0;  \r
7402    com.fix_blength = fix_blength0;\r
7403    *lnL = com.plfun(x,com.np); /* restore things, for e.g. AncestralSeqs */\r
7404    if(fabs(*lnL-lnL0) > 1e-5) \r
7405       printf("%.6f != %.6f lnL error.  Something is wrong in minB\n", *lnL, lnL0);\r
7406    free(space_minbranches);\r
7407 \r
7408    return (status==-1 ? -1 : 0);\r
7409 }\r
7410 \r
7411 \r
7412 /*********************  START: Testing iteration algorithm ******************/\r
7413 \r
7414 int minB2 (FILE*fout, double *lnL,double x[],double xb[][2],double e0, double space[])\r
7415 {\r
7416 /* \r
7417 */\r
7418    int ntime0=com.ntime, fix_blength0=com.fix_blength;\r
7419    int status=0, i, npcom=com.np-com.ntime;\r
7420    size_t s;\r
7421    double *xcom=x+com.ntime, lnL0= *lnL;\r
7422    double (*xbcom)[2]=xb+ntime0;\r
7423 \r
7424    s = (3*com.ncode*com.ncode + (com.conPSiteClass) * 4*(size_t)com.npatt) * sizeof(double);\r
7425    if((space_minbranches=(double*)malloc(s))==NULL)  error2("oom minB2");\r
7426    if(com.ntime==0 || npcom==0) error2("minB2: should not come here");\r
7427 \r
7428    noisy_minbranches=0;\r
7429    /* if(*lnL<=0)  *lnL=com.plfun(x,com.np); */\r
7430    com.ntime=0; com.fix_blength=2;\r
7431 #if(CODEML)\r
7432    if(com.NSsites==0) com.pomega=xcom+com.nrgene+!com.fix_kappa;\r
7433 #endif\r
7434 \r
7435    ming2(NULL, lnL, minbranches, NULL, xcom, xbcom, space, e0, npcom);\r
7436 \r
7437 \r
7438    com.ntime = ntime0;  com.fix_blength = fix_blength0;\r
7439    for(i=0; i<tree.nnode; i++)  \r
7440       if(i!=tree.root) x[nodes[i].ibranch] = nodes[i].branch;\r
7441    *lnL = com.plfun(x,com.np); /* restore things, for e.g. AncestralSeqs */\r
7442    free(space_minbranches);\r
7443 \r
7444    return (status==-1 ? -1 : 0);\r
7445 }\r
7446 \r
7447 /*********************  END: Testing iteration algorithm ******************/\r
7448 \r
7449 \r
7450 static int times=0;\r
7451 \r
7452 \r
7453 int updateconP (double x[], int inode)\r
7454 {\r
7455 /* update conP for inode.  \r
7456 \r
7457    Confusing decision about x[] follows.  Think about redesign.\r
7458 \r
7459    (1) Called by PostProbNode for ancestral reconstruction, with com.clock = 0, \r
7460        1, 2: x[] is passed over and com.ntime is used to get xcom in \r
7461        SetPSiteClass()\r
7462    (2) Called from minbranches(), with com.clock = 0.  xcom[] is passed \r
7463        over by minbranches and com.ntime=0 is set.  So SetPSiteClass()\r
7464        can still get the correct substitution parameters.  \r
7465        Also look at ConditionalPNode().\r
7466   \r
7467    Note that com.nodeScaleF and nodes[].conP are shifted if(com.conPSiteClass).\r
7468 */\r
7469    int ig,i,ir;\r
7470 \r
7471    if(com.conPSiteClass==0)\r
7472       for(ig=0; ig<com.ngene; ig++) {\r
7473          if(com.Mgene>1 || com.nalpha>1)\r
7474             SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,x);\r
7475          /* x[] needed by local clock models and if(com.aaDist==AAClasses).\r
7476             This is called from PostProbNode\r
7477          */\r
7478          \r
7479          ConditionalPNode(inode, ig, x);\r
7480       }\r
7481    else {  /* site-class models */\r
7482       FOR(ir,com.ncatG) {\r
7483 #ifdef CODEML\r
7484          IClass = ir;\r
7485 #endif\r
7486          if(ir) {\r
7487             if(com.NnodeScale)\r
7488                com.nodeScaleF += com.NnodeScale*(size_t)com.npatt;\r
7489             for(i=com.ns; i<tree.nnode; i++)\r
7490                nodes[i].conP += (tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;\r
7491          }\r
7492          SetPSiteClass(ir, x);\r
7493          for(ig=0; ig<com.ngene; ig++) {\r
7494             if(com.Mgene>1 || com.nalpha>1)\r
7495                SetPGene(ig, com.Mgene>1, com.Mgene>1, com.nalpha>1, x);\r
7496             if(com.nalpha>1) SetPSiteClass(ir, x);\r
7497             ConditionalPNode(inode,ig, x);\r
7498          }\r
7499       }\r
7500 \r
7501       /* shift positions */\r
7502       com.nodeScaleF -= (com.ncatG-1)*com.NnodeScale*com.npatt;\r
7503       for(i=com.ns; i<tree.nnode; i++)\r
7504          nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*com.ncode*(size_t)com.npatt;\r
7505    }\r
7506    return(0);\r
7507 }\r
7508 \r
7509 \r
7510 double minbranches (double x[], int np)\r
7511 {\r
7512 /* Ziheng, November 1999.\r
7513    optimizing one branch at a time\r
7514    \r
7515    for each branch a..b, reroot the tree at b, and \r
7516    then calculate conditional probability for node a.\r
7517    For each branch, this routine determines the Newton search direction \r
7518    p = -dl/dll.  It then halves the steplength to make sure -lnL is decreased.\r
7519    When the Newton solution is correct, this strategy will waste one \r
7520    extra call to lfunt.  It does not seem possible to remove calculation of \r
7521    l (lnL) in lfuntddl().\r
7522    lfun or lfundG and thus SetParameters are called once beforehand to set up \r
7523    globals like com.pomega.\r
7524    This works with NSsites and NSbranch models.\r
7525    \r
7526    com.oldconP[] marks nodes that need to be updated when the tree is rerooted.  \r
7527    The array is declared in baseml and codeml and used in the following \r
7528    routines: ReRootTree, minbranches, and ConditionalPNode.\r
7529 \r
7530    Note: At the end of the routine, nodes[].conP are not updated.\r
7531 */\r
7532    int ib,oldroot=tree.root, a,b;\r
7533    int icycle, maxcycle=500, icycleb, ncycleb=10, i;\r
7534    double lnL, lnL0=0, l0,l,dl,ddl=-1, t,t0,t00, p,step=1, small=1e-20,y;\r
7535    double tb[2]={1e-8,50}, e=e_minbranches, *space=space_minbranches;\r
7536    double *xcom=x+com.ntime;  /* this is incorrect as com.ntime=0 */\r
7537    double smallddl=0.25/com.ls*(1-0.25/com.ls)/com.ls;\r
7538 \r
7539    if(com.ntime) error2("ntime should be 0 in minbranches");\r
7540    lnL0 = l0 = l = lnL = com.plfun(xcom,-1);\r
7541 \r
7542    if(noisy_minbranches>2) printf("\tlnL0 =    %14.6f\n",-lnL0);\r
7543 \r
7544    for(icycle=0; icycle<maxcycle; icycle++) {\r
7545       for(ib=0; ib<tree.nbranch; ib++) {\r
7546          t = t0 = t00 = nodes[tree.branches[ib][1]].branch; \r
7547          l0 = l;\r
7548          a = tree.branches[ib][0];\r
7549          b = tree.branches[ib][1];\r
7550          /* if a is the root, why do we want to reroot the tree at b?  Just switch a with b? */\r
7551 \r
7552          for(i=0; i<tree.nnode; i++)\r
7553             com.oldconP[i]=1;\r
7554          ReRootTree(b);\r
7555          updateconP(x, a);\r
7556 \r
7557          for(icycleb=0; icycleb<ncycleb; icycleb++) {  /* iterating a branch */\r
7558             if(!com.conPSiteClass)\r
7559                lfuntdd(t, a, b, xcom, &y, &dl, &ddl, space);\r
7560             else\r
7561                lfuntdd_SiteClass(t, a, b, xcom, &y, &dl, &ddl, space);\r
7562 \r
7563             p = -dl/fabs(ddl);\r
7564             /* p = -dl/ddl; newton direction */\r
7565             if (fabs(p)<small) step = 0;\r
7566             else if(p<0)       step = min2(1, (tb[0]-t0)/p);\r
7567             else               step = min2(1, (tb[1]-t0)/p);\r
7568 \r
7569             if(icycle==0 && step!=1 && step!=0)\r
7570                step *= 0.99; /* avoid border */\r
7571             for (i=0; step>small; i++,step/=4) {\r
7572                t = t0 + step*p;\r
7573                if(!com.conPSiteClass) lfunt(t, a, b, xcom, &l, space);\r
7574                else                   lfunt_SiteClass(t, a, b, xcom, &l, space);\r
7575                if(l<l0) break;\r
7576             }\r
7577             if(step<=small) { t=t0; l=l0; break; }\r
7578             if(fabs(t-t0)<e*fabs(1+t) && fabs(l-l0)<e) break;\r
7579             t0=t; l0=l;\r
7580          }\r
7581          nodes[a].branch = t;\r
7582 \r
7583          g_minbranches[ib] = -dl;\r
7584          varb_minbranches[ib] = -ddl;\r
7585       }   /* for (ib) */\r
7586       lnL = l;\r
7587       if(noisy_minbranches>2) printf("\tCycle %2d: %14.6f\n",icycle+1, -l);\r
7588       if(fabs(lnL-lnL0) < e) break;\r
7589       lnL0 = lnL;\r
7590    }  /* for (icycle) */\r
7591    ReRootTree(oldroot);  /* did not update conP */\r
7592    FOR(i,tree.nnode) com.oldconP[i]=0;\r
7593    return(lnL);\r
7594 }\r
7595 \r
7596 \r
7597 \r
7598 int lfunt(double t, int a, int b, double xcom[], double *l, double space[])\r
7599 {\r
7600 /* See notes for lfunt_dd and minbranches\r
7601 */\r
7602    int i,j,k, h,ig, n=com.ncode, nroot=n;\r
7603    int n1 = (com.cleandata&&b<com.ns ? 1 : n), xb, nUVR;\r
7604    double expt,uexpt=0,multiply;\r
7605    double *P=space, piqi,pqj, fh, mr=0;\r
7606    double *pkappa;\r
7607 \r
7608 #if (CODEML)\r
7609    nUVR = NBTYPE+2;\r
7610    pkappa = (com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);\r
7611    if (com.seqtype==CODONseq && com.model) {\r
7612       if((com.model==NSbranchB || com.model==NSbranch2) && com.NSsites==0 && com.nbtype<=nUVR) {\r
7613          U = _UU[(int)nodes[a].label]; \r
7614          V = _VV[(int)nodes[a].label]; \r
7615          Root = _Root[(int)nodes[a].label]; \r
7616       }\r
7617       else {\r
7618          eigenQcodon(1, -1, NULL, NULL, NULL, Root, U, V, &mr, pkappa, nodes[a].omega, PMat);\r
7619       }\r
7620    }\r
7621 #endif\r
7622 \r
7623 #if (BASEML)\r
7624    if (com.nhomo==2)\r
7625       eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);\r
7626    nroot = nR;\r
7627 #endif\r
7628 \r
7629    *l = 0;\r
7630    for (ig=0; ig<com.ngene; ig++) {\r
7631       if(com.Mgene>1) SetPGene(ig,1,1,0,xcom); /* com.ntime=0 */\r
7632       for(i=0; i<n*n; i++) P[i] = 0;\r
7633 \r
7634       for(k=0,expt=1; k<nroot; k++) {\r
7635          multiply = com.rgene[ig]*Root[k];\r
7636          if(k) expt = exp(t*multiply);\r
7637 \r
7638 #if (CODEML)  /* uses U & V */\r
7639          for(i=0; i<n; i++)\r
7640             for(j=0,uexpt=U[i*n+k]*expt; j<n; j++)\r
7641                P[i*n+j] += uexpt*V[k*n+j];\r
7642 #elif (BASEML) /* uses Cijk */\r
7643          for(i=0; i<n; i++) for(j=0; j<n; j++)\r
7644             P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;\r
7645 #endif\r
7646       }\r
7647 \r
7648       for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
7649          n1 = (b<com.ns ? nChara[com.z[b][h]] : n);\r
7650          for(i=0,fh=0; i<n1; i++) {\r
7651             xb = i;\r
7652             if(b<com.ns) piqi = com.pi[ xb = CharaMap[com.z[b][h]][i] ];\r
7653             else         piqi = com.pi[i] * nodes[b].conP[h*n+i];\r
7654 \r
7655             for(j=0,pqj=0; j<n; j++)\r
7656                pqj += P[xb*n+j]*nodes[a].conP[h*n+j];\r
7657             fh += piqi*pqj;\r
7658          }\r
7659          if(noisy && fh<1e-250)\r
7660             printf("a bit too small: fh[%d] = %10.6e\n",h,fh);\r
7661          if(fh<0) fh = -500;\r
7662          else     fh = log(fh);\r
7663 \r
7664          *l -= fh*com.fpatt[h];\r
7665          for(i=0; i<com.NnodeScale; i++)\r
7666             *l -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];\r
7667       }\r
7668    }\r
7669    return(0);\r
7670 }\r
7671 \r
7672 \r
7673 int lfuntdd(double t, int a, int b, double xcom[], double *l, double*dl, double*ddl, double space[])\r
7674 {\r
7675 /* Calculates lnL for branch length t for branch b->a.\r
7676    See notes in minbranches().\r
7677    Conditional probability updated correctly already.\r
7678 \r
7679    i for b, j for a?\r
7680 */\r
7681    int i,j,k, h,ig,n=com.ncode, nroot=n;\r
7682    int n1 = (com.cleandata&&b<com.ns ? 1 : n), xb, nUVR;\r
7683    double expt, uexpt = 0, multiply;\r
7684    double *P=space, *dP=P+n*n,*ddP=dP+n*n, piqi,pqj,dpqj,ddpqj, fh, dfh, ddfh;\r
7685    double *pkappa, mr=0;\r
7686 \r
7687 #if(CODEML)\r
7688    nUVR = NBTYPE+2;\r
7689    pkappa=(com.hkyREV||com.codonf==FMutSel ? xcom+com.nrgene : &com.kappa);\r
7690    if (com.seqtype==CODONseq && com.model) {\r
7691       if((com.model==NSbranchB || com.model==NSbranch2) && com.NSsites==0 && com.nbtype<=nUVR) {\r
7692          U = _UU[(int)nodes[a].label]; \r
7693          V = _VV[(int)nodes[a].label]; \r
7694          Root = _Root[(int)nodes[a].label]; \r
7695       }\r
7696       else {\r
7697          eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, pkappa, nodes[a].omega, PMat);\r
7698       }\r
7699    }\r
7700 #endif\r
7701 \r
7702 #if(BASEML)\r
7703    if (com.nhomo==2)\r
7704       eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);\r
7705    nroot=nR;\r
7706 #endif\r
7707    *l = *dl = *ddl = 0;\r
7708    for(ig=0; ig<com.ngene; ig++) {\r
7709       if(com.Mgene>1) SetPGene(ig,1,1,0,xcom);  /* com.ntime=0 */\r
7710       for(i=0; i<n*n; i++) P[i] = dP[i] = ddP[i] = 0;\r
7711 \r
7712       for(k=0,expt=1; k<nroot; k++) {\r
7713          multiply = com.rgene[ig]*Root[k];\r
7714          if(k) expt = exp(t*multiply);\r
7715 \r
7716 #if (CODEML)  /* uses U & V */\r
7717          for(i=0; i<n; i++) \r
7718             for(j=0,uexpt=U[i*n+k]*expt; j<n; j++) {\r
7719                P[i*n+j] += uexpt*V[k*n+j];\r
7720                if(k) {\r
7721                   dP[i*n+j]  += uexpt*V[k*n+j]*multiply;\r
7722                   ddP[i*n+j] += uexpt*V[k*n+j]*multiply*multiply;\r
7723                }\r
7724             }\r
7725 #elif (BASEML) /* uses Cijk */\r
7726          for(i=0; i<n; i++) for(j=0; j<n; j++) {\r
7727             P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;\r
7728             if(k) {\r
7729                dP[i*n+j]  += Cijk[i*n*nroot+j*nroot+k]*expt*multiply;\r
7730                ddP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply*multiply;\r
7731             }\r
7732          }\r
7733 #endif\r
7734       }\r
7735 \r
7736       for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
7737          n1 = (b<com.ns ? nChara[com.z[b][h]] : n);\r
7738          for(i=0,fh=dfh=ddfh=0; i<n1; i++) {\r
7739             xb = i;\r
7740             if(b<com.ns) piqi = com.pi[ xb = CharaMap[com.z[b][h]][i] ];\r
7741             else         piqi = com.pi[i] * nodes[b].conP[h*n+i];\r
7742             for(j=0,pqj=dpqj=ddpqj=0; j<n; j++) {\r
7743                pqj   +=   P[xb*n+j] * nodes[a].conP[h*n+j];\r
7744                dpqj  +=  dP[xb*n+j] * nodes[a].conP[h*n+j];\r
7745                ddpqj += ddP[xb*n+j] * nodes[a].conP[h*n+j];\r
7746             }\r
7747             fh   += piqi*pqj;\r
7748             dfh  += piqi*dpqj;\r
7749             ddfh += piqi*ddpqj;\r
7750          }\r
7751          if(noisy && fh<1e-250) {\r
7752             printf("too small: fh[%d] = %10.6e\n",h,fh);\r
7753             OutTreeN(F0,0,1);\r
7754          }\r
7755          *l -= log(fh)*com.fpatt[h];\r
7756          for(i=0; i<com.NnodeScale; i++)\r
7757             *l -= com.nodeScaleF[i*com.npatt+h]*com.fpatt[h];\r
7758          *dl  -= dfh/fh * com.fpatt[h];\r
7759          *ddl -= (fh*ddfh - dfh*dfh)/(fh*fh) * com.fpatt[h];\r
7760       }\r
7761    }  /* for(ig) */\r
7762    return(0);\r
7763 }\r
7764 \r
7765 \r
7766 int lfunt_SiteClass(double t, int a, int b, double xcom[], double *l, double space[])\r
7767 {\r
7768 /* see notes in lfuntdd_SiteClass\r
7769    For branch&site models, look at the notes in GetPMatBranch()\r
7770 */\r
7771    int i,j,k, h,ig,ir,it, n=com.ncode, nroot=n;\r
7772    int n1=(com.cleandata&&b<com.ns?1:n), xb;\r
7773    double y,expt,uexpt=0,multiply, piqi,pqj;\r
7774    double *P=space, *fh=P+n*n;\r
7775    double *Sh=fh+com.npatt;  /* scale factor for each site pattern*/\r
7776    double *pK=com.fhK;  /* proportion for each site class after scaling */\r
7777    double smallw=1e-12; \r
7778 \r
7779 #if (BASEML)\r
7780    if (com.nhomo==2)\r
7781       eigenTN93(com.model, *nodes[a].pkappa,1,com.pi,&nR,Root,Cijk);\r
7782    nroot=nR;\r
7783 #endif\r
7784 \r
7785    if(com.NnodeScale==0) \r
7786       for(ir=0; ir<com.ncatG; ir++) \r
7787          for (h=0; h<com.npatt; h++)  \r
7788             pK[ir*com.npatt+h] = com.freqK[ir];\r
7789    else {\r
7790       for(h=0; h<com.npatt; h++) {\r
7791          for(ir=0,it=0; ir<com.ncatG; ir++) {\r
7792             for(k=0,y=0; k<com.NnodeScale; k++)\r
7793                y += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];\r
7794             if((pK[ir*com.npatt+h]=y) > pK[it*com.npatt+h])\r
7795                it = ir;\r
7796          }\r
7797          Sh[h] = pK[it*com.npatt+h];\r
7798          for(ir=0; ir<com.ncatG; ir++)\r
7799             pK[ir*com.npatt+h] = com.freqK[ir]*exp(pK[ir*com.npatt+h]-Sh[h]);\r
7800       }\r
7801    }\r
7802 \r
7803    for(h=0; h<com.npatt; h++) fh[h] = 0;\r
7804    for(ir=0; ir<com.ncatG; ir++) {\r
7805       SetPSiteClass(ir, xcom);  /* com.ntime=0 */\r
7806 \r
7807 #if CODEML  /* branch b->a */\r
7808       /* branch&site models */\r
7809       if(com.seqtype==CODONseq && com.NSsites && com.model)\r
7810          Set_UVR_BranchSite (ir, (int)nodes[a].label);\r
7811 #endif\r
7812 \r
7813       if(ir) {\r
7814          for(i=com.ns;i<tree.nnode;i++)\r
7815             nodes[i].conP += (tree.nnode-com.ns)*n*(size_t)com.npatt;\r
7816       }\r
7817       for (ig=0; ig<com.ngene; ig++) {\r
7818          if(com.Mgene>1 || com.nalpha>1)\r
7819             SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,xcom);  /* com.ntime=0 */\r
7820          if(com.nalpha>1) SetPSiteClass(ir, xcom);    /* com.ntime=0 */\r
7821 \r
7822          for(i=0; i<n*n; i++) P[i] = 0;\r
7823          for(k=0,expt=1; k<nroot; k++) {\r
7824             multiply = com.rgene[ig]*Root[k]*_rateSite;\r
7825 #if (CODEML)\r
7826             if(com.seqtype==1 && com.model>=2) \r
7827                multiply *= Qfactor_NS_branch[(int)nodes[a].label];\r
7828 #endif\r
7829             if(k) expt = exp(t*multiply);\r
7830 \r
7831 #if (CODEML)  /* uses U & V */\r
7832             for(i=0; i<n; i++) \r
7833                for(j=0,uexpt=U[i*n+k]*expt; j<n; j++)\r
7834                   P[i*n+j] += uexpt*V[k*n+j];\r
7835 #elif (BASEML) /* uses Cijk */\r
7836             for(i=0; i<n; i++) \r
7837                for(j=0; j<n; j++) \r
7838                   P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;\r
7839 #endif\r
7840          }  /* for (k), look through eigenroots */\r
7841          for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
7842             n1 = (b<com.ns ? nChara[com.z[b][h]] : n);\r
7843             for(i=0; i<n1; i++) {\r
7844                xb = i;\r
7845                if(b<com.ns) piqi = pK[ir*com.npatt+h] * com.pi[ xb = CharaMap[com.z[b][h]][i] ];\r
7846                else         piqi = pK[ir*com.npatt+h] * com.pi[i] * nodes[b].conP[h*n+i];\r
7847 \r
7848                for(j=0,pqj=0; j<n; j++)\r
7849                   pqj += P[xb*n+j]*nodes[a].conP[h*n+j];\r
7850                fh[h] += piqi*pqj;\r
7851             }\r
7852          }  /* for (h) */\r
7853       }     /* for (ig) */\r
7854    }        /* for(ir) */\r
7855 \r
7856    for(i=com.ns; i<tree.nnode; i++)  /* shift position */\r
7857       nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*n*(size_t)com.npatt;\r
7858    for(h=0,*l=0; h<com.npatt; h++) {\r
7859       if(fh[h]<1e-250) \r
7860          printf("small (lfunt_SiteClass): fh[%d] = %10.6e\n",h,fh[h]);\r
7861 \r
7862       *l -= log(fh[h])*com.fpatt[h];\r
7863       if(com.NnodeScale) *l -= Sh[h]*com.fpatt[h];\r
7864    }\r
7865    return(0);\r
7866 }\r
7867 \r
7868 \r
7869 int lfuntdd_SiteClass(double t, int a,int b,double xcom[],\r
7870     double *l,double*dl,double*ddl,double space[])\r
7871 {\r
7872 /* dt and ddt for site-class models, modified from lfuntdd()\r
7873    nodes[].conP (and com.nodeScaleF if scaling is used) is shifted for ir, \r
7874    and moved back to the rootal place at the end of the routine.\r
7875 \r
7876    At the start of this routine, nodes[].conP has the conditional probabilties \r
7877    for each node, each site pattern, for each site class (ir).  \r
7878    Scaling: When scaling is used, scale factors \r
7879    com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h] for all nodes \r
7880    are collected into Sh[h], after adjusting for rate classes, since the \r
7881    sum is taken over ir.  Sh[h] and pK[ir*com.npatt+h] together store the \r
7882    scale factors and proportions for site classes.  com.freqK[ir] is not \r
7883    used in this routine beyond this point.\r
7884    if(com.Malpha), com.freqK[]=1/com.ncatG and does not change with ig, \r
7885    and so the collection of Sh for sites at the start of the routine is o.k.\r
7886 \r
7887    The space for com.fhK[] is used.\r
7888    space[2*ncode*ncode + 4*npatt]:\r
7889      dP[ncode*ncode],ddP[ncode*ncode],fh[npatt],dfh[npatt],ddfh[npatt],Sh[npatt]\r
7890      pK[ncatG*npatt]=com.fhK[]\r
7891 */\r
7892    int i,j,k, h,ig,ir,it, n=com.ncode, nroot=n;\r
7893    int n1=(com.cleandata&&b<com.ns?1:n), xb;\r
7894    double y,expt,uexpt=0,multiply, piqi,pqj,dpqj,ddpqj;\r
7895    double *P=PMat, *dP=space,*ddP=dP+n*n;\r
7896    double *fh=ddP+n*n, *dfh=fh+com.npatt, *ddfh=dfh+com.npatt;\r
7897    double *Sh=ddfh+com.npatt;  /* scale factor for each site pattern */\r
7898    double *pK=com.fhK;  /* proportion for each site class after scaling */\r
7899    double smallw=1e-12; \r
7900    size_t s;\r
7901 \r
7902 #if (BASEML)\r
7903    if (com.nhomo==2)\r
7904       eigenTN93(com.model, *nodes[a].pkappa, 1, com.pi, &nR, Root, Cijk);\r
7905    nroot=nR;\r
7906 #endif\r
7907    if(com.NnodeScale==0)\r
7908       for(ir=0; ir<com.ncatG; ir++)\r
7909          for(h=0; h<com.npatt; h++)  \r
7910             pK[ir*com.npatt+h] = com.freqK[ir];\r
7911    else {\r
7912       for(h=0; h<com.npatt; h++) {\r
7913          for(ir=0,it=0; ir<com.ncatG; ir++) {\r
7914             for(k=0,y=0; k<com.NnodeScale; k++)\r
7915                y += com.nodeScaleF[ir*com.NnodeScale*com.npatt + k*com.npatt+h];\r
7916             if((pK[ir*com.npatt+h]=y) > pK[it*com.npatt+h]) \r
7917                it = ir;\r
7918          }\r
7919          Sh[h] = pK[it*com.npatt+h];\r
7920          for(ir=0; ir<com.ncatG; ir++)\r
7921             pK[ir*com.npatt+h] = com.freqK[ir] * exp(pK[ir*com.npatt+h]-Sh[h]);\r
7922       }\r
7923    }\r
7924 \r
7925    for(h=0; h<com.npatt; h++)\r
7926       fh[h] = dfh[h] = ddfh[h] = 0;\r
7927    for(ir=0; ir<com.ncatG; ir++) {\r
7928       SetPSiteClass(ir, xcom);   /* com.ntime=0 */\r
7929 \r
7930 #if CODEML  /* branch b->a */\r
7931       /* branch&site models */\r
7932       if(com.seqtype==CODONseq && com.NSsites && com.model)\r
7933          Set_UVR_BranchSite (ir, (int)nodes[a].label);\r
7934 #endif\r
7935 \r
7936       if(ir) {\r
7937          for(i=com.ns; i<tree.nnode; i++)\r
7938             nodes[i].conP += (tree.nnode-com.ns)*n*(size_t)com.npatt;\r
7939       }\r
7940       for (ig=0; ig<com.ngene; ig++) {\r
7941          if(com.Mgene>1 || com.nalpha>1)\r
7942             SetPGene(ig,com.Mgene>1,com.Mgene>1,com.nalpha>1,xcom);   /* com.ntime=0 */\r
7943          if(com.nalpha>1) SetPSiteClass(ir, xcom);   /* com.ntime=0 */\r
7944 \r
7945          for(i=0; i<n*n; i++) \r
7946             P[i] = dP[i] = ddP[i]=0;\r
7947          for(k=0,expt=1; k<nroot; k++) {   /* k loops through eigenroots */\r
7948             multiply = com.rgene[ig]*Root[k]*_rateSite;\r
7949 #if (CODEML)\r
7950             if(com.seqtype==1 && com.model>=2) \r
7951                multiply *= Qfactor_NS_branch[(int)nodes[a].label];\r
7952 #endif\r
7953             if(k) expt = exp(t*multiply);\r
7954 \r
7955 #if (CODEML)  /* uses U & V */\r
7956             for(i=0; i<n; i++) \r
7957                for(j=0,uexpt=U[i*n+k]*expt; j<n; j++) {\r
7958                   P[i*n+j] += uexpt*V[k*n+j];\r
7959                   if(k) {\r
7960                       dP[i*n+j] += uexpt*V[k*n+j]*multiply;\r
7961                      ddP[i*n+j] += uexpt*V[k*n+j]*multiply*multiply;\r
7962                   }\r
7963                }\r
7964 #elif (BASEML) /* uses Cijk */\r
7965             for(i=0; i<n; i++) for(j=0; j<n; j++) {\r
7966                P[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt;\r
7967                if(k) {\r
7968                    dP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply;\r
7969                   ddP[i*n+j] += Cijk[i*n*nroot+j*nroot+k]*expt*multiply*multiply;\r
7970                }\r
7971             }\r
7972 #endif\r
7973          }\r
7974 \r
7975          for (h=com.posG[ig]; h<com.posG[ig+1]; h++) {\r
7976             n1 = (b<com.ns ? nChara[com.z[b][h]] : n);\r
7977             for(i=0; i<n1; i++) {\r
7978                xb = i;\r
7979                if(b<com.ns)\r
7980                   piqi = pK[ir*com.npatt+h] * com.pi[ xb = CharaMap[com.z[b][h]][i] ];\r
7981                else\r
7982                   piqi = pK[ir*com.npatt+h] * com.pi[i] * nodes[b].conP[h*n+i];\r
7983 \r
7984                for(j=0,pqj=dpqj=ddpqj=0; j<n; j++) {\r
7985                     pqj +=   P[xb*n+j]*nodes[a].conP[h*n+j];\r
7986                    dpqj +=  dP[xb*n+j]*nodes[a].conP[h*n+j];\r
7987                   ddpqj += ddP[xb*n+j]*nodes[a].conP[h*n+j];\r
7988                }\r
7989                  fh[h] += piqi*pqj;\r
7990                 dfh[h] += piqi*dpqj;\r
7991                ddfh[h] += piqi*ddpqj;\r
7992             }\r
7993          }  /* for (h) */\r
7994       }     /* for (ig) */\r
7995    }        /* for(ir) */\r
7996 \r
7997    for(i=com.ns; i<tree.nnode; i++)\r
7998       nodes[i].conP -= (com.ncatG-1)*(tree.nnode-com.ns)*n*(size_t)com.npatt;\r
7999    for(h=0,*l=*dl=*ddl=0; h<com.npatt; h++) {\r
8000       if(fh[h]<1e-250) \r
8001          printf("small fh[%d] = %10.6e\n",h,fh[h]);\r
8002 \r
8003       *l -= log(fh[h])*com.fpatt[h];\r
8004       if(com.NnodeScale) *l -= Sh[h]*com.fpatt[h];\r
8005       *dl  -= dfh[h]/fh[h] * com.fpatt[h];\r
8006       *ddl -= (fh[h]*ddfh[h] - dfh[h]*dfh[h])/(fh[h]*fh[h]) * com.fpatt[h];\r
8007    }\r
8008 \r
8009    return(0);\r
8010 }\r
8011 \r
8012 #endif\r
8013 \r
8014 \r
8015 #endif         /* #ifdef LFUNCTIONS */\r
8016 \r
8017 #ifdef BIRTHDEATH\r
8018 \r
8019 void BranchLengthBD(int rooted, double birth, double death, double sample, \r
8020      double mut)\r
8021 {\r
8022 /* Generate random branch lengths (nodes[].branch) using the birth and\r
8023    death process with species sampling, or the Yule (coalescent?) process\r
8024    if sample=0, when only parameter mut is used.\r
8025    Note: older interior nodes have larger node numbers, so root is at\r
8026    node com.ns*2-2 with time t[ns-2], while the youngest node is at \r
8027    node com.ns with time t[0].  When unrooted=0, the root is removed with\r
8028    branch lengths adjusted.\r
8029    This works with the tree generated from RandomLHistory().\r
8030 */\r
8031    int i,j, it, imin,fixt0=1;\r
8032    double la=birth, mu=death, rho=sample, tmin, r, t[NS-1];\r
8033    double phi, eml, y;\r
8034 \r
8035    if (sample==0)  /* coalescent model.  Check this!!!  */\r
8036       for (i=com.ns,y=0; i>1; i--) \r
8037           nodes[com.ns*2-i].age=y += -log(rndu())/(i*(i-1.)/2.)*mut/2;\r
8038    else  {         /* BD with sampling */\r
8039       if (fixt0) t[com.ns-2]=1;\r
8040       if (fabs(la-mu)>1e-6) {\r
8041          eml = exp(mu-la);  \r
8042          phi = (rho*la*(eml-1)+(mu-la)*eml)/(eml-1);\r
8043          for (i=0; i<com.ns-1-(fixt0); i++) {\r
8044            r = rndu();\r
8045            t[i] = log((phi-r*rho*la)/(phi-r*rho*la+r*(la-mu)))/(mu-la);\r
8046        }\r
8047       }\r
8048       else  \r
8049          for (i=0; i<com.ns-1-(fixt0); i++) {\r
8050             r = rndu();\r
8051             t[i] = r/(1+la*rho*(1-r)); \r
8052          }\r
8053       /* bubble sort */\r
8054       for (i=0; i<com.ns-1-1; i++) {\r
8055          for (j=i+1,tmin=t[i],imin=i; j<com.ns-1; j++) \r
8056             if (tmin>t[j]) { tmin=t[j]; imin=j; }\r
8057          t[imin] = t[i];\r
8058          t[i] = tmin;\r
8059       }\r
8060       for (i=com.ns; i>1; i--)\r
8061          nodes[com.ns*2-i].age = t[com.ns-i]*mut;\r
8062    }\r
8063    for(i=0; i<com.ns; i++) nodes[i].age = 0;\r
8064    for (i=0; i<tree.nnode; i++) \r
8065       if (i != tree.root) \r
8066          nodes[i].branch = nodes[nodes[i].father].age - nodes[i].age;\r
8067    if (!rooted) {\r
8068       it = nodes[tree.root].sons[2];\r
8069       nodes[it].branch = 2*nodes[2*com.ns-2].age - nodes[tree.root].age - nodes[it].age;\r
8070    }\r
8071 }\r
8072 \r
8073 #endif\r
8074 \r
8075 \r
8076 #ifdef NODESTRUCTURE\r
8077 #ifdef EVOLVER\r
8078 \r
8079 int RandomLHistory (int rooted, double space[])\r
8080 {\r
8081 /* random coalescence tree, with each labeled history having equal probability.\r
8082    interior nodes are numbered ns, ns+1, ..., 2*ns-1-!rooted\r
8083 */\r
8084    int ns=com.ns, i, j, it=0, *nodea=(int*)space;\r
8085    double t;\r
8086 \r
8087    for (i=0; i<2*ns-1-!rooted; i++) ClearNode(i);\r
8088 \r
8089    for (i=0; i<ns; i++) nodea[i]=i;\r
8090    for (i=ns,t=0; i>(1+!rooted); i--) {\r
8091       nodes[it=2*ns-i].nson = 2;\r
8092       j = (int)(i*rndu()); \r
8093       nodes[nodea[j]].father = it;\r
8094       nodes[it].sons[0] = nodea[j];\r
8095       nodea[j] = nodea[i-1];\r
8096       j = (int)((i-1)*rndu()); \r
8097       nodes[nodea[j]].father = it;\r
8098       nodes[it].sons[1] = nodea[j];\r
8099       nodea[j] = it;\r
8100       if (!rooted && i==3) {\r
8101          nodes[it].nson++; \r
8102          nodes[nodea[1-j]].father = it;\r
8103          nodes[it].sons[2] = nodea[1-j];\r
8104       }\r
8105    }\r
8106    tree.root = it;\r
8107    tree.nnode = ns*2-1-!rooted;\r
8108    NodeToBranch();\r
8109    return (0);\r
8110 }\r
8111 \r
8112 #endif\r
8113 \r
8114 #endif  /* NODESTRUCTURE */\r
8115 \r
8116 \r
8117 \r
8118 /* routines for dating analysis of heterogeneous data */\r
8119 #if (defined BASEML || defined CODEML || defined MCMCTREE)\r
8120 \r
8121 \r
8122 #if (defined MCMCTREE)\r
8123 \r
8124 int ProcessFossilInfo()\r
8125 {\r
8126 /* This processes fossil calibration information that has been read into \r
8127    nodes[].nodeStr.  It uses both sptree and nodes[], before it is destroyed. \r
8128    This is called before sequence alignments at loci are read.\r
8129 \r
8130    Possible confusions: \r
8131    Simple lower and upper bounds can be specified using <, >, or both < and > in \r
8132    the tree either with or without quotation marks.  These are read in ReadTreeN() \r
8133    and processed in ReadTreeSeqs().  \r
8134    Other distributions such as G, SN, ST must be specified using the format 'G(alpha, beta)',\r
8135    say, and are processed here.  Simple bounds can also be specified using the format \r
8136    'L(0.5)', 'U(1.0)', or 'B(0.5, 1.0)', in which case they are processed here.  \r
8137    I kept this complexity, (i) to keep the option of using <, >, which is intuitive, \r
8138    (ii) for ReadTreeN to be able to read other node labels such as #, $, either with\r
8139    or without ' '.\r
8140 */\r
8141    int i,j,k, nfossiltype=7;\r
8142    char *pch;\r
8143    double tailL=0.025, tailR=0.025, p_LOWERBOUND=0.1, c_LOWERBOUND=1.0;\r
8144 \r
8145    for(i=sptree.nspecies; i<tree.nnode; i++) {\r
8146       if(nodes[i].nodeStr == NULL) \r
8147          continue;\r
8148       if(sptree.nodes[i].fossil) {  /* fossila specified using <, >, already processed.  */\r
8149          free(nodes[i].nodeStr);\r
8150          continue;\r
8151       }\r
8152       for(j=1; j<nfossiltype+1; j++)\r
8153          if((pch = strstr(nodes[i].nodeStr, fossils[j]))) break;\r
8154       if(j == nfossiltype+1) \r
8155          printf("\nunrecognized fossil calibration: %s\n", nodes[i].nodeStr);\r
8156 \r
8157       sptree.nodes[i].fossil = j;\r
8158       pch = strchr(nodes[i].nodeStr, '(') + 1;\r
8159 \r
8160       switch(j) {\r
8161       case (LOWER_F): \r
8162          /* truncated Cauchy default prior L(tL, p, c) */\r
8163          sptree.nodes[i].pfossil[1] = p_LOWERBOUND;\r
8164          sptree.nodes[i].pfossil[2] = c_LOWERBOUND;\r
8165          sptree.nodes[i].pfossil[3] = tailL;\r
8166          sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],\r
8167                                         &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);\r
8168          break;\r
8169       case (UPPER_F): \r
8170          sptree.nodes[i].pfossil[2] = tailR;\r
8171          sscanf(pch, "%lf,%lf", &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2]);\r
8172          break;\r
8173       case (BOUND_F): \r
8174          sptree.nodes[i].pfossil[2] = tailL;\r
8175          sptree.nodes[i].pfossil[3] = tailR;\r
8176          sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],\r
8177                                         &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);\r
8178          if(sptree.nodes[i].pfossil[0] > sptree.nodes[i].pfossil[1]) { \r
8179             printf("fossil bounds (%.4f, %.4f)", sptree.nodes[i].pfossil[0], sptree.nodes[i].pfossil[1]);\r
8180             error2("fossil bounds in tree incorrect");\r
8181          }\r
8182          break;\r
8183       case (GAMMA_F): \r
8184          sscanf(pch, "%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1]);\r
8185          break;\r
8186       case (SKEWN_F):\r
8187          sscanf(pch, "%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2]);\r
8188          break;\r
8189       case (SKEWT_F): \r
8190          sscanf(pch, "%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1], &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3]);\r
8191          break;\r
8192       case (S2N_F): \r
8193          sscanf(pch, "%lf,%lf,%lf,%lf,%lf,%lf,%lf", &sptree.nodes[i].pfossil[0], &sptree.nodes[i].pfossil[1],\r
8194             &sptree.nodes[i].pfossil[2], &sptree.nodes[i].pfossil[3], &sptree.nodes[i].pfossil[4], \r
8195             &sptree.nodes[i].pfossil[5], &sptree.nodes[i].pfossil[6]);\r
8196          break;\r
8197       }\r
8198 \r
8199       sptree.nfossil++;\r
8200       sptree.nodes[i].usefossil = 1;\r
8201       nodes[i].branch = nodes[i].label = 0;\r
8202       free(nodes[i].nodeStr);\r
8203    }\r
8204 \r
8205    return(0);\r
8206 }\r
8207 \r
8208 #endif\r
8209 \r
8210 \r
8211 int GenerateGtree (int locus);\r
8212 \r
8213 int ReadTreeSeqs (FILE*fout)\r
8214 {\r
8215 /* This reads the combined species tree, the fossil calibration information, \r
8216    and sequence data at each locus.  sptree.nodes[].pfossil[] has tL, tU for \r
8217    bounds or alpha and beta for the gamma prior.  \r
8218 \r
8219    This routine also processes fossil calibration information specified using \r
8220    <, >, or both.  More complex specifications are stored in nodes[].nodeStr and \r
8221    processed in ProcessFossilInfo().  See notes in that routine.\r
8222 \r
8223    This also constructs the gene tree at each locus, by pruning the master \r
8224    species tree..\r
8225 */\r
8226    FILE *fseq, *ftree;\r
8227    int i,j, locus, clean0=com.cleandata;\r
8228    double tailL=0.025, tailR=0.025, p_LOWERBOUND=0.1, c_LOWERBOUND=1.0;\r
8229 \r
8230    ftree = gfopen(com.treef,"r");\r
8231 \r
8232    /* read master species tree and process fossil calibration info */\r
8233    fscanf(ftree, "%d%d", &sptree.nspecies, &i);\r
8234    com.ns = sptree.nspecies;\r
8235    if(com.ns>NS) error2("raise NS?");\r
8236    /* to read master species names into sptree.nodes[].name */\r
8237    if(noisy) puts("Reading master tree.");\r
8238    for(j=0; j<sptree.nspecies; j++) \r
8239       com.spname[j] = sptree.nodes[j].name;\r
8240    nodes = nodes_t;\r
8241 \r
8242    ReadTreeN(ftree, &i, &j, 1, 1);\r
8243    if(i) {\r
8244            for(i=j=0; i<tree.nnode; i++)\r
8245                    if(i!=tree.root && nodes[i].branch>0) j++;\r
8246            if(j==tree.nbranch) \r
8247                    printf("\aTree with fossil calibrations should not have branch lengths!");\r
8248    }\r
8249    if(com.clock==5 || com.clock==6)\r
8250       for(i=0; i<tree.nnode; i++) nodes[i].branch = nodes[i].label = 0;\r
8251    for(i=0; i<tree.nnode; i++) \r
8252       if(nodes[i].label<0) nodes[i].label = 0;  /* change -1 into 0 */\r
8253 \r
8254    /* OutTreeN(F0,0,0); FPN(F0); */\r
8255    OutTreeN(F0,1,0); FPN(F0);\r
8256    /* OutTreeN(F0,1,1); FPN(F0); */\r
8257    /* copy master tree into sptree */\r
8258    if(tree.nnode != 2*com.ns-1) \r
8259       error2("check and think about multificating trees.");\r
8260    sptree.nnode = tree.nnode;  sptree.nbranch = tree.nbranch; \r
8261    sptree.root = tree.root;    sptree.nfossil = 0;\r
8262    for(i=0; i<sptree.nspecies*2-1; i++) {\r
8263       sptree.nodes[i].father = nodes[i].father;\r
8264       sptree.nodes[i].nson = nodes[i].nson;\r
8265       if(nodes[i].nson!=0 && nodes[i].nson!=2) \r
8266          error2("master tree has to be binary.");\r
8267       for(j=0; j<sptree.nodes[i].nson; j++) \r
8268          sptree.nodes[i].sons[j] = nodes[i].sons[j];\r
8269 \r
8270       sptree.nodes[i].fossil = nodes[i].fossil;\r
8271       sptree.nodes[i].age = nodes[i].age;\r
8272       sptree.nodes[i].pfossil[0] = nodes[i].branch; /* ">": Lower bound */\r
8273       sptree.nodes[i].pfossil[1] = nodes[i].label;  /* "<": Upper bound */\r
8274 \r
8275       if(nodes[i].branch && nodes[i].label > 0) {  /* joint bound: >0.8<1.2 */\r
8276          if(nodes[i].age == 0) {\r
8277             sptree.nodes[i].fossil = BOUND_F;\r
8278             sptree.nodes[i].pfossil[2] = tailL;\r
8279             sptree.nodes[i].pfossil[3] = tailR;\r
8280          }\r
8281          else {\r
8282             error2("\nUse 'G(alpha, beta)' to specify the gamma calibration");\r
8283          }\r
8284          sptree.nfossil++;\r
8285       }\r
8286       else if(nodes[i].branch) {       /* lower bound: >0.8 */\r
8287          sptree.nodes[i].fossil = LOWER_F;\r
8288          sptree.nfossil++; \r
8289          /* truncated Cauchy default prior L(tL, p, c) */\r
8290          sptree.nodes[i].pfossil[1] = p_LOWERBOUND;\r
8291          sptree.nodes[i].pfossil[2] = c_LOWERBOUND;\r
8292          sptree.nodes[i].pfossil[3] = tailL;\r
8293       }\r
8294       else if(nodes[i].label > 0) {    /* upper bound: <1.2 */\r
8295          sptree.nodes[i].fossil = UPPER_F; \r
8296          sptree.nfossil++; \r
8297          sptree.nodes[i].pfossil[2] = tailR;\r
8298       }\r
8299 \r
8300       if(sptree.nodes[i].fossil)\r
8301          sptree.nodes[i].usefossil = 1;\r
8302 \r
8303       nodes[i].branch = nodes[i].label = 0;\r
8304    }\r
8305 \r
8306 #if (defined MCMCTREE)\r
8307    if(!com.TipDate) ProcessFossilInfo();\r
8308 #endif\r
8309 \r
8310    /* read sequences at each locus, construct gene tree by pruning sptree */\r
8311    data.ngene = com.ndata;\r
8312    com.ndata=1;\r
8313    fseq = gfopen(com.seqf, "r");\r
8314    if((gnodes=(struct TREEN**)malloc(sizeof(struct TREEN*)*data.ngene)) == NULL) \r
8315       error2("oom");\r
8316 \r
8317    printf("\nReading sequence data..  %d loci\n", data.ngene);\r
8318    for(locus=0; locus<data.ngene; locus++) {\r
8319       fprintf(fout, "\n\n*** Locus %d ***\n", locus+1);\r
8320       printf("\n\n*** Locus %d ***\n", locus+1);\r
8321 \r
8322       com.cleandata=(char)clean0;\r
8323       for(j=0; j<sptree.nspecies; j++)\r
8324                   com.spname[j] = NULL; /* points to nowhere */\r
8325 #if (defined CODEML)\r
8326       if(com.seqtype==1) {\r
8327          com.icode = data.icode[locus];\r
8328          setmark_61_64();\r
8329       }\r
8330 #endif\r
8331       ReadSeq(fout, fseq, clean0, locus);               /* allocates com.spname[] */\r
8332 #if (defined CODEML)\r
8333       if(com.seqtype == 1) {\r
8334          if(com.sspace < max2(com.ngene+1,com.ns)*(64+12+4)*sizeof(double)) {\r
8335             com.sspace = max2(com.ngene+1,com.ns)*(64+12+4)*sizeof(double);\r
8336             if((com.space = (double*)realloc(com.space,com.sspace))==NULL)\r
8337                error2("oom space for #c");\r
8338          }\r
8339          InitializeCodon(fout,com.space);\r
8340       }\r
8341 #endif\r
8342 \r
8343       data.ns[locus] = com.ns;\r
8344       data.ls[locus] = com.ls;\r
8345 #if(MCMCTREE)\r
8346       if(data.datatype[locus] == MORPHC) \r
8347          ;\r
8348       else \r
8349 #endif\r
8350       {\r
8351          if(com.seqtype==0 || com.seqtype==2)\r
8352             InitializeBaseAA(fout);\r
8353          fflush(fout);\r
8354          if((com.seqtype==0 || com.seqtype==2) && com.model==0)\r
8355             PatternWeightJC69like(fout);\r
8356          xtoy(com.pi, data.pi[locus], com.ncode);\r
8357          data.cleandata[locus] = (char)com.cleandata;\r
8358          data.npatt[locus] = com.npatt;\r
8359          data.fpatt[locus] = com.fpatt; com.fpatt=NULL;\r
8360          for(i=0; i<com.ns; i++) { \r
8361             data.z[locus][i] = com.z[i];\r
8362             com.z[i] = NULL; \r
8363          }\r
8364          printf("%3d patterns, %s\n", com.npatt, (com.cleandata? "clean": "messy"));\r
8365       }\r
8366 \r
8367       GenerateGtree(locus);      /* free com.spname[] */\r
8368    }\r
8369    for(i=0,com.cleandata=1; i<data.ngene; i++) \r
8370       if(data.cleandata[i]==0) \r
8371          com.cleandata = 0;\r
8372 \r
8373    fclose(ftree);  fclose(fseq);\r
8374    SetMapAmbiguity();\r
8375 \r
8376 \r
8377 #if(defined MCMCTREE)\r
8378    if(com.TipDate) {\r
8379       /* com.TipDate_TimeUnit is already initialized, and it won't be changed in GetTipDate() */\r
8380       GetTipDate(&com.TipDate, &com.TipDate_TimeUnit);\r
8381       for(i=0; i<sptree.nspecies; i++)\r
8382          sptree.nodes[i].age = nodes[i].age;\r
8383    }\r
8384 #endif\r
8385 \r
8386    return(0);\r
8387 }\r
8388 \r
8389 \r
8390 int GenerateGtree (int locus)\r
8391 {\r
8392 /* construct the gene tree at locus by pruning tips in the master species \r
8393    tree.  com.spname[] have names of species at the current locus (probably read \r
8394    from the sequence alignment at the locus).  They are used by the routine to compare \r
8395    with sptree.nodes[].name to decide which species to keep for the locus.  \r
8396    See GetSubTreeN() for more details.\r
8397 */\r
8398    int ns=data.ns[locus], i,j, ipop[NS], keep[NS], newnodeNO[2*NS-1];\r
8399 \r
8400    for(j=0; j<sptree.nspecies; j++) keep[j]=0;\r
8401    for(i=0;i<ns;i++) {\r
8402       for(j=0;j<sptree.nspecies;j++)\r
8403          if(!strcmp(com.spname[i], sptree.nodes[j].name)) break;\r
8404       if(j==sptree.nspecies) {\r
8405          printf("species %s not found in master tree\n", com.spname[i]);\r
8406          exit(-1);\r
8407       }\r
8408       if(keep[j]) {\r
8409          printf("\nspecies %s occurs twice in locus %d", com.spname[i], locus+1);\r
8410          error2("\ngiving up...");\r
8411       }\r
8412       keep[j] = i+1;  ipop[i] = j;  /* seq j in alignment is species i in master tree. */\r
8413       free(com.spname[i]);\r
8414    }\r
8415 \r
8416    /* copy master species tree and then prune it. */\r
8417    copySptree();\r
8418    GetSubTreeN(keep, newnodeNO);\r
8419    com.ns=ns;\r
8420 \r
8421    for(i=0;i<sptree.nnode;i++)  \r
8422       if(newnodeNO[i]!=-1) nodes[newnodeNO[i]].ipop = i;\r
8423    /* printGtree(0);  */\r
8424 \r
8425    gnodes[locus] = (struct TREEN*)malloc((ns*2-1)*sizeof(struct TREEN));\r
8426    if(gnodes[locus] == NULL) error2("oom gtree");\r
8427    memcpy(gnodes[locus], nodes, (ns*2-1)*sizeof(struct TREEN));\r
8428    data.root[locus]=tree.root;\r
8429 \r
8430    return(0);\r
8431 }\r
8432 \r
8433 \r
8434 int printGtree (int printBlength)\r
8435 {\r
8436    int i,j;\r
8437 \r
8438    for(i=0; i<com.ns; i++) \r
8439       com.spname[i]=sptree.nodes[nodes[i].ipop].name;\r
8440    for(i=0;i<tree.nnode;i++) \r
8441       if(i!=tree.root) \r
8442          nodes[i].branch=nodes[nodes[i].father].age-nodes[i].age;\r
8443    printf("\nns = %d  nnode = %d", com.ns, tree.nnode);\r
8444    printf("\n%7s%7s %8s %7s%7s","father","node","(ipop)","nson:","sons");\r
8445    for(i=0; i<tree.nnode; i++) {\r
8446       printf ("\n%7d%7d   (%2d) %7d  ",\r
8447          nodes[i].father+1, i+1, nodes[i].ipop+1, nodes[i].nson);\r
8448       for(j=0; j<nodes[i].nson; j++) printf (" %2d", nodes[i].sons[j]+1);\r
8449    }\r
8450    FPN(F0); OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0); FPN(F0); \r
8451    if(printBlength) { OutTreeN(F0,1,1); FPN(F0); }\r
8452    return(0);\r
8453 }\r
8454 \r
8455 \r
8456 void copySptree (void)\r
8457 {\r
8458 /* This copies sptree into nodes = nodes_t, for printing or editing\r
8459 */\r
8460    int i,j;\r
8461 \r
8462    nodes = nodes_t;\r
8463    com.ns = sptree.nspecies;   tree.root = sptree.root;\r
8464    tree.nnode = sptree.nnode;  tree.nbranch = sptree.nbranch; \r
8465    for(i=0; i<sptree.nnode; i++) {\r
8466       /* this is used by mcmctree */\r
8467       if(i<com.ns) com.spname[i] = sptree.nodes[i].name;\r
8468       \r
8469       /* The following may be needed by bpp.  Check carefully. */\r
8470       /*\r
8471       if(i<com.ns) strcpy(com.spname[i], sptree.nodes[i].name);\r
8472       */\r
8473       nodes[i].father  =sptree.nodes[i].father;\r
8474       nodes[i].nson = sptree.nodes[i].nson;\r
8475       for(j=0;j<nodes[i].nson;j++) \r
8476          nodes[i].sons[j] = sptree.nodes[i].sons[j];\r
8477       nodes[i].fossil = sptree.nodes[i].fossil;\r
8478       nodes[i].age = sptree.nodes[i].age;\r
8479       if(i != tree.root) \r
8480          nodes[i].branch = sptree.nodes[nodes[i].father].age - sptree.nodes[i].age;\r
8481    }\r
8482 }\r
8483 \r
8484 void printSptree (void)\r
8485 {\r
8486    int i, j, k;\r
8487 \r
8488    printf("\n************\nSpecies tree\nns = %d  nnode = %d", sptree.nspecies, sptree.nnode);\r
8489    printf("\n%7s%7s  %-8s %12s %12s%16s\n","father","node","name","time","fossil","sons");\r
8490    for (i=0; i<sptree.nnode; i++) {\r
8491       printf("%7d%7d  %-14s %9.5f", \r
8492          sptree.nodes[i].father+1, i+1, sptree.nodes[i].name, sptree.nodes[i].age);\r
8493 \r
8494 #ifdef MCMCTREE\r
8495       if((k = sptree.nodes[i].fossil)) {\r
8496          printf(" %s ( ", fossils[k]);\r
8497          for(j=0; j<npfossils[k]; j++) {\r
8498             printf("%6.4f", sptree.nodes[i].pfossil[j + (k==UPPER_F)]);\r
8499             printf("%s", (j==npfossils[k]-1 ? " ) " : ", "));\r
8500          }\r
8501       }\r
8502 #endif\r
8503 \r
8504       if(sptree.nodes[i].nson)\r
8505          printf("  (%2d %2d)", sptree.nodes[i].sons[0]+1, sptree.nodes[i].sons[1]+1);\r
8506       printf("\n");\r
8507    }\r
8508    copySptree();\r
8509    FPN(F0); OutTreeN(F0,0,0); FPN(F0); OutTreeN(F0,1,0);  FPN(F0); \r
8510    OutTreeN(F0,1,1); FPN(F0);\r
8511 }\r
8512 \r
8513 \r
8514 #endif\r
8515 \r
8516 \r
8517 \r
8518 \r
8519 #if (defined BASEML || defined CODEML)\r
8520 \r
8521 #if (defined CODEML)\r
8522 \r
8523 int GetMemPUVR(int nc, int nUVR)\r
8524 {\r
8525 /* this gets mem for nUVR sets of matrices\r
8526 */\r
8527    int i;\r
8528 \r
8529    PMat=(double*)malloc((nc*nc+nUVR*nc*nc*2+nUVR*nc)*sizeof(double));\r
8530    if(PMat==NULL) error2("oom getting P&U&V&Root");\r
8531    U=_UU[0]=PMat+nc*nc;  V=_VV[0]=_UU[0]+nc*nc; Root=_Root[0]=_VV[0]+nc*nc;\r
8532    for(i=1; i<nUVR; i++) {\r
8533       _UU[i]=_UU[i-1]+nc*nc*2+nc; _VV[i]=_VV[i-1]+nc*nc*2+nc; \r
8534       _Root[i]=_Root[i-1]+nc*nc*2+nc;\r
8535    }\r
8536    return(0);\r
8537 }\r
8538 \r
8539 void FreeMemPUVR(void)\r
8540 {   \r
8541    free(PMat); \r
8542 }\r
8543 \r
8544 \r
8545 int GetUVRoot_codeml (void)\r
8546 {\r
8547 /* This uses data.daafile[] to set up the eigen matrices U, V, Root for \r
8548    combined clock analyses of multiple protein data sets (clock = 5 or 6).\r
8549 */\r
8550    int locus, nc=(com.seqtype==1?64:20), nUVR=data.ngene;\r
8551    double mr=0;\r
8552 \r
8553    if(com.seqtype==1 && (!com.fix_kappa || !com.fix_omega)) nUVR=1;\r
8554    GetMemPUVR(nc, nUVR);\r
8555 \r
8556    if(nUVR>6) error2("The maximum number of proteins is set to 6.");\r
8557    if(com.seqtype==2) {\r
8558       for(locus=0; locus<data.ngene; locus++) {\r
8559          if(data.ngene>1) \r
8560             strcpy(com.daafile, data.daafile[locus]);\r
8561          GetDaa(NULL, com.daa);\r
8562          if(com.model==Empirical_F) \r
8563             xtoy(data.pi[locus], com.pi, nc);\r
8564          eigenQaa(NULL, _Root[locus], _UU[locus], _VV[locus], NULL);\r
8565       }\r
8566    }\r
8567    else if(com.seqtype==1 && com.fix_kappa & com.fix_omega) {\r
8568       for(locus=0; locus<data.ngene; locus++) {\r
8569          if(com.seqtype==1) {\r
8570             com.icode=data.icode[locus];\r
8571             setmark_61_64 ();\r
8572          }\r
8573          com.kappa=data.kappa[locus];\r
8574          com.omega=data.omega[locus];\r
8575          xtoy(data.pi[locus], com.pi, com.ncode);\r
8576          eigenQcodon(1,-1,NULL,NULL,NULL, _Root[locus], _UU[locus], _VV[locus], &mr,\r
8577             &com.kappa, com.omega, PMat);\r
8578       }\r
8579    }\r
8580    return(0);\r
8581 }\r
8582 \r
8583 \r
8584 #endif\r
8585 \r
8586 \r
8587 int UseLocus (int locus, int copycondP, int setmodel, int setSeqName)\r
8588 {\r
8589 /* This point nodes to the gene tree at locus gnodes[locus] and set com.z[] \r
8590    etc. for likelihood calculation for the locus.  \r
8591 */\r
8592    int i, nS;\r
8593    double mr=0;\r
8594 \r
8595    com.ns=data.ns[locus]; com.ls=data.ls[locus];\r
8596    tree.root=data.root[locus];\r
8597    tree.nnode=2*com.ns-1;  /* assumes binary tree */\r
8598    tree.nbranch=tree.nnode-1;\r
8599 \r
8600    nodes=gnodes[locus];\r
8601 \r
8602    com.cleandata=data.cleandata[locus];\r
8603    com.npatt=com.posG[1]=data.npatt[locus];  com.posG[0]=0;\r
8604    com.fpatt=data.fpatt[locus];\r
8605    for(i=0; i<com.ns; i++) com.z[i] = data.z[locus][i];\r
8606 \r
8607    /* The following is model-dependent */\r
8608    if(setmodel) {\r
8609 \r
8610       com.kappa=data.kappa[locus];\r
8611       com.omega=data.omega[locus];\r
8612       com.alpha=data.alpha[locus];\r
8613 \r
8614 #if(defined CODEML)\r
8615       if(com.seqtype==1) {\r
8616          com.icode=data.icode[locus];\r
8617          setmark_61_64 ();\r
8618       }\r
8619 #endif\r
8620 \r
8621 #if(defined BASEML)\r
8622       if(com.seqtype==0 && com.model!=0 && com.model!=1)\r
8623          xtoy(data.pi[locus], com.pi, com.ncode);\r
8624       if(com.model<=TN93)\r
8625          eigenTN93(com.model, com.kappa, com.kappa, com.pi, &nR, Root, Cijk);\r
8626       else if (com.model==REV)\r
8627          eigenQREVbase (NULL, PMat, &com.kappa, com.pi, &nR, Root, Cijk);\r
8628 #else\r
8629       if((com.seqtype==1 && com.codonf) || (com.seqtype==2 && com.model==3))\r
8630          xtoy(data.pi[locus], com.pi, com.ncode);\r
8631 \r
8632       if((com.seqtype==2 && (com.model==2 || com.model==3))\r
8633          || (com.seqtype==1 && com.fix_kappa && com.fix_omega)) {\r
8634          Root=_Root[locus]; U=_UU[locus];  V=_VV[locus];\r
8635       }\r
8636       else {\r
8637          eigenQcodon(1,-1,NULL,NULL,NULL,Root,U,V, &mr, &com.kappa, com.omega,PMat);\r
8638       }\r
8639 \r
8640 #endif\r
8641       if(com.alpha)\r
8642          DiscreteGamma (com.freqK,com.rK,com.alpha,com.alpha,com.ncatG,DGammaUseMedian);\r
8643 \r
8644       com.NnodeScale = data.NnodeScale[locus];\r
8645       com.nodeScale = data.nodeScale[locus];\r
8646       nS = com.NnodeScale*com.npatt * (com.conPSiteClass ? com.ncatG : 1);\r
8647       for(i=0; i<nS; i++) com.nodeScaleF[i] = 0;\r
8648    }\r
8649    if(setSeqName)\r
8650       for(i=0; i<com.ns; i++)\r
8651          com.spname[i] = sptree.nodes[nodes[i].ipop].name;\r
8652    return(0);\r
8653 }\r
8654 \r
8655 \r
8656 void GetMemBC (void)\r
8657 {\r
8658 /* This gets memory for baseml and codeml under local clock models for analysis \r
8659    of combined data from multiple loci.\r
8660    com.conP[] is shared across loci.\r
8661    fhK[] uses shared space for loci.\r
8662 */\r
8663    int j, locus, nc = (com.seqtype==1?64:com.ncode);\r
8664    size_t maxsizeScale=0, nS, sfhK=0, s1, snode;\r
8665    double *p;\r
8666 \r
8667    for(locus=0,com.sconP=0; locus<data.ngene; locus++) {\r
8668       snode = nc*data.npatt[locus];\r
8669       s1 = snode*(data.ns[locus]-1)*sizeof(double);\r
8670       if(com.alpha) {     /* this is for step 1, using method = 1 */\r
8671          com.conPSiteClass = 1;\r
8672          s1 *= com.ncatG;\r
8673       }\r
8674       if(s1>com.sconP) com.sconP = s1;\r
8675       if(com.alpha && (size_t)data.npatt[locus]>sfhK) \r
8676          sfhK = data.npatt[locus];\r
8677    }\r
8678 \r
8679    com.conP = (double*)malloc(com.sconP);\r
8680    printf("\n%5lu bytes for conP\n", com.sconP); \r
8681    if(com.conP==NULL)\r
8682       error2("oom conP");\r
8683    if (com.alpha) {\r
8684       sfhK *= com.ncatG*sizeof(double);\r
8685       if((com.fhK=(double*)realloc(com.fhK,sfhK))==NULL) error2("oom");\r
8686    }\r
8687 \r
8688    /* set gnodes[locus][].conP for internal nodes */\r
8689    for(locus=0; locus<data.ngene; locus++) {\r
8690       snode = nc*data.npatt[locus];\r
8691       for(j=data.ns[locus]; j<data.ns[locus]*2-1; j++)\r
8692          gnodes[locus][j].conP = com.conP + (j-data.ns[locus])*snode;\r
8693    }\r
8694    for(locus=0; locus<data.ngene; locus++) {\r
8695       if(!data.cleandata[locus]) {\r
8696          UseLocus(locus, -1, 0, 0);\r
8697       }\r
8698    }\r
8699 \r
8700    if(sptree.nspecies>20) {\r
8701       for(locus=0; locus<data.ngene; locus++) {\r
8702          UseLocus(locus, -1, 0, 0);\r
8703          com.NnodeScale = 0;\r
8704          com.nodeScale = data.nodeScale[locus]=(char*)malloc(tree.nnode*sizeof(char));\r
8705          if(com.nodeScale==NULL)  error2("oom");\r
8706          for(j=0; j<tree.nnode; j++) com.nodeScale[j] = 0;\r
8707 \r
8708          SetNodeScale(tree.root);\r
8709 \r
8710          data.NnodeScale[locus] = com.NnodeScale;\r
8711          nS = com.NnodeScale*com.npatt;\r
8712          if(com.conPSiteClass) nS *= com.ncatG;\r
8713          maxsizeScale = max2(maxsizeScale, nS);\r
8714 \r
8715          if(com.NnodeScale) {\r
8716             printf("\n%d node(s) used for scaling at locus %d: \n",com.NnodeScale,locus+1);\r
8717             FOR(j,tree.nnode) if(com.nodeScale[j]) printf(" %2d",j+1);\r
8718             FPN(F0);\r
8719          }\r
8720       }\r
8721       if(maxsizeScale) {\r
8722          if((com.nodeScaleF=(double*)malloc(maxsizeScale*sizeof(double)))==NULL)\r
8723             error2("oom nscale");\r
8724          for(j=0; j<(int)maxsizeScale; j++) com.nodeScaleF[j] = 0;\r
8725       }\r
8726    }\r
8727 \r
8728 }\r
8729 \r
8730 void FreeMemBC (void)\r
8731 {\r
8732    int locus, j;\r
8733 \r
8734    for(locus=0; locus<data.ngene; locus++)\r
8735       free(gnodes[locus]);\r
8736    free(gnodes);\r
8737    free(com.conP);\r
8738    for(locus=0; locus<data.ngene; locus++) {\r
8739       free(data.fpatt[locus]);\r
8740       for(j=0;j<data.ns[locus]; j++)\r
8741          free(data.z[locus][j]);\r
8742    }\r
8743    if(com.alpha)\r
8744       free(com.fhK);\r
8745 \r
8746    if(sptree.nspecies>20) {\r
8747       for(locus=0; locus<data.ngene; locus++)\r
8748          free(data.nodeScale[locus]);\r
8749       if(com.nodeScaleF) free(com.nodeScaleF);\r
8750    }\r
8751 }\r
8752 \r
8753 \r
8754 \r
8755 \r
8756 double nu_AHRS=0.001, *varb_AHRS;\r
8757 \r
8758 \r
8759 double funSS_AHRS(double x[], int np);\r
8760 \r
8761 \r
8762 double lnLfunHeteroData (double x[], int np)\r
8763 {\r
8764 /* This calculates the log likelihood, the log of the probability of the data \r
8765    given gtree[] for each locus.  This is for step 3 of Yang (2004. Acta \r
8766    Zoologica Sinica 50:645-656)\r
8767    x[0,1,...s-k] has node ages in the species tree, followed by branch rates \r
8768    for genes 1, 2, ..., then kappa for genes, then alpha for genes\r
8769 */\r
8770    int i,k, locus;\r
8771    double lnL=0, lnLt, *pbrate;\r
8772 \r
8773    /* ??? need more work for codon sequences */\r
8774    for(locus=0,k=com.ntime-1; locus<data.ngene; locus++) \r
8775       k+=data.nbrate[locus];\r
8776    if(!com.fix_kappa) FOR(locus,data.ngene) data.kappa[locus]=x[k++];\r
8777    if(!com.fix_omega) FOR(locus,data.ngene) data.omega[locus]=x[k++];\r
8778    if(!com.fix_alpha) FOR(locus,data.ngene) data.alpha[locus]=x[k++];\r
8779 \r
8780    /* update node ages in species tree */\r
8781    copySptree();\r
8782    SetBranch(x);\r
8783    FOR(i,tree.nnode) sptree.nodes[i].age=nodes[i].age;\r
8784 \r
8785    for(locus=0,pbrate=x+com.ntime-1; locus<data.ngene; locus++) {\r
8786 \r
8787       UseLocus(locus, -1, 1, 1);\r
8788       /* copy node ages to gene tree */\r
8789       FOR(i,tree.nnode)  nodes[i].age=sptree.nodes[nodes[i].ipop].age;\r
8790       FOR(i,tree.nnode) {\r
8791          if(i!=tree.root) {\r
8792             nodes[i].branch = (nodes[nodes[i].father].age-nodes[i].age) \r
8793                             * pbrate[(int)nodes[i].label];\r
8794             if(nodes[i].branch<-1e-4)\r
8795                puts("b<0");\r
8796          }\r
8797       }\r
8798       lnL += lnLt = com.plfun(x, -1);\r
8799       pbrate += data.nbrate[locus];\r
8800    }\r
8801    return(lnL);\r
8802 }\r
8803 \r
8804 \r
8805 double funSS_AHRS (double x[], int np)\r
8806 {\r
8807 /* Function to be minimized in the ad hoc rate smoothing procedure: \r
8808       lnLb + lnLr\r
8809    nodes[].label has node rate.\r
8810    lnLb is weighted sum of squares using approximate variances for branch lengths.\r
8811 \r
8812    lnLr is the log of the prior of rates under the geometric Brownian motion \r
8813    model of rate evolution. There is no need for recursion as the order at \r
8814    which sptree.nodes are visited is unimportant.  The rates are stored in \r
8815    gnodes[].label.\r
8816    The root rate is fixed to be the weighted average rate of its two sons, \r
8817    inversely weighted by the divergence times.\r
8818 */\r
8819    int locus, j,k, root, pa, son0, son1;\r
8820    double lnLb, lnLr, lnLbi, lnLri;  /* lnLb & lnLr are sum of squares for b and r */\r
8821    double b,be,t, t0,t1, r,rA, w,y, small=1e-20, smallage=AgeLow[sptree.root]*small;\r
8822    double nu = nu_AHRS, *varb=varb_AHRS;\r
8823 \r
8824    /* set up node ages in species tree */\r
8825    copySptree();\r
8826    SetBranch(x);\r
8827    for(j=0; j<tree.nnode; j++)\r
8828       sptree.nodes[j].age = nodes[j].age;\r
8829 \r
8830    k=com.ntime-1;\r
8831    for(locus=0,lnLb=lnLr=0; locus<data.ngene; varb+=com.ns*2-1,locus++) {\r
8832       UseLocus(locus, -1, 0, 0);\r
8833       if(data.fix_nu==2)      nu = x[np-1];\r
8834       else if(data.fix_nu==3) nu = x[np-1-(data.ngene-1-locus)];\r
8835 \r
8836       root = tree.root;\r
8837       son0 = nodes[root].sons[0];\r
8838       son1 = nodes[root].sons[1];\r
8839       /* copy node ages and rates into gene tree nodes[]. */\r
8840       for(j=0; j<tree.nnode; j++) { /* age and rates */\r
8841          nodes[j].age=sptree.nodes[nodes[j].ipop].age;\r
8842          if(j!=root)\r
8843             nodes[j].label = x[k++];\r
8844       }\r
8845       t0 = nodes[root].age-nodes[son0].age;\r
8846       t1 = nodes[root].age-nodes[son1].age;\r
8847       if(t0+t1 < 1e-7)\r
8848          error2("small root branch.  Think about what to do.");\r
8849       nodes[root].label = (nodes[son0].label*t1+nodes[son1].label*t0)/(t0+t1);\r
8850 \r
8851       for(j=0,lnLbi=0; j<tree.nnode; j++) {\r
8852          if(j==son0 || j==son1) continue;\r
8853          pa = nodes[j].father;\r
8854          if(j==root) {\r
8855             b  = nodes[son0].branch+nodes[son1].branch;\r
8856             be = (nodes[j].age-nodes[son0].age) * (nodes[root].label+nodes[son0].label)/2\r
8857                + (nodes[j].age-nodes[son1].age) * (nodes[root].label+nodes[son1].label)/2;\r
8858          }\r
8859          else {\r
8860             b  = nodes[j].branch;\r
8861             be = (nodes[pa].age-nodes[j].age) * (nodes[pa].label+nodes[j].label)/2;\r
8862          }\r
8863          w = varb[j];\r
8864          if(w<small) \r
8865             puts("small variance");\r
8866          lnLbi -= square(be-b)/(2*w);\r
8867       }\r
8868 \r
8869       for(j=0,lnLri=0; j<tree.nnode; j++) {\r
8870          if(j==root) continue;\r
8871          pa = nodes[j].father;\r
8872          t = nodes[pa].age - nodes[j].age;\r
8873          t = max2(t,smallage);\r
8874          r = nodes[j].label;\r
8875          rA= nodes[pa].label;\r
8876 \r
8877          if(rA<small || t<small || r<small)  puts("small r, rA, or t");\r
8878          y = log(r/rA)+t*nu/2;\r
8879          lnLri -= y*y/(2*t*nu) - log(r) - log(2*Pi*t*nu)/2;\r
8880       }\r
8881 \r
8882       if(data.fix_nu>1) lnLri += -nu/nu_AHRS-log(nu);  /* exponential prior */\r
8883       lnLb -= lnLbi;\r
8884       lnLr -= lnLri;\r
8885    }\r
8886    return (lnLb + lnLr);\r
8887 }\r
8888 \r
8889 \r
8890 void SetBranchRates(int inode)\r
8891 {\r
8892 /* this uses node rates to set branch rates, and is used only after the ad hoc \r
8893    rate smoothing iteration is finished.\r
8894 */\r
8895    int i;\r
8896    if(inode<com.ns) \r
8897       nodes[inode].label = (nodes[inode].label + nodes[nodes[inode].father].label)/2;\r
8898    else\r
8899       for(i=0; i<nodes[inode].nson; i++) \r
8900          SetBranchRates(nodes[inode].sons[i]);\r
8901 }\r
8902 \r
8903 \r
8904 int GetInitialsClock6Step1 (double x[], double xb[][2])\r
8905 {\r
8906 /* This is for clock 6 step 1.\r
8907 */\r
8908    int i,k;\r
8909    double tb[]={.0001, 999};\r
8910 \r
8911    com.ntime=k=tree.nbranch;\r
8912    GetInitialsTimes (x);\r
8913 \r
8914    com.plfun = (com.alpha==0 ? lfun : lfundG);\r
8915    com.conPSiteClass = (com.method && com.plfun==lfundG);\r
8916 \r
8917 /*   InitializeNodeScale(); */\r
8918 \r
8919    if(com.seqtype==0)  com.nrate = !com.fix_kappa;\r
8920 \r
8921    com.np=com.ntime+!com.fix_kappa+!com.fix_alpha;\r
8922    if(com.seqtype==1 && !com.fix_omega) com.np++;\r
8923 \r
8924    if(!com.fix_kappa) x[k++]=com.kappa;\r
8925    if(!com.fix_omega) x[k++]=com.omega;\r
8926    if(!com.fix_alpha) x[k++]=com.alpha;\r
8927    NodeToBranch ();\r
8928    \r
8929    for(i=0; i<com.ntime; i++)  \r
8930       { xb[i][0]=tb[0]; xb[i][1]=tb[1]; }\r
8931    for( ; i<com.np; i++)  \r
8932       { xb[i][0]=.001; xb[i][1]=999; }\r
8933 \r
8934    if(noisy>3 && com.np<200) {\r
8935       printf("\nInitials (np=%d)\n", com.np);\r
8936       for(i=0; i<com.np; i++) printf(" %10.5f", x[i]);      FPN(F0);\r
8937       for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][0]);  FPN(F0);\r
8938       for(i=0; i<com.np; i++) printf(" %10.5f", xb[i][1]);  FPN(F0);\r
8939    }\r
8940    return (0);\r
8941 }\r
8942 \r
8943 \r
8944 \r
8945 int GetInitialsClock56Step3 (double x[])\r
8946 {\r
8947 /* This is for clock 5 or clock 6 step 3\r
8948 */\r
8949    int i, j,k=0, naa=20;\r
8950 \r
8951    if(com.clock==5)\r
8952       GetInitialsTimes (x);\r
8953 \r
8954    com.plfun = (com.alpha==0 ? lfun : lfundG);\r
8955    com.conPSiteClass = (com.method && com.plfun==lfundG);\r
8956 \r
8957 /*   InitializeNodeScale(); */\r
8958 \r
8959    com.np = com.ntime-1 + (1+!com.fix_kappa+!com.fix_omega+!com.fix_alpha)*data.ngene;\r
8960    if(com.clock==5) \r
8961       for(i=com.ntime-1;i<com.np;i++) x[i]=.2+rndu();\r
8962    else if(com.clock==6) {\r
8963       for(j=0,k=com.ntime-1; j<data.ngene; k+=data.nbrate[j],j++) \r
8964          com.np += data.nbrate[j]-1;\r
8965       if(!com.fix_kappa)\r
8966          for(j=0; j<data.ngene; j++) x[k++]=data.kappa[j];\r
8967       if(!com.fix_omega) \r
8968          for(j=0; j<data.ngene; j++) x[k++]=data.omega[j];\r
8969       if(!com.fix_alpha) \r
8970          for(j=0; j<data.ngene; j++) x[k++]=data.alpha[j];\r
8971       for(i=k;i<com.np;i++) x[i]=(.5+rndu())/2;\r
8972    }\r
8973    return (0);\r
8974 }\r
8975 \r
8976 \r
8977 double GetMeanRate (void)\r
8978 {\r
8979 /* This gets the rough average rate for the locus \r
8980 */\r
8981    int inode, i,j,k, ipop, nleft,nright,marks[NS], sons[2], nfossil;\r
8982    double mr, md;\r
8983 \r
8984    mr=0; nfossil=0;\r
8985    for(inode=com.ns; inode<tree.nnode; inode++) {\r
8986       ipop = nodes[inode].ipop;  \r
8987       if(sptree.nodes[ipop].fossil == 0) continue;\r
8988       sons[0] = nodes[inode].sons[0];\r
8989       sons[1] = nodes[inode].sons[1];\r
8990       for(i=0,nleft=nright=0; i<com.ns; i++) {\r
8991          for(j=i,marks[i]=0; j!=tree.root; j=nodes[j].father) {\r
8992             if(j==sons[0])       { marks[i]=1; nleft++;  break; }\r
8993             else if (j==sons[1]) { marks[i]=2; nright++; break; }\r
8994          }\r
8995       }\r
8996       if(nleft==0 || nright==0) {\r
8997          puts("this calibration is not in gene tree.");\r
8998          continue;\r
8999       }\r
9000       nfossil++;\r
9001 \r
9002       for(i=0,md=0; i<com.ns; i++) {\r
9003          for(j=0; j<com.ns; j++) {\r
9004             if(marks[i]==1 && marks[j]==2) {\r
9005                for(k=i; k!=inode; k=nodes[k].father)\r
9006                   md+=nodes[k].branch;\r
9007                for(k=j; k!=inode; k=nodes[k].father)\r
9008                   md+=nodes[k].branch;\r
9009             }\r
9010          }\r
9011       }\r
9012       md /= (nleft*nright);\r
9013       mr += md/(sptree.nodes[ipop].age*2);\r
9014 \r
9015       /*\r
9016       printf("node age & mr n%-4d %9.5f%9.5f  ", inode, sptree.nodes[ipop].age, md);\r
9017       if(com.ns<100) FOR(i,com.ns) printf("%d",marks[i]); \r
9018       FPN(F0);\r
9019       */\r
9020    }\r
9021    mr /= nfossil;\r
9022    if(nfossil==0) \r
9023       { printf("need fossils for this locus\n"); exit(-1); }\r
9024 \r
9025    return(mr);\r
9026 }\r
9027 \r
9028 \r
9029 int AdHocRateSmoothing (FILE*fout, double x[NS*3], double xb[NS*3][2], double space[])\r
9030 {\r
9031 /* ad hoc rate smoothing for likelihood estimation of divergence times.\r
9032    Step 1: Use JC69 to estimate branch lengths under no-clock model.\r
9033    Step 2: ad hoc rate smoothing, estimating one set of divergence times\r
9034            and many sets of branch rates for loci.  Rate at root is set to \r
9035            weighted average of rate at the two sons.\r
9036 */\r
9037    int model0=com.model, ntime0=com.ntime;  /* is this useful? */\r
9038    int fix_kappa0=com.fix_kappa, fix_omega0=com.fix_omega, fix_alpha0=com.fix_alpha;\r
9039    int ib, son0, son1;\r
9040    double kappa0=com.kappa, omega0=com.omega, alpha0=com.alpha, t0,t1, *varb;\r
9041    double f, e=1e-8, pb=0.00001, rb[]={0.001,99}, lnL,lnLsum=0;\r
9042    double mbrate[20], Rj[20], r,minr,maxr, beta, *pnu=&nu_AHRS,nu, mr[NGENE];\r
9043    int i,j,k,k0, locus, nbrate[20],maxnbrate=20;\r
9044    char timestr[32];\r
9045    FILE *fBV = gfopen("in.BV","w");\r
9046    FILE *fdist = gfopen("RateDist.txt","w");\r
9047    FILE *finStep1 = fopen("in.ClockStep1","r"),\r
9048         *finStep2 = fopen("in.ClockStep2","r");\r
9049 \r
9050    noisy=4;\r
9051    for(locus=0,k=0; locus<data.ngene; locus++)\r
9052       k += 2*data.ns[locus]-1;\r
9053    if((varb_AHRS=(double*)malloc(k*sizeof(double)))==NULL) \r
9054       error2("oom AHRS");\r
9055    for(i=0; i<k;i++)  varb_AHRS[i]=-1;\r
9056 \r
9057 \r
9058    /* Step 1: Estimate branch lengths without clock.  */\r
9059    printf("\nStep 1: Estimate branch lengths under no clock.\n");\r
9060    fprintf(fout,"\n\nStep 1: Estimate branch lengths under no clock.\n");\r
9061    com.clock=0; com.method=1;\r
9062 /*\r
9063 com.model=0;  com.fix_kappa=1; com.kappa=1; \r
9064 com.fix_alpha=1; com.alpha=0;\r
9065 */\r
9066    for(locus=0; locus<data.ngene; locus++) {\r
9067       if(!com.fix_kappa) data.kappa[locus]=com.kappa;\r
9068       if(!com.fix_omega) data.omega[locus]=com.omega;\r
9069       if(!com.fix_alpha) data.alpha[locus]=com.alpha;\r
9070    }\r
9071    for(locus=0,varb=varb_AHRS; locus<data.ngene; varb+=com.ns*2-1,locus++) {\r
9072       UseLocus(locus, -1, 1, 1);\r
9073 \r
9074       fprintf(fout,"\nLocus %d (%d sequences)\n", locus+1, com.ns);\r
9075 \r
9076       son0 = nodes[tree.root].sons[0]; \r
9077       son1 = nodes[tree.root].sons[1];\r
9078 \r
9079       GetInitialsClock6Step1 (x, xb);\r
9080 \r
9081       lnL=0;\r
9082       if(com.ns>30) fprintf(frub, "\n\nLocus %d\n", locus+1);\r
9083       if(finStep1) {\r
9084          puts("read MLEs from step 1 from file");\r
9085          for(i=0; i<com.np; i++) \r
9086             fscanf(finStep1,"%lf",&x[i]);\r
9087       }\r
9088       else {\r
9089          j = minB((com.ns>30?frub:NULL), &lnL, x, xb, e, space);\r
9090          for(j=0; j<com.ns*2-1; j++) {\r
9091             ib = nodes[j].ibranch;\r
9092             if(j!=tree.root) varb[j] = (x[ib]>1e-8 ? -1/varb_minbranches[ib] : 999);\r
9093          }\r
9094 /*\r
9095 matout(F0, x, 1, com.ntime);\r
9096 matout2(F0, varb, 1, tree.nnode, 10, 7);\r
9097 fout = stdout;\r
9098 exit(0);\r
9099 */\r
9100       }\r
9101 \r
9102       if(!com.fix_kappa) data.kappa[locus] = x[com.ntime];\r
9103       if(!com.fix_omega) data.omega[locus] = x[com.ntime + !com.fix_kappa];\r
9104       if(!com.fix_alpha) data.alpha[locus] = x[com.ntime + !com.fix_kappa + !com.fix_omega];\r
9105 \r
9106       lnLsum += lnL;\r
9107 \r
9108       t0 = nodes[son0].branch; \r
9109       t1 = nodes[son1].branch;\r
9110       varb[tree.root] = varb[t0>t1?son0:son1];\r
9111       nodes[son0].branch = nodes[son1].branch = (t0+t1)/2;  /* arbitrary */\r
9112       mr[locus] = GetMeanRate();\r
9113 \r
9114       printf("   Locus %d: %d sequences, %d blengths, lnL = %15.6f mr=%.5f%10s\n", \r
9115          locus+1, com.ns, com.np-1,-lnL,mr[locus], printtime(timestr));\r
9116       fprintf(fout,"\nlnL = %.6f\n\n", -lnL);\r
9117       OutTreeB(fout);  FPN(fout);\r
9118       for(i=0; i<com.np; i++) fprintf(fout," %8.5f",x[i]); FPN(fout);\r
9119       for(i=0; i<tree.nbranch; i++) fprintf(fout," %8.5f", sqrt(varb[tree.branches[i][1]])); FPN(fout);\r
9120       FPN(fout);  OutTreeN(fout,1,1);  FPN(fout);  fflush(fout);\r
9121 \r
9122       fprintf(fBV, "\n\nLocus %d: %d sequences, %d+1 branches\nlnL = %15.6f\n\n", \r
9123          locus+1, com.ns, tree.nbranch-1, -lnL);\r
9124       OutTreeB(fBV);  FPN(fBV);\r
9125       for(i=0; i<tree.nbranch; i++) fprintf(fBV," %12.9f",x[i]); FPN(fBV);\r
9126       for(i=0; i<tree.nbranch; i++) fprintf(fBV," %12.9f", sqrt(varb[tree.branches[i][1]])); FPN(fBV);\r
9127       FPN(fBV);  OutTreeN(fBV,1,1);  FPN(fBV);  fflush(fBV);\r
9128    }\r
9129    fclose(fBV);\r
9130    if(data.ngene>1) fprintf(fout,"\nSum of lnL over loci = %15.6f\n", -lnLsum);\r
9131 \r
9132    /* Step 2: ad hoc rate smoothing to estimate branch rates.  */\r
9133    printf("\nStep 2: Ad hoc rate smoothing to estimate branch rates.\n");\r
9134    fprintf(fout, "\n\nStep 2: Ad hoc rate smoothing to estimate branch rates.\n");\r
9135    /* s - 1 - NFossils node ages, (2*s_i - 2) rates for branches at each locus */\r
9136    com.clock = 1;\r
9137    copySptree();\r
9138    GetInitialsTimes (x);\r
9139 \r
9140    for(locus=0,com.np=com.ntime-1; locus<data.ngene; locus++) \r
9141       com.np += data.ns[locus]*2-2;\r
9142    if(data.fix_nu==2) com.np++;\r
9143    if(data.fix_nu==3) com.np+=data.ngene;\r
9144 \r
9145    if(com.np>NS*6) error2("change NP for ad hoc rate smoothing.");\r
9146    for(i=0; i<com.ntime-1; i++)\r
9147       { xb[i][0]=pb;  xb[i][1]=1-pb; }\r
9148    if(!nodes[tree.root].fossil)  \r
9149       { xb[0][0]=AgeLow[tree.root]*1.0001; xb[0][1]=max2(AgeLow[tree.root]*10,50); }\r
9150    for( ; i<com.np; i++)  { /* for rates */\r
9151       xb[i][0]=rb[0]; xb[i][1]=rb[1];\r
9152    }\r
9153    for(locus=0,i=com.ntime-1; locus<data.ngene; locus++) \r
9154       for(j=0; j<data.ns[locus]*2-2; j++) \r
9155          x[i++]=mr[locus]*(.8+.4*rndu());\r
9156    for( ; i<com.np; i++)   /* nu */\r
9157       x[i]=0.001+0.1*rndu();\r
9158 \r
9159    if(noisy>3) {\r
9160       for(i=0; i<com.np; i++) \r
9161          { printf(" %10.5f", x[i]); if(i==com.ntime-2) FPN(F0); }  FPN(F0);\r
9162       if(com.np<200) {\r
9163          for(i=0; i<com.np; i++)  printf(" %10.5f", xb[i][0]);  FPN(F0);\r
9164          for(i=0; i<com.np; i++)  printf(" %10.5f", xb[i][1]);  FPN(F0);\r
9165       }\r
9166    }\r
9167 \r
9168    if(data.fix_nu>1) \r
9169       pnu = x+com.np-(data.fix_nu==2 ? 1 : data.ngene);\r
9170    printf("  %d times, %d rates, %d parameters, ", com.ntime-1,k,com.np);\r
9171 \r
9172    noisy=3;\r
9173    f = funSS_AHRS(x, com.np);\r
9174    if(noisy>2) printf("\nf0 = %12.6f\n",f );\r
9175 \r
9176    if(finStep2) {\r
9177       puts("read MLEs from step 2 from file");\r
9178       for(i=0; i<com.np; i++) fscanf(finStep2,"%lf",&x[i]);\r
9179       matout(F0,x,1,com.np);\r
9180    }\r
9181    else {\r
9182       j = ming2(frub, &f, funSS_AHRS, NULL, x, xb, space, 1e-9, com.np);\r
9183 \r
9184       /* generate output to in.clockStep2\r
9185       matout(fout,x,1,com.np);\r
9186       */\r
9187 \r
9188       if(j==-1) \r
9189          { puts("\nad hoc rate smoothing iteration may not have converged.\nEnter to continue; Ctrl-C to break."); \r
9190       getchar(); }\r
9191    }\r
9192    free(varb_AHRS);\r
9193 \r
9194    fputs("\nEstimated divergence times from ad hoc rate smoothing\n\n",fout);\r
9195    copySptree();\r
9196    FOR(i,tree.nnode) nodes[i].branch*=100;\r
9197    for(i=com.ns; i<tree.nnode; i++)\r
9198       fprintf(fout, "Node %2d   Time %9.5f\n", i+1, nodes[i].age*100);\r
9199    FPN(fout); OutTreeN(fout,1,1); FPN(fout);\r
9200 \r
9201    fprintf(fout, "\nEstimated rates from ad hoc rate smoothing\n");\r
9202    for(locus=0,k=k0=com.ntime-1; locus<data.ngene; k0+=data.nbrate[locus++]) {\r
9203 \r
9204       UseLocus(locus, -1, 0, 1);\r
9205       for(i=0; i<tree.nnode; i++)\r
9206          if(i!=tree.root)  nodes[i].label=x[k++];\r
9207       son0=nodes[tree.root].sons[0]; son1=nodes[tree.root].sons[1];\r
9208       t0=nodes[tree.root].age-nodes[son0].age; \r
9209       t1=nodes[tree.root].age-nodes[son1].age; \r
9210       nodes[tree.root].label = (nodes[son0].label*t1+nodes[son1].label*t0)/(t0+t1);\r
9211       SetBranchRates(tree.root);  /* node rates -> branch rates */\r
9212 \r
9213       nu = (data.fix_nu==3 ? *(pnu+locus) : *pnu);\r
9214       fprintf(fout,"\nLocus %d (%d sequences)\n\n", locus+1, com.ns);\r
9215       fprintf(fout,"nu = %.6g\n", nu);\r
9216 \r
9217       /* this block can be deleted? */\r
9218       fprintf(fout, "\nnode \tage \tlength \trate\n");\r
9219       for(i=0; i<tree.nnode; i++,FPN(fout)) {\r
9220          fprintf(fout, "%02d\t%.3f", i+1,nodes[i].age);\r
9221          if(i!=tree.root) \r
9222             fprintf(fout, "\t%.5f\t%.5f", nodes[i].branch,nodes[i].label);\r
9223       }\r
9224 \r
9225       fprintf(fout,"\nRates as labels in tree:\n"); \r
9226       OutTreeN(fout,1,PrLabel); FPN(fout);  fflush(fout);\r
9227 \r
9228       if(data.nbrate[locus]>maxnbrate) error2("too many rate classes?  Change source.");\r
9229       for(i=0,minr=1e6,maxr=0; i<tree.nnode; i++)\r
9230          if(i!=tree.root) {\r
9231             r=nodes[i].label;\r
9232             if(r<0 && noisy) \r
9233                puts("node label<0?");\r
9234             minr = min2(minr,r);\r
9235             maxr = max2(maxr,r);\r
9236          }\r
9237 \r
9238       fprintf(fdist, "\n%6d\n", tree.nnode-1);\r
9239       for(i=0; i<tree.nnode; i++) {\r
9240          if(i==tree.root) continue;\r
9241          fprintf(fdist, "R%-10.7f  ", nodes[i].label);\r
9242          for(j=0; j<i; j++)\r
9243             if(j!=tree.root)\r
9244                fprintf(fdist, " %9.6f", fabs(nodes[i].label-nodes[j].label));\r
9245          FPN(fdist);\r
9246       }\r
9247       fflush(fdist);\r
9248 /*\r
9249       for(j=0; j<data.nbrate[locus]; j++)\r
9250          Rj[j]=minr+(j+1)*(maxr-minr)/data.nbrate[locus];\r
9251 */\r
9252       beta = pow(1/(data.nbrate[locus]+1.), 1/(data.nbrate[locus]-1.));\r
9253       beta = 0.25+0.25*log((double)data.nbrate[locus]);\r
9254       if(beta>1) beta=0.99;\r
9255       for(j=0; j<data.nbrate[locus]; j++)\r
9256          Rj[j]=minr+(maxr-minr)*pow(beta, data.nbrate[locus]-1.-j);\r
9257 \r
9258 printf("\nLocus %d: nu = %.6f, rate range (%.6f, %.6f)\n", locus+1,nu,minr,maxr);\r
9259 printf("Cutting points:\n");\r
9260 for(j=0; j<data.nbrate[locus]; j++)\r
9261    printf(" < %.6f, ", Rj[j]);\r
9262 printf("\nThe number of rate groups (0 for no change)? ");\r
9263 /* scanf("%d", &j); */\r
9264 j=0;\r
9265 if(j) {\r
9266    data.nbrate[locus]=j;\r
9267    printf("input %d cutting points? ", data.nbrate[locus]-1);\r
9268    for(j=0,Rj[data.nbrate[locus]-1]=maxr; j<data.nbrate[locus]-1; j++)\r
9269       scanf("%lf", &Rj[j]);\r
9270 }\r
9271 \r
9272       for(i=0;i<data.nbrate[locus];i++) { mbrate[i]=0; nbrate[i]=0; }\r
9273       for(i=0; i<tree.nnode; i++) {\r
9274          if(i==tree.root) continue;\r
9275          r=nodes[i].label;\r
9276          for(j=0; j<data.nbrate[locus]-1; j++)\r
9277             if(r<Rj[j]) break;\r
9278          mbrate[j] += r;\r
9279          nbrate[j] ++;\r
9280          nodes[i].label = j;\r
9281       }\r
9282       nodes[tree.root].label=-1;\r
9283       for(i=0;i<data.nbrate[locus];i++) \r
9284          mbrate[i] = (nbrate[i]?mbrate[i]/nbrate[i]:-1);\r
9285 \r
9286       fprintf(fout,"\nCollapsing rates into groups\nRate range: (%.6f, %.6f)\n", minr,maxr);\r
9287 /*      fprintf(fout,"\nCollapsing rates into groups\nbeta = %.6g  Rate range: (%.6f, %.6f)\n", beta, minr,maxr);\r
9288 */\r
9289       for(j=0; j<data.nbrate[locus]; j++)\r
9290          fprintf(fout,"rate group %d  (%2d): <%9.6f, mean %9.6f\n", \r
9291             j, nbrate[j], Rj[j], mbrate[j]);\r
9292 \r
9293       FPN(fout); OutTreeN(fout,1,PrLabel); FPN(fout);\r
9294       fprintf(fout, "\n\nRough rates for branch groups at locus %d\n", locus+1);\r
9295       for(i=0; i<data.nbrate[locus]; i++)\r
9296          x[k0+i] = mbrate[i];\r
9297    }\r
9298 \r
9299 printf("\n\n%d times, %d timerates from AHRS:\n", com.ntime-1,k0);\r
9300 fprintf(fout,"\n\n%d times, %d timerates from AHRS\n", com.ntime-1,k0);\r
9301 for(i=0; i<k0; i++) {\r
9302    printf("%12.6f", x[i]);\r
9303    if(i==com.ntime-2) FPN(F0);\r
9304    fprintf(fout,"%12.6f", x[i]);\r
9305    if(i==com.ntime-2) FPN(fout);\r
9306 }\r
9307 FPN(F0);  FPN(fout);\r
9308 \r
9309    for(i=0; i<k0; i++) x[i]*=0.9+0.2*rndu(); \r
9310    \r
9311    com.model=model0;  com.clock=6;  \r
9312 \r
9313 \r
9314    com.fix_kappa=fix_kappa0; com.kappa=kappa0;\r
9315    com.fix_omega=fix_omega0; com.omega=omega0;\r
9316    com.fix_alpha=fix_alpha0; com.alpha=alpha0;\r
9317 \r
9318 #if 0\r
9319    /* fix parameters: value > 0, precise value unimportant */\r
9320    if(!fix_kappa0) { com.fix_kappa=1; com.kappa=0.1; }\r
9321    if(!fix_omega0) { com.fix_omega=1; com.omega=0.1; }\r
9322    if(!fix_alpha0) { com.fix_alpha=1; com.alpha=0.1; }\r
9323 #endif\r
9324 \r
9325    fclose(fdist);\r
9326    fflush(fout);\r
9327    printf(" %10s\n", printtime(timestr));\r
9328 \r
9329    if(finStep1) fclose(finStep1);\r
9330    if(finStep2) fclose(finStep2);\r
9331 \r
9332    return(0);\r
9333 }\r
9334 \r
9335 \r
9336 void DatingHeteroData (FILE* fout)\r
9337 {\r
9338 /* This is for clock and local-clock dating using heterogeneous data from \r
9339    multiple loci.  Some species might be missing at some loci.  Thus \r
9340    gnodes[locus] stores the gene tree at locus.  Branch lengths in the gene \r
9341    tree are constructed using the divergence times in the master species tree, \r
9342    and the rates for genes and branches.  \r
9343 \r
9344       com.clock = 5: global clock\r
9345                   6: local clock\r
9346 */\r
9347    char timestr[64];\r
9348    int i,j,k, s, np, sconP0=0, locus;\r
9349    double x[NS*6],xb[NS*6][2], lnL,e=1e-7, *var=NULL;\r
9350    int nbrate=4;\r
9351    size_t maxnpML, maxnpADRS;\r
9352 \r
9353    data.fix_nu=3;\r
9354 /*\r
9355 if(com.clock==6) {\r
9356   printf("nu (1:fix; 2:estimate one for all genes; 3:estimate one for every gene)? ");\r
9357   scanf("%d", &data.fix_nu);\r
9358   if(data.fix_nu==1) scanf("%lf", &nu_AHRS);\r
9359 }\r
9360 */\r
9361    ReadTreeSeqs(fout);\r
9362    com.nbtype=1;\r
9363    for(j=0; j<sptree.nnode; j++) {\r
9364       sptree.nodes[j].pfossil[0] = sptree.nodes[j].pfossil[1] = -1;\r
9365    }\r
9366    for(j=sptree.nspecies, com.ntime=j-1, sptree.nfossil=0; j<sptree.nnode; j++) {\r
9367       if(sptree.nodes[j].fossil) {\r
9368          com.ntime--;\r
9369          sptree.nfossil++;\r
9370          printf("node %2d age fixed at %.3f\n", j, sptree.nodes[j].age);\r
9371       }\r
9372    }\r
9373    GetMemBC();\r
9374    s = sptree.nspecies;\r
9375    maxnpML = s-1 + (5+2)*data.ngene;\r
9376    maxnpADRS = s-1 + (2*s-1)*data.ngene + 2*data.ngene;\r
9377    com.sspace = max2(com.sspace, spaceming2(maxnpADRS));\r
9378    com.sspace = max2(com.sspace, maxnpML*(maxnpML+1)*sizeof(double));\r
9379    if((com.space = (double*)realloc(com.space,com.sspace))==NULL) \r
9380       error2("oom space");\r
9381 \r
9382 #if (defined CODEML)\r
9383    GetUVRoot_codeml ();\r
9384 #endif\r
9385    if(com.clock==6) {\r
9386       if(data.fix_nu<=1) {\r
9387          printf("nu & nbrate? ");\r
9388          scanf("%lf%d? ", &nu_AHRS, &nbrate);\r
9389       }\r
9390       for(locus=0; locus<data.ngene; locus++)  \r
9391          data.nbrate[locus] = nbrate;\r
9392       AdHocRateSmoothing(fout, x, xb, com.space);\r
9393 \r
9394       printf("\nStep 3: ML estimation of times and rates.");\r
9395       fprintf(fout,"\n\nStep 3: ML estimation of times and rates.\n");\r
9396    }\r
9397    else {   /* clock = 5, global clock */\r
9398       for(locus=0; locus<data.ngene; locus++) \r
9399          for(i=0,data.nbrate[locus]=1; i<data.ns[locus]*2-1; i++)\r
9400             gnodes[locus][i].label=0;\r
9401    }\r
9402 \r
9403    noisy=3;\r
9404 \r
9405    copySptree();\r
9406    GetInitialsClock56Step3(x);\r
9407    np=com.np;\r
9408 \r
9409    SetxBound (com.np, xb);\r
9410    lnL = lnLfunHeteroData(x,np);\r
9411 \r
9412    if(noisy) {\r
9413       printf("\nntime & nrate & np:%6d%6d%6d\n",com.ntime-1,com.nrate,com.np);\r
9414       matout(F0,x,1,np);\r
9415       printf("\nlnL0 = %12.6f\n",-lnL);\r
9416    }\r
9417 \r
9418    j = ming2(noisy>2?frub:NULL,&lnL,lnLfunHeteroData,NULL,x,xb, com.space,e,np);\r
9419 \r
9420    if(noisy) printf("Out...\nlnL  = %12.6f\n", -lnL);\r
9421    \r
9422    LASTROUND=1;\r
9423    for(i=0,j=!sptree.nodes[sptree.root].fossil; i<sptree.nnode; i++) \r
9424       if(i!=sptree.root && sptree.nodes[i].nson && !sptree.nodes[i].fossil) \r
9425          x[j++]=sptree.nodes[i].age;       /* copy node ages into x[] */\r
9426 \r
9427    if (com.getSE) {\r
9428       if(np>100 || (com.seqtype && np>20)) puts("Calculating SE's");\r
9429       var=com.space+np;\r
9430       Hessian (np,x,lnL,com.space,var,lnLfunHeteroData,var+np*np);\r
9431       matinv(var,np,np,var+np*np);\r
9432    }\r
9433    copySptree();\r
9434    SetBranch(x);\r
9435    fprintf(fout,"\n\nTree:  ");  OutTreeN(fout,0,0);\r
9436    fprintf(fout,"\nlnL(ntime:%3d  np:%3d):%14.6f\n", com.ntime-1,np,-lnL);\r
9437    OutTreeB(fout);  FPN (fout);\r
9438    for(i=0;i<np;i++) fprintf(fout," %9.5f",x[i]);  FPN(fout);  fflush(fout);\r
9439 \r
9440    if(com.getSE) {\r
9441       fprintf(fout,"SEs for parameters:\n");\r
9442       for(i=0;i<np;i++) fprintf(fout," %9.5f",(var[i*np+i]>0.?sqrt(var[i*np+i]):-1));\r
9443       FPN(fout);\r
9444       if (com.getSE==2) matout2(fout, var, np, np, 15, 10);\r
9445    }\r
9446 \r
9447    fprintf(fout,"\nTree with node ages for TreeView\n");\r
9448    FOR(i,tree.nnode) nodes[i].branch*=100;\r
9449    FPN(fout);  OutTreeN(fout,1,1);  FPN(fout);\r
9450    FPN(fout);  OutTreeN(fout,1,PrNodeNum);  FPN(fout);\r
9451    FPN(fout);  OutTreeN(fout,1,PrLabel|PrAge);  FPN(fout);\r
9452    FPN(fout);  OutTreeN(fout,1,0);  FPN(fout);\r
9453    OutputTimesRates(fout, x, var);\r
9454 \r
9455    fprintf(fout,"\nSubstititon rates for genes (per time unit)\n");\r
9456    for(j=0,k=com.ntime-1; j<data.ngene; j++,FPN(fout)) {\r
9457       fprintf(fout,"   Gene %2d: ", j+1);\r
9458       for(i=0; i<data.nbrate[j]; i++,k++) {\r
9459          fprintf(fout,"%10.5f", x[k]);\r
9460          if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));\r
9461       }\r
9462       if(com.clock==6) fprintf(fout," ");\r
9463    }\r
9464    if(!com.fix_kappa) {\r
9465       fprintf(fout,"\nkappa for genes\n");\r
9466       for(j=0; j<data.ngene; j++,k++) {\r
9467          fprintf(fout,"%10.5f", data.kappa[j]);\r
9468          if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));\r
9469       }\r
9470    }\r
9471    if(!com.fix_omega) {\r
9472       fprintf(fout,"\nomega for genes\n");\r
9473       for(j=0; j<data.ngene; j++,k++) {\r
9474          fprintf(fout,"%10.5f", data.omega[j]);\r
9475          if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));\r
9476       }\r
9477    }\r
9478    if(!com.fix_alpha) {\r
9479       fprintf(fout,"\nalpha for genes\n");\r
9480       for(j=0; j<data.ngene; j++,k++) {\r
9481          fprintf(fout,"%10.5f", data.alpha[j]);\r
9482          if(com.getSE) fprintf(fout," +- %.5f", sqrt(var[k*np+k]));\r
9483       }\r
9484    }\r
9485    FPN(fout);\r
9486    FreeMemBC();\r
9487    printf("\nTime used: %s\n", printtime(timestr));\r
9488    exit(0);\r
9489 }\r
9490 \r
9491 #endif\r