src/likelihood.c

   1 /*
   2  *  MrBayes 3
   3  *
   4  *  (c) 2002-2013
   5  *
   6  *  John P. Huelsenbeck
   7  *  Dept. Integrative Biology
   8  *  University of California, Berkeley
   9  *  Berkeley, CA 94720-3140
  10  *  johnh@berkeley.edu
  11  *
  12  *  Fredrik Ronquist
  13  *  Swedish Museum of Natural History
  14  *  Box 50007
  15  *  SE-10405 Stockholm, SWEDEN
  16  *  fredrik.ronquist@nrm.se
  17  *
  18  *  With important contributions by
  19  *
  20  *  Paul van der Mark (paulvdm@sc.fsu.edu)
  21  *  Maxim Teslenko (maxim.teslenko@nrm.se)
  22  *
  23  *  and by many users (run 'acknowledgments' to see more info)
  24  *
  25  * This program is free software; you can redistribute it and/or
  26  * modify it under the terms of the GNU General Public License
  27  * as published by the Free Software Foundation; either version 2
  28  * of the License, or (at your option) any later version.
  29  *
  30  * This program is distributed in the hope that it will be useful,
  31  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  32  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  33  * GNU General Public License for more details (www.gnu.org).
  34  *
  35  */
  36
  37 #include "bayes.h"
  38 #include "likelihood.h"
  39 #include "mbbeagle.h"
  40 #include "model.h"
  41 #include "utils.h"
  42
  43 const char* const svnRevisionLikeliC = "$Rev: 1003 $";   /* Revision keyword which is expanded/updated by svn on each commit/update */
  44
  45 #define LIKE_EPSILON                1.0e-300
  46
  47 extern int      *chainId;
  48 extern int      numLocalChains;
  49 extern int      rateProbRowSize;            /* size of rate probs for one chain one state   */
  50 extern MrBFlt   **rateProbs;                /* pointers to rate probs used by adgamma model */
  51
  52 /* local prototypes */
  53 void      CopySiteScalers (ModelInfo *m, int chain);
  54 void      FlipCondLikeSpace (ModelInfo *m, int chain, int nodeIndex);
  55 void      FlipCijkSpace (ModelInfo *m, int chain);
  56 void      FlipNodeScalerSpace (ModelInfo *m, int chain, int nodeIndex);
  57 void      FlipSiteScalerSpace (ModelInfo *m, int chain);
  58 void      FlipTiProbsSpace (ModelInfo *m, int chain, int nodeIndex);
  59 MrBFlt    GetRate (int division, int chain);
  60 int       RemoveNodeScalers(TreeNode *p, int division, int chain);
  61 int       RemoveNodeScalers_SSE(TreeNode *p, int division, int chain);
  62 void      ResetSiteScalers (ModelInfo *m, int chain);
  63 int       UpDateCijk (int whichPart, int whichChain);
  64
  65
  66 #if !defined (SSE_ENABLED) || 1
  67 /*----------------------------------------------------------------
  68 |
  69 |   CondLikeDown_Bin: binary model with or without rate
  70 |       variation
  71 |
  72 -----------------------------------------------------------------*/
  73 int CondLikeDown_Bin (TreeNode *p, int division, int chain)
  74 {
  75     int             c, k;
  76     CLFlt           *clL, *clR, *clP, *pL, *pR, *tiPL, *tiPR;
  77     ModelInfo       *m;
  78
  79     /* find model settings for this division */
  80     m = &modelSettings[division];
  81
  82     /* Flip conditional likelihood space */
  83     FlipCondLikeSpace (m, chain, p->index);
  84
  85     /* find conditional likelihood pointers */
  86     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
  87     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
  88     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
  89
  90     /* find transition probabilities */
  91     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
  92     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
  93
  94     tiPL = pL;
  95     tiPR = pR;
  96     for (k=0; k<m->numGammaCats; k++)
  97         {
  98         for (c=0; c<m->numChars; c++)
  99             {
 100             *(clP++) = (tiPL[0]*clL[0] + tiPL[1]*clL[1])
 101                       *(tiPR[0]*clR[0] + tiPR[1]*clR[1]);
 102             *(clP++) = (tiPL[2]*clL[0] + tiPL[3]*clL[1])
 103                       *(tiPR[2]*clR[0] + tiPR[3]*clR[1]);
 104
 105             clL += 2;
 106             clR += 2;
 107             }
 108         tiPL += 4;
 109         tiPR += 4;
 110         }
 111
 112     return NO_ERROR;
 113
 114 }
 115 #endif
 116
 117
 118 #if defined (SSE_ENABLED)
 119 /*----------------------------------------------------------------
 120 |
 121 |   CondLikeDown_Bin_SSE: binary model with or without rate
 122 |       variation
 123 |
 124 -----------------------------------------------------------------*/
 125 int CondLikeDown_Bin_SSE (TreeNode *p, int division, int chain)
 126 {
 127     int             c, k;
 128     CLFlt           *pL, *pR, *tiPL, *tiPR;
 129     __m128          *clL, *clR, *clP;
 130     __m128          m1, m2, m3, m4, m5, m6;
 131     ModelInfo       *m;
 132
 133     m = &modelSettings[division];
 134
 135     /* flip state of node so that we are not overwriting old cond likes */
 136     FlipCondLikeSpace (m, chain, p->index);
 137
 138     /* find conditional likelihood pointers */
 139     clL = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
 140     clR = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
 141     clP = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
 142
 143     /* find transition probabilities */
 144     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
 145     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
 146
 147     tiPL = pL;
 148     tiPR = pR;
 149     for (k=0; k<m->numGammaCats; k++)
 150         {
 151         for (c=0; c<m->numSSEChars; c++)
 152             {
 153             m1 = _mm_load1_ps (&tiPL[0]);
 154             m2 = _mm_load1_ps (&tiPR[0]);
 155             m5 = _mm_mul_ps (m1, clL[0]);
 156             m6 = _mm_mul_ps (m2, clR[0]);
 157
 158             m1 = _mm_load1_ps (&tiPL[1]);
 159             m2 = _mm_load1_ps (&tiPR[1]);
 160             m3 = _mm_mul_ps (m1, clL[1]);
 161             m4 = _mm_mul_ps (m2, clR[1]);
 162
 163             m5 = _mm_add_ps (m3, m5);
 164             m6 = _mm_add_ps (m4, m6);
 165
 166             *clP++ = _mm_mul_ps (m5, m6);
 167
 168             m1 = _mm_load1_ps (&tiPL[2]);
 169             m2 = _mm_load1_ps (&tiPR[2]);
 170             m5 = _mm_mul_ps (m1, clL[0]);
 171             m6 = _mm_mul_ps (m2, clR[0]);
 172
 173             m1 = _mm_load1_ps (&tiPL[3]);
 174             m2 = _mm_load1_ps (&tiPR[3]);
 175             m3 = _mm_mul_ps (m1, clL[1]);
 176             m4 = _mm_mul_ps (m2, clR[1]);
 177
 178             m5 = _mm_add_ps (m3, m5);
 179             m6 = _mm_add_ps (m4, m6);
 180
 181             *clP++ = _mm_mul_ps (m5, m6);
 182             clL += 2;
 183             clR += 2;
 184             }
 185         tiPL += 4;
 186         tiPR += 4;
 187         }
 188
 189     return NO_ERROR;
 190 }
 191 #endif
 192
 193
 194 /*----------------------------------------------------------------
 195 |
 196 |   CondLikeDown_Gen: general n-state model with or without rate
 197 |       variation
 198 |
 199 -----------------------------------------------------------------*/
 200 int CondLikeDown_Gen (TreeNode *p, int division, int chain)
 201 {
 202     int             a, b, c, h, i, k, j, shortCut, *lState=NULL, *rState=NULL,
 203                     nObsStates, nStates, nStatesSquared, preLikeJump;
 204     CLFlt           likeL, likeR, *pL, *pR, *tiPL, *tiPR, *clL, *clR, *clP;
 205     ModelInfo       *m;
 206 #   if !defined (DEBUG_NOSHORTCUTS)
 207     int catStart;
 208 #   endif
 209
 210     /* find model settings for this division and nStates, nStatesSquared */
 211     m = &modelSettings[division];
 212     nObsStates = m->numStates;
 213     nStates = m->numModelStates;
 214     nStatesSquared = nStates * nStates;
 215     preLikeJump = nObsStates * nStates;
 216
 217     /* flip conditional likelihood space */
 218     FlipCondLikeSpace (m, chain, p->index);
 219
 220     /* find conditional likelihood pointers */
 221     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
 222     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
 223     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
 224
 225     /* find transition probabilities */
 226     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
 227     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
 228
 229     /* find likelihoods of site patterns for left branch if terminal */
 230     shortCut = 0;
 231 #   if !defined (DEBUG_NOSHORTCUTS)
 232     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
 233         {
 234         shortCut |= 1;
 235         lState = m->termState[p->left->index];
 236         tiPL = pL;
 237         for (k=a=0; k<m->numGammaCats; k++)
 238             {
 239             catStart = a;
 240             for (i=0; i<nObsStates; i++)
 241                 for (j=i; j<nStatesSquared; j+=nStates)
 242                     preLikeL[a++] = tiPL[j];
 243             for (b=1; b<nStates/nObsStates; b++)
 244                 {
 245                 a = catStart;
 246                 for (i=0; i<nObsStates; i++)
 247                     {
 248                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
 249                         preLikeL[a++] += tiPL[j];
 250                     }
 251                 }
 252             /* for ambiguous */
 253             for (i=0; i<nStates; i++)
 254                 preLikeL[a++] = 1.0;
 255             tiPL += nStatesSquared;
 256             }
 257         }
 258
 259     /* find likelihoods of site patterns for right branch if terminal */
 260     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
 261         {
 262         shortCut |= 2;
 263         rState = m->termState[p->right->index];
 264         tiPR = pR;
 265         for (k=a=0; k<m->numGammaCats; k++)
 266             {
 267             catStart = a;
 268             for (i=0; i<nObsStates; i++)
 269                 for (j=i; j<nStatesSquared; j+=nStates)
 270                     preLikeR[a++] = tiPR[j];
 271             for (b=1; b<nStates/nObsStates; b++)
 272                 {
 273                 a = catStart;
 274                 for (i=0; i<nObsStates; i++)
 275                     {
 276                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
 277                         preLikeR[a++] += tiPR[j];
 278                     }
 279                 }
 280             /* for ambiguous */
 281             for (i=0; i<nStates; i++)
 282                 preLikeR[a++] = 1.0;
 283             tiPR += nStatesSquared;
 284             }
 285         }
 286 #   endif
 287     switch (shortCut)
 288         {
 289         case 0:
 290             tiPL = pL;
 291             tiPR = pR;
 292             for (k=0; k<m->numGammaCats; k++)
 293                 {
 294                 for (c=0; c<m->numChars; c++)
 295                     {
 296                     for (i=h=0; i<nStates; i++)
 297                         {
 298                         likeL = likeR = 0.0;
 299                         for (j=0; j<nStates; j++)
 300                             {
 301                             likeL += tiPL[h]*clL[j];
 302                             likeR += tiPR[h++]*clR[j];
 303                             }
 304                         *(clP++) = likeL * likeR;
 305                         }
 306                     clL += nStates;
 307                     clR += nStates;
 308                     }
 309                 tiPL += nStatesSquared;
 310                 tiPR += nStatesSquared;
 311                 }
 312             break;
 313         case 1:
 314             tiPR = pR;
 315             for (k=0; k<m->numGammaCats; k++)
 316                 {
 317                 for (c=0; c<m->numChars; c++)
 318                     {
 319                     a = lState[c] + k*(preLikeJump+nStates);
 320                     for (i=h=0; i<nStates; i++)
 321                         {
 322                         likeR = 0.0;
 323                         for (j=0; j<nStates; j++)
 324                             {
 325                             likeR += tiPR[h++]*clR[j];
 326                             }
 327                         *(clP++) = preLikeL[a++] * likeR;
 328                         }
 329                     clR += nStates;
 330                     }
 331                 tiPR += nStatesSquared;
 332                 }
 333             break;
 334         case 2:
 335             tiPL = pL;
 336             for (k=0; k<m->numGammaCats; k++)
 337                 {
 338                 for (c=0; c<m->numChars; c++)
 339                     {
 340                     a = rState[c] + k*(preLikeJump+nStates);
 341                     for (i=h=0; i<nStates; i++)
 342                         {
 343                         likeL = 0.0;
 344                         for (j=0; j<nStates; j++)
 345                             {
 346                             likeL += tiPL[h++]*clL[j];
 347                             }
 348                         *(clP++) = preLikeR[a++] * likeL;
 349                         }
 350                     clL += nStates;
 351                     }
 352                 tiPL += nStatesSquared;
 353                 }
 354             break;
 355         case 3:
 356             for (k=0; k<m->numGammaCats; k++)
 357                 {
 358                 for (c=0; c<m->numChars; c++)
 359                     {
 360                     a = rState[c] + k*(preLikeJump+nStates);
 361                     b = lState[c] + k*(preLikeJump+nStates);
 362                     for (i=0; i<nStates; i++)
 363                         {
 364                         *(clP++) = preLikeR[a++] * preLikeL[b++];
 365                         }
 366                     }
 367                 }
 368             break;
 369         }
 370
 371     return NO_ERROR;
 372 }
 373
 374
 375 #if defined (SSE_ENABLED)
 376 /*----------------------------------------------------------------
 377 |
 378 |   CondLikeDown_Gen_SSE: general n-state model with or without rate
 379 |       variation
 380 |
 381 -----------------------------------------------------------------*/
 382 int CondLikeDown_Gen_SSE (TreeNode *p, int division, int chain)
 383 {
 384     int             c, c1, h, i, j, k, t, shortCut, *lState=NULL, *rState=NULL, nStates, nStatesSquared, nObsStates, preLikeJump;
 385     CLFlt           *pL, *pR, *tiPL, *tiPR;
 386     __m128          *clL, *clR, *clP;
 387     __m128          mTiPL, mTiPR, mL, mR, mAcumL, mAcumR;
 388     ModelInfo       *m;
 389     CLFlt           *preLikeRV[FLOATS_PER_VEC];
 390     CLFlt           *preLikeLV[FLOATS_PER_VEC];
 391
 392 #   if !defined (DEBUG_NOSHORTCUTS)
 393     int             a, b, catStart;
 394 #   endif
 395
 396     /* find model settings for this division and nStates, nStatesSquared */
 397     m = &modelSettings[division];
 398     nObsStates = m->numStates;
 399     nStates = m->numModelStates;
 400     nStatesSquared = nStates * nStates;
 401     preLikeJump = nObsStates * nStates;
 402
 403     /* Flip conditional likelihood space */
 404     FlipCondLikeSpace (m, chain, p->index);
 405
 406     /* find conditional likelihood pointers */
 407     clL = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->left->index ]];
 408     clR = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->right->index]];
 409     clP = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->index       ]];
 410
 411     /* find transition probabilities */
 412     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
 413     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
 414
 415     /* find likelihoods of site patterns for left branch if terminal */
 416     shortCut = 0;
 417 #   if !defined (DEBUG_NOSHORTCUTS)
 418     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
 419         {
 420         shortCut |= 1;
 421         lState = m->termState[p->left->index];
 422         tiPL = pL;
 423         for (k=a=0; k<m->numGammaCats; k++)
 424             {
 425             catStart = a;
 426             for (i=0; i<nObsStates; i++)
 427                 for (j=i; j<nStatesSquared; j+=nStates)
 428                     preLikeL[a++] = tiPL[j];
 429             for (b=1; b<nStates/nObsStates; b++)
 430                 {
 431                 a = catStart;
 432                 for (i=0; i<nObsStates; i++)
 433                     {
 434                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
 435                         preLikeL[a++] += tiPL[j];
 436                     }
 437                 }
 438             /* for ambiguous */
 439             for (i=0; i<nStates; i++)
 440                 preLikeL[a++] = 1.0;
 441             tiPL += nStatesSquared;
 442             }
 443         }
 444
 445     /* find likelihoods of site patterns for right branch if terminal */
 446     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
 447         {
 448         shortCut |= 2;
 449         rState = m->termState[p->right->index];
 450         tiPR = pR;
 451         for (k=a=0; k<m->numGammaCats; k++)
 452             {
 453             catStart = a;
 454             for (i=0; i<nObsStates; i++)
 455                 for (j=i; j<nStatesSquared; j+=nStates)
 456                     preLikeR[a++] = tiPR[j];
 457             for (b=1; b<nStates/nObsStates; b++)
 458                 {
 459                 a = catStart;
 460                 for (i=0; i<nObsStates; i++)
 461                     {
 462                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
 463                         preLikeR[a++] += tiPR[j];
 464                     }
 465                 }
 466             /* for ambiguous */
 467             for (i=0; i<nStates; i++)
 468                 preLikeR[a++] = 1.0;
 469             tiPR += nStatesSquared;
 470             }
 471         }
 472 #   endif
 473
 474     switch (shortCut)
 475         {
 476         case 0:
 477             tiPL = pL;
 478             tiPR = pR;
 479             for (k=0; k<m->numGammaCats; k++)
 480                 {
 481                 for (c=0; c<m->numSSEChars; c++)
 482                     {
 483                     for (i=h=0; i<nStates; i++)
 484                         {
 485                         mAcumL = _mm_setzero_ps();
 486                         mAcumR = _mm_setzero_ps();
 487                         for (j=0; j<nStates; j++)
 488                             {
 489                             mTiPL  = _mm_load1_ps (&tiPL[h]);
 490                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
 491                             mL     = _mm_mul_ps (mTiPL, clL[j]);
 492                             mR     = _mm_mul_ps (mTiPR, clR[j]);
 493                             mAcumL = _mm_add_ps (mL, mAcumL);
 494                             mAcumR = _mm_add_ps (mR, mAcumR);
 495                             }
 496                         *(clP++) = _mm_mul_ps (mAcumL, mAcumR);
 497                         }
 498                     clL += nStates;
 499                     clR += nStates;
 500                     }
 501                 tiPL += nStatesSquared;
 502                 tiPR += nStatesSquared;
 503                 }
 504             break;
 505         case 1:
 506             tiPR = pR;
 507             for (k=0; k<m->numGammaCats; k++)
 508                 {
 509                 for (c=t=0; c<m->numSSEChars; c++)
 510                     {
 511                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
 512                         {
 513                         preLikeLV[c1] = &preLikeL[lState[t] + k*(preLikeJump+nStates)];
 514                         }
 515                     for (i=h=0; i<nStates; i++)
 516                         {
 517                         assert (FLOATS_PER_VEC == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
 518                         mAcumL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
 519                         mAcumR = _mm_setzero_ps();
 520                         for (j=0; j<nStates; j++)
 521                             {
 522                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
 523                             mR     = _mm_mul_ps (mTiPR, clR[j]);
 524                             mAcumR = _mm_add_ps (mR, mAcumR);
 525                             }
 526                         *(clP++) = _mm_mul_ps (mAcumL,mAcumR);
 527                         }
 528                     clR += nStates;
 529                     }
 530                 tiPR += nStatesSquared;
 531                 }
 532             break;
 533         case 2:
 534             tiPL = pL;
 535             for (k=0; k<m->numGammaCats; k++)
 536                 {
 537                 for (c=t=0; c<m->numSSEChars; c++)
 538                     {
 539                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
 540                         {
 541                         preLikeRV[c1] = &preLikeR[rState[t] + k*(preLikeJump+nStates)];
 542                         }
 543                     for (i=h=0; i<nStates; i++)
 544                         {
 545                         assert (FLOATS_PER_VEC == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
 546                         mAcumR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
 547                         mAcumL = _mm_setzero_ps();
 548                         for (j=0; j<nStates; j++)
 549                             {
 550                             mTiPL  = _mm_load1_ps (&tiPL[h++]);
 551                             mL     = _mm_mul_ps (mTiPL, clL[j]);
 552                             mAcumL = _mm_add_ps (mL, mAcumL);
 553                             }
 554                         *(clP++) = _mm_mul_ps (mAcumL,mAcumR);
 555                         }
 556                     clL += nStates;
 557                     }
 558                 tiPL += nStatesSquared;
 559                 }
 560             break;
 561         case 3:
 562             for (k=0; k<m->numGammaCats; k++)
 563                 {
 564                 for (c=t=0; c<m->numSSEChars; c++)
 565                     {
 566                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
 567                         {
 568                         preLikeRV[c1] = &preLikeR[rState[t] + k*(preLikeJump+nStates)];
 569                         preLikeLV[c1] = &preLikeL[lState[t] + k*(preLikeJump+nStates)];
 570                         }
 571                     for (i=0; i<nStates; i++)
 572                         {
 573                         assert (FLOATS_PER_VEC == 4); /* In the following 2 statments we assume that SSE register can hold exactly 4 ClFlts. */
 574                         mL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
 575                         mR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
 576                         *(clP++) = _mm_mul_ps (mL,mR);
 577                         }
 578                     }
 579                 }
 580             break;
 581         }
 582     return NO_ERROR;
 583 }
 584 #endif
 585
 586
 587 /*----------------------------------------------------------------
 588 |
 589 |   CondLikeDown_Gen_GibbsGamma: general n-state model with rate
 590 |       variation modeled using discrete gamma with Gibbs resampling
 591 |
 592 -----------------------------------------------------------------*/
 593 int CondLikeDown_Gen_GibbsGamma (TreeNode *p, int division, int chain)
 594 {
 595     int             a, b, c, i, j, r, *rateCat, shortCut, *lState=NULL, *rState=NULL,
 596                     nObsStates, nStates, nStatesSquared, nGammaCats;
 597     CLFlt           likeL, likeR, *pL, *pR, *tiPL, *tiPR, *clL, *clR, *clP;
 598     ModelInfo       *m;
 599 #   if !defined (DEBUG_NOSHORTCUTS)
 600     int k, catStart;
 601 #   endif
 602
 603     /* find model settings for this division and nStates, nStatesSquared */
 604     m = &modelSettings[division];
 605     nObsStates = m->numStates;
 606     nStates = m->numModelStates;
 607     nStatesSquared = nStates * nStates;
 608
 609     /* flip conditional likelihood space */
 610     FlipCondLikeSpace (m, chain, p->index);
 611
 612     /* find conditional likelihood pointers */
 613     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
 614     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
 615     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
 616
 617     /* find transition probabilities */
 618     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
 619     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
 620
 621     /* find rate category index and number of gamma categories */
 622     rateCat = m->tiIndex + chain * m->numChars;
 623     nGammaCats = m->numGammaCats;
 624
 625     /* find likelihoods of site patterns for left branch if terminal */
 626     shortCut = 0;
 627 #   if !defined (DEBUG_NOSHORTCUTS)
 628     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
 629         {
 630         shortCut |= 1;
 631         lState = m->termState[p->left->index];
 632         tiPL = pL;
 633         for (k=a=0; k<nGammaCats; k++)
 634             {
 635             catStart = a;
 636             for (i=0; i<nObsStates; i++)
 637                 for (j=i; j<nStatesSquared; j+=nStates)
 638                     preLikeL[a++] = tiPL[j];
 639             for (b=1; b<nStates/nObsStates; b++)
 640                 {
 641                 a = catStart;
 642                 for (i=0; i<nObsStates; i++)
 643                     {
 644                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
 645                         preLikeL[a++] += tiPL[j];
 646                     }
 647                 }
 648             /* for ambiguous */
 649             for (i=0; i<nStates; i++)
 650                 preLikeL[a++] = 1.0;
 651             tiPL += nStatesSquared;
 652             }
 653         }
 654
 655     /* find likelihoods of site patterns for right branch if terminal */
 656     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
 657         {
 658         shortCut |= 2;
 659         rState = m->termState[p->right->index];
 660         tiPR = pR;
 661         for (k=a=0; k<nGammaCats; k++)
 662             {
 663             catStart = a;
 664             for (i=0; i<nObsStates; i++)
 665                 for (j=i; j<nStatesSquared; j+=nStates)
 666                     preLikeR[a++] = tiPR[j];
 667             for (b=1; b<nStates/nObsStates; b++)
 668                 {
 669                 a = catStart;
 670                 for (i=0; i<nObsStates; i++)
 671                     {
 672                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
 673                         preLikeR[a++] += tiPR[j];
 674                     }
 675                 }
 676             /* for ambiguous */
 677             for (i=0; i<nStates; i++)
 678                 preLikeR[a++] = 1.0;
 679             tiPR += nStatesSquared;
 680             }
 681         }
 682 #   endif
 683
 684     switch (shortCut)
 685         {
 686         case 0:
 687             for (c=0; c<m->numChars; c++)
 688                 {
 689                 r = (*rateCat++);
 690                 if (r < nGammaCats)
 691                     {
 692                     tiPL = pL + r*nStatesSquared;
 693                     tiPR = pR + r*nStatesSquared;
 694                     for (i=0; i<nStates; i++)
 695                         {
 696                         likeL = likeR = 0.0;
 697                         for (j=0; j<nStates; j++)
 698                             {
 699                             likeL += (*tiPL++) * clL[j];
 700                             likeR += (*tiPR++) * clR[j];
 701                             }
 702                         *(clP++) = likeL * likeR;
 703                         }
 704                     }
 705                 else
 706                     clP += nStates;
 707                 clL += nStates;
 708                 clR += nStates;
 709                 }
 710             break;
 711         case 1:
 712             for (c=0; c<m->numChars; c++)
 713                 {
 714                 r = (*rateCat++);
 715                 if (r < nGammaCats)
 716                     {
 717                     tiPR = pR + r*nStatesSquared;
 718                     a = lState[c] + r*(nStatesSquared+nStates);
 719                     for (i=0; i<nStates; i++)
 720                         {
 721                         likeR = 0.0;
 722                         for (j=0; j<nStates; j++)
 723                             {
 724                             likeR += (*tiPR++)*clR[j];
 725                             }
 726                         *(clP++) = preLikeL[a++] * likeR;
 727                         }
 728                     }
 729                 else
 730                     clP += nStates;
 731                 clR += nStates;
 732                 }
 733             break;
 734         case 2:
 735             for (c=0; c<m->numChars; c++)
 736                 {
 737                 r = (*rateCat++);
 738                 if (r < nGammaCats)
 739                     {
 740                     tiPL = pL + r*nStatesSquared;
 741                     a = rState[c] + r*(nStatesSquared+nStates);
 742                     for (i=0; i<nStates; i++)
 743                         {
 744                         likeL = 0.0;
 745                         for (j=0; j<nStates; j++)
 746                             {
 747                             likeL += (*tiPL++)*clL[j];
 748                             }
 749                         *(clP++) = preLikeR[a++] * likeL;
 750                         }
 751                     }
 752                 else
 753                     clP += nStates;
 754                 clL += nStates;
 755                 }
 756             break;
 757         case 3:
 758             for (c=0; c<m->numChars; c++)
 759                 {
 760                 r = (*rateCat++);
 761                 if (r < nGammaCats)
 762                     {
 763                     a = lState[c] + r*(nStatesSquared+nStates);
 764                     b = rState[c] + r*(nStatesSquared+nStates);
 765                     for (i=0; i<nStates; i++)
 766                         *(clP++) = preLikeL[a++]*preLikeR[b++];
 767                     }
 768                 else
 769                     clP += nStates;
 770                 }
 771             break;
 772         }
 773
 774     return NO_ERROR;
 775 }
 776
 777
 778 /*----------------------------------------------------------------
 779 |
 780 |   CondLikeDown_NUC4: 4by4 nucleotide model with or without rate
 781 |       variation
 782 |
 783 -----------------------------------------------------------------*/
 784 int CondLikeDown_NUC4 (TreeNode *p, int division, int chain)
 785 {
 786     int             c, h, i, j, k, shortCut, *lState=NULL, *rState=NULL;
 787     CLFlt           *clL, *clR, *clP, *pL, *pR, *tiPL, *tiPR;
 788     ModelInfo       *m;
 789
 790     m = &modelSettings[division];
 791
 792     /* flip space so that we do not overwrite old cond likes */
 793     FlipCondLikeSpace (m, chain, p->index);
 794
 795     /* find conditional likelihood pointers */
 796     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
 797     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
 798     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
 799
 800     /* find transition probabilities */
 801     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
 802     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
 803
 804     /* find likelihoods of site patterns for left branch if terminal */
 805     shortCut = 0;
 806 #   if !defined (DEBUG_NOSHORTCUTS)
 807     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
 808         {
 809         shortCut |= 1;
 810         lState = m->termState[p->left->index];
 811         tiPL = pL;
 812         for (k=j=0; k<m->numGammaCats; k++)
 813             {
 814             for (i=0; i<4; i++)
 815                 {
 816                 preLikeL[j++] = tiPL[0];
 817                 preLikeL[j++] = tiPL[4];
 818                 preLikeL[j++] = tiPL[8];
 819                 preLikeL[j++] = tiPL[12];
 820                 tiPL++;
 821                 }
 822             /* for ambiguous */
 823             for (i=0; i<4; i++)
 824                 preLikeL[j++] = 1.0;
 825             tiPL += 12;
 826             }
 827         }
 828
 829     /* find likelihoods of site patterns for right branch if terminal */
 830     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
 831         {
 832         shortCut |= 2;
 833         rState = m->termState[p->right->index];
 834         tiPR = pR;
 835         for (k=j=0; k<m->numGammaCats; k++)
 836             {
 837             for (i=0; i<4; i++)
 838                 {
 839                 preLikeR[j++] = tiPR[0];
 840                 preLikeR[j++] = tiPR[4];
 841                 preLikeR[j++] = tiPR[8];
 842                 preLikeR[j++] = tiPR[12];
 843                 tiPR++;
 844                 }
 845             /* for ambiguous */
 846             for (i=0; i<4; i++)
 847                 preLikeR[j++] = 1.0;
 848             tiPR += 12;
 849             }
 850         }
 851 #   endif
 852
 853     switch (shortCut)
 854         {
 855         case 0:
 856             tiPL = pL;
 857             tiPR = pR;
 858             for (k=h=0; k<m->numGammaCats; k++)
 859                 {
 860                 for (c=0; c<m->numChars; c++)
 861                     {
 862                     clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
 863                                 *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T]);
 864                     clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
 865                                 *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T]);
 866                     clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
 867                                 *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T]);
 868                     clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
 869                                 *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T]);
 870                     clL += 4;
 871                     clR += 4;
 872                     }
 873                 tiPL += 16;
 874                 tiPR += 16;
 875                 }
 876             break;
 877         case 1:
 878             tiPR = pR;
 879             for (k=h=0; k<m->numGammaCats; k++)
 880                 {
 881                 for (c=0; c<m->numChars; c++)
 882                     {
 883                     i = lState[c] + k*20;
 884                     clP[h++] =   preLikeL[i++]
 885                                 *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T]);
 886                     clP[h++] =   preLikeL[i++]
 887                                 *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T]);
 888                     clP[h++] =   preLikeL[i++]
 889                                 *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T]);
 890                     clP[h++] =   preLikeL[i++]
 891                                 *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T]);
 892                     clR += 4;
 893                     }
 894                 tiPR += 16;
 895                 }
 896             break;
 897         case 2:
 898             tiPL = pL;
 899             for (k=h=0; k<m->numGammaCats; k++)
 900                 {
 901                 for (c=0; c<m->numChars; c++)
 902                     {
 903                     i = rState[c] + k*20;
 904                     clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
 905                                 *preLikeR[i++];
 906                     clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
 907                                 *preLikeR[i++];
 908                     clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
 909                                 *preLikeR[i++];
 910                     clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
 911                                 *preLikeR[i++];
 912                     clL += 4;
 913                     }
 914                 tiPL += 16;
 915                 }
 916             break;
 917         case 3:
 918             for (k=h=0; k<m->numGammaCats; k++)
 919                 {
 920                 for (c=0; c<m->numChars; c++)
 921                     {
 922                     i = j = k*20;
 923                     i += lState[c];
 924                     j += rState[c];
 925                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
 926                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
 927                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
 928                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
 929                     }
 930                 }
 931         }
 932
 933     return NO_ERROR;
 934 }
 935
 936
 937 /*----------------------------------------------------------------
 938 |
 939 |   CondLikeDown_NUC4_GibbsGamma: 4by4 nucleotide model with rate
 940 |       variation approximated using Gibbs sampling of gamma
 941 |
 942 -----------------------------------------------------------------*/
 943 int CondLikeDown_NUC4_GibbsGamma (TreeNode *p, int division, int chain)
 944 {
 945     int             c, h, i, j, r, *rateCat, shortCut, *lState=NULL, *rState=NULL,
 946                     nGammaCats;
 947     CLFlt           *clL, *clR, *clP, *pL, *pR, *tiPL, *tiPR;
 948     ModelInfo       *m;
 949 #   if !defined (DEBUG_NOSHORTCUTS)
 950     int k;
 951 #   endif
 952
 953     m = &modelSettings[division];
 954
 955     /* flip conditional likelihood space */
 956     FlipCondLikeSpace (m, chain, p->index);
 957
 958     /* find conditional likelihood pointers */
 959     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
 960     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
 961     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
 962
 963     /* find transition probabilities */
 964     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
 965     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
 966
 967     /* find rate category index  and number of gamma categories */
 968     rateCat = m->tiIndex + chain * m->numChars;
 969     nGammaCats = m->numGammaCats;
 970
 971     /* find likelihoods of site patterns for left branch if terminal */
 972     shortCut = 0;
 973 #   if !defined (DEBUG_NOSHORTCUTS)
 974     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
 975         {
 976         shortCut |= 1;
 977         lState = m->termState[p->left->index];
 978         tiPL = pL;
 979         for (k=j=0; k<nGammaCats; k++)
 980             {
 981             for (i=0; i<4; i++)
 982                 {
 983                 preLikeL[j++] = tiPL[0];
 984                 preLikeL[j++] = tiPL[4];
 985                 preLikeL[j++] = tiPL[8];
 986                 preLikeL[j++] = tiPL[12];
 987                 tiPL++;
 988                 }
 989             /* for ambiguous */
 990             for (i=0; i<4; i++)
 991                 preLikeL[j++] = 1.0;
 992             tiPL += 12;
 993             }
 994         }
 995
 996     /* find likelihoods of site patterns for right branch if terminal */
 997     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
 998         {
 999         shortCut |= 2;
1000         rState =  m->termState[p->right->index];
1001         tiPR = pR;
1002         for (k=j=0; k<nGammaCats; k++)
1003             {
1004             for (i=0; i<4; i++)
1005                 {
1006                 preLikeR[j++] = tiPR[0];
1007                 preLikeR[j++] = tiPR[4];
1008                 preLikeR[j++] = tiPR[8];
1009                 preLikeR[j++] = tiPR[12];
1010                 tiPR++;
1011                 }
1012             /* for ambiguous */
1013             for (i=0; i<4; i++)
1014                 preLikeR[j++] = 1.0;
1015             tiPR += 12;
1016             }
1017         }
1018 #   endif
1019
1020     switch (shortCut)
1021         {
1022         case 0:
1023             for (c=h=0; c<m->numChars; c++)
1024                 {
1025                 r = rateCat[c];
1026                 if (r < nGammaCats)
1027                     {
1028                     tiPL = pL + r * 16;
1029                     tiPR = pR + r * 16;
1030                     clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
1031                                 *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T]);
1032                     clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
1033                                 *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T]);
1034                     clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
1035                                 *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T]);
1036                     clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
1037                                 *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T]);
1038                     }
1039                 else
1040                     h += 4;
1041                 clL += 4;
1042                 clR += 4;
1043                 }
1044             break;
1045         case 1:
1046             for (c=h=0; c<m->numChars; c++)
1047                 {
1048                 r = rateCat[c];
1049                 if (r < nGammaCats)
1050                     {
1051                     tiPR = pR + r * 16;
1052                     i = lState[c] + r * 20;
1053                     clP[h++] =   preLikeL[i++]
1054                                 *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T]);
1055                     clP[h++] =   preLikeL[i++]
1056                                 *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T]);
1057                     clP[h++] =   preLikeL[i++]
1058                                 *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T]);
1059                     clP[h++] =   preLikeL[i++]
1060                                 *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T]);
1061                     }
1062                 else
1063                     h += 4;
1064                 clR += 4;
1065                 }
1066             break;
1067         case 2:
1068             for (c=h=0; c<m->numChars; c++)
1069                 {
1070                 r = rateCat[c];
1071                 if (r < nGammaCats)
1072                     {
1073                     tiPL = pL + r * 16;
1074                     i = rState[c] + r * 20;
1075                     clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
1076                                 *preLikeR[i++];
1077                     clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
1078                                 *preLikeR[i++];
1079                     clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
1080                                 *preLikeR[i++];
1081                     clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
1082                                 *preLikeR[i++];
1083                     }
1084                 else
1085                     h += 4;
1086                 clL += 4;
1087                 }
1088             break;
1089         case 3:
1090             for (c=h=0; c<m->numChars; c++)
1091                 {
1092                 r = rateCat[c];
1093                 if (r < nGammaCats)
1094                     {
1095                     i = lState[c] + r * 20;
1096                     j = rState[c] + r * 20;
1097                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
1098                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
1099                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
1100                     clP[h++] =   preLikeL[i++]*preLikeR[j++];
1101                     }
1102                 else
1103                     h += 4;
1104                 }
1105             break;
1106         }
1107
1108     return NO_ERROR;
1109 }
1110
1111
1112 #if defined (SSE_ENABLED)
1113 /*----------------------------------------------------------------
1114 |
1115 |   CondLikeDown_NUC4_SSE: 4by4 nucleotide model with or without rate
1116 |       variation, using SSE instructions
1117 |
1118 -----------------------------------------------------------------*/
1119 int CondLikeDown_NUC4_SSE (TreeNode *p, int division, int chain)
1120 {
1121     int             c, k;
1122     CLFlt           *pL, *pR, *tiPL, *tiPR;
1123     __m128          *clL, *clR, *clP;
1124     __m128          m1, m2, m3, m4, m5, m6;
1125     ModelInfo       *m;
1126
1127     m = &modelSettings[division];
1128
1129     /* flip state of node so that we are not overwriting old cond likes */
1130     FlipCondLikeSpace (m, chain, p->index);
1131
1132     /* find conditional likelihood pointers */
1133     clL = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1134     clR = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
1135     clP = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
1136
1137     /* find transition probabilities */
1138     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1139     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1140
1141     tiPL = pL;
1142     tiPR = pR;
1143     for (k=0; k<m->numGammaCats; k++)
1144         {
1145         for (c=0; c<m->numSSEChars; c++)
1146             {
1147             m1 = _mm_load1_ps (&tiPL[AA]);
1148             m2 = _mm_load1_ps (&tiPR[AA]);
1149             m5 = _mm_mul_ps (m1, clL[A]);
1150             m6 = _mm_mul_ps (m2, clR[A]);
1151
1152             m1 = _mm_load1_ps (&tiPL[AC]);
1153             m2 = _mm_load1_ps (&tiPR[AC]);
1154             m3 = _mm_mul_ps (m1, clL[C]);
1155             m4 = _mm_mul_ps (m2, clR[C]);
1156             m5 = _mm_add_ps (m3, m5);
1157             m6 = _mm_add_ps (m4, m6);
1158
1159             m1 = _mm_load1_ps (&tiPL[AG]);
1160             m2 = _mm_load1_ps (&tiPR[AG]);
1161             m3 = _mm_mul_ps (m1, clL[G]);
1162             m4 = _mm_mul_ps (m2, clR[G]);
1163             m5 = _mm_add_ps (m3, m5);
1164             m6 = _mm_add_ps (m4, m6);
1165
1166             m1 = _mm_load1_ps (&tiPL[AT]);
1167             m2 = _mm_load1_ps (&tiPR[AT]);
1168             m3 = _mm_mul_ps (m1, clL[T]);
1169             m4 = _mm_mul_ps (m2, clR[T]);
1170             m5 = _mm_add_ps (m3, m5);
1171             m6 = _mm_add_ps (m4, m6);
1172
1173             *clP++ = _mm_mul_ps (m5, m6);
1174
1175             m1 = _mm_load1_ps (&tiPL[CA]);
1176             m2 = _mm_load1_ps (&tiPR[CA]);
1177             m5 = _mm_mul_ps (m1, clL[A]);
1178             m6 = _mm_mul_ps (m2, clR[A]);
1179
1180             m1 = _mm_load1_ps (&tiPL[CC]);
1181             m2 = _mm_load1_ps (&tiPR[CC]);
1182             m3 = _mm_mul_ps (m1, clL[C]);
1183             m4 = _mm_mul_ps (m2, clR[C]);
1184             m5 = _mm_add_ps (m3, m5);
1185             m6 = _mm_add_ps (m4, m6);
1186
1187             m1 = _mm_load1_ps (&tiPL[CG]);
1188             m2 = _mm_load1_ps (&tiPR[CG]);
1189             m3 = _mm_mul_ps (m1, clL[G]);
1190             m4 = _mm_mul_ps (m2, clR[G]);
1191             m5 = _mm_add_ps (m3, m5);
1192             m6 = _mm_add_ps (m4, m6);
1193
1194             m1 = _mm_load1_ps (&tiPL[CT]);
1195             m2 = _mm_load1_ps (&tiPR[CT]);
1196             m3 = _mm_mul_ps (m1, clL[T]);
1197             m4 = _mm_mul_ps (m2, clR[T]);
1198             m5 = _mm_add_ps (m3, m5);
1199             m6 = _mm_add_ps (m4, m6);
1200
1201             *clP++ = _mm_mul_ps (m5, m6);
1202
1203             m1 = _mm_load1_ps (&tiPL[GA]);
1204             m2 = _mm_load1_ps (&tiPR[GA]);
1205             m5 = _mm_mul_ps (m1, clL[A]);
1206             m6 = _mm_mul_ps (m2, clR[A]);
1207
1208             m1 = _mm_load1_ps (&tiPL[GC]);
1209             m2 = _mm_load1_ps (&tiPR[GC]);
1210             m3 = _mm_mul_ps (m1, clL[C]);
1211             m4 = _mm_mul_ps (m2, clR[C]);
1212             m5 = _mm_add_ps (m3, m5);
1213             m6 = _mm_add_ps (m4, m6);
1214
1215             m1 = _mm_load1_ps (&tiPL[GG]);
1216             m2 = _mm_load1_ps (&tiPR[GG]);
1217             m3 = _mm_mul_ps (m1, clL[G]);
1218             m4 = _mm_mul_ps (m2, clR[G]);
1219             m5 = _mm_add_ps (m3, m5);
1220             m6 = _mm_add_ps (m4, m6);
1221
1222             m1 = _mm_load1_ps (&tiPL[GT]);
1223             m2 = _mm_load1_ps (&tiPR[GT]);
1224             m3 = _mm_mul_ps (m1, clL[T]);
1225             m4 = _mm_mul_ps (m2, clR[T]);
1226             m5 = _mm_add_ps (m3, m5);
1227             m6 = _mm_add_ps (m4, m6);
1228
1229             *clP++ = _mm_mul_ps (m5, m6);
1230
1231             m1 = _mm_load1_ps (&tiPL[TA]);
1232             m2 = _mm_load1_ps (&tiPR[TA]);
1233             m5 = _mm_mul_ps (m1, clL[A]);
1234             m6 = _mm_mul_ps (m2, clR[A]);
1235
1236             m1 = _mm_load1_ps (&tiPL[TC]);
1237             m2 = _mm_load1_ps (&tiPR[TC]);
1238             m3 = _mm_mul_ps (m1, clL[C]);
1239             m4 = _mm_mul_ps (m2, clR[C]);
1240             m5 = _mm_add_ps (m3, m5);
1241             m6 = _mm_add_ps (m4, m6);
1242
1243             m1 = _mm_load1_ps (&tiPL[TG]);
1244             m2 = _mm_load1_ps (&tiPR[TG]);
1245             m3 = _mm_mul_ps (m1, clL[G]);
1246             m4 = _mm_mul_ps (m2, clR[G]);
1247             m5 = _mm_add_ps (m3, m5);
1248             m6 = _mm_add_ps (m4, m6);
1249
1250             m1 = _mm_load1_ps (&tiPL[TT]);
1251             m2 = _mm_load1_ps (&tiPR[TT]);
1252             m3 = _mm_mul_ps (m1, clL[T]);
1253             m4 = _mm_mul_ps (m2, clR[T]);
1254             m5 = _mm_add_ps (m3, m5);
1255             m6 = _mm_add_ps (m4, m6);
1256
1257             *clP++ = _mm_mul_ps (m5, m6);
1258             clL += 4;
1259             clR += 4;
1260             }
1261         tiPL += 16;
1262         tiPR += 16;
1263         }
1264
1265     return NO_ERROR;
1266
1267 }
1268 #endif
1269
1270
1271 #if !defined (SSE_ENABLED) || 1
1272 /*----------------------------------------------------------------
1273 |
1274 |   CondLikeDown_NY98: codon model with omega variation
1275 |
1276 -----------------------------------------------------------------*/
1277 int CondLikeDown_NY98 (TreeNode *p, int division, int chain)
1278 {
1279     int             a, b, c, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, nStates, nStatesSquared;
1280     CLFlt           likeL, likeR, *pL, *pR, *tiPL, *tiPR, *clL, *clR, *clP;
1281     ModelInfo       *m;
1282
1283     /* find model settings for this division and nStates, nStatesSquared */
1284     m = &modelSettings[division];
1285     nStates = m->numModelStates;
1286     nStatesSquared = nStates * nStates;
1287
1288     /* Flip conditional likelihood space */
1289     FlipCondLikeSpace (m, chain, p->index);
1290
1291     /* find conditional likelihood pointers */
1292     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1293     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
1294     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
1295
1296     /* find transition probabilities */
1297     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1298     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1299
1300     /* find likelihoods of site patterns for left branch if terminal */
1301     shortCut = 0;
1302 #   if !defined (DEBUG_NOSHORTCUTS)
1303     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
1304         {
1305         shortCut |= 1;
1306         lState = m->termState[p->left->index];
1307         tiPL = pL;
1308         for (k=a=0; k<m->numOmegaCats; k++)
1309             {
1310             for (i=0; i<nStates; i++)
1311                 for (j=i; j<nStatesSquared; j+=nStates)
1312                     preLikeL[a++] = tiPL[j];
1313             /* for ambiguous */
1314             for (i=0; i<nStates; i++)
1315                 preLikeL[a++] = 1.0;
1316             tiPL += nStatesSquared;
1317             }
1318         }
1319
1320     /* find likelihoods of site patterns for right branch if terminal */
1321     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
1322         {
1323         shortCut |= 2;
1324         rState = m->termState[p->right->index];
1325         tiPR = pR;
1326         for (k=a=0; k<m->numOmegaCats; k++)
1327             {
1328             for (i=0; i<nStates; i++)
1329                 for (j=i; j<nStatesSquared; j+=nStates)
1330                     preLikeR[a++] = tiPR[j];
1331             /* for ambiguous */
1332             for (i=0; i<nStates; i++)
1333                 preLikeR[a++] = 1.0;
1334             tiPR += nStatesSquared;
1335             }
1336         }
1337 #   endif
1338
1339     switch (shortCut)
1340         {
1341         case 0:
1342             tiPL = pL;
1343             tiPR = pR;
1344             for (k=0; k<m->numOmegaCats; k++)
1345                 {
1346                 for (c=0; c<m->numChars; c++)
1347                     {
1348                     for (i=h=0; i<nStates; i++)
1349                         {
1350                         likeL = likeR = 0.0;
1351                         for (j=0; j<nStates; j++)
1352                             {
1353                             likeL += tiPL[h]*clL[j];
1354                             likeR += tiPR[h++]*clR[j];
1355                             }
1356                         *(clP++) = likeL * likeR;
1357                         }
1358                     clL += nStates;
1359                     clR += nStates;
1360                     }
1361                 tiPL += nStatesSquared;
1362                 tiPR += nStatesSquared;
1363                 }
1364             break;
1365         case 1:
1366             tiPR = pR;
1367             for (k=0; k<m->numOmegaCats; k++)
1368                 {
1369                 for (c=0; c<m->numChars; c++)
1370                     {
1371                     a = lState[c] + k*(nStatesSquared+nStates);
1372                     for (i=h=0; i<nStates; i++)
1373                         {
1374                         likeR = 0.0;
1375                         for (j=0; j<nStates; j++)
1376                             {
1377                             likeR += tiPR[h++]*clR[j];
1378                             }
1379                         *(clP++) = preLikeL[a++] * likeR;
1380                         }
1381                     clR += nStates;
1382                     }
1383                 tiPR += nStatesSquared;
1384                 }
1385             break;
1386         case 2:
1387             tiPL = pL;
1388             for (k=0; k<m->numOmegaCats; k++)
1389                 {
1390                 for (c=0; c<m->numChars; c++)
1391                     {
1392                     a = rState[c] + k*(nStatesSquared+nStates);
1393                     for (i=h=0; i<nStates; i++)
1394                         {
1395                         likeL = 0.0;
1396                         for (j=0; j<nStates; j++)
1397                             {
1398                             likeL += tiPL[h++]*clL[j];
1399                             }
1400                         *(clP++) = preLikeR[a++] * likeL;
1401                         }
1402                     clL += nStates;
1403                     }
1404                 tiPL += nStatesSquared;
1405                 }
1406             break;
1407         case 3:
1408             for (k=0; k<m->numOmegaCats; k++)
1409                 {
1410                 for (c=0; c<m->numChars; c++)
1411                     {
1412                     a = rState[c] + k*(nStatesSquared+nStates);
1413                     b = lState[c] + k*(nStatesSquared+nStates);
1414                     for (i=0; i<nStates; i++)
1415                         {
1416                         *(clP++) = preLikeR[a++] * preLikeL[b++];
1417                         }
1418                     }
1419                 }
1420             break;
1421         }
1422
1423     return NO_ERROR;
1424 }
1425 #endif
1426
1427
1428 #if defined (SSE_ENABLED)
1429 /*----------------------------------------------------------------
1430 |
1431 |   CondLikeDown_NY98_SSE: codon model with omega variation
1432 |
1433 -----------------------------------------------------------------*/
1434 int CondLikeDown_NY98_SSE (TreeNode *p, int division, int chain)
1435 {
1436     int             c, c1, h, i, j, k, t, shortCut, *lState=NULL, *rState=NULL, nStates, nStatesSquared;
1437     CLFlt           *pL, *pR, *tiPL, *tiPR;
1438     __m128          *clL, *clR, *clP;
1439     __m128          mTiPL, mTiPR, mL, mR, mAcumL, mAcumR;
1440     ModelInfo       *m;
1441     CLFlt           *preLikeRV[FLOATS_PER_VEC];
1442     CLFlt           *preLikeLV[FLOATS_PER_VEC];
1443 #   if !defined (DEBUG_NOSHORTCUTS)
1444     int             a;
1445 #   endif
1446
1447     /* find model settings for this division and nStates, nStatesSquared */
1448     m = &modelSettings[division];
1449     nStates = m->numModelStates;
1450     nStatesSquared = nStates * nStates;
1451
1452     /* Flip conditional likelihood space */
1453     FlipCondLikeSpace (m, chain, p->index);
1454
1455     /* find conditional likelihood pointers */
1456     clL = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1457     clR = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->right->index]];
1458     clP = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->index       ]];
1459
1460     /* find transition probabilities */
1461     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1462     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1463
1464     /* find likelihoods of site patterns for left branch if terminal */
1465     shortCut = 0;
1466 #   if !defined (DEBUG_NOSHORTCUTS)
1467     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
1468         {
1469         shortCut |= 1;
1470         lState = m->termState[p->left->index];
1471         tiPL = pL;
1472         for (k=a=0; k<m->numOmegaCats; k++)
1473             {
1474             for (i=0; i<nStates; i++)
1475                 for (j=i; j<nStatesSquared; j+=nStates)
1476                     preLikeL[a++] = tiPL[j];
1477             /* for ambiguous */
1478             for (i=0; i<nStates; i++)
1479                 preLikeL[a++] = 1.0;
1480             tiPL += nStatesSquared;
1481             }
1482         }
1483
1484     /* find likelihoods of site patterns for right branch if terminal */
1485     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
1486         {
1487         shortCut |= 2;
1488         rState = m->termState[p->right->index];
1489         tiPR = pR;
1490         for (k=a=0; k<m->numOmegaCats; k++)
1491             {
1492             for (i=0; i<nStates; i++)
1493                 for (j=i; j<nStatesSquared; j+=nStates)
1494                     preLikeR[a++] = tiPR[j];
1495             /* for ambiguous */
1496             for (i=0; i<nStates; i++)
1497                 preLikeR[a++] = 1.0;
1498             tiPR += nStatesSquared;
1499             }
1500         }
1501 #   endif
1502
1503     switch (shortCut)
1504         {
1505         case 0:
1506             tiPL = pL;
1507             tiPR = pR;
1508             for (k=0; k<m->numOmegaCats; k++)
1509                 {
1510                 for (c=0; c<m->numSSEChars; c++)
1511                     {
1512                     for (i=h=0; i<nStates; i++)
1513                         {
1514                         mAcumL = _mm_setzero_ps();
1515                         mAcumR = _mm_setzero_ps();
1516                         for (j=0; j<nStates; j++)
1517                             {
1518                             mTiPL  = _mm_load1_ps (&tiPL[h]);
1519                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
1520                             mL     = _mm_mul_ps (mTiPL, clL[j]);
1521                             mR     = _mm_mul_ps (mTiPR, clR[j]);
1522                             mAcumL = _mm_add_ps (mL, mAcumL);
1523                             mAcumR = _mm_add_ps (mR, mAcumR);
1524                             }
1525                         *(clP++) = _mm_mul_ps (mAcumL, mAcumR);
1526                         }
1527                     clL += nStates;
1528                     clR += nStates;
1529                     }
1530                 tiPL += nStatesSquared;
1531                 tiPR += nStatesSquared;
1532                 }
1533             break;
1534         case 1:
1535             tiPR = pR;
1536             for (k=0; k<m->numOmegaCats; k++)
1537                 {
1538                 for (c=t=0; c<m->numSSEChars; c++)
1539                     {
1540                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
1541                         {
1542                         preLikeLV[c1] = &preLikeL[lState[t] + k*(nStatesSquared+nStates)];
1543                         }
1544                     for (i=h=0; i<nStates; i++)
1545                         {
1546                         assert (FLOATS_PER_VEC == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
1547                         mAcumL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
1548                         mAcumR = _mm_setzero_ps();
1549                         for (j=0; j<nStates; j++)
1550                             {
1551                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
1552                             mR     = _mm_mul_ps (mTiPR, clR[j]);
1553                             mAcumR = _mm_add_ps (mR, mAcumR);
1554                             }
1555                         *(clP++) = _mm_mul_ps (mAcumL,mAcumR);
1556                         }
1557                     clR += nStates;
1558                     }
1559                 tiPR += nStatesSquared;
1560                 }
1561             break;
1562         case 2:
1563             tiPL = pL;
1564             for (k=0; k<m->numOmegaCats; k++)
1565                 {
1566                 for (c=t=0; c<m->numSSEChars; c++)
1567                     {
1568                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
1569                         {
1570                         preLikeRV[c1] = &preLikeR[rState[t] + k*(nStatesSquared+nStates)];
1571                         }
1572                     for (i=h=0; i<nStates; i++)
1573                         {
1574                         assert (FLOATS_PER_VEC == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
1575                         mAcumR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
1576                         mAcumL = _mm_setzero_ps();
1577                         for (j=0; j<nStates; j++)
1578                             {
1579                             mTiPL  = _mm_load1_ps (&tiPL[h++]);
1580                             mL     = _mm_mul_ps (mTiPL, clL[j]);
1581                             mAcumL = _mm_add_ps (mL, mAcumL);
1582                             }
1583                         *(clP++) = _mm_mul_ps (mAcumL,mAcumR);
1584                         }
1585                     clL += nStates;
1586                     }
1587                 tiPL += nStatesSquared;
1588                 }
1589             break;
1590         case 3:
1591             for (k=0; k<m->numOmegaCats; k++)
1592                 {
1593                 for (c=t=0; c<m->numSSEChars; c++)
1594                     {
1595                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
1596                         {
1597                         preLikeRV[c1] = &preLikeR[rState[t] + k*(nStatesSquared+nStates)];
1598                         preLikeLV[c1] = &preLikeL[lState[t] + k*(nStatesSquared+nStates)];
1599                         }
1600                     for (i=0; i<nStates; i++)
1601                         {
1602                         assert (FLOATS_PER_VEC == 4); /* In the following 2 statments we assume that SSE register can hold exactly 4 ClFlts. */
1603                         mL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
1604                         mR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
1605                         *(clP++) = _mm_mul_ps (mL,mR);
1606                         }
1607                     }
1608                 }
1609             break;
1610         }
1611
1612     return NO_ERROR;
1613 }
1614 #endif
1615
1616
1617 /*----------------------------------------------------------------
1618 |
1619 |   CondLikeDown_Std: variable number of states model
1620 |       with or without rate variation
1621 |
1622 -----------------------------------------------------------------*/
1623 int CondLikeDown_Std (TreeNode *p, int division, int chain)
1624 {
1625     int             a, c, h, i, j, k, nStates, nCats, tmp;
1626     CLFlt           *clL, *clR, *clP, *pL, *pR, *tiPL, *tiPR, likeL, likeR;
1627     ModelInfo       *m;
1628
1629     m = &modelSettings[division];
1630
1631     /* Flip conditional likelihood space */
1632     FlipCondLikeSpace (m, chain, p->index);
1633
1634     /* find conditional likelihood pointers */
1635     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1636     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
1637     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
1638
1639     /* find transition probabilities */
1640     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1641     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1642
1643     /* Conditional likelihood space is assumed to be arranged in numGammaCats blocks of data. Each block contains all data for one gamma category.
1644     Each gamma cat block consist of numChars sequences of data, each of this sequences corresponds to a character of data matrix.
1645     A sequence consists of nStates for all non-binary data, otherwise length of sequence is nStates*numBetaCats (i.e. 2*numBetaCats) */
1646
1647     /* calculate ancestral probabilities */
1648     for (k=h=0; k<m->numGammaCats; k++)
1649         {
1650         /* calculate ancestral probabilities */
1651         for (c=0; c<m->numChars; c++)
1652             {
1653             nStates = m->nStates[c];
1654
1655             /* the following lines ensure that nCats is 1 unless */
1656             /* the character is binary and beta categories are used  */
1657             if (nStates == 2)
1658                 nCats = m->numBetaCats;
1659             else
1660                 nCats = 1;
1661
1662             tmp = k*nStates*nStates; /* tmp contains offset to skip gamma cats that already processed*/
1663             tiPL = pL + m->tiIndex[c] + tmp;
1664             tiPR = pR + m->tiIndex[c] + tmp;
1665             tmp = (m->numGammaCats-1)*2*2; /* tmp contains size of block of tpi matrices across all gamma cats (minus one) for single beta category. Further used only if character is binary to jump to next beta category */
1666
1667             for (j=0; j<nCats;j++)
1668                 {
1669                 for (a=0; a<nStates; a++)
1670                     {
1671                     likeL = likeR = 0.0;
1672                     for (i=0; i<nStates; i++)
1673                         {
1674                         likeL += *(tiPL++) * clL[i];
1675                         likeR += *(tiPR++) * clR[i];
1676                         }
1677                     clP[h++] = likeL * likeR;
1678                     }
1679                 clL += nStates;
1680                 clR += nStates;
1681
1682                 tiPL += tmp;
1683                 tiPR += tmp;
1684                 }
1685             }
1686         }
1687
1688     return NO_ERROR;
1689 }
1690
1691
1692 #if !defined (SSE_ENABLED) || 1
1693 /*----------------------------------------------------------------
1694 |
1695 |   CondLikeRoot_Bin: binary model with or without rate
1696 |       variation
1697 |
1698 -----------------------------------------------------------------*/
1699 int CondLikeRoot_Bin (TreeNode *p, int division, int chain)
1700 {
1701     int             c, k;
1702     CLFlt           *clL, *clR, *clP, *clA, *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
1703     ModelInfo       *m;
1704
1705     /* find model settings for this division */
1706     m = &modelSettings[division];
1707
1708     /* flip state of node so that we are not overwriting old cond likes */
1709     FlipCondLikeSpace (m, chain, p->index);
1710
1711     /* find conditional likelihood pointers */
1712     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1713     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
1714     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
1715     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
1716
1717     /* find transition probabilities (or calculate instead) */
1718     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1719     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1720     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
1721
1722     tiPL = pL;
1723     tiPR = pR;
1724     tiPA = pA;
1725     for (k=0; k<m->numGammaCats; k++)
1726         {
1727         for (c=0; c<m->numChars; c++)
1728             {
1729             *(clP++) = (tiPL[0]*clL[0] + tiPL[1]*clL[1])
1730                       *(tiPR[0]*clR[0] + tiPR[1]*clR[1])
1731                       *(tiPA[0]*clA[0] + tiPA[1]*clA[1]);
1732             *(clP++) = (tiPL[2]*clL[0] + tiPL[3]*clL[1])
1733                       *(tiPR[2]*clR[0] + tiPR[3]*clR[1])
1734                       *(tiPA[2]*clA[0] + tiPA[3]*clA[1]);
1735
1736             clA += 2;
1737             clL += 2;
1738             clR += 2;
1739             }
1740         tiPA += 4;
1741         tiPL += 4;
1742         tiPR += 4;
1743         }
1744
1745     return NO_ERROR;
1746 }
1747 #endif
1748
1749
1750 #if defined (SSE_ENABLED)
1751 /*----------------------------------------------------------------
1752 |
1753 |   CondLikeRoot_Bin_SSE:binary model with or without rate
1754 |       variation
1755 |
1756 -----------------------------------------------------------------*/
1757 int CondLikeRoot_Bin_SSE (TreeNode *p, int division, int chain)
1758 {
1759     int             c, k;
1760     CLFlt           *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
1761     __m128          *clL, *clR, *clP, *clA;
1762     __m128          m1, m2, m3, m4, m5, m6, m7;
1763     ModelInfo       *m;
1764
1765     m = &modelSettings[division];
1766
1767     /* flip state of node so that we are not overwriting old cond likes */
1768     FlipCondLikeSpace (m, chain, p->index);
1769
1770     /* find conditional likelihood pointers */
1771     clL = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1772     clR = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
1773     clP = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
1774     clA = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
1775
1776     /* find transition probabilities */
1777     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1778     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1779     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
1780
1781     tiPL = pL;
1782     tiPR = pR;
1783     tiPA = pA;
1784     for (k=0; k<m->numGammaCats; k++)
1785         {
1786         for (c=0; c<m->numSSEChars; c++)
1787             {
1788             m1 = _mm_load1_ps (&tiPL[0]);
1789             m5 = *clL++;
1790             m2 = _mm_mul_ps (m1, m5);
1791             m1 = _mm_load1_ps (&tiPL[2]);
1792             m6 = _mm_mul_ps (m1, m5);
1793
1794             m1 = _mm_load1_ps (&tiPL[1]);
1795             m5 = *clL++;
1796             m3 = _mm_mul_ps (m1, m5);
1797             m1 = _mm_load1_ps (&tiPL[3]);
1798             m5 = _mm_mul_ps (m1, m5);
1799
1800             m4 = _mm_add_ps (m2, m3); /* in m4 we get (tiPL[0]*clL[0] + tiPL[1]*clL[1]) */
1801             m6 = _mm_add_ps (m5, m6); /* in m6 we get (tiPL[2]*clL[0] + tiPL[3]*clL[1]) */
1802
1803             m1 = _mm_load1_ps (&tiPR[0]);
1804             m5 = *clR++;
1805             m2 = _mm_mul_ps (m1, m5);
1806             m1 = _mm_load1_ps (&tiPR[2]);
1807             m7 = _mm_mul_ps (m1, m5);
1808
1809             m1 = _mm_load1_ps (&tiPR[1]);
1810             m5 = *clR++;
1811             m3 = _mm_mul_ps (m1, m5);
1812             m1 = _mm_load1_ps (&tiPR[3]);
1813             m5 = _mm_mul_ps (m1, m5);
1814
1815             m1 = _mm_add_ps (m2, m3); /* in m1 we get (tiPR[0]*clR[0] + tiPR[1]*clR[1]) */
1816             m7 = _mm_add_ps (m5, m7); /* in m7 we get (tiPR[2]*clR[0] + tiPR[3]*clR[1]) */
1817
1818             m4 = _mm_mul_ps (m1, m4); /* in m4 we get (tiPL[0]*clL[0] + tiPL[1]*clL[1])*(tiPR[0]*clR[0] + tiPR[1]*clR[1]) */
1819             m7 = _mm_mul_ps (m6, m7); /* in m7 we get (tiPL[2]*clL[0] + tiPL[3]*clL[1])*(tiPR[2]*clR[0] + tiPR[3]*clR[1]) */
1820
1821             m1 = _mm_load1_ps (&tiPA[0]);
1822             m5 = *clA++;
1823             m2 = _mm_mul_ps (m1, m5);
1824             m1 = _mm_load1_ps (&tiPA[2]);
1825             m6 = _mm_mul_ps (m1, m5);
1826
1827             m1 = _mm_load1_ps (&tiPA[1]);
1828             m5 = *clA++;
1829             m3 = _mm_mul_ps (m1, m5);
1830             m1 = _mm_load1_ps (&tiPA[3]);
1831             m1 = _mm_mul_ps (m1, m5);
1832
1833             m2 = _mm_add_ps (m2, m3); /* in m1 we get (tiPA[0]*clA[0] + tiPA[1]*clA[1]) */
1834             m1 = _mm_add_ps (m1, m6); /* in m1 we get (tiPA[2]*clA[0] + tiPA[3]*clA[1]) */
1835
1836             *clP++ = _mm_mul_ps (m2, m4);
1837             *clP++ = _mm_mul_ps (m1, m7);
1838
1839             }
1840         tiPL += 4;
1841         tiPR += 4;
1842         tiPA += 4;
1843         }
1844
1845     return NO_ERROR;
1846
1847 }
1848 #endif
1849
1850
1851 /*----------------------------------------------------------------
1852 |
1853 |   CondLikeRoot_Gen: general n-state model with or without rate
1854 |       variation
1855 |
1856 -----------------------------------------------------------------*/
1857 int CondLikeRoot_Gen (TreeNode *p, int division, int chain)
1858 {
1859     int             a, b, c, d, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, *aState=NULL,
1860                     nObsStates, nStates, nStatesSquared, preLikeJump;
1861     CLFlt           likeL, likeR, likeA, *clL, *clR, *clP, *clA, *pL, *pR, *pA,
1862                     *tiPL, *tiPR, *tiPA;
1863     ModelInfo       *m;
1864 #   if !defined (DEBUG_NOSHORTCUTS)
1865     int catStart;
1866 #   endif
1867
1868     /* find model settings for this division and nStates, nStatesSquared */
1869     m = &modelSettings[division];
1870     nObsStates = m->numStates;
1871     nStates = m->numModelStates;
1872     nStatesSquared = nStates * nStates;
1873     preLikeJump = nObsStates * nStates;
1874
1875     /* flip state of node so that we are not overwriting old cond likes */
1876     FlipCondLikeSpace (m, chain, p->index);
1877
1878     /* find conditional likelihood pointers */
1879     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
1880     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
1881     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
1882     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
1883
1884     /* find transition probabilities (or calculate instead) */
1885     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
1886     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
1887     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
1888
1889     /* find likelihoods of site patterns for left branch if terminal */
1890     shortCut = 0;
1891 #   if !defined (DEBUG_NOSHORTCUTS)
1892     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
1893         {
1894         shortCut |= 1;
1895         lState = m->termState[p->left->index];
1896         tiPL = pL;
1897         for (k=a=0; k<m->numGammaCats; k++)
1898             {
1899             catStart = a;
1900             for (i=0; i<nObsStates; i++)
1901                 for (j=i; j<nStatesSquared; j+=nStates)
1902                     preLikeL[a++] = tiPL[j];
1903             for (b=1; b<nStates/nObsStates; b++)
1904                 {
1905                 a = catStart;
1906                 for (i=0; i<nObsStates; i++)
1907                     {
1908                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
1909                         preLikeL[a++] += tiPL[j];
1910                     }
1911                 }
1912             /* for ambiguous */
1913             for (i=0; i<nStates; i++)
1914                 preLikeL[a++] = 1.0;
1915             tiPL += nStatesSquared;
1916             }
1917         }
1918
1919     /* find likelihoods of site patterns for right branch if terminal */
1920     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
1921         {
1922         shortCut |= 2;
1923         rState = m->termState[p->right->index];
1924         tiPR = pR;
1925         for (k=a=0; k<m->numGammaCats; k++)
1926             {
1927             catStart = a;
1928             for (i=0; i<nObsStates; i++)
1929                 for (j=i; j<nStatesSquared; j+=nStates)
1930                     preLikeR[a++] = tiPR[j];
1931             for (b=1; b<nStates/nObsStates; b++)
1932                 {
1933                 a = catStart;
1934                 for (i=0; i<nObsStates; i++)
1935                     {
1936                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
1937                         preLikeR[a++] += tiPR[j];
1938                     }
1939                 }
1940             /* for ambiguous */
1941             for (i=0; i<nStates; i++)
1942                 preLikeR[a++] = 1.0;
1943             tiPR += nStatesSquared;
1944             }
1945         }
1946
1947     /* find likelihoods of site patterns for anc branch, always terminal */
1948     if (m->isPartAmbig[p->anc->index] == YES)
1949         {
1950         shortCut = 4;
1951         }
1952     else
1953         {
1954         aState = m->termState[p->anc->index];
1955         tiPA = pA;
1956         for (k=a=0; k<m->numGammaCats; k++)
1957             {
1958             catStart = a;
1959             for (i=0; i<nObsStates; i++)
1960                 for (j=i; j<nStatesSquared; j+=nStates)
1961                     preLikeA[a++] = tiPA[j];
1962             for (b=1; b<nStates/nObsStates; b++)
1963                 {
1964                 a = catStart;
1965                 for (i=0; i<nObsStates; i++)
1966                     {
1967                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
1968                         preLikeA[a++] += tiPA[j];
1969                     }
1970                 }
1971             /* for ambiguous */
1972             for (i=0; i<nStates; i++)
1973                 preLikeA[a++] = 1.0;
1974             tiPA += nStatesSquared;
1975             }
1976         }
1977 #   else
1978     shortCut = 4;
1979 #   endif
1980
1981     //shortCut = 4;
1982     switch (shortCut)
1983         {
1984         case 4:
1985             tiPL = pL;
1986             tiPR = pR;
1987             tiPA = pA;
1988             for (k=0; k<m->numGammaCats; k++)
1989                 {
1990                 for (c=0; c<m->numChars; c++)
1991                     {
1992                     for (i=h=0; i<nStates; i++)
1993                         {
1994                         likeL = likeR = likeA = 0.0;
1995                         for (j=0; j<nStates; j++)
1996                             {
1997                             likeL += tiPL[h]*clL[j];
1998                             likeR += tiPR[h]*clR[j];
1999                             likeA += tiPA[h++]*clA[j];
2000                             }
2001                         *(clP++) = likeL * likeR * likeA;
2002                         }
2003                     clL += nStates;
2004                     clR += nStates;
2005                     clA += nStates;
2006                     }
2007                 tiPL += nStatesSquared;
2008                 tiPR += nStatesSquared;
2009                 tiPA += nStatesSquared;
2010                 }
2011             break;
2012         case 0:
2013             tiPR = pR;
2014             tiPL = pL;
2015             for (k=0; k<m->numGammaCats; k++)
2016                 {
2017                 for (c=0; c<m->numChars; c++)
2018                     {
2019                     a = aState[c] + k*(preLikeJump+nStates);
2020                     for (i=h=0; i<nStates; i++)
2021                         {
2022                         likeR = likeL = 0.0;
2023                         for (j=0; j<nStates; j++)
2024                             {
2025                             likeR += tiPR[h]*clR[j];
2026                             likeL += tiPL[h++]*clL[j];
2027                             }
2028                         *(clP++) = preLikeA[a++] * likeR * likeL;
2029                         }
2030                     clR += nStates;
2031                     clL += nStates;
2032                     }
2033                 tiPR += nStatesSquared;
2034                 tiPL += nStatesSquared;
2035                 }
2036             break;
2037         case 1:
2038             tiPR = pR;
2039             for (k=0; k<m->numGammaCats; k++)
2040                 {
2041                 for (c=0; c<m->numChars; c++)
2042                     {
2043                     a = lState[c] + k*(preLikeJump+nStates);
2044                     b = aState[c] + k*(preLikeJump+nStates);
2045                     for (i=h=0; i<nStates; i++)
2046                         {
2047                         likeR = 0.0;
2048                         for (j=0; j<nStates; j++)
2049                             {
2050                             likeR += tiPR[h++]*clR[j];
2051                             }
2052                         *(clP++) = preLikeL[a++] * preLikeA[b++] * likeR;
2053                         }
2054                     clR += nStates;
2055                     }
2056                 tiPR += nStatesSquared;
2057                 }
2058             break;
2059         case 2:
2060             tiPL = pL;
2061             for (k=0; k<m->numGammaCats; k++)
2062                 {
2063                 for (c=0; c<m->numChars; c++)
2064                     {
2065                     a = rState[c] + k*(preLikeJump+nStates);
2066                     b = aState[c] + k*(preLikeJump+nStates);
2067                     for (i=h=0; i<nStates; i++)
2068                         {
2069                         likeL = 0.0;
2070                         for (j=0; j<nStates; j++)
2071                             {
2072                             likeL += tiPL[h++]*clL[j];
2073                             }
2074                         *(clP++) = preLikeR[a++] * preLikeA[b++] * likeL;
2075                         }
2076                     clL += nStates;
2077                     }
2078                 tiPL += nStatesSquared;
2079                 }
2080             break;
2081         case 3:
2082             for (k=0; k<m->numGammaCats; k++)
2083                 {
2084                 for (c=0; c<m->numChars; c++)
2085                     {
2086                     a = rState[c] + k*(preLikeJump+nStates);
2087                     b = lState[c] + k*(preLikeJump+nStates);
2088                     d = aState[c] + k*(preLikeJump+nStates);
2089                     for (i=0; i<nStates; i++)
2090                         {
2091                         *(clP++) = preLikeR[a++] * preLikeL[b++] * preLikeA[d++];
2092                         }
2093                     }
2094                 }
2095             break;
2096         }
2097
2098     return NO_ERROR;
2099 }
2100
2101
2102 #if defined (SSE_ENABLED)
2103 /*----------------------------------------------------------------
2104 |
2105 |   CondLikeRoot_Gen_SSE:general n-state model with or without rate
2106 |       variation
2107 |
2108 -----------------------------------------------------------------*/
2109 int CondLikeRoot_Gen_SSE (TreeNode *p, int division, int chain)
2110 {
2111     int             c, c1, t, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, *aState=NULL, nObsStates, preLikeJump,
2112                     nStates, nStatesSquared;
2113     CLFlt           *pL, *pR, *pA,
2114                     *tiPL, *tiPR, *tiPA;
2115     __m128          *clL, *clR, *clP, *clA;
2116     __m128          mTiPL, mTiPR, mTiPA, mL, mR, mA, mAcumL, mAcumR, mAcumA;
2117     ModelInfo       *m;
2118     CLFlt           *preLikeRV[FLOATS_PER_VEC];
2119     CLFlt           *preLikeLV[FLOATS_PER_VEC];
2120     CLFlt           *preLikeAV[FLOATS_PER_VEC];
2121
2122 #   if !defined (DEBUG_NOSHORTCUTS)
2123     int a, b, catStart;
2124 #   endif
2125
2126     /* find model settings for this division and nStates, nStatesSquared */
2127     m = &modelSettings[division];
2128     nObsStates = m->numStates;
2129     nStates = m->numModelStates;
2130     nStatesSquared = nStates * nStates;
2131     preLikeJump = nObsStates * nStates;
2132
2133     /* flip state of node so that we are not overwriting old cond likes */
2134     FlipCondLikeSpace (m, chain, p->index);
2135
2136     /* find conditional likelihood pointers */
2137     clL = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->left->index ]];
2138     clR = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->right->index]];
2139     clP = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->index       ]];
2140     clA = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
2141
2142     /* find transition probabilities (or calculate instead) */
2143     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
2144     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
2145     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
2146
2147     /* find likelihoods of site patterns for left branch if terminal */
2148     shortCut = 0;
2149 #   if !defined (DEBUG_NOSHORTCUTS)
2150     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
2151         {
2152         shortCut |= 1;
2153         lState = m->termState[p->left->index];
2154         tiPL = pL;
2155         for (k=a=0; k<m->numGammaCats; k++)
2156             {
2157             catStart = a;
2158             for (i=0; i<nObsStates; i++)
2159                 for (j=i; j<nStatesSquared; j+=nStates)
2160                     preLikeL[a++] = tiPL[j];
2161             for (b=1; b<nStates/nObsStates; b++)
2162                 {
2163                 a = catStart;
2164                 for (i=0; i<nObsStates; i++)
2165                     {
2166                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2167                         preLikeL[a++] += tiPL[j];
2168                     }
2169                 }
2170             /* for ambiguous */
2171             for (i=0; i<nStates; i++)
2172                 preLikeL[a++] = 1.0;
2173             tiPL += nStatesSquared;
2174             }
2175         }
2176
2177     /* find likelihoods of site patterns for right branch if terminal */
2178     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
2179         {
2180         shortCut |= 2;
2181         rState = m->termState[p->right->index];
2182         tiPR = pR;
2183         for (k=a=0; k<m->numGammaCats; k++)
2184             {
2185             catStart = a;
2186             for (i=0; i<nObsStates; i++)
2187                 for (j=i; j<nStatesSquared; j+=nStates)
2188                     preLikeR[a++] = tiPR[j];
2189             for (b=1; b<nStates/nObsStates; b++)
2190                 {
2191                 a = catStart;
2192                 for (i=0; i<nObsStates; i++)
2193                     {
2194                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2195                         preLikeR[a++] += tiPR[j];
2196                     }
2197                 }
2198             /* for ambiguous */
2199             for (i=0; i<nStates; i++)
2200                 preLikeR[a++] = 1.0;
2201             tiPR += nStatesSquared;
2202             }
2203         }
2204
2205     /* find likelihoods of site patterns for anc branch, always terminal */
2206     if (m->isPartAmbig[p->anc->index] == YES)
2207         {
2208         shortCut = 4;
2209         }
2210     else
2211         {
2212         aState = m->termState[p->anc->index];
2213         tiPA = pA;
2214         for (k=a=0; k<m->numGammaCats; k++)
2215             {
2216             catStart = a;
2217             for (i=0; i<nObsStates; i++)
2218                 for (j=i; j<nStatesSquared; j+=nStates)
2219                     preLikeA[a++] = tiPA[j];
2220             for (b=1; b<nStates/nObsStates; b++)
2221                 {
2222                 a = catStart;
2223                 for (i=0; i<nObsStates; i++)
2224                     {
2225                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2226                         preLikeA[a++] += tiPA[j];
2227                     }
2228                 }
2229             /* for ambiguous */
2230             for (i=0; i<nStates; i++)
2231                 preLikeA[a++] = 1.0;
2232             tiPA += nStatesSquared;
2233             }
2234         }
2235 #   else
2236     shortCut = 4;
2237 #   endif
2238
2239         switch (shortCut)
2240         {
2241         case 4:
2242             tiPL = pL;
2243             tiPR = pR;
2244             tiPA = pA;
2245             for (k=0; k<m->numGammaCats; k++)
2246                 {
2247                 for (c=0; c<m->numSSEChars; c++)
2248                     {
2249                     for (i=h=0; i<nStates; i++)
2250                         {
2251                         mAcumL = _mm_setzero_ps();
2252                         mAcumR = _mm_setzero_ps();
2253                         mAcumA = _mm_setzero_ps();
2254                         for (j=0; j<nStates; j++)
2255                             {
2256                             mTiPL  = _mm_load1_ps (&tiPL[h]);
2257                             mTiPR  = _mm_load1_ps (&tiPR[h]);
2258                             mTiPA  = _mm_load1_ps (&tiPA[h++]);
2259                             mL     = _mm_mul_ps (mTiPL, clL[j]);
2260                             mR     = _mm_mul_ps (mTiPR, clR[j]);
2261                             mA     = _mm_mul_ps (mTiPA, clA[j]);
2262                             mAcumL = _mm_add_ps (mL, mAcumL);
2263                             mAcumR = _mm_add_ps (mR, mAcumR);
2264                             mAcumA = _mm_add_ps (mA, mAcumA);
2265                             }
2266                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
2267                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
2268                         }
2269                     clL += nStates;
2270                     clR += nStates;
2271                     clA += nStates;
2272                     }
2273                 tiPL += nStatesSquared;
2274                 tiPR += nStatesSquared;
2275                 tiPA += nStatesSquared;
2276                 }
2277             break;
2278         case 0:
2279             tiPL =pL;
2280             tiPR =pR;
2281             for (k=0; k<m->numGammaCats; k++)
2282                 {
2283                 for (c=t=0; c<m->numSSEChars; c++)
2284                     {
2285                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
2286                         {
2287                         preLikeAV[c1] = &preLikeA[aState[t] + k*(preLikeJump+nStates)];
2288                         }
2289                     for (i=h=0; i<nStates; i++)
2290                         {
2291                         assert (FLOATS_PER_VEC == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
2292                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
2293                         mAcumL = _mm_setzero_ps();
2294                         mAcumR = _mm_setzero_ps();
2295                         for (j=0; j<nStates; j++)
2296                             {
2297                             mTiPL  = _mm_load1_ps (&tiPL[h]);
2298                             mL     = _mm_mul_ps (mTiPL, clL[j]);
2299                             mAcumL = _mm_add_ps (mL, mAcumL);
2300                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
2301                             mR     = _mm_mul_ps (mTiPR, clR[j]);
2302                             mAcumR = _mm_add_ps (mR, mAcumR);
2303                             }
2304                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
2305                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
2306                         }
2307                     clR += nStates;
2308                     clL += nStates;
2309                     }
2310                 tiPL += nStatesSquared;
2311                 tiPR += nStatesSquared;
2312                 }
2313             break;
2314         case 1:
2315             tiPR = pR;
2316             for (k=0; k<m->numGammaCats; k++)
2317                 {
2318                 for (c=t=0; c<m->numSSEChars; c++)
2319                     {
2320                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
2321                         {
2322                         preLikeLV[c1] = &preLikeL[lState[t] + k*(preLikeJump+nStates)];
2323                         preLikeAV[c1] = &preLikeA[aState[t] + k*(preLikeJump+nStates)];
2324                         }
2325                     for (i=h=0; i<nStates; i++)
2326                         {
2327                         assert (FLOATS_PER_VEC == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
2328                         mAcumL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
2329                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
2330                         mAcumR = _mm_setzero_ps();
2331                         for (j=0; j<nStates; j++)
2332                             {
2333                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
2334                             mR     = _mm_mul_ps (mTiPR, clR[j]);
2335                             mAcumR = _mm_add_ps (mR, mAcumR);
2336                             }
2337                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
2338                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
2339                         }
2340                     clR += nStates;
2341                     }
2342                 tiPR += nStatesSquared;
2343                 }
2344             break;
2345         case 2:
2346             tiPL = pL;
2347             for (k=0; k<m->numGammaCats; k++)
2348                 {
2349                 for (c=t=0; c<m->numSSEChars; c++)
2350                     {
2351                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
2352                         {
2353                         preLikeRV[c1] = &preLikeR[rState[t] + k*(preLikeJump+nStates)];
2354                         preLikeAV[c1] = &preLikeA[aState[t] + k*(preLikeJump+nStates)];
2355                         }
2356                     for (i=h=0; i<nStates; i++)
2357                         {
2358                         assert (FLOATS_PER_VEC == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
2359                         mAcumR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
2360                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
2361                         mAcumL = _mm_setzero_ps();
2362                         for (j=0; j<nStates; j++)
2363                             {
2364                             mTiPL  = _mm_load1_ps (&tiPL[h++]);
2365                             mL     = _mm_mul_ps (mTiPL, clL[j]);
2366                             mAcumL = _mm_add_ps (mL, mAcumL);
2367                             }
2368                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
2369                         *(clP++) = _mm_mul_ps (mAcumL,mAcumA);
2370                         }
2371                     clL += nStates;
2372                     }
2373                 tiPL += nStatesSquared;
2374                 }
2375             break;
2376         case 3:
2377             for (k=0; k<m->numGammaCats; k++)
2378                 {
2379                 for (c=t=0; c<m->numSSEChars; c++)
2380                     {
2381                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
2382                         {
2383                         preLikeRV[c1] = &preLikeR[rState[t] + k*(preLikeJump+nStates)];
2384                         preLikeLV[c1] = &preLikeL[lState[t] + k*(preLikeJump+nStates)];
2385                         preLikeAV[c1] = &preLikeA[aState[t] + k*(preLikeJump+nStates)];
2386                         }
2387                     for (i=0; i<nStates; i++)
2388                         {
2389                         assert (FLOATS_PER_VEC == 4); /* In the following 2 statments we assume that SSE register can hold exactly 4 ClFlts. */
2390                         mL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
2391                         mR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
2392                         mA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
2393                         mL = _mm_mul_ps (mL,mR);
2394                         *(clP++) = _mm_mul_ps (mL,mA);
2395                         }
2396                     }
2397                 }
2398             break;
2399         }
2400
2401     return NO_ERROR;
2402 }
2403 #endif
2404
2405
2406 /*----------------------------------------------------------------
2407 |
2408 |   CondLikeRoot_Gen_GibbsGamma: general n-state model with rate
2409 |       variation modeled using a discrete gamma distribution with
2410 |       Gibbs resampling of rate categories
2411 |
2412 -----------------------------------------------------------------*/
2413 int CondLikeRoot_Gen_GibbsGamma (TreeNode *p, int division, int chain)
2414 {
2415     int             a, b, c, i, j, r, *rateCat, shortCut, *lState=NULL,
2416                     *rState=NULL, *aState=NULL, nObsStates, nStates,
2417                     nStatesSquared, nGammaCats;
2418     CLFlt           likeL, likeR, likeA, *clL, *clR, *clP, *clA, *pL, *pR, *pA,
2419                     *tiPL, *tiPR, *tiPA;
2420     ModelInfo       *m;
2421 #   if !defined (DEBUG_NOSHORTCUTS)
2422     int k, catStart;
2423 #endif
2424
2425     /* find model settings for this division and nStates, nStatesSquared */
2426     m = &modelSettings[division];
2427     nObsStates = m->numStates;
2428     nStates = m->numModelStates;
2429     nStatesSquared = nStates * nStates;
2430
2431     /* flip conditional likelihood space */
2432     FlipCondLikeSpace (m, chain, p->index);
2433
2434     /* find conditional likelihood pointers */
2435     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
2436     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
2437     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
2438     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
2439
2440     /* find transition probabilities (or calculate instead) */
2441     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
2442     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
2443     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
2444
2445     /* find rate category index and number of gamma categories */
2446     rateCat = m->tiIndex + chain * m->numChars;
2447     nGammaCats = m->numGammaCats;
2448
2449     /* find likelihoods of site patterns for left branch if terminal */
2450     shortCut = 0;
2451 #   if !defined (DEBUG_NOSHORTCUTS)
2452     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
2453         {
2454         shortCut |= 1;
2455         lState = m->termState[p->left->index];
2456         tiPL = pL;
2457         for (k=a=0; k<nGammaCats; k++)
2458             {
2459             catStart = a;
2460             for (i=0; i<nObsStates; i++)
2461                 for (j=i; j<nStatesSquared; j+=nStates)
2462                     preLikeL[a++] = tiPL[j];
2463             for (b=1; b<nStates/nObsStates; b++)
2464                 {
2465                 a = catStart;
2466                 for (i=0; i<nObsStates; i++)
2467                     {
2468                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2469                         preLikeL[a++] += tiPL[j];
2470                     }
2471                 }
2472             /* for ambiguous */
2473             for (i=0; i<nStates; i++)
2474                 preLikeL[a++] = 1.0;
2475             tiPL += nStatesSquared;
2476             }
2477         }
2478
2479     /* find likelihoods of site patterns for right branch if terminal */
2480     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
2481         {
2482         shortCut |= 2;
2483         rState = m->termState[p->right->index];
2484         tiPR = pR;
2485         for (k=a=0; k<nGammaCats; k++)
2486             {
2487             catStart = a;
2488             for (i=0; i<nObsStates; i++)
2489                 for (j=i; j<nStatesSquared; j+=nStates)
2490                     preLikeR[a++] = tiPR[j];
2491             for (b=1; b<nStates/nObsStates; b++)
2492                 {
2493                 a = catStart;
2494                 for (i=0; i<nObsStates; i++)
2495                     {
2496                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2497                         preLikeR[a++] += tiPR[j];
2498                     }
2499                 }
2500             /* for ambiguous */
2501             for (i=0; i<nStates; i++)
2502                 preLikeR[a++] = 1.0;
2503             tiPR += nStatesSquared;
2504             }
2505         }
2506
2507     /* find likelihoods of site patterns for anc branch, always terminal */
2508     if (m->isPartAmbig[p->anc->index] == YES)
2509         {
2510         shortCut = 4;
2511         }
2512     else
2513         {
2514         aState = m->termState[p->anc->index];
2515         tiPA = pA;
2516         for (k=a=0; k<nGammaCats; k++)
2517             {
2518             catStart = a;
2519             for (i=0; i<nObsStates; i++)
2520                 for (j=i; j<nStatesSquared; j+=nStates)
2521                     preLikeA[a++] = tiPA[j];
2522             for (b=1; b<nStates/nObsStates; b++)
2523                 {
2524                 a = catStart;
2525                 for (i=0; i<nObsStates; i++)
2526                     {
2527                     for (j=i+b*nObsStates; j<nStatesSquared; j+=nStates)
2528                         preLikeA[a++] += tiPA[j];
2529                     }
2530                 }
2531             /* for ambiguous */
2532             for (i=0; i<nStates; i++)
2533                 preLikeA[a++] = 1.0;
2534             tiPA += nStatesSquared;
2535             }
2536         }
2537 #   else
2538     shortCut = 4;
2539 #   endif
2540
2541     switch (shortCut)
2542         {
2543     case 4:
2544         for (c=0; c<m->numChars; c++)
2545             {
2546             r = (*rateCat++);
2547             if (r < nGammaCats)
2548                 {
2549                 tiPL = pL + r*nStatesSquared;
2550                 tiPR = pR + r*nStatesSquared;
2551                 tiPA = pA + r*nStatesSquared;
2552                 for (i=0; i<nStates; i++)
2553                     {
2554                     likeL = likeR = likeA = 0.0;
2555                     for (j=0; j<nStates; j++)
2556                         {
2557                         likeL += (*tiPL++) * clL[j];
2558                         likeR += (*tiPR++) * clR[j];
2559                         likeA += (*tiPA++) * clA[j];
2560                         }
2561                     *(clP++) = likeL * likeR * likeA;
2562                     }
2563                 }
2564             else
2565                 clP += nStates;
2566             clL += nStates;
2567             clR += nStates;
2568             clA += nStates;
2569             }
2570         break;
2571     case 0:
2572     case 3:
2573         for (c=0; c<m->numChars; c++)
2574             {
2575             r = (*rateCat++);
2576             if (r < nGammaCats)
2577                 {
2578                 tiPL = pL + r*nStatesSquared;
2579                 tiPR = pR + r*nStatesSquared;
2580                 a = aState[c] + r*(nStatesSquared+nStates);
2581                 for (i=0; i<nStates; i++)
2582                     {
2583                     likeL = likeR = 0.0;
2584                     for (j=0; j<nStates; j++)
2585                         {
2586                         likeL += (*tiPL++) * clL[j];
2587                         likeR += (*tiPR++) * clR[j];
2588                         }
2589                     *(clP++) = likeL * likeR * preLikeA[a++];
2590                     }
2591                 }
2592             else
2593                 clP += nStates;
2594             clL += nStates;
2595             clR += nStates;
2596             }
2597         break;
2598     case 1:
2599         for (c=0; c<m->numChars; c++)
2600             {
2601             r = (*rateCat++);
2602             if (r < nGammaCats)
2603                 {
2604                 tiPR = pR + r*nStatesSquared;
2605                 a = lState[c] + r*(nStatesSquared+nStates);
2606                 b = aState[c] + r*(nStatesSquared+nStates);
2607                 for (i=0; i<nStates; i++)
2608                     {
2609                     likeR = 0.0;
2610                     for (j=0; j<nStates; j++)
2611                         {
2612                         likeR += (*tiPR++) * clR[j];
2613                         }
2614                     *(clP++) = preLikeL[a++] * likeR * preLikeA[b++];
2615                     }
2616                 }
2617             else
2618                 clP += nStates;
2619             clR += nStates;
2620             }
2621         break;
2622     case 2:
2623         for (c=0; c<m->numChars; c++)
2624             {
2625             r = (*rateCat++);
2626             if (r < nGammaCats)
2627                 {
2628                 tiPL = pL + r*nStatesSquared;
2629                 a = rState[c] + r*(nStatesSquared+nStates);
2630                 b = aState[c] + r*(nStatesSquared+nStates);
2631                 for (i=0; i<nStates; i++)
2632                     {
2633                     likeL = 0.0;
2634                     for (j=0; j<nStates; j++)
2635                         {
2636                         likeL += (*tiPL++) * clL[j];
2637                         }
2638                     *(clP++) = likeL * preLikeR[a++] * preLikeA[b++];
2639                     }
2640                 }
2641             else
2642                 clP += nStates;
2643             clL += nStates;
2644             }
2645         break;
2646         }
2647
2648     return NO_ERROR;
2649 }
2650
2651
2652 /*----------------------------------------------------------------
2653 |
2654 |   CondLikeRoot_NUC4: 4by4 nucleotide model with or without rate
2655 |       variation
2656 |
2657 -----------------------------------------------------------------*/
2658 int CondLikeRoot_NUC4 (TreeNode *p, int division, int chain)
2659 {
2660     int             a, c, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, *aState=NULL;
2661     CLFlt           *clL, *clR, *clP, *clA, *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
2662     ModelInfo       *m;
2663
2664     m = &modelSettings[division];
2665
2666     /* flip state of node so that we are not overwriting old cond likes */
2667     FlipCondLikeSpace (m, chain, p->index);
2668
2669     /* find conditional likelihood pointers */
2670     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
2671     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
2672     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
2673     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
2674
2675     /* find transition probabilities (or calculate instead) */
2676     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
2677     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
2678     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
2679
2680     /* find likelihoods of site patterns for left branch if terminal */
2681     shortCut = 0;
2682 #   if !defined (DEBUG_NOSHORTCUTS)
2683     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
2684         {
2685         shortCut |= 1;
2686         lState = m->termState[p->left->index];
2687         tiPL = pL;
2688         for (k=j=0; k<m->numGammaCats; k++)
2689             {
2690             for (i=0; i<4; i++)
2691                 {
2692                 preLikeL[j++] = tiPL[0];
2693                 preLikeL[j++] = tiPL[4];
2694                 preLikeL[j++] = tiPL[8];
2695                 preLikeL[j++] = tiPL[12];
2696                 tiPL++;
2697                 }
2698             /* for ambiguous */
2699             for (i=0; i<4; i++)
2700                 preLikeL[j++] = 1.0;
2701             tiPL += 12;
2702             }
2703         }
2704
2705     /* find likelihoods of site patterns for right branch if terminal */
2706     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
2707         {
2708         shortCut |= 2;
2709         rState = m->termState[p->right->index];
2710         tiPR = pR;
2711         for (k=j=0; k<m->numGammaCats; k++)
2712             {
2713             for (i=0; i<4; i++)
2714                 {
2715                 preLikeR[j++] = tiPR[0];
2716                 preLikeR[j++] = tiPR[4];
2717                 preLikeR[j++] = tiPR[8];
2718                 preLikeR[j++] = tiPR[12];
2719                 tiPR++;
2720                 }
2721             /* for ambiguous */
2722             for (i=0; i<4; i++)
2723                 preLikeR[j++] = 1.0;
2724             tiPR += 12;
2725             }
2726         }
2727
2728     /* find likelihoods of site patterns for anc branch, always terminal */
2729     if (m->isPartAmbig[p->anc->index] == YES)
2730         {
2731         shortCut = 4;
2732         }
2733     else
2734         {
2735         aState = m->termState[p->anc->index];
2736         tiPA = pA;
2737         for (k=j=0; k<m->numGammaCats; k++)
2738             {
2739             for (i=0; i<4; i++)
2740                 {
2741                 preLikeA[j++] = tiPA[0];
2742                 preLikeA[j++] = tiPA[4];
2743                 preLikeA[j++] = tiPA[8];
2744                 preLikeA[j++] = tiPA[12];
2745                 tiPA++;
2746                 }
2747             /* for ambiguous */
2748             for (i=0; i<4; i++)
2749                 preLikeA[j++] = 1.0;
2750             tiPA += 12;
2751             }
2752         }
2753 #   else
2754     shortCut = 4;
2755 #   endif
2756
2757     switch (shortCut)
2758         {
2759     case 4:
2760         tiPL = pL;
2761         tiPR = pR;
2762         tiPA = pA;
2763         for (k=h=0; k<m->numGammaCats; k++)
2764             {
2765             for (c=0; c<m->numChars; c++)
2766                 {
2767                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
2768                             *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
2769                             *(tiPA[AA]*clA[A] + tiPA[AC]*clA[C] + tiPA[AG]*clA[G] + tiPA[AT]*clA[T]);
2770                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
2771                             *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
2772                             *(tiPA[CA]*clA[A] + tiPA[CC]*clA[C] + tiPA[CG]*clA[G] + tiPA[CT]*clA[T]);
2773                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
2774                             *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
2775                             *(tiPA[GA]*clA[A] + tiPA[GC]*clA[C] + tiPA[GG]*clA[G] + tiPA[GT]*clA[T]);
2776                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
2777                             *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
2778                             *(tiPA[TA]*clA[A] + tiPA[TC]*clA[C] + tiPA[TG]*clA[G] + tiPA[TT]*clA[T]);
2779                 clL += 4;
2780                 clR += 4;
2781                 clA += 4;
2782                 }
2783             tiPL += 16;
2784             tiPR += 16;
2785             tiPA += 16;
2786             }
2787         break;
2788
2789     case 0:
2790         tiPL = pL;
2791         tiPR = pR;
2792         for (k=h=0; k<m->numGammaCats; k++)
2793             {
2794             for (c=0; c<m->numChars; c++)
2795                 {
2796                 i = aState[c] + k*20;
2797                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
2798                             *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
2799                             *preLikeA[i++];
2800                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
2801                             *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
2802                             *preLikeA[i++];
2803                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
2804                             *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
2805                             *preLikeA[i++];
2806                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
2807                             *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
2808                             *preLikeA[i++];
2809                 clL += 4;
2810                 clR += 4;
2811                 }
2812             tiPL += 16;
2813             tiPR += 16;
2814             }
2815         break;
2816
2817     case 1:
2818         tiPR = pR;
2819         for (k=h=0; k<m->numGammaCats; k++)
2820             {
2821             for (c=0; c<m->numChars; c++)
2822                 {
2823                 i = lState[c] + k*20;
2824                 j = aState[c] + k*20;
2825                 clP[h++] =   (tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
2826                             *preLikeL[i++]*preLikeA[j++];
2827                 clP[h++] =   (tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
2828                             *preLikeL[i++]*preLikeA[j++];
2829                 clP[h++] =   (tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
2830                             *preLikeL[i++]*preLikeA[j++];
2831                 clP[h++] =   (tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
2832                             *preLikeL[i++]*preLikeA[j++];
2833                 clR += 4;
2834                 }
2835             tiPR += 16;
2836             }
2837         break;
2838
2839     case 2:
2840         tiPL = pL;
2841         for (k=h=0; k<m->numGammaCats; k++)
2842             {
2843             for (c=0; c<m->numChars; c++)
2844                 {
2845                 i = rState[c] + k*20;
2846                 j = aState[c] + k*20;
2847                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
2848                             *preLikeR[i++]*preLikeA[j++];
2849                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
2850                             *preLikeR[i++]*preLikeA[j++];
2851                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
2852                             *preLikeR[i++]*preLikeA[j++];
2853                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
2854                             *preLikeR[i++]*preLikeA[j++];
2855                 clL += 4;
2856                 }
2857             tiPL += 16;
2858             }
2859         break;
2860
2861     case 3:
2862         for (k=h=0; k<m->numGammaCats; k++)
2863             {
2864             for (c=0; c<m->numChars; c++)
2865                 {
2866                 a = lState[c] + k*20;
2867                 i = rState[c] + k*20;
2868                 j = aState[c] + k*20;
2869                 clP[h++] =   preLikeL[a++]*preLikeR[i++]*preLikeA[j++];
2870                 clP[h++] =   preLikeL[a++]*preLikeR[i++]*preLikeA[j++];
2871                 clP[h++] =   preLikeL[a++]*preLikeR[i++]*preLikeA[j++];
2872                 clP[h++] =   preLikeL[a++]*preLikeR[i++]*preLikeA[j++];
2873                 }
2874             }
2875         break;
2876         }
2877
2878     return NO_ERROR;
2879 }
2880
2881
2882 /*----------------------------------------------------------------
2883 |
2884 |   CondLikeRoot_NUC4_GibbsGamma: 4by4 nucleotide model with rate
2885 |       variation approimated by Gibbs sampling from gamma
2886 |
2887 -----------------------------------------------------------------*/
2888 int CondLikeRoot_NUC4_GibbsGamma (TreeNode *p, int division, int chain)
2889 {
2890     int             c, h, i, j, r, *rateCat, shortCut, *lState=NULL, *rState=NULL, *aState=NULL,
2891                     nGammaCats;
2892     CLFlt           *clL, *clR, *clP, *clA, *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
2893     ModelInfo       *m;
2894 #   if !defined (DEBUG_NOSHORTCUTS)
2895     int k;
2896 #   endif
2897
2898     m = &modelSettings[division];
2899
2900     /* flip conditional likelihood space */
2901     FlipCondLikeSpace (m, chain, p->index);
2902
2903         /* find conditional likelihood pointers */
2904     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
2905     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
2906     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
2907     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
2908
2909     /* find transition probabilities (or calculate instead) */
2910     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
2911     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
2912     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
2913
2914     /* find rate category index and number of gamma categories */
2915     rateCat = m->tiIndex + chain * m->numChars;
2916     nGammaCats = m->numGammaCats;
2917
2918     /* find likelihoods of site patterns for left branch if terminal */
2919     shortCut = 0;
2920 #   if !defined (DEBUG_NOSHORTCUTS)
2921     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
2922         {
2923         shortCut |= 1;
2924         lState = m->termState[p->left->index];
2925         tiPL = pL;
2926         for (k=j=0; k<nGammaCats; k++)
2927             {
2928             for (i=0; i<4; i++)
2929                 {
2930                 preLikeL[j++] = tiPL[0];
2931                 preLikeL[j++] = tiPL[4];
2932                 preLikeL[j++] = tiPL[8];
2933                 preLikeL[j++] = tiPL[12];
2934                 tiPL++;
2935                 }
2936             /* for ambiguous */
2937             for (i=0; i<4; i++)
2938                 preLikeL[j++] = 1.0;
2939             tiPL += 12;
2940             }
2941         }
2942
2943     /* find likelihoods of site patterns for right branch if terminal */
2944     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
2945         {
2946         shortCut |= 2;
2947         rState = m->termState[p->right->index];
2948         tiPR = pR;
2949         for (k=j=0; k<nGammaCats; k++)
2950             {
2951             for (i=0; i<4; i++)
2952                 {
2953                 preLikeR[j++] = tiPR[0];
2954                 preLikeR[j++] = tiPR[4];
2955                 preLikeR[j++] = tiPR[8];
2956                 preLikeR[j++] = tiPR[12];
2957                 tiPR++;
2958                 }
2959             /* for ambiguous */
2960             for (i=0; i<4; i++)
2961                 preLikeR[j++] = 1.0;
2962             tiPR += 12;
2963             }
2964         }
2965
2966     /* find likelihoods of site patterns for anc branch, always terminal */
2967     if (m->isPartAmbig[p->anc->index] == YES)
2968         {
2969         shortCut = 4;
2970         }
2971     else
2972         {
2973         aState = m->termState[p->anc->index];
2974         tiPA = pA;
2975         for (k=j=0; k<nGammaCats; k++)
2976             {
2977             for (i=0; i<4; i++)
2978                 {
2979                 preLikeA[j++] = tiPA[0];
2980                 preLikeA[j++] = tiPA[4];
2981                 preLikeA[j++] = tiPA[8];
2982                 preLikeA[j++] = tiPA[12];
2983                 tiPA++;
2984                 }
2985             /* for ambiguous */
2986             for (i=0; i<4; i++)
2987                 preLikeA[j++] = 1.0;
2988             tiPA += 12;
2989             }
2990         }
2991 #   else
2992     shortCut = 4;
2993 #   endif
2994
2995     switch (shortCut)
2996         {
2997     case 4:
2998         for (c=h=0; c<m->numChars; c++)
2999             {
3000             r = rateCat[c];
3001             if (r < nGammaCats)
3002                 {
3003                 tiPL = pL + r * 16;
3004                 tiPR = pR + r * 16;
3005                 tiPA = pA + r * 16;
3006                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
3007                             *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
3008                             *(tiPA[AA]*clA[A] + tiPA[AC]*clA[C] + tiPA[AG]*clA[G] + tiPA[AT]*clA[T]);
3009                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
3010                             *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
3011                             *(tiPA[CA]*clA[A] + tiPA[CC]*clA[C] + tiPA[CG]*clA[G] + tiPA[CT]*clA[T]);
3012                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
3013                             *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
3014                             *(tiPA[GA]*clA[A] + tiPA[GC]*clA[C] + tiPA[GG]*clA[G] + tiPA[GT]*clA[T]);
3015                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
3016                             *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
3017                             *(tiPA[TA]*clA[A] + tiPA[TC]*clA[C] + tiPA[TG]*clA[G] + tiPA[TT]*clA[T]);
3018                 }
3019             else
3020                 h += 4;
3021             clL += 4;
3022             clR += 4;
3023             clA += 4;
3024             }
3025         break;
3026
3027     case 0:
3028     case 3:
3029         for (c=h=0; c<m->numChars; c++)
3030             {
3031             r = rateCat[c];
3032             if (r < nGammaCats)
3033                 {
3034                 tiPL = pL + r * 16;
3035                 tiPR = pR + r * 16;
3036                 i = aState[c] + r * 20;
3037                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
3038                             *(tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
3039                             *preLikeA[i++];
3040                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
3041                             *(tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
3042                             *preLikeA[i++];
3043                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
3044                             *(tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
3045                             *preLikeA[i++];
3046                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
3047                             *(tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
3048                             *preLikeA[i++];
3049                 }
3050             else
3051                 h += 4;
3052             clL += 4;
3053             clR += 4;
3054             }
3055         break;
3056
3057     case 1:
3058         for (c=h=0; c<m->numChars; c++)
3059             {
3060             r = rateCat[c];
3061             if (r < nGammaCats)
3062                 {
3063                 tiPR = pR + r * 16;
3064                 i = lState[c] + r * 20;
3065                 j = aState[c] + r * 20;
3066                 clP[h++] =   (tiPR[AA]*clR[A] + tiPR[AC]*clR[C] + tiPR[AG]*clR[G] + tiPR[AT]*clR[T])
3067                             *preLikeL[i++]*preLikeA[j++];
3068                 clP[h++] =   (tiPR[CA]*clR[A] + tiPR[CC]*clR[C] + tiPR[CG]*clR[G] + tiPR[CT]*clR[T])
3069                             *preLikeL[i++]*preLikeA[j++];
3070                 clP[h++] =   (tiPR[GA]*clR[A] + tiPR[GC]*clR[C] + tiPR[GG]*clR[G] + tiPR[GT]*clR[T])
3071                             *preLikeL[i++]*preLikeA[j++];
3072                 clP[h++] =   (tiPR[TA]*clR[A] + tiPR[TC]*clR[C] + tiPR[TG]*clR[G] + tiPR[TT]*clR[T])
3073                             *preLikeL[i++]*preLikeA[j++];
3074                 }
3075             else
3076                 h += 4;
3077             clR += 4;
3078             }
3079         break;
3080
3081     case 2:
3082         for (c=h=0; c<m->numChars; c++)
3083             {
3084             r = rateCat[c];
3085             if (r < nGammaCats)
3086                 {
3087                 tiPL = pL + r * 16;
3088                 i = rState[c] + r * 20;
3089                 j = aState[c] + r * 20;
3090                 clP[h++] =   (tiPL[AA]*clL[A] + tiPL[AC]*clL[C] + tiPL[AG]*clL[G] + tiPL[AT]*clL[T])
3091                             *preLikeR[i++]*preLikeA[j++];
3092                 clP[h++] =   (tiPL[CA]*clL[A] + tiPL[CC]*clL[C] + tiPL[CG]*clL[G] + tiPL[CT]*clL[T])
3093                             *preLikeR[i++]*preLikeA[j++];
3094                 clP[h++] =   (tiPL[GA]*clL[A] + tiPL[GC]*clL[C] + tiPL[GG]*clL[G] + tiPL[GT]*clL[T])
3095                             *preLikeR[i++]*preLikeA[j++];
3096                 clP[h++] =   (tiPL[TA]*clL[A] + tiPL[TC]*clL[C] + tiPL[TG]*clL[G] + tiPL[TT]*clL[T])
3097                             *preLikeR[i++]*preLikeA[j++];
3098                 }
3099             else
3100                 h += 4;
3101             clL += 4;
3102             }
3103         break;
3104         }
3105
3106     return NO_ERROR;
3107 }
3108
3109
3110 #if defined (SSE_ENABLED)
3111 /*----------------------------------------------------------------
3112 |
3113 |   CondLikeRoot_NUC4_SSE: 4by4 nucleotide model with or without rate
3114 |       variation using SSE instructions
3115 |
3116 -----------------------------------------------------------------*/
3117 int CondLikeRoot_NUC4_SSE (TreeNode *p, int division, int chain)
3118 {
3119     int             c, k;
3120     CLFlt           *pL, *pR, *pA, *tiPL, *tiPR, *tiPA;
3121     __m128          *clL, *clR, *clP, *clA;
3122     __m128          m1, m2, m3, m4, m5, m6, m7, m8, m9;
3123     ModelInfo       *m;
3124
3125     m = &modelSettings[division];
3126
3127     /* flip state of node so that we are not overwriting old cond likes */
3128     FlipCondLikeSpace (m, chain, p->index);
3129
3130     /* find conditional likelihood pointers */
3131     clL = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->left->index ]];
3132     clR = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->right->index]];
3133     clP = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index       ]];
3134     clA = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
3135
3136     /* find transition probabilities */
3137     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
3138     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
3139     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
3140
3141     tiPL = pL;
3142     tiPR = pR;
3143     tiPA = pA;
3144     for (k=0; k<m->numGammaCats; k++)
3145         {
3146         for (c=0; c<m->numSSEChars; c++)
3147             {
3148             m1 = _mm_load1_ps (&tiPL[AA]);
3149             m2 = _mm_load1_ps (&tiPR[AA]);
3150             m3 = _mm_load1_ps (&tiPA[AA]);
3151             m7 = _mm_mul_ps (m1, clL[A]);
3152             m8 = _mm_mul_ps (m2, clR[A]);
3153             m9 = _mm_mul_ps (m3, clA[A]);
3154
3155             m1 = _mm_load1_ps (&tiPL[AC]);
3156             m2 = _mm_load1_ps (&tiPR[AC]);
3157             m3 = _mm_load1_ps (&tiPA[AC]);
3158             m4 = _mm_mul_ps (m1, clL[C]);
3159             m5 = _mm_mul_ps (m2, clR[C]);
3160             m6 = _mm_mul_ps (m3, clA[C]);
3161             m7 = _mm_add_ps (m4, m7);
3162             m8 = _mm_add_ps (m5, m8);
3163             m9 = _mm_add_ps (m6, m9);
3164
3165             m1 = _mm_load1_ps (&tiPL[AG]);
3166             m2 = _mm_load1_ps (&tiPR[AG]);
3167             m3 = _mm_load1_ps (&tiPA[AG]);
3168             m4 = _mm_mul_ps (m1, clL[G]);
3169             m5 = _mm_mul_ps (m2, clR[G]);
3170             m6 = _mm_mul_ps (m3, clA[G]);
3171             m7 = _mm_add_ps (m4, m7);
3172             m8 = _mm_add_ps (m5, m8);
3173             m9 = _mm_add_ps (m6, m9);
3174
3175             m1 = _mm_load1_ps (&tiPL[AT]);
3176             m2 = _mm_load1_ps (&tiPR[AT]);
3177             m3 = _mm_load1_ps (&tiPA[AT]);
3178             m4 = _mm_mul_ps (m1, clL[T]);
3179             m5 = _mm_mul_ps (m2, clR[T]);
3180             m6 = _mm_mul_ps (m3, clA[T]);
3181             m7 = _mm_add_ps (m4, m7);
3182             m8 = _mm_add_ps (m5, m8);
3183             m9 = _mm_add_ps (m6, m9);
3184
3185             m7 = _mm_mul_ps (m7, m8);
3186             *clP++ = _mm_mul_ps (m7, m9);
3187
3188             m1 = _mm_load1_ps (&tiPL[CA]);
3189             m2 = _mm_load1_ps (&tiPR[CA]);
3190             m3 = _mm_load1_ps (&tiPA[CA]);
3191             m7 = _mm_mul_ps (m1, clL[A]);
3192             m8 = _mm_mul_ps (m2, clR[A]);
3193             m9 = _mm_mul_ps (m3, clA[A]);
3194
3195             m1 = _mm_load1_ps (&tiPL[CC]);
3196             m2 = _mm_load1_ps (&tiPR[CC]);
3197             m3 = _mm_load1_ps (&tiPA[CC]);
3198             m4 = _mm_mul_ps (m1, clL[C]);
3199             m5 = _mm_mul_ps (m2, clR[C]);
3200             m6 = _mm_mul_ps (m3, clA[C]);
3201             m7 = _mm_add_ps (m4, m7);
3202             m8 = _mm_add_ps (m5, m8);
3203             m9 = _mm_add_ps (m6, m9);
3204
3205             m1 = _mm_load1_ps (&tiPL[CG]);
3206             m2 = _mm_load1_ps (&tiPR[CG]);
3207             m3 = _mm_load1_ps (&tiPA[CG]);
3208             m4 = _mm_mul_ps (m1, clL[G]);
3209             m5 = _mm_mul_ps (m2, clR[G]);
3210             m6 = _mm_mul_ps (m3, clA[G]);
3211             m7 = _mm_add_ps (m4, m7);
3212             m8 = _mm_add_ps (m5, m8);
3213             m9 = _mm_add_ps (m6, m9);
3214
3215             m1 = _mm_load1_ps (&tiPL[CT]);
3216             m2 = _mm_load1_ps (&tiPR[CT]);
3217             m3 = _mm_load1_ps (&tiPA[CT]);
3218             m4 = _mm_mul_ps (m1, clL[T]);
3219             m5 = _mm_mul_ps (m2, clR[T]);
3220             m6 = _mm_mul_ps (m3, clA[T]);
3221             m7 = _mm_add_ps (m4, m7);
3222             m8 = _mm_add_ps (m5, m8);
3223             m9 = _mm_add_ps (m6, m9);
3224
3225             m7 = _mm_mul_ps (m7, m8);
3226             *clP++ = _mm_mul_ps (m7, m9);
3227
3228             m1 = _mm_load1_ps (&tiPL[GA]);
3229             m2 = _mm_load1_ps (&tiPR[GA]);
3230             m3 = _mm_load1_ps (&tiPA[GA]);
3231             m7 = _mm_mul_ps (m1, clL[A]);
3232             m8 = _mm_mul_ps (m2, clR[A]);
3233             m9 = _mm_mul_ps (m3, clA[A]);
3234
3235             m1 = _mm_load1_ps (&tiPL[GC]);
3236             m2 = _mm_load1_ps (&tiPR[GC]);
3237             m3 = _mm_load1_ps (&tiPA[GC]);
3238             m4 = _mm_mul_ps (m1, clL[C]);
3239             m5 = _mm_mul_ps (m2, clR[C]);
3240             m6 = _mm_mul_ps (m3, clA[C]);
3241             m7 = _mm_add_ps (m4, m7);
3242             m8 = _mm_add_ps (m5, m8);
3243             m9 = _mm_add_ps (m6, m9);
3244
3245             m1 = _mm_load1_ps (&tiPL[GG]);
3246             m2 = _mm_load1_ps (&tiPR[GG]);
3247             m3 = _mm_load1_ps (&tiPA[GG]);
3248             m4 = _mm_mul_ps (m1, clL[G]);
3249             m5 = _mm_mul_ps (m2, clR[G]);
3250             m6 = _mm_mul_ps (m3, clA[G]);
3251             m7 = _mm_add_ps (m4, m7);
3252             m8 = _mm_add_ps (m5, m8);
3253             m9 = _mm_add_ps (m6, m9);
3254
3255             m1 = _mm_load1_ps (&tiPL[GT]);
3256             m2 = _mm_load1_ps (&tiPR[GT]);
3257             m3 = _mm_load1_ps (&tiPA[GT]);
3258             m4 = _mm_mul_ps (m1, clL[T]);
3259             m5 = _mm_mul_ps (m2, clR[T]);
3260             m6 = _mm_mul_ps (m3, clA[T]);
3261             m7 = _mm_add_ps (m4, m7);
3262             m8 = _mm_add_ps (m5, m8);
3263             m9 = _mm_add_ps (m6, m9);
3264
3265             m7 = _mm_mul_ps (m7, m8);
3266             *clP++ = _mm_mul_ps (m7, m9);
3267
3268             m1 = _mm_load1_ps (&tiPL[TA]);
3269             m2 = _mm_load1_ps (&tiPR[TA]);
3270             m3 = _mm_load1_ps (&tiPA[TA]);
3271             m7 = _mm_mul_ps (m1, clL[A]);
3272             m8 = _mm_mul_ps (m2, clR[A]);
3273             m9 = _mm_mul_ps (m3, clA[A]);
3274
3275             m1 = _mm_load1_ps (&tiPL[TC]);
3276             m2 = _mm_load1_ps (&tiPR[TC]);
3277             m3 = _mm_load1_ps (&tiPA[TC]);
3278             m4 = _mm_mul_ps (m1, clL[C]);
3279             m5 = _mm_mul_ps (m2, clR[C]);
3280             m6 = _mm_mul_ps (m3, clA[C]);
3281             m7 = _mm_add_ps (m4, m7);
3282             m8 = _mm_add_ps (m5, m8);
3283             m9 = _mm_add_ps (m6, m9);
3284
3285             m1 = _mm_load1_ps (&tiPL[TG]);
3286             m2 = _mm_load1_ps (&tiPR[TG]);
3287             m3 = _mm_load1_ps (&tiPA[TG]);
3288             m4 = _mm_mul_ps (m1, clL[G]);
3289             m5 = _mm_mul_ps (m2, clR[G]);
3290             m6 = _mm_mul_ps (m3, clA[G]);
3291             m7 = _mm_add_ps (m4, m7);
3292             m8 = _mm_add_ps (m5, m8);
3293             m9 = _mm_add_ps (m6, m9);
3294
3295             m1 = _mm_load1_ps (&tiPL[TT]);
3296             m2 = _mm_load1_ps (&tiPR[TT]);
3297             m3 = _mm_load1_ps (&tiPA[TT]);
3298             m4 = _mm_mul_ps (m1, clL[T]);
3299             m5 = _mm_mul_ps (m2, clR[T]);
3300             m6 = _mm_mul_ps (m3, clA[T]);
3301             m7 = _mm_add_ps (m4, m7);
3302             m8 = _mm_add_ps (m5, m8);
3303             m9 = _mm_add_ps (m6, m9);
3304
3305             m7 = _mm_mul_ps (m7, m8);
3306             *clP++ = _mm_mul_ps (m7, m9);
3307
3308             clL += 4;
3309             clR += 4;
3310             clA += 4;
3311             }
3312         tiPL += 16;
3313         tiPR += 16;
3314         tiPA += 16;
3315         }
3316
3317     return NO_ERROR;
3318 }
3319 #endif
3320
3321
3322 #if !defined (SSE_ENABLED) || 1
3323 /*----------------------------------------------------------------
3324 |
3325 |   CondLikeRoot_NY98: codon model with omega variation
3326 |
3327 -----------------------------------------------------------------*/
3328 int CondLikeRoot_NY98 (TreeNode *p, int division, int chain)
3329 {
3330     int             a, b, c, d, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, *aState=NULL,
3331                     nStates, nStatesSquared;
3332     CLFlt           likeL, likeR, likeA, *clL, *clR, *clP, *clA, *pL, *pR, *pA,
3333                     *tiPL, *tiPR, *tiPA;
3334     ModelInfo       *m;
3335
3336     /* find model settings for this division and nStates, nStatesSquared */
3337     m = &modelSettings[division];
3338     nStates = m->numModelStates;
3339     nStatesSquared = nStates * nStates;
3340
3341     /* flip state of node so that we are not overwriting old cond likes */
3342     FlipCondLikeSpace (m, chain, p->index);
3343
3344     /* find conditional likelihood pointers */
3345     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
3346     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
3347     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
3348     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
3349
3350     /* find transition probabilities (or calculate instead) */
3351     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
3352     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
3353     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
3354
3355     /* find likelihoods of site patterns for left branch if terminal */
3356     shortCut = 0;
3357 #   if !defined (DEBUG_NOSHORTCUTS)
3358     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
3359         {
3360         shortCut |= 1;
3361         lState = m->termState[p->left->index];
3362         tiPL = pL;
3363         for (k=a=0; k<m->numOmegaCats; k++)
3364             {
3365             for (i=0; i<nStates; i++)
3366                 for (j=i; j<nStatesSquared; j+=nStates)
3367                     preLikeL[a++] = tiPL[j];
3368             /* for ambiguous */
3369             for (i=0; i<nStates; i++)
3370                 preLikeL[a++] = 1.0;
3371             tiPL += nStatesSquared;
3372             }
3373         }
3374
3375     /* find likelihoods of site patterns for right branch if terminal */
3376     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
3377         {
3378         shortCut |= 2;
3379         rState = m->termState[p->right->index];
3380         tiPR = pR;
3381         for (k=a=0; k<m->numOmegaCats; k++)
3382             {
3383             for (i=0; i<nStates; i++)
3384                 for (j=i; j<nStatesSquared; j+=nStates)
3385                     preLikeR[a++] = tiPR[j];
3386             /* for ambiguous */
3387             for (i=0; i<nStates; i++)
3388                 preLikeR[a++] = 1.0;
3389             tiPR += nStatesSquared;
3390             }
3391         }
3392
3393     /* find likelihoods of site patterns for anc branch, always terminal */
3394     if (m->isPartAmbig[p->anc->index] == YES)
3395         {
3396         shortCut = 4;
3397         }
3398     else
3399         {
3400         aState = m->termState[p->anc->index];
3401         tiPA = pA;
3402         for (k=a=0; k<m->numOmegaCats; k++)
3403             {
3404             for (i=0; i<nStates; i++)
3405                 for (j=i; j<nStatesSquared; j+=nStates)
3406                     preLikeA[a++] = tiPA[j];
3407             /* for ambiguous */
3408             for (i=0; i<nStates; i++)
3409                 preLikeA[a++] = 1.0;
3410             tiPA += nStatesSquared;
3411             }
3412         }
3413 #   else
3414     shortCut = 4;
3415 #   endif
3416
3417         switch (shortCut)
3418         {
3419         case 4:
3420             tiPL = pL;
3421             tiPR = pR;
3422             tiPA = pA;
3423             for (k=0; k<m->numOmegaCats; k++)
3424                 {
3425                 for (c=0; c<m->numChars; c++)
3426                     {
3427                     for (i=h=0; i<nStates; i++)
3428                         {
3429                         likeL = likeR = likeA = 0.0;
3430                         for (j=0; j<nStates; j++)
3431                             {
3432                             likeA += tiPA[h]*clA[j];
3433                             likeL += tiPL[h]*clL[j];
3434                             likeR += tiPR[h++]*clR[j];
3435                             }
3436                         *(clP++) = likeL * likeR * likeA;
3437                         }
3438                     clL += nStates;
3439                     clR += nStates;
3440                     clA += nStates;
3441                     }
3442                 tiPL += nStatesSquared;
3443                 tiPR += nStatesSquared;
3444                 tiPA += nStatesSquared;
3445                 }
3446             break;
3447         case 0:
3448             tiPR = pR;
3449             tiPL = pL;
3450             for (k=0; k<m->numOmegaCats; k++)
3451                 {
3452                 for (c=0; c<m->numChars; c++)
3453                     {
3454                     b = aState[c] + k*(nStatesSquared+nStates);
3455                     for (i=h=0; i<nStates; i++)
3456                         {
3457                         likeR = likeL = 0.0;
3458                         for (j=0; j<nStates; j++)
3459                             {
3460                             likeR += tiPR[h]*clR[j];
3461                             likeL += tiPL[h++]*clL[j];
3462                             }
3463                         *(clP++) =  preLikeA[b++] * likeL * likeR;
3464                         }
3465                     clR += nStates;
3466                     clL += nStates;
3467                     }
3468                 tiPR += nStatesSquared;
3469                 tiPL += nStatesSquared;
3470                 }
3471             break;
3472         case 1:
3473             tiPR = pR;
3474             for (k=0; k<m->numOmegaCats; k++)
3475                 {
3476                 for (c=0; c<m->numChars; c++)
3477                     {
3478                     a = lState[c] + k*(nStatesSquared+nStates);
3479                     b = aState[c] + k*(nStatesSquared+nStates);
3480                     for (i=h=0; i<nStates; i++)
3481                         {
3482                         likeR = 0.0;
3483                         for (j=0; j<nStates; j++)
3484                             {
3485                             likeR += tiPR[h++]*clR[j];
3486                             }
3487                         *(clP++) = preLikeL[a++] * preLikeA[b++] * likeR;
3488                         }
3489                     clR += nStates;
3490                     }
3491                 tiPR += nStatesSquared;
3492                 }
3493             break;
3494         case 2:
3495             tiPL = pL;
3496             for (k=0; k<m->numOmegaCats; k++)
3497                 {
3498                 for (c=0; c<m->numChars; c++)
3499                     {
3500                     a = rState[c] + k*(nStatesSquared+nStates);
3501                     b = aState[c] + k*(nStatesSquared+nStates);
3502                     for (i=h=0; i<nStates; i++)
3503                         {
3504                         likeL = 0.0;
3505                         for (j=0; j<nStates; j++)
3506                             {
3507                             likeL += tiPL[h++]*clL[j];
3508                             }
3509                         *(clP++) = preLikeR[a++] * preLikeA[b++] * likeL;
3510                         }
3511                     clL += nStates;
3512                     }
3513                 tiPL += nStatesSquared;
3514                 }
3515             break;
3516         case 3:
3517             for (k=0; k<m->numOmegaCats; k++)
3518                 {
3519                 for (c=0; c<m->numChars; c++)
3520                     {
3521                     a = rState[c] + k*(nStatesSquared+nStates);
3522                     b = lState[c] + k*(nStatesSquared+nStates);
3523                     d = aState[c] + k*(nStatesSquared+nStates);
3524                     for (i=0; i<nStates; i++)
3525                         {
3526                         *(clP++) = preLikeR[a++] * preLikeL[b++] * preLikeA[d++];
3527                         }
3528                     }
3529                 }
3530             break;
3531         }
3532
3533     return NO_ERROR;
3534 }
3535 #endif
3536
3537
3538 #if defined (SSE_ENABLED)
3539 /*----------------------------------------------------------------
3540 |
3541 |   CondLikeRoot_NY98_SSE: codon model with omega variation
3542 |
3543 -----------------------------------------------------------------*/
3544 int CondLikeRoot_NY98_SSE (TreeNode *p, int division, int chain)
3545 {
3546     int             c, c1, t, h, i, j, k, shortCut, *lState=NULL, *rState=NULL, *aState=NULL,
3547                     nStates, nStatesSquared;
3548     CLFlt           *pL, *pR, *pA,
3549                     *tiPL, *tiPR, *tiPA;
3550     __m128          *clL, *clR, *clP, *clA;
3551     __m128          mTiPL, mTiPR, mTiPA, mL, mR, mA, mAcumL, mAcumR, mAcumA;
3552     ModelInfo       *m;
3553     CLFlt           *preLikeRV[FLOATS_PER_VEC];
3554     CLFlt           *preLikeLV[FLOATS_PER_VEC];
3555     CLFlt           *preLikeAV[FLOATS_PER_VEC];
3556
3557 #   if !defined (DEBUG_NOSHORTCUTS)
3558     int             a;
3559
3560 #   endif
3561
3562     /* find model settings for this division and nStates, nStatesSquared */
3563     m = &modelSettings[division];
3564     nStates = m->numModelStates;
3565     nStatesSquared = nStates * nStates;
3566
3567     /* flip state of node so that we are not overwriting old cond likes */
3568     FlipCondLikeSpace (m, chain, p->index);
3569
3570     /* find conditional likelihood pointers */
3571     clL = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->left->index ]];
3572     clR = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->right->index]];
3573     clP = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->index       ]];
3574     clA = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
3575
3576     /* find transition probabilities (or calculate instead) */
3577     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
3578     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
3579     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
3580
3581     /* find likelihoods of site patterns for left branch if terminal */
3582     shortCut = 0;
3583 #   if !defined (DEBUG_NOSHORTCUTS)
3584     if (p->left->left == NULL && m->isPartAmbig[p->left->index] == NO)
3585         {
3586         shortCut |= 1;
3587         lState = m->termState[p->left->index];
3588         tiPL = pL;
3589         for (k=a=0; k<m->numOmegaCats; k++)
3590             {
3591             for (i=0; i<nStates; i++)
3592                 for (j=i; j<nStatesSquared; j+=nStates)
3593                     preLikeL[a++] = tiPL[j];
3594             /* for ambiguous */
3595             for (i=0; i<nStates; i++)
3596                 preLikeL[a++] = 1.0;
3597             tiPL += nStatesSquared;
3598             }
3599         }
3600
3601     /* find likelihoods of site patterns for right branch if terminal */
3602     if (p->right->left == NULL && m->isPartAmbig[p->right->index] == NO)
3603         {
3604         shortCut |= 2;
3605         rState = m->termState[p->right->index];
3606         tiPR = pR;
3607         for (k=a=0; k<m->numOmegaCats; k++)
3608             {
3609             for (i=0; i<nStates; i++)
3610                 for (j=i; j<nStatesSquared; j+=nStates)
3611                     preLikeR[a++] = tiPR[j];
3612             /* for ambiguous */
3613             for (i=0; i<nStates; i++)
3614                 preLikeR[a++] = 1.0;
3615             tiPR += nStatesSquared;
3616             }
3617         }
3618
3619     /* find likelihoods of site patterns for anc branch, always terminal */
3620     if (m->isPartAmbig[p->anc->index] == YES)
3621         {
3622         shortCut = 4;
3623         }
3624     else
3625         {
3626         aState = m->termState[p->anc->index];
3627         tiPA = pA;
3628         for (k=a=0; k<m->numOmegaCats; k++)
3629             {
3630             for (i=0; i<nStates; i++)
3631                 for (j=i; j<nStatesSquared; j+=nStates)
3632                     preLikeA[a++] = tiPA[j];
3633             /* for ambiguous */
3634             for (i=0; i<nStates; i++)
3635                 preLikeA[a++] = 1.0;
3636             tiPA += nStatesSquared;
3637             }
3638         }
3639 #   else
3640     shortCut = 4;
3641 #   endif
3642         switch (shortCut)
3643         {
3644         case 4:
3645             tiPL = pL;
3646             tiPR = pR;
3647             tiPA = pA;
3648             for (k=0; k<m->numOmegaCats; k++)
3649                 {
3650                 for (c=0; c<m->numSSEChars; c++)
3651                     {
3652                     for (i=h=0; i<nStates; i++)
3653                         {
3654                         mAcumL = _mm_setzero_ps();
3655                         mAcumR = _mm_setzero_ps();
3656                         mAcumA = _mm_setzero_ps();
3657                         for (j=0; j<nStates; j++)
3658                             {
3659                             mTiPL  = _mm_load1_ps (&tiPL[h]);
3660                             mTiPR  = _mm_load1_ps (&tiPR[h]);
3661                             mTiPA  = _mm_load1_ps (&tiPA[h++]);
3662                             mL     = _mm_mul_ps (mTiPL, clL[j]);
3663                             mR     = _mm_mul_ps (mTiPR, clR[j]);
3664                             mA     = _mm_mul_ps (mTiPA, clA[j]);
3665                             mAcumL = _mm_add_ps (mL, mAcumL);
3666                             mAcumR = _mm_add_ps (mR, mAcumR);
3667                             mAcumA = _mm_add_ps (mA, mAcumA);
3668                             }
3669                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
3670                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
3671                         }
3672                     clL += nStates;
3673                     clR += nStates;
3674                     clA += nStates;
3675                     }
3676                 tiPL += nStatesSquared;
3677                 tiPR += nStatesSquared;
3678                 tiPA += nStatesSquared;
3679                 }
3680             break;
3681         case 0:
3682             tiPL =pL;
3683             tiPR =pR;
3684             for (k=0; k<m->numOmegaCats; k++)
3685                 {
3686                 for (c=t=0; c<m->numSSEChars; c++)
3687                     {
3688                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
3689                         {
3690                         preLikeAV[c1] = &preLikeA[aState[t] + k*(nStatesSquared+nStates)];
3691                         }
3692                     for (i=h=0; i<nStates; i++)
3693                         {
3694                         assert (FLOATS_PER_VEC == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
3695                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
3696                         mAcumL = _mm_setzero_ps();
3697                         mAcumR = _mm_setzero_ps();
3698                         for (j=0; j<nStates; j++)
3699                             {
3700                             mTiPL  = _mm_load1_ps (&tiPL[h]);
3701                             mL     = _mm_mul_ps (mTiPL, clL[j]);
3702                             mAcumL = _mm_add_ps (mL, mAcumL);
3703                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
3704                             mR     = _mm_mul_ps (mTiPR, clR[j]);
3705                             mAcumR = _mm_add_ps (mR, mAcumR);
3706                             }
3707                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
3708                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
3709                         }
3710                     clR += nStates;
3711                     clL += nStates;
3712                     }
3713                 tiPL += nStatesSquared;
3714                 tiPR += nStatesSquared;
3715                 }
3716             break;
3717         case 1:
3718             tiPR = pR;
3719             for (k=0; k<m->numOmegaCats; k++)
3720                 {
3721                 for (c=t=0; c<m->numSSEChars; c++)
3722                     {
3723                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
3724                         {
3725                         preLikeLV[c1] = &preLikeL[lState[t] + k*(nStatesSquared+nStates)];
3726                         preLikeAV[c1] = &preLikeA[aState[t] + k*(nStatesSquared+nStates)];
3727                         }
3728                     for (i=h=0; i<nStates; i++)
3729                         {
3730                         assert (FLOATS_PER_VEC == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
3731                         mAcumL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
3732                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
3733                         mAcumR = _mm_setzero_ps();
3734                         for (j=0; j<nStates; j++)
3735                             {
3736                             mTiPR  = _mm_load1_ps (&tiPR[h++]);
3737                             mR     = _mm_mul_ps (mTiPR, clR[j]);
3738                             mAcumR = _mm_add_ps (mR, mAcumR);
3739                             }
3740                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
3741                         *(clP++) = _mm_mul_ps (mAcumL, mAcumA);
3742                         }
3743                     clR += nStates;
3744                     }
3745                 tiPR += nStatesSquared;
3746                 }
3747             break;
3748         case 2:
3749             tiPL = pL;
3750             for (k=0; k<m->numOmegaCats; k++)
3751                 {
3752                 for (c=t=0; c<m->numSSEChars; c++)
3753                     {
3754                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
3755                         {
3756                         preLikeRV[c1] = &preLikeR[rState[t] + k*(nStatesSquared+nStates)];
3757                         preLikeAV[c1] = &preLikeA[aState[t] + k*(nStatesSquared+nStates)];
3758                         }
3759                     for (i=h=0; i<nStates; i++)
3760                         {
3761                         assert (FLOATS_PER_VEC == 4); /* In the following statment we assume that SSE register can hold exactly 4 ClFlts. */
3762                         mAcumR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
3763                         mAcumA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
3764                         mAcumL = _mm_setzero_ps();
3765                         for (j=0; j<nStates; j++)
3766                             {
3767                             mTiPL  = _mm_load1_ps (&tiPL[h++]);
3768                             mL     = _mm_mul_ps (mTiPL, clL[j]);
3769                             mAcumL = _mm_add_ps (mL, mAcumL);
3770                             }
3771                         mAcumL = _mm_mul_ps (mAcumL, mAcumR);
3772                         *(clP++) = _mm_mul_ps (mAcumL,mAcumA);
3773                         }
3774                     clL += nStates;
3775                     }
3776                 tiPL += nStatesSquared;
3777                 }
3778             break;
3779         case 3:
3780             for (k=0; k<m->numOmegaCats; k++)
3781                 {
3782                 for (c=t=0; c<m->numSSEChars; c++)
3783                     {
3784                     for (c1=0; c1<FLOATS_PER_VEC; c1++,t++)
3785                         {
3786                         preLikeRV[c1] = &preLikeR[rState[t] + k*(nStatesSquared+nStates)];
3787                         preLikeLV[c1] = &preLikeL[lState[t] + k*(nStatesSquared+nStates)];
3788                         preLikeAV[c1] = &preLikeA[aState[t] + k*(nStatesSquared+nStates)];
3789                         }
3790                     for (i=0; i<nStates; i++)
3791                         {
3792                         assert (FLOATS_PER_VEC == 4); /* In the following 2 statments we assume that SSE register can hold exactly 4 ClFlts. */
3793                         mL = _mm_set_ps (*(preLikeLV[3]++), *(preLikeLV[2]++), *(preLikeLV[1]++), *(preLikeLV[0]++));
3794                         mR = _mm_set_ps (*(preLikeRV[3]++), *(preLikeRV[2]++), *(preLikeRV[1]++), *(preLikeRV[0]++));
3795                         mA = _mm_set_ps (*(preLikeAV[3]++), *(preLikeAV[2]++), *(preLikeAV[1]++), *(preLikeAV[0]++));
3796                         mL = _mm_mul_ps (mL,mR);
3797                         *(clP++) = _mm_mul_ps (mL,mA);
3798                         }
3799                     }
3800                 }
3801             break;
3802         }
3803
3804     return NO_ERROR;
3805 }
3806 #endif
3807
3808
3809 /*----------------------------------------------------------------
3810 |
3811 |   CondLikeRoot_Std: variable number of states model
3812 |       with or without rate variation
3813 |
3814 -----------------------------------------------------------------*/
3815 int CondLikeRoot_Std (TreeNode *p, int division, int chain)
3816 {
3817     int             a, c, h, i, j, k, nStates=0, nCats=0, tmp;
3818     CLFlt           *clL, *clR, *clP, *clA, *pL, *pR, *pA, *tiPL, *tiPR, *tiPA,
3819                     likeL, likeR, likeA;
3820     ModelInfo       *m;
3821
3822     m = &modelSettings[division];
3823
3824     /* flip state of node so that we are not overwriting old cond likes */
3825     FlipCondLikeSpace (m, chain, p->index);
3826
3827     /* find conditional likelihood pointers */
3828     clL = m->condLikes[m->condLikeIndex[chain][p->left->index ]];
3829     clR = m->condLikes[m->condLikeIndex[chain][p->right->index]];
3830     clP = m->condLikes[m->condLikeIndex[chain][p->index       ]];
3831     clA = m->condLikes[m->condLikeIndex[chain][p->anc->index  ]];
3832
3833     /* find transition probabilities (or calculate instead) */
3834     pL = m->tiProbs[m->tiProbsIndex[chain][p->left->index ]];
3835     pR = m->tiProbs[m->tiProbsIndex[chain][p->right->index]];
3836     pA = m->tiProbs[m->tiProbsIndex[chain][p->index       ]];
3837
3838     /* calculate ancestral probabilities */
3839     for (k=h=0; k<m->numGammaCats; k++)
3840         {
3841         /* calculate ancestral probabilities */
3842         for (c=0; c<m->numChars; c++)
3843             {
3844             nStates = m->nStates[c];
3845
3846             /* the following lines ensure that nCats is 1 unless */
3847             /* the character is binary and beta categories are used  */
3848             if (nStates == 2)
3849                 nCats = m->numBetaCats;
3850             else
3851                 nCats = 1;
3852
3853             tmp = k*nStates*nStates; /* tmp contains offset to skip gamma cats that already processed*/
3854             tiPL = pL + m->tiIndex[c] + tmp;
3855             tiPR = pR + m->tiIndex[c] + tmp;
3856             tiPA = pA + m->tiIndex[c] + tmp;
3857             tmp = (m->numGammaCats-1)*2*2; /* tmp contains size of block of tpi matrices across all gamma cats (minus one) for single beta category. Further used only if character is binary to jump to next beta category */
3858
3859             for (j=0; j<nCats;j++)
3860                 {
3861                 for (a=0; a<nStates; a++)
3862                     {
3863                     likeL = likeR = likeA = 0.0;
3864                     for (i=0; i<nStates; i++)
3865                         {
3866                         likeL += *(tiPL++) * clL[i];
3867                         likeR += *(tiPR++) * clR[i];
3868                         likeA += *(tiPA++) * clA[i];
3869                         }
3870                     clP[h++] = likeL * likeR * likeA;
3871                     }
3872                 clL += nStates;
3873                 clR += nStates;
3874                 clA += nStates;
3875
3876                 tiPL += tmp;
3877                 tiPR += tmp;
3878                 tiPA += tmp;
3879                 }
3880             }
3881         }
3882
3883     return NO_ERROR;
3884 }
3885
3886
3887 /*----------------------------------------------------------------
3888 |
3889 |   CondLikeUp_Bin: pull likelihoods up and calculate scaled
3890 |       finals, binary model with or without rate variation
3891 |
3892 -----------------------------------------------------------------*/
3893 int CondLikeUp_Bin (TreeNode *p, int division, int chain)
3894 {
3895     int             c, k;
3896     CLFlt           *clFA, *clFP, *clDP, *tiP, condLikeUp[2], sum[2];
3897     ModelInfo       *m;
3898
3899     /* find model settings for this division */
3900     m = &modelSettings[division];
3901
3902     if (p->anc->anc == NULL)
3903         {
3904         /* this is the root node */
3905         /* find conditional likelihood pointers = down cond likes */
3906         /* use conditional likelihood scratch space for final cond likes */
3907         clDP = m->condLikes[m->condLikeIndex[chain][p->index]];
3908         clFP = m->condLikes[m->condLikeScratchIndex[p->index]];
3909
3910         for (k=0; k<m->numGammaCats; k++)
3911             {
3912             for (c=0; c<m->numChars; c++)
3913                 {
3914                 *(clFP++) = *(clDP++);
3915                 *(clFP++) = *(clDP++);
3916                 }
3917             }
3918         }
3919     else
3920         {
3921         /* find conditional likelihood pointers */
3922         /* use conditional likelihood scratch space for final cond likes */
3923         clFA = m->condLikes[m->condLikeScratchIndex[p->anc->index]];
3924         clFP = m->condLikes[m->condLikeScratchIndex[p->index     ]];
3925         clDP = m->condLikes[m->condLikeIndex[chain][p->index     ]];
3926
3927         /* find transition probabilities */
3928         tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
3929
3930         for (k=0; k<m->numGammaCats; k++)
3931             {
3932             for (c=0; c<m->numChars; c++)
3933                 {
3934                 condLikeUp[0] = condLikeUp[1] = 0.0;
3935
3936                 sum[0] = tiP[0]*clDP[0] + tiP[1]*clDP[1];
3937                 sum[1] = tiP[2]*clDP[0] + tiP[3]*clDP[1];
3938
3939                 if (sum[0] != 0.0) condLikeUp[0] = clFA[0] / sum[0];
3940                 if (sum[1] != 0.0) condLikeUp[1] = clFA[1] / sum[1];
3941
3942                 *(clFP++) = (condLikeUp[0]*tiP[0] + condLikeUp[1]*tiP[1])*clDP[0];
3943                 *(clFP++) = (condLikeUp[0]*tiP[2] + condLikeUp[1]*tiP[3])*clDP[1];
3944
3945                 clFA += 2;
3946                 clDP += 2;
3947                 }
3948             tiP += 4;
3949             }
3950         }
3951
3952     return NO_ERROR;
3953 }
3954
3955
3956 /*----------------------------------------------------------------
3957 |
3958 |   CondLikeUp_Gen: pull likelihoods up and calculate scaled
3959 |       finals for an interior node
3960 |
3961 -----------------------------------------------------------------*/
3962 int CondLikeUp_Gen (TreeNode *p, int division, int chain)
3963 {
3964     int             a, c, i, j, k, nStates, nStatesSquared, nGammaCats;
3965     CLFlt           *clFA, *clFP, *clDP, *tiP, *condLikeUp, sum;
3966     ModelInfo       *m;
3967
3968     /* find model settings for this division */
3969     m = &modelSettings[division];
3970
3971     /* find number of states in the model */
3972     nStates = m->numModelStates;
3973     nStatesSquared = nStates * nStates;
3974
3975     /* find number of gamma cats */
3976     nGammaCats = m->numGammaCats;
3977     if (m->gibbsGamma == YES)
3978         nGammaCats = 1;
3979
3980     /* use preallocated scratch space */
3981     condLikeUp = m->ancStateCondLikes;
3982
3983     /* calculate final states */
3984     if (p->anc->anc == NULL)
3985         {
3986         /* this is the root node */
3987         /* find conditional likelihood pointers = down cond likes */
3988         /* use conditional likelihood scratch space for final cond likes */
3989         clDP = m->condLikes[m->condLikeIndex[chain][p->index]];
3990         clFP = m->condLikes[m->condLikeScratchIndex[p->index]];
3991
3992         /* final cond likes = downpass cond likes */
3993         for (k=0; k<nGammaCats; k++)
3994             {
3995             /* copy cond likes */
3996             for (c=0; c<m->numChars*nStates; c++)
3997                 *(clFP++) = *(clDP++);
3998             }
3999         }
4000     else
4001         {
4002         /* find conditional likelihood pointers */
4003         /* use conditional likelihood scratch space for final cond likes */
4004         clFA = m->condLikes[m->condLikeScratchIndex[p->anc->index]];
4005         clFP = m->condLikes[m->condLikeScratchIndex[p->index     ]];
4006         clDP = m->condLikes[m->condLikeIndex[chain][p->index     ]];
4007
4008         /* find transition probabilities */
4009         tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
4010
4011         for (k=0; k<nGammaCats; k++)
4012             {
4013             for (c=0; c<m->numChars; c++)
4014                 {
4015                 for (a=j=0; a<nStates; a++)
4016                     {
4017                     sum = 0.0;
4018                     for (i=0; i<nStates; i++)
4019                         sum += tiP[j++]*clDP[i];
4020                     if (sum != 0.0) condLikeUp[a] = clFA[a] / sum;
4021                     }
4022
4023                 for (a=j=0; a<nStates; a++)
4024                     {
4025                     sum = 0.0;
4026                     for (i=0; i<nStates; i++)
4027                         {
4028                         sum += condLikeUp[i] * tiP[j++];
4029                         }
4030                     *(clFP++) = sum * clDP[a];
4031                     }
4032
4033                 clFA += nStates;
4034                 clDP += nStates;
4035                 }
4036             tiP += nStatesSquared;
4037             }
4038         }
4039
4040     return NO_ERROR;
4041 }
4042
4043
4044 /*----------------------------------------------------------------
4045 |
4046 |   CondLikeUp_NUC4: pull likelihoods up and calculate scaled
4047 |       finals for an interior node
4048 |
4049 -----------------------------------------------------------------*/
4050 int     CondLikeUp_NUC4 (TreeNode *p, int division, int chain)
4051 {
4052     int             c, k, nGammaCats;
4053     CLFlt           *clFA, *clFP, *clDP, *tiP, condLikeUp[4], sum[4];
4054     ModelInfo       *m;
4055
4056     /* find model settings for this division */
4057     m = &modelSettings[division];
4058
4059     /* find number of gamma cats */
4060     nGammaCats = m->numGammaCats;
4061     if (m->gibbsGamma == YES)
4062         nGammaCats = 1;
4063
4064     /* calculate final states */
4065     if (p->anc->anc == NULL)
4066         {
4067         /* this is the root node */
4068         /* find conditional likelihood pointers = down cond likes */
4069         /* use conditional likelihood scratch space for final cond likes */
4070         clDP = m->condLikes[m->condLikeIndex[chain][p->index]];
4071         clFP = m->condLikes[m->condLikeScratchIndex[p->index]];
4072
4073         /* final cond likes = downpass cond likes */
4074         for (k=0; k<nGammaCats; k++)
4075             {
4076             /* copy cond likes */
4077             for (c=0; c<m->numChars; c++)
4078                 {
4079                 *(clFP++) = *(clDP++);
4080                 *(clFP++) = *(clDP++);
4081                 *(clFP++) = *(clDP++);
4082                 *(clFP++) = *(clDP++);
4083                 }
4084             }
4085         }
4086     else
4087         {
4088         /* find conditional likelihood pointers */
4089         /* use conditional likelihood scratch space for final cond likes */
4090         clFA = m->condLikes[m->condLikeScratchIndex[p->anc->index]];
4091         clFP = m->condLikes[m->condLikeScratchIndex[p->index     ]];
4092         clDP = m->condLikes[m->condLikeIndex[chain][p->index     ]];
4093
4094         /* find transition probabilities */
4095         tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
4096
4097         for (k=0; k<nGammaCats; k++)
4098             {
4099             for (c=0; c<m->numChars; c++)
4100                 {
4101                 condLikeUp[A] = condLikeUp[C] = condLikeUp[G] = condLikeUp[T] = 0.0;
4102
4103                 sum[A] = (tiP[AA]*clDP[A] + tiP[AC]*clDP[C] + tiP[AG]*clDP[G] + tiP[AT]*clDP[T]);
4104                 sum[C] = (tiP[CA]*clDP[A] + tiP[CC]*clDP[C] + tiP[CG]*clDP[G] + tiP[CT]*clDP[T]);
4105                 sum[G] = (tiP[GA]*clDP[A] + tiP[GC]*clDP[C] + tiP[GG]*clDP[G] + tiP[GT]*clDP[T]);
4106                 sum[T] = (tiP[TA]*clDP[A] + tiP[TC]*clDP[C] + tiP[TG]*clDP[G] + tiP[TT]*clDP[T]);
4107
4108                 if (sum[A] != 0.0) condLikeUp[A] = clFA[A] / sum[A];
4109                 if (sum[C] != 0.0) condLikeUp[C] = clFA[C] / sum[C];
4110                 if (sum[G] != 0.0) condLikeUp[G] = clFA[G] / sum[G];
4111                 if (sum[T] != 0.0) condLikeUp[T] = clFA[T] / sum[T];
4112
4113 /*
4114                 clFP[A] = (condLikeUp[A]*tiP[AA] + condLikeUp[C]*tiP[CA] + condLikeUp[G]*tiP[GA] + condLikeUp[T]*tiP[TA])*clDP[A];
4115                 clFP[C] = (condLikeUp[A]*tiP[AC] + condLikeUp[C]*tiP[CC] + condLikeUp[G]*tiP[GC] + condLikeUp[T]*tiP[TC])*clDP[C];
4116                 clFP[G] = (condLikeUp[A]*tiP[AG] + condLikeUp[C]*tiP[CG] + condLikeUp[G]*tiP[GG] + condLikeUp[T]*tiP[TG])*clDP[G];
4117                 clFP[T] = (condLikeUp[A]*tiP[AT] + condLikeUp[C]*tiP[CT] + condLikeUp[G]*tiP[GT] + condLikeUp[T]*tiP[TT])*clDP[T];
4118 */
4119
4120                 clFP[A] = (condLikeUp[A]*tiP[AA] + condLikeUp[C]*tiP[AC] + condLikeUp[G]*tiP[AG] + condLikeUp[T]*tiP[AT])*clDP[A];
4121                 clFP[C] = (condLikeUp[A]*tiP[CA] + condLikeUp[C]*tiP[CC] + condLikeUp[G]*tiP[CG] + condLikeUp[T]*tiP[CT])*clDP[C];
4122                 clFP[G] = (condLikeUp[A]*tiP[GA] + condLikeUp[C]*tiP[GC] + condLikeUp[G]*tiP[GG] + condLikeUp[T]*tiP[GT])*clDP[G];
4123                 clFP[T] = (condLikeUp[A]*tiP[TA] + condLikeUp[C]*tiP[TC] + condLikeUp[G]*tiP[TG] + condLikeUp[T]*tiP[TT])*clDP[T];
4124
4125                 clFA += 4;
4126                 clFP += 4;
4127                 clDP += 4;
4128                 }
4129             tiP += 16;
4130             }
4131         }
4132
4133     return NO_ERROR;
4134 }
4135
4136
4137 /*----------------------------------------------------------------
4138 |
4139 |   CondLikeUp_Std: pull likelihoods up and calculate scaled
4140 |       finals for an interior node
4141 |
4142 -----------------------------------------------------------------*/
4143 int     CondLikeUp_Std (TreeNode *p, int division, int chain)
4144 {
4145     int             a, c, i, j, k, t, nStates, nCats, coppySize,tmp;
4146     CLFlt           *clFA, *clFP, *clDP, *pA, *tiP, condLikeUp[10], sum;
4147     ModelInfo       *m;
4148
4149     /* find model settings for this division */
4150     m = &modelSettings[division];
4151
4152     /* calculate final states */
4153     if (p->anc->anc == NULL)
4154         {
4155         /* this is the root node */
4156         /* find conditional likelihood pointers = down cond likes */
4157         /* use conditional likelihood scratch space for final cond likes */
4158         clDP = m->condLikes[m->condLikeIndex[chain][p->index]];
4159         clFP = m->condLikes[m->condLikeScratchIndex[p->index]];
4160
4161         coppySize=0;
4162         /* final cond likes = downpass cond likes */
4163         for (c=0; c<m->numChars; c++)
4164             {
4165             /* calculate nStates and nCats */
4166             nStates = m->nStates[c];
4167
4168             /* the following lines ensure that nCats is 1 unless */
4169             /* the character is binary and beta categories are used  */
4170             if (nStates == 2)
4171                 nCats = m->numBetaCats;
4172             else
4173                 nCats = 1;
4174
4175             coppySize+=nCats*nStates;
4176             }
4177
4178         /* finally multiply with the gamma cats */
4179         coppySize *= m->numGammaCats;
4180
4181         /* copy cond likes */
4182         for (k=0; k<coppySize; k++)
4183             *(clFP++) = *(clDP++);
4184         }
4185     else
4186         {
4187         /* find conditional likelihood pointers */
4188         /* use conditional likelihood scratch space for final cond likes */
4189         clFA = m->condLikes[m->condLikeScratchIndex[p->anc->index]];
4190         clFP = m->condLikes[m->condLikeScratchIndex[p->index     ]];
4191         clDP = m->condLikes[m->condLikeIndex[chain][p->index     ]];
4192
4193         /* find transition probabilities */
4194         pA = m->tiProbs[m->tiProbsIndex[chain][p->index]];
4195
4196         for (k=0; k<m->numGammaCats; k++)
4197             {
4198             for (c=0; c<m->numChars; c++)
4199                 {
4200
4201                 /* calculate nStates and nCats */
4202                 nStates = m->nStates[c];
4203
4204                 /* the following lines ensure that nCats is 1 unless */
4205                 /* the character is binary and beta categories are used  */
4206                 if (nStates == 2)
4207                     nCats = m->numBetaCats;
4208                 else
4209                     nCats = 1;
4210
4211                 tmp = k*nStates*nStates; /* tmp contains offset to skip gamma cats that already processed*/
4212                 tiP = pA + m->tiIndex[c] + tmp;
4213                 tmp = (m->numGammaCats-1)*2*2; /* tmp contains size of block of tpi matrices across all gamma cats (minus one) for single beta category. Further used only if character is binary to jump to next beta category */
4214
4215                 /* finally multiply with the gamma cats */
4216                 //nCats *= m->numGammaCats;
4217
4218                 /* now calculate the final cond likes */
4219                 for (t=0; t<nCats; t++)
4220                     {
4221                     for (a=j=0; a<nStates; a++)
4222                         {
4223                         sum = 0.0;
4224                         for (i=0; i<nStates; i++)
4225                             sum += tiP[j++]*clDP[i];
4226                         if (sum == 0.0)
4227                             condLikeUp[a] = 0.0;    /* we lost the conditional likelihood in the downpass (can occur in gamma model) */
4228                         else
4229                             condLikeUp[a] = clFA[a] / sum;
4230                         }
4231
4232                     for (a=j=0; a<nStates; a++)
4233                         {
4234                         sum = 0.0;
4235                         for (i=0; i<nStates; i++)
4236                             {
4237                             sum += condLikeUp[i] * tiP[j++];
4238                             }
4239                         clFP[a] = sum * clDP[a];
4240                         }
4241
4242                     clFP += nStates;
4243                     clFA += nStates;
4244                     clDP += nStates;
4245                     tiP += tmp;
4246                     }
4247                 }
4248             }
4249         }
4250
4251     return NO_ERROR;
4252 }
4253
4254
4255 /*----------------------------------------------------------------
4256 |
4257 |   CondLikeScaler_Gen: general n-state model with or without rate
4258 |       variation
4259 |
4260 -----------------------------------------------------------------*/
4261 int CondLikeScaler_Gen (TreeNode *p, int division, int chain)
4262 {
4263     int             c, k, n, nStates;
4264     CLFlt           scaler, **clP, *clPtr, *scP, *lnScaler;
4265     ModelInfo       *m;
4266 #   if defined (FAST_LOG)
4267     int             index;
4268 #   endif
4269
4270     assert (p->scalerNode == YES);
4271
4272     m = &modelSettings[division];
4273     nStates = m->numModelStates;
4274
4275     /* find conditional likelihood pointers */
4276     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
4277     clP   = m->clP;
4278     for (k=0; k<m->numGammaCats; k++)
4279         {
4280         clP[k] = clPtr;
4281         clPtr += m->numChars * m->numModelStates;
4282         }
4283
4284     /* find node scalers */
4285     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4286
4287     /* find site scalers */
4288     lnScaler = m->scalers[m->siteScalerIndex[chain]];
4289
4290     /* rescale */
4291     for (c=0; c<m->numChars; c++)
4292         {
4293         scaler = 0.0;
4294         for (k=0; k<m->numGammaCats; k++)
4295             {
4296             for (n=0; n<nStates; n++)
4297                 {
4298                 if (clP[k][n] > scaler)
4299                     scaler = clP[k][n];
4300                 }
4301             }
4302
4303 #   if defined (FAST_LOG)
4304         frexp (scaler, &index);
4305         index = 1-index;
4306         scaler = scalerValue[index];
4307 #   endif
4308         for (k=0; k<m->numGammaCats; k++)
4309             {
4310             for (n=0; n<nStates; n++)
4311                 clP[k][n] /= scaler;
4312             clP[k] += n;
4313             }
4314
4315 #   if defined (FAST_LOG)
4316         scP[c]       = logValue[index];         /* store node scaler */
4317         lnScaler[c] += scP[c];              /* add into tree scaler  */
4318 #   else
4319         scP[c]       = (CLFlt) log (scaler);    /* store node scaler */
4320         lnScaler[c] += scP[c];  /* add into tree scaler  */
4321 #   endif
4322         }
4323
4324     m->scalersSet[chain][p->index] = YES;
4325
4326     return (NO_ERROR);
4327 }
4328
4329
4330 #if defined (SSE_ENABLED)
4331 /*----------------------------------------------------------------
4332 |
4333 |   CondLikeScaler_Gen_SSE: general n-state model with or without rate
4334 |       variation
4335 |
4336 -----------------------------------------------------------------*/
4337 int CondLikeScaler_Gen_SSE (TreeNode *p, int division, int chain)
4338 {
4339     int             c, k, n, nStates;
4340     CLFlt           *scP, *lnScaler;
4341     __m128          *clPtr, **clP, m1;
4342     ModelInfo       *m;
4343 #   if defined (FAST_LOG)
4344     int             index;
4345 #   endif
4346
4347     m = &modelSettings[division];
4348     nStates = m->numModelStates;
4349
4350     /* find conditional likelihood pointers */
4351     clPtr = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index]];
4352     clP   = m->clP_SSE;
4353     for (k=0; k<m->numGammaCats; k++)
4354         {
4355         clP[k] = clPtr;
4356         clPtr += m->numSSEChars * m->numModelStates;
4357         }
4358
4359     /* find node scalers */
4360     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4361     //scP_SSE = (__m128 *) scP;
4362
4363     /* find site scalers */
4364     lnScaler = m->scalers[m->siteScalerIndex[chain]];
4365
4366     /* rescale */
4367     for (c=0; c<m->numSSEChars; c++)
4368         {
4369         //scaler = 0.0;
4370         m1 = _mm_setzero_ps ();
4371         for (k=0; k<m->numGammaCats; k++)
4372             {
4373             for (n=0; n<nStates; n++)
4374                 {
4375                 m1 = _mm_max_ps (m1, clP[k][n]);
4376                 }
4377             }
4378         _mm_store_ps (scP,  m1);
4379         scP += FLOATS_PER_VEC;
4380
4381 #   if defined (FAST_LOG)
4382         frexp (scaler, &index);
4383         index = 1-index;
4384         scaler = scalerValue[index];
4385 #   endif
4386         for (k=0; k<m->numGammaCats; k++)
4387             {
4388             for (n=0; n<nStates; n++)
4389                 {
4390                 *clP[k] = _mm_div_ps (*clP[k], m1);
4391                 clP[k]++;
4392                 }
4393             }
4394         }
4395
4396     /* Reset scP to original position*/
4397     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4398     for (c=0; c<m->numChars; c++)
4399         {
4400 #   if defined (FAST_LOG)
4401         scP[c]       = logValue[index];         /* store node scaler */
4402         lnScaler[c] += scP[c];                  /* add into tree scaler  */
4403 #   else
4404         scP[c]       = (CLFlt) log (scP[c]);    /* store node scaler */
4405         lnScaler[c] += scP[c];                  /* add into tree scaler  */
4406 #   endif
4407         }
4408
4409     m->scalersSet[chain][p->index] = YES;
4410
4411     return (NO_ERROR);
4412 }
4413 #endif
4414
4415
4416 /*----------------------------------------------------------------
4417 |
4418 |   CondLikeScaler_Gen_GibbsGamma: general n-state model with Gibbs
4419 |       sampling of rate categories in discrete gamma
4420 |
4421 -----------------------------------------------------------------*/
4422 int CondLikeScaler_Gen_GibbsGamma (TreeNode *p, int division, int chain)
4423 {
4424     int             c, i, j, n, nStates, *rateCat, nGammaCats;
4425     CLFlt           scaler, *clP, *scP, *lnScaler;
4426     ModelInfo       *m;
4427 #   if defined (FAST_LOG)
4428     int             index;
4429 #   endif
4430
4431     assert (p->scalerNode ==  YES);
4432
4433     m = &modelSettings[division];
4434     nStates = m->numModelStates;
4435
4436     /* find conditional likelihood pointer */
4437     clP = m->condLikes[m->condLikeIndex[chain][p->index]];
4438
4439     /* flip node scalers */
4440     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4441
4442     /* find site scalers */
4443     lnScaler = m->scalers[m->siteScalerIndex[chain]];
4444
4445     /* find rate category index and number of gamma categories */
4446     rateCat = m->tiIndex + chain * m->numChars;
4447     nGammaCats = m->numGammaCats;
4448
4449     /* scale */
4450     i = j = 0;
4451     for (c=0; c<m->numChars; c++)
4452         {
4453         if (rateCat[c] < nGammaCats)
4454             {
4455             scaler = 0.0;
4456             for (n=0; n<nStates; n++)
4457                 {
4458                 if (clP[i] > scaler)
4459                     scaler = clP[i];
4460                 i++;
4461                 }
4462
4463 #   if defined (FAST_LOG)
4464             frexp (scaler, &index);
4465             index = 1-index;
4466             scaler = scalerValue[index];
4467 #   endif
4468
4469             for (n=0; n<nStates; n++)
4470                 clP[j++] /= scaler;
4471
4472 #   if defined (FAST_LOG)
4473             scP[c]       = logValue[index];         /* store node scaler */
4474             lnScaler[c] += scP[c];                  /* add into tree scaler  */
4475 #   else
4476             scP[c]       = (CLFlt) log (scaler);    /* store node scaler */
4477             lnScaler[c] += scP[c];                  /* add into tree scaler  */
4478 #   endif
4479
4480             }
4481         else
4482             {
4483             scP[c] = 0.0;
4484             /* no need to add it to the lnScaler */
4485             i += nStates;
4486             j += nStates;
4487             }
4488         }
4489
4490     m->scalersSet[chain][p->index] = YES;
4491
4492     return (NO_ERROR);
4493 }
4494
4495
4496 /*----------------------------------------------------------------
4497 |
4498 |   CondLikeScaler_NUC4: 4by4 nucleotide model with or without rate
4499 |       variation
4500 |
4501 -----------------------------------------------------------------*/
4502 int CondLikeScaler_NUC4 (TreeNode *p, int division, int chain)
4503 {
4504     int             c, k;
4505     CLFlt           scaler, *scP, *lnScaler, *clPtr, **clP;
4506     ModelInfo       *m;
4507
4508 #   if defined (FAST_LOG)
4509     int             index;
4510 #   endif
4511
4512     m = &modelSettings[division];
4513     assert (p->scalerNode == YES);
4514
4515     /* find conditional likelihood pointers */
4516     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
4517     clP   = m->clP;
4518     for (k=0; k<m->numGammaCats; k++)
4519         {
4520         clP[k] = clPtr;
4521         clPtr += m->numChars * m->numModelStates;
4522         }
4523
4524     /* find node scalers */
4525     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4526
4527     /* find site scalers */
4528     lnScaler = m->scalers[m->siteScalerIndex[chain]];
4529
4530     /* rescale values */
4531     for (c=0; c<m->numChars; c++)
4532         {
4533         scaler = 0.0;
4534         for (k=0; k<m->numGammaCats; k++)
4535             {
4536             if (clP[k][A] > scaler)
4537                 scaler = clP[k][A];
4538             if (clP[k][C] > scaler)
4539                 scaler = clP[k][C];
4540             if (clP[k][G] > scaler)
4541                 scaler = clP[k][G];
4542             if (clP[k][T] > scaler)
4543                 scaler = clP[k][T];
4544             }
4545
4546 #   if defined (FAST_LOG)
4547         frexp (scaler, &index);
4548         index = 1-index;
4549         scaler = scalerValue[index];
4550 #   endif
4551         for (k=0; k<m->numGammaCats; k++)
4552             {
4553             clP[k][A] /= scaler;
4554             clP[k][C] /= scaler;
4555             clP[k][G] /= scaler;
4556             clP[k][T] /= scaler;
4557             clP[k] += 4;
4558             }
4559
4560 #   if defined (FAST_LOG)
4561         scP[c]       = logValue[index];     /* store node scaler */
4562         lnScaler[c] += scP[c];              /* add into tree scaler  */
4563 #   else
4564         scP[c]       = (CLFlt) log(scaler); /* store node scaler */
4565         lnScaler[c] += scP[c];  /* add into tree scaler  */
4566 #   endif
4567         }
4568
4569     m->scalersSet[chain][p->index] = YES;   /* set flag marking scalers set */
4570
4571     return NO_ERROR;
4572 }
4573
4574
4575 #if defined (SSE_ENABLED)
4576 /*----------------------------------------------------------------
4577 |
4578 |   CondLikeScaler_NUC4_SSE: 4by4 nucleotide model with or without rate
4579 |       variation using SSE code
4580 |
4581 -----------------------------------------------------------------*/
4582 int CondLikeScaler_NUC4_SSE (TreeNode *p, int division, int chain)
4583 {
4584     int             c, k;
4585     CLFlt           *scP, *lnScaler;
4586     __m128          *clPtr, **clP, *scP_SSE, m1;
4587     ModelInfo       *m;
4588
4589     m = &modelSettings[division];
4590     assert (p->scalerNode == YES);
4591
4592     /* find conditional likelihood pointers */
4593     clPtr = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index]];
4594     clP   = m->clP_SSE;
4595     for (k=0; k<m->numGammaCats; k++)
4596         {
4597         clP[k] = clPtr;
4598         clPtr += m->numSSEChars * m->numModelStates;
4599         }
4600
4601     /* find node scalers */
4602     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4603     scP_SSE = (__m128 *) scP;
4604
4605     /* find site scalers */
4606     lnScaler = m->scalers[m->siteScalerIndex[chain]];
4607
4608     /* rescale */
4609     for (c=0; c<m->numSSEChars; c++)
4610         {
4611         m1 = _mm_setzero_ps ();
4612         for (k=0; k<m->numGammaCats; k++)
4613             {
4614             m1 = _mm_max_ps (m1, clP[k][A]);
4615             m1 = _mm_max_ps (m1, clP[k][C]);
4616             m1 = _mm_max_ps (m1, clP[k][G]);
4617             m1 = _mm_max_ps (m1, clP[k][T]);
4618             }
4619
4620         for (k=0; k<m->numGammaCats; k++)
4621             {
4622             *clP[k] = _mm_div_ps (*clP[k], m1);
4623             clP[k]++;
4624             *clP[k] = _mm_div_ps (*clP[k], m1);
4625             clP[k]++;
4626             *clP[k] = _mm_div_ps (*clP[k], m1);
4627             clP[k]++;
4628             *clP[k] = _mm_div_ps (*clP[k], m1);
4629             clP[k]++;
4630             }
4631
4632         (*scP_SSE++) = m1;
4633         }
4634
4635     /* update site scalers */
4636     for (c=0; c<m->numChars; c++)
4637         lnScaler[c] += (scP[c] = (CLFlt)(log (scP[c])));    /* add log of new scaler into tree scaler  */
4638
4639     m->scalersSet[chain][p->index] = YES;   /* set flag marking scalers set */
4640
4641     return NO_ERROR;
4642
4643 }
4644 #endif
4645
4646
4647 /*----------------------------------------------------------------
4648 |
4649 |   CondLikeScaler_NUC4_GibbsGamma: 4by4 nucleotide model with rate
4650 |       variation approximated by Gibbs sampling from gamma
4651 |
4652 -----------------------------------------------------------------*/
4653 int CondLikeScaler_NUC4_GibbsGamma (TreeNode *p, int division, int chain)
4654 {
4655     int             c, i, j, nGammaCats, *rateCat;
4656     CLFlt           scaler, *clP, *scP, *lnScaler;
4657     ModelInfo       *m;
4658
4659 #   if defined (FAST_LOG)
4660     int             index;
4661 #   endif
4662
4663     assert (p->scalerNode == YES);
4664
4665     m = &modelSettings[division];
4666
4667     /* find conditional likelihood pointer */
4668     clP = m->condLikes[m->condLikeIndex[chain][p->index]];
4669
4670     /* find node scalers */
4671     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4672
4673     /* find site scalers */
4674     lnScaler = m->scalers[m->siteScalerIndex[chain]];
4675
4676     /* find rate category index and number of gamma categories */
4677     rateCat = m->tiIndex + chain * m->numChars;
4678     nGammaCats = m->numGammaCats;
4679
4680     /* scale */
4681     i = j = 0;
4682     for (c=0; c<m->numChars; c++)
4683         {
4684         if (rateCat[c] < nGammaCats)
4685             {
4686             scaler = 0.0;
4687             if (clP[i] > scaler)
4688                 scaler = clP[i];
4689             i++;
4690             if (clP[i] > scaler)
4691                 scaler = clP[i];
4692             i++;
4693             if (clP[i] > scaler)
4694                 scaler = clP[i];
4695             i++;
4696             if (clP[i] > scaler)
4697                 scaler = clP[i];
4698             i++;
4699
4700 #   if defined (FAST_LOG)
4701             frexp (scaler, &index);
4702             index = 1-index;
4703             scaler = scalerValue[index];
4704 #   endif
4705
4706             clP[j++] /= scaler;
4707             clP[j++] /= scaler;
4708             clP[j++] /= scaler;
4709             clP[j++] /= scaler;
4710
4711 #   if defined (FAST_LOG)
4712             scP[c]       = logValue[index];         /* store node scaler */
4713             lnScaler[c] += scP[c];                  /* add into tree scaler  */
4714 #   else
4715             scP[c]       = (CLFlt) log (scaler);    /* store node scaler */
4716             lnScaler[c] += scP[c];                  /* add into tree scaler  */
4717 #   endif
4718             }
4719         else
4720             {
4721             scP[c] = 0.0;   /* store node scaler */
4722             /* no need to add it to the lnScaler */
4723             i += 4;
4724             j += 4;
4725             }
4726         }
4727
4728     m->scalersSet[chain][p->index] = YES;
4729
4730     return NO_ERROR;
4731 }
4732
4733
4734 #if !defined (SSE_ENABLED) || 1
4735 /*----------------------------------------------------------------
4736 |
4737 |   CondLikeScaler_NY98: codon model with omega variation
4738 |
4739 -----------------------------------------------------------------*/
4740 int CondLikeScaler_NY98 (TreeNode *p, int division, int chain)
4741 {
4742     int             c, k, n, nStates;
4743     CLFlt           scaler, **clP, *clPtr, *scP, *lnScaler;
4744     ModelInfo       *m;
4745 #   if defined (FAST_LOG)
4746     int             index;
4747 #   endif
4748
4749     m = &modelSettings[division];
4750     nStates = m->numModelStates;
4751
4752     /* find conditional likelihood pointers */
4753     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
4754     clP   = m->clP;
4755     for (k=0; k<m->numOmegaCats; k++)
4756         {
4757         clP[k] = clPtr;
4758         clPtr += m->numChars * m->numModelStates;
4759         }
4760
4761     /* find node scalers */
4762     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4763
4764     /* find site scalers */
4765     lnScaler = m->scalers[m->siteScalerIndex[chain]];
4766
4767     /* rescale */
4768     for (c=0; c<m->numChars; c++)
4769         {
4770         scaler = 0.0;
4771         for (k=0; k<m->numOmegaCats; k++)
4772             {
4773             for (n=0; n<nStates; n++)
4774                 {
4775                 if (clP[k][n] > scaler)
4776                     scaler = clP[k][n];
4777                 }
4778             }
4779
4780 #   if defined (FAST_LOG)
4781         frexp (scaler, &index);
4782         index = 1-index;
4783         scaler = scalerValue[index];
4784 #   endif
4785         for (k=0; k<m->numOmegaCats; k++)
4786             {
4787             for (n=0; n<nStates; n++)
4788                 {
4789                 clP[k][n] /= scaler;
4790                 }
4791             clP[k] += n;
4792             }
4793
4794 #   if defined (FAST_LOG)
4795         scP[c]       = logValue[index];         /* store node scaler */
4796         lnScaler[c] += scP[c];                  /* add into tree scaler  */
4797 #   else
4798         scP[c]       = (CLFlt) log (scaler);    /* store node scaler */
4799         lnScaler[c] += scP[c];                  /* add into tree scaler  */
4800 #   endif
4801         }
4802
4803     m->scalersSet[chain][p->index] = YES;
4804
4805     return (NO_ERROR);
4806 }
4807 #endif
4808
4809
4810 #if defined (SSE_ENABLED)
4811 /*----------------------------------------------------------------
4812 |
4813 |   CondLikeScaler_NY98_SSE: codon model with omega variation
4814 |
4815 -----------------------------------------------------------------*/
4816 int CondLikeScaler_NY98_SSE (TreeNode *p, int division, int chain)
4817 {
4818     int             c, k, n, nStates;
4819     CLFlt           *scP, *lnScaler;
4820     __m128          *clPtr, **clP, m1;
4821     ModelInfo       *m;
4822 #   if defined (FAST_LOG)
4823     int             index;
4824 #   endif
4825
4826     m = &modelSettings[division];
4827     nStates = m->numModelStates;
4828
4829     /* find conditional likelihood pointers */
4830     clPtr = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index]];
4831     clP   = m->clP_SSE;
4832     for (k=0; k<m->numOmegaCats; k++)
4833         {
4834         clP[k] = clPtr;
4835         clPtr += m->numSSEChars * m->numModelStates;
4836         }
4837
4838     /* find node scalers */
4839     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4840     //scP_SSE = (__m128 *) scP;
4841
4842     /* find site scalers */
4843     lnScaler = m->scalers[m->siteScalerIndex[chain]];
4844
4845     /* rescale */
4846     for (c=0; c<m->numSSEChars; c++)
4847         {
4848         //scaler = 0.0;
4849         m1 = _mm_setzero_ps ();
4850         for (k=0; k<m->numOmegaCats; k++)
4851             {
4852             for (n=0; n<nStates; n++)
4853                 {
4854                 m1 = _mm_max_ps (m1, clP[k][n]);
4855                 }
4856             }
4857         _mm_store_ps (scP,  m1);
4858         scP += FLOATS_PER_VEC;
4859
4860 #   if defined (FAST_LOG)
4861         frexp (scaler, &index);
4862         index = 1-index;
4863         scaler = scalerValue[index];
4864 #   endif
4865         for (k=0; k<m->numOmegaCats; k++)
4866             {
4867             for (n=0; n<nStates; n++)
4868                 {
4869                 *clP[k] = _mm_div_ps (*clP[k], m1);
4870                 clP[k]++;
4871                 }
4872             }
4873         }
4874
4875     /* Reset scP to original position*/
4876     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4877     for (c=0; c<m->numChars; c++)
4878         {
4879 #   if defined (FAST_LOG)
4880         scP[c]       = logValue[index];         /* store node scaler */
4881         lnScaler[c] += scP[c];                  /* add into tree scaler  */
4882 #   else
4883         scP[c]       = (CLFlt) log (scP[c]);    /* store node scaler */
4884         lnScaler[c] += scP[c];                  /* add into tree scaler  */
4885 #   endif
4886         }
4887
4888     m->scalersSet[chain][p->index] = YES;
4889
4890     return (NO_ERROR);
4891 }
4892 #endif
4893
4894
4895 /*----------------------------------------------------------------
4896 |
4897 |   CondLikeScaler_Std: variable states model with or without
4898 |       rate variation
4899 |
4900 -----------------------------------------------------------------*/
4901 int CondLikeScaler_Std (TreeNode *p, int division, int chain)
4902 {
4903     int             c, n, k, nStates, numReps;
4904     CLFlt           scaler, *clPtr, **clP, *scP, *lnScaler;
4905     ModelInfo       *m;
4906 #   if defined (FAST_LOG)
4907     int             index;
4908 #   endif
4909
4910     assert (p->scalerNode == YES);
4911
4912     m = &modelSettings[division];
4913
4914     numReps=0;
4915     for (c=0; c<m->numChars; c++)
4916         {
4917         if (m->nStates[c] == 2)
4918             numReps += m->numBetaCats * 2;
4919         else
4920             numReps += m->nStates[c];
4921         }
4922
4923     /* find conditional likelihood pointers */
4924     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
4925     clP   = m->clP;
4926     for (k=0; k<m->numGammaCats; k++)
4927         {
4928         clP[k] = clPtr;
4929         clPtr += numReps;
4930         }
4931
4932     /* find node scalers */
4933     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
4934
4935     /* find site scalers */
4936     lnScaler = m->scalers[m->siteScalerIndex[chain]];
4937
4938     /* rescale */
4939     for (c=0; c<m->numChars; c++)
4940         {
4941         scaler = 0.0;
4942         nStates = m->nStates[c];
4943         if (nStates == 2)
4944             nStates = m->numBetaCats * 2;
4945
4946         for (k=0; k<m->numGammaCats; k++)
4947             {
4948             for (n=0; n<nStates; n++)
4949                 {
4950                 if (clP[k][n] > scaler)
4951                     scaler = clP[k][n];
4952                 }
4953             }
4954
4955 #   if defined (FAST_LOG)
4956         frexp (scaler, &index);
4957         index = 1-index;
4958         scaler = scalerValue[index];
4959 #   endif
4960         for (k=0; k<m->numGammaCats; k++)
4961             {
4962             for (n=0; n<nStates; n++)
4963                 clP[k][n] /= scaler;
4964             clP[k] += nStates;
4965             }
4966
4967 #   if defined (FAST_LOG)
4968         scP[c]       = logValue[index];         /* store node scaler */
4969         lnScaler[c] += scP[c];                  /* add into tree scaler  */
4970 #   else
4971         scP[c]       = (CLFlt) log (scaler);    /* store node scaler */
4972         lnScaler[c] += scP[c];                  /* add into tree scaler  */
4973 #   endif
4974         }
4975
4976     m->scalersSet[chain][p->index] = YES;
4977
4978     return NO_ERROR;
4979 }
4980
4981
4982 /*------------------------------------------------------------------
4983 |
4984 |   Likelihood_Adgamma: all n-state models with autocorrelated
4985 |        discrete gamma rate variation, NOT morph, restriction,
4986 |        codon or doublet models; just fill in rateProbs
4987 |
4988 -------------------------------------------------------------------*/
4989 int Likelihood_Adgamma (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
4990 {
4991     int             c, j, k, i, nStates, nStatesDiv2;
4992     MrBFlt          *bs, *swr, s01, s10, probOn, probOff, covBF[40];
4993     MrBFlt          like, *rP;
4994     CLFlt           *clP;
4995     ModelInfo       *m;
4996
4997     /* NOTE: whichSitePats offsets numSitesOfPat by whichSitePats X numCompressedChars.
4998        This is done so we can use the character reweighting scheme for "heating" chains. This was easy to
4999        accomplish for all of the models except this one, which doesn't use numSitesOfPat when calculating
5000        likelihoods. Either we disallow autocorrelated rates when using MCMC with character reweighting, or
5001        we properly calculate likelihoods when some site patterns have increased or decreased weight. For
5002        now, we do not allow MCMCMC with character reweighting with this HMM; we bail out in the function
5003        FillNumSitesOfPat if we have Adgamma rate variation and reweighting. */
5004     k = whichSitePats;
5005
5006     /* find model settings */
5007     m = &modelSettings[division];
5008
5009     /* get the number of states */
5010     nStates = m->numModelStates;
5011     nStatesDiv2 = nStates / 2;
5012
5013     /* find base frequencies */
5014     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
5015
5016     /* find conditional likelihood pointer */
5017     clP = m->condLikes[m->condLikeIndex[chain][p->index]];
5018
5019     /* find pointer to rate probabilities */
5020     rP = rateProbs[chain] + state[chain] * rateProbRowSize + m->rateProbStart;
5021
5022     /* loop over characters and calculate rate probs */
5023     if (m->switchRates != NULL)
5024         {
5025         swr = GetParamVals (m->switchRates, chain, state[chain]);
5026         s01 = swr[0];
5027         s10 = swr[1];
5028         probOn = s01 / (s01 + s10);
5029         probOff =  1.0 - probOn;
5030         for (j=0; j<nStatesDiv2; j++)
5031             {
5032             covBF[j] = bs[j] * probOn;
5033             covBF[j+nStatesDiv2] = bs[j] * probOff;
5034             }
5035         bs = covBF;
5036         }
5037
5038     for (c=i=0; c<m->numChars; c++)
5039         {
5040         for (k=0; k<m->numGammaCats; k++)
5041             {
5042             like =  0.0;
5043             for (j=0; j<nStates; j++)
5044                 like += (*(clP++)) *  bs[j];
5045             rP[i++] = like;
5046             }
5047         }
5048
5049     /* reset lnL, likelihood calculated later for this model */
5050     *lnL =  0.0;
5051
5052     return (NO_ERROR);
5053 }
5054
5055
5056 /*------------------------------------------------------------------
5057 |
5058 |   Likelihood_Gen: general n-state models with or without rate
5059 |       variation
5060 |
5061 -------------------------------------------------------------------*/
5062 int Likelihood_Gen (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
5063 {
5064     int             c, j, k, nStates, hasPInvar;
5065     MrBFlt          s01, s10, probOn, probOff, *swr;
5066     MrBFlt          covBF[40], freq, *bs, like, likeI, pInvar=0.0, lnLike;
5067     CLFlt           *clPtr, **clP, *lnScaler, *nSitesOfPat, *clInvar=NULL;
5068     ModelInfo       *m;
5069
5070     /* find model settings and nStates, pInvar, invar cond likes */
5071     m = &modelSettings[division];
5072     nStates = m->numModelStates;
5073     if (m->pInvar == NULL)
5074         {
5075         hasPInvar = NO;
5076         }
5077     else
5078         {
5079         hasPInvar = YES;
5080         pInvar =  *(GetParamVals (m->pInvar, chain, state[chain]));
5081         clInvar = m->invCondLikes;
5082         }
5083
5084     /* find conditional likelihood pointers */
5085     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
5086     clP = m->clP;
5087     for (k=0; k<m->numGammaCats; k++)
5088         {
5089         clP[k] = clPtr;
5090         clPtr += m->numChars * m->numModelStates;
5091         }
5092
5093
5094     /* find base frequencies */
5095     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
5096
5097     /* if covarion model, adjust base frequencies */
5098     if (m->switchRates != NULL)
5099         {
5100         /* find the stationary frequencies */
5101         swr = GetParamVals(m->switchRates, chain, state[chain]);
5102         s01 = swr[0];
5103         s10 = swr[1];
5104         probOn = s01 / (s01 + s10);
5105         probOff =  1.0 - probOn;
5106
5107         /* now adjust the base frequencies; on-state stored first in cond likes */
5108         for (j=0; j<nStates/2; j++)
5109             {
5110             covBF[j] = bs[j] * probOn;
5111             covBF[j+nStates/2] = bs[j] * probOff;
5112             }
5113
5114         /* finally set bs pointer to adjusted values */
5115         bs = covBF;
5116         }
5117
5118     /* find category frequencies */
5119     if (hasPInvar == NO)
5120         freq =  1.0 /  m->numGammaCats;
5121     else
5122         freq = (1.0 - pInvar) /  m->numGammaCats;
5123
5124     /* find site scaler */
5125     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5126
5127     /* find nSitesOfPat */
5128     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
5129
5130     /* reset lnL */
5131     *lnL = 0.0;
5132
5133     /* loop over characters */
5134     if (hasPInvar == NO)
5135         {
5136         for (c=0; c<m->numChars; c++)
5137             {
5138             like = 0.0;
5139             for (k=0; k<m->numGammaCats; k++)
5140                 for (j=0; j<nStates; j++)
5141                     {
5142                     like += (*(clP[k]++)) * bs[j];
5143 #   ifdef DEBUG_LIKELIHOOD
5144                     // printf ("char=%d cat=%d j=%d like %E\n",c, k,j,like);
5145 #   endif
5146                     }
5147             like *= freq;
5148
5149             /* check against LIKE_EPSILON (values close to zero are problematic) */
5150             if (like < LIKE_EPSILON)
5151                 {
5152 #   ifdef DEBUG_LIKELIHOOD
5153                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5154 #   endif
5155                 (*lnL) = MRBFLT_NEG_MAX;
5156                 abortMove = YES;
5157                 return ERROR;
5158                 }
5159             else
5160                 {
5161                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
5162                 }
5163             }
5164         }
5165     else
5166         {
5167         /* has invariable category */
5168         for (c=0; c<m->numChars; c++)
5169             {
5170             likeI = like = 0.0;
5171             for (k=0; k<m->numGammaCats; k++)
5172                 for (j=0; j<nStates; j++)
5173                     {
5174                     like += (*(clP[k]++)) * bs[j];
5175                     }
5176             like *= freq;
5177             for (j=0; j<nStates; j++)
5178                 likeI += (*(clInvar++)) * bs[j] * pInvar;
5179             if (lnScaler[c] < -200.0)
5180                 {
5181                 /* we are not going to be able to exponentiate the scaling factor */
5182                 if (likeI > 1E-70)
5183                     {
5184                     /* forget about like; it is going to be insignificant compared to likeI */
5185                     lnLike = log(likeI);
5186                     }
5187                 else
5188                     {
5189                     /* treat likeI as if 0.0, that is, ignore it completely */
5190                     lnLike = log(like) + lnScaler[c];
5191                     }
5192                 }
5193             else
5194                 lnLike = log (like + (likeI / exp (lnScaler[c]))) + lnScaler[c];
5195
5196             /* check against LIKE_EPSILON (values close to zero are problematic) */
5197             if (like < LIKE_EPSILON)
5198                 {
5199 #   ifdef DEBUG_LIKELIHOOD
5200                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5201 #   endif
5202                 (*lnL) = MRBFLT_NEG_MAX;
5203                 abortMove = YES;
5204                 return ERROR;
5205                 }
5206             else
5207                 {
5208                 (*lnL) += lnLike * nSitesOfPat[c];
5209                 }
5210             }
5211         }
5212
5213     return NO_ERROR;
5214 }
5215
5216
5217 #if defined (SSE_ENABLED)
5218 //#   if 0
5219 //CLFlt DeleteME[1000];
5220 //int PrintOld_SSE (TreeNode *p, int division, int chain){
5221 //
5222 //    int             c, c1, j, k, nStates;
5223 //    //MrBFlt            *swr, likeI, pInvar=0.0, lnLike;
5224 //    CLFlt           *temp_vector;
5225 //    __m128          *clPtr, **clP;
5226 //    ModelInfo       *m;
5227 //
5228 //    m = &modelSettings[division];
5229 //    nStates = m->numModelStates;
5230 //    /* find conditional likelihood pointers */
5231 //
5232 //    temp_vector =  DeleteME;
5233 //
5234 //    clPtr = (__m128 *) (m->condLikes[m->condLikeIndex[chain][p->index]]);
5235 //    clP = m->clP_SSE;
5236 //    for (k=0; k<m->numGammaCats; k++)
5237 //        {
5238 //        clP[k] = clPtr;
5239 //        clPtr += m->numSSEChars * m->numModelStates;
5240 //        }
5241 //
5242 //    for (c=0; c<m->numChars; c++)
5243 //        {
5244 //        c1 = c / FLOATS_PER_VEC;
5245 //        for (k=0; k<m->numGammaCats; k++)
5246 //            {
5247 //            for (j=0; j<nStates; j++)
5248 //                {
5249 //                *temp_vector++ = *(((CLFlt*)&clP[k][c1*nStates+j])+c % FLOATS_PER_VEC);
5250 //                }
5251 //            }
5252 //        }
5253 //    temp_vector=DeleteME;
5254 //
5255 //    return 1;
5256 //}
5257 //#   endif
5258
5259
5260 /*------------------------------------------------------------------
5261 |
5262 |   Likelihood_Gen_SSE: general n-state model with or without rate
5263 |       variation
5264 |
5265 -------------------------------------------------------------------*/
5266 int Likelihood_Gen_SSE (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
5267 {
5268     int             c, j, k, nStates, hasPInvar;
5269     MrBFlt          like, *bs;
5270     MrBFlt          s01, s10, probOn, probOff, *swr, covBF[40], freq, likeI, pInvar=0.0, lnLike;
5271     CLFlt           *lnScaler, *nSitesOfPat, *lnL_SSE, *lnLI_SSE;
5272     __m128          *clPtr, **clP, *clInvar=NULL;
5273     __m128          m1, mCatLike, mLike, mFreq;
5274     ModelInfo       *m;
5275
5276     /* find model settings and nStates, pInvar, invar cond likes */
5277     m = &modelSettings[division];
5278     nStates = m->numModelStates;
5279     if (m->pInvar == NULL)
5280         {
5281         hasPInvar = NO;
5282         }
5283     else
5284         {
5285         hasPInvar = YES;
5286         pInvar =  *(GetParamVals (m->pInvar, chain, state[chain]));
5287         clInvar = (__m128 *) (m->invCondLikes);
5288         }
5289
5290     /* find conditional likelihood pointers */
5291     clPtr = (__m128 *) (m->condLikes[m->condLikeIndex[chain][p->index]]);
5292     clP = m->clP_SSE;
5293     for (k=0; k<m->numGammaCats; k++)
5294         {
5295         clP[k] = clPtr;
5296         clPtr += m->numSSEChars * m->numModelStates;
5297         }
5298     lnL_SSE  = m->lnL_SSE;
5299     lnLI_SSE = m->lnLI_SSE;
5300
5301     /* find base frequencies */
5302     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
5303
5304     /* if covarion model, adjust base frequencies */
5305     if (m->switchRates != NULL)
5306         {
5307         /* find the stationary frequencies */
5308         swr = GetParamVals(m->switchRates, chain, state[chain]);
5309         s01 = swr[0];
5310         s10 = swr[1];
5311         probOn = s01 / (s01 + s10);
5312         probOff =  1.0 - probOn;
5313
5314         /* now adjust the base frequencies; on-state stored first in cond likes */
5315         for (j=0; j<nStates/2; j++)
5316             {
5317             covBF[j] = bs[j] * probOn;
5318             covBF[j+nStates/2] = bs[j] * probOff;
5319             }
5320
5321         /* finally set bs pointer to adjusted values */
5322         bs = covBF;
5323         }
5324
5325     /* find category frequencies */
5326     if (hasPInvar == NO)
5327         freq =  1.0 /  m->numGammaCats;
5328     else
5329         freq = (1.0 - pInvar) /  m->numGammaCats;
5330
5331     mFreq = _mm_set1_ps ((CLFlt)(freq));
5332
5333     /* find site scaler */
5334     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5335
5336     /* find nSitesOfPat */
5337     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
5338
5339     /* reset lnL */
5340     *lnL = 0.0;
5341
5342     for (c=0; c<m->numSSEChars; c++)
5343         {
5344         mLike = _mm_setzero_ps ();
5345         for (k=0; k<m->numGammaCats; k++)
5346             {
5347             mCatLike = _mm_setzero_ps ();
5348             for (j=0; j<nStates; j++)
5349                 {
5350                 m1 = _mm_mul_ps (clP[k][j], _mm_set1_ps ((CLFlt)bs[j]));
5351                 mCatLike = _mm_add_ps (mCatLike, m1);
5352                 }
5353             m1 = _mm_mul_ps (mCatLike, mFreq);
5354             mLike = _mm_add_ps (mLike, m1);
5355             clP[k] += nStates;
5356             }
5357         _mm_store_ps (lnL_SSE, mLike);
5358         lnL_SSE += FLOATS_PER_VEC;
5359         }
5360
5361     /* loop over characters */
5362     if (hasPInvar == NO)
5363         {
5364         for (c=0; c<m->numChars; c++)
5365             {
5366             like = m->lnL_SSE[c];
5367             /* check against LIKE_EPSILON (values close to zero are problematic) */
5368             if (like < LIKE_EPSILON)
5369                 {
5370 #   ifdef DEBUG_LIKELIHOOD
5371                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5372 #   endif
5373                 (*lnL) = MRBFLT_NEG_MAX;
5374                 abortMove = YES;
5375                 return ERROR;
5376                 }
5377             else
5378                 {
5379                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
5380                 }
5381             }
5382         }
5383     else
5384         {
5385         /* has invariable category */
5386         for (c=0; c<m->numSSEChars; c++)
5387             {
5388             mCatLike = _mm_setzero_ps ();
5389             for (j=0; j<nStates; j++)
5390                 {
5391                 m1 = _mm_mul_ps (clInvar[j], _mm_set1_ps ((CLFlt)bs[j]));
5392                 mCatLike = _mm_add_ps (mCatLike, m1);
5393                 }
5394             clInvar += nStates;
5395             _mm_store_ps (lnL_SSE, mCatLike);
5396             lnLI_SSE += FLOATS_PER_VEC;
5397             }
5398
5399         for (c=0; c<m->numChars; c++)
5400             {
5401             like  = m->lnL_SSE[c];
5402             likeI = m->lnLI_SSE[c];
5403             if (lnScaler[c] < -200.0)
5404                 {
5405                 /* we are not going to be able to exponentiate the scaling factor */
5406                 if (likeI > 1E-70)
5407                     {
5408                     /* forget about like; it is going to be insignificant compared to likeI */
5409                     lnLike = log(likeI);
5410                     }
5411                 else
5412                     {
5413                     /* treat likeI as if 0.0, that is, ignore it completely */
5414                     lnLike = log(like) + lnScaler[c];
5415                     }
5416                 }
5417             else
5418                 lnLike = log (like + (likeI / exp (lnScaler[c]))) + lnScaler[c];
5419
5420             /* check against LIKE_EPSILON (values close to zero are problematic) */
5421             if (like < LIKE_EPSILON)
5422                 {
5423 #   ifdef DEBUG_LIKELIHOOD
5424                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5425 #   endif
5426                 (*lnL) = MRBFLT_NEG_MAX;
5427                 abortMove = YES;
5428                 return ERROR;
5429                 }
5430             else
5431                 {
5432                 (*lnL) += lnLike * nSitesOfPat[c];
5433                 }
5434             }
5435         }
5436
5437     return NO_ERROR;
5438
5439 }
5440 #endif
5441
5442
5443 /*------------------------------------------------------------------
5444 |
5445 |   Likelihood_Gen_GibbsGamma: general n-state models using
5446 |       Gibbs resampling of discrete gamma rate categories
5447 |
5448 -------------------------------------------------------------------*/
5449 int Likelihood_Gen_GibbsGamma (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
5450 {
5451     int             c, j, nStates, nGammaCats, *rateCat;
5452     MrBFlt          s01, s10, probOn, probOff, *swr;
5453     MrBFlt          covBF[40], *bs, like;
5454     CLFlt           *clP, *lnScaler, *nSitesOfPat, *clInvar=NULL;
5455     ModelInfo       *m;
5456
5457     /* find model settings, nStates and invar cond likes */
5458     m = &modelSettings[division];
5459     nStates = m->numModelStates;
5460     clInvar = m->invCondLikes;
5461
5462     /* find conditional likelihood pointer */
5463     clP = m->condLikes[m->condLikeIndex[chain][p->index]];
5464
5465     /* find base frequencies */
5466     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
5467
5468     /* if covarion model, adjust base frequencies */
5469     if (m->switchRates != NULL)
5470         {
5471         /* find the stationary frequencies */
5472         swr = GetParamVals(m->switchRates, chain, state[chain]);
5473         s01 = swr[0];
5474         s10 = swr[1];
5475         probOn = s01 / (s01 + s10);
5476         probOff =  1.0 - probOn;
5477
5478         /* now adjust the base frequencies; on-state stored first in cond likes */
5479         for (j=0; j<nStates/2; j++)
5480             {
5481             covBF[j] = bs[j] * probOn;
5482             covBF[j+nStates/2] = bs[j] * probOff;
5483             }
5484
5485         /* finally set bs pointer to adjusted values */
5486         bs = covBF;
5487         }
5488
5489     /* find site scaler */
5490     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5491
5492     /* find nSitesOfPat */
5493     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
5494
5495     /* find rate category index and number of gamma categories */
5496     rateCat = m->tiIndex + chain * m->numChars;
5497     nGammaCats = m->numGammaCats;
5498
5499     /* reset lnL */
5500     *lnL = 0.0;
5501
5502     /* loop over characters */
5503     if (m->pInvar == NULL)
5504         {
5505         for (c=0; c<m->numChars; c++)
5506             {
5507             like = 0.0;
5508             for (j=0; j<nStates; j++)
5509                 {
5510                 like += (*(clP++)) * bs[j];
5511 #   ifdef DEBUG_LIKELIHOOD
5512                 // printf ("char=%d cat=%d j=%d like %E\n",c, k,j,like);
5513 #   endif
5514                 }
5515
5516             /* check against LIKE_EPSILON (values close to zero are problematic) */
5517             if (like < LIKE_EPSILON)
5518                 {
5519 #   ifdef DEBUG_LIKELIHOOD
5520                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5521 #   endif
5522                 (*lnL) = MRBFLT_NEG_MAX;
5523                 abortMove = YES;
5524                 return ERROR;
5525                 }
5526             else
5527                 {
5528                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
5529                 }
5530             }
5531         }
5532     else
5533         {
5534         /* has invariable category */
5535         for (c=0; c<m->numChars; c++)
5536             {
5537             like = 0.0;
5538             if (rateCat[c] < nGammaCats)
5539                 {
5540                 for (j=0; j<nStates; j++)
5541                     like += (*(clP++)) * bs[j];
5542                 clInvar += nStates;
5543                 }
5544             else
5545                 {
5546                 for (j=0; j<nStates; j++)
5547                     like += (*(clInvar++)) * bs[j];
5548                 clP += nStates;
5549                 }
5550
5551             /* check against LIKE_EPSILON (values close to zero are problematic) */
5552             if (like < LIKE_EPSILON)
5553                 {
5554 #   ifdef DEBUG_LIKELIHOOD
5555                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5556 #   endif
5557                 (*lnL) = MRBFLT_NEG_MAX;
5558                 abortMove = YES;
5559                 return ERROR;
5560                 }
5561             else
5562                 {
5563                 (*lnL) += (log(like) + lnScaler[c]) * nSitesOfPat[c];
5564                 }
5565             }
5566         }
5567
5568     return NO_ERROR;
5569 }
5570
5571
5572 /*------------------------------------------------------------------
5573 |
5574 |   Likelihood_NUC4: 4by4 nucleotide models with or without rate
5575 |       variation
5576 |
5577 -------------------------------------------------------------------*/
5578 int Likelihood_NUC4 (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
5579 {
5580     int             c, k, hasPInvar;
5581     MrBFlt          freq, likeI, *bs, like, pInvar=0.0;
5582     CLFlt           *clPtr, **clP, *lnScaler, *nSitesOfPat, *clInvar=NULL;
5583     ModelInfo       *m;
5584
5585 #   if defined (FAST_LOG)
5586     int             index;
5587     MrBFlt          likeAdjust = 1.0, f;
5588 #   endif
5589
5590     /* find model settings and pInvar, invar cond likes */
5591     m = &modelSettings[division];
5592     if (m->pInvar == NULL)
5593         {
5594         hasPInvar = NO;
5595         }
5596     else
5597         {
5598         hasPInvar = YES;
5599         pInvar =  *(GetParamVals (m->pInvar, chain, state[chain]));
5600         clInvar = m->invCondLikes;
5601         }
5602
5603     /* find conditional likelihood pointers */
5604     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
5605     clP = m->clP;
5606     for (k=0; k<m->numGammaCats; k++)
5607         {
5608         clP[k] = clPtr;
5609         clPtr += m->numChars * m->numModelStates;
5610         }
5611
5612     /* find base frequencies */
5613     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
5614
5615     /* find category frequencies */
5616     if (hasPInvar == NO)
5617         freq =  1.0 /  m->numGammaCats;
5618     else
5619         freq =  (1.0 - pInvar) /  m->numGammaCats;
5620
5621     /* find tree scaler */
5622     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5623
5624     /* find nSitesOfPat */
5625     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
5626
5627     /* reset lnL */
5628     *lnL = 0.0;
5629
5630     /* loop over characters */
5631     if (hasPInvar == NO)
5632         {
5633         for (c=0; c<m->numChars; c++)
5634             {
5635             like = 0.0;
5636             for (k=0; k<m->numGammaCats; k++)
5637                 {
5638                 like += (clP[k][A] * bs[A] + clP[k][C] * bs[C] + clP[k][G] * bs[G] + clP[k][T] * bs[T]);
5639                 clP[k] += 4;
5640                 }
5641             like *= freq;
5642
5643             /* check against LIKE_EPSILON (values close to zero are problematic) */
5644             if (like < LIKE_EPSILON)
5645                 {
5646 #   ifdef DEBUG_LIKELIHOOD
5647                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5648 #   endif
5649                 (*lnL) = MRBFLT_NEG_MAX;
5650                 abortMove = YES;
5651                 return ERROR;
5652                 }
5653             else
5654                 {
5655 #   if defined (FAST_LOG)
5656                 f = frexp (like, &index);
5657                 index = 1-index;
5658                 (*lnL) += (lnScaler[c] +  logValue[index]) * nSitesOfPat[c];
5659                 for (k=0; k<(int)nSitesOfPat[c]; k++)
5660                     likeAdjust *= f;
5661 #   else
5662                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
5663 #   endif
5664                 }
5665             }
5666         }
5667     else
5668         {
5669         /* has invariable category */
5670         for (c=0; c<m->numChars; c++)
5671             {
5672             like = 0.0;
5673             for (k=0; k<m->numGammaCats; k++)
5674                 {
5675                 like += (clP[k][A] * bs[A] + clP[k][C] * bs[C] + clP[k][G] * bs[G] + clP[k][T] * bs[T]);
5676                 clP[k] += 4;
5677                 }
5678             like *= freq;
5679             likeI = (clInvar[A] * bs[A] + clInvar[C] * bs[C] + clInvar[G] * bs[G] + clInvar[T] * bs[T]) * pInvar;
5680             if (lnScaler[c] < -200)
5681                 {
5682                 /* we are not going to be able to exponentiate the scaling factor */
5683                 if (likeI > 1E-70)
5684                     {
5685                     /* forget about like; it is going to be insignificant compared to likeI */
5686                     like = likeI;
5687                     }
5688                 else
5689                     {
5690                     /* treat likeI as if 0.0, that is, ignore it completely */
5691                     }
5692                 }
5693             else
5694                 like = like + (likeI / exp (lnScaler[c]));
5695
5696             clInvar += 4;
5697
5698             /* check against LIKE_EPSILON (values close to zero are problematic) */
5699             if (like < LIKE_EPSILON)
5700                 {
5701 #   ifdef DEBUG_LIKELIHOOD
5702                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5703 #   endif
5704                 (*lnL) = MRBFLT_NEG_MAX;
5705                 abortMove = YES;
5706                 return ERROR;
5707                 }
5708             else
5709                 {
5710 #   if defined (FAST_LOG)
5711                 f = frexp (like, &index);
5712                 index = 1-index;
5713                 (*lnL) += (lnScaler[c] +  logValue[index]) * nSitesOfPat[c];
5714                 for (k=0; k<(int)nSitesOfPat[c]; k++)
5715                     likeAdjust *= f;
5716 #   else
5717                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
5718 #   endif
5719                 }
5720             }
5721         }
5722
5723 #   if defined (FAST_LOG)
5724     (*lnL) += log (likeAdjust);
5725 #   endif
5726
5727     return NO_ERROR;
5728 }
5729
5730
5731 /*------------------------------------------------------------------
5732 |
5733 |   Likelihood_NUC4_GibbsGamma: 4by4 nucleotide models with rate
5734 |       variation using Gibbs sampling from gamma rate categories
5735 |
5736 -------------------------------------------------------------------*/
5737 int Likelihood_NUC4_GibbsGamma (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
5738 {
5739     int             c, i, r, nGammaCats, *rateCat;
5740     MrBFlt          *bs, like;
5741     CLFlt           *clP, *lnScaler, *nSitesOfPat, *clInvar;
5742     ModelInfo       *m;
5743
5744 #   if defined (FAST_LOG)
5745     int             k, index;
5746     MrBFlt          likeAdjust = 1.0, f;
5747 #   endif
5748
5749     /* find model settings and invar cond likes */
5750     m = &modelSettings[division];
5751     clInvar = m->invCondLikes;
5752
5753     /* find conditional likelihood pointer */
5754     clP = m->condLikes[m->condLikeIndex[chain][p->index]];
5755
5756     /* find base frequencies */
5757     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
5758
5759     /* find tree scaler */
5760     lnScaler = m->scalers[m->siteScalerIndex[chain]];
5761
5762     /* find nSitesOfPat */
5763     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
5764
5765     /* find rate category index  and number of gamma categories */
5766     rateCat = m->tiIndex + chain * m->numChars;
5767     nGammaCats = m->numGammaCats;
5768
5769     /* reset lnL */
5770     *lnL = 0.0;
5771
5772     /* loop over characters */
5773     if (m->pInvar == NULL)
5774         {
5775         for (c=i=0; c<m->numChars; c++)
5776             {
5777             like = (clP[A] * bs[A] + clP[C] * bs[C] + clP[G] * bs[G] + clP[T] * bs[T]);
5778             clP += 4;
5779
5780             /* check against LIKE_EPSILON (values close to zero are problematic) */
5781             if (like < LIKE_EPSILON)
5782                 {
5783 #   ifdef DEBUG_LIKELIHOOD
5784                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5785 #   endif
5786                 (*lnL) = MRBFLT_NEG_MAX;
5787                 abortMove = YES;
5788                 return ERROR;
5789                 }
5790             else
5791                 {
5792 #   if defined (FAST_LOG)
5793                 f = frexp (like, &index);
5794                 index = 1-index;
5795                 (*lnL) += (lnScaler[c] +  logValue[index]) * nSitesOfPat[c];
5796                 for (k=0; k<(int)nSitesOfPat[c]; k++)
5797                     likeAdjust *= f;
5798 #   else
5799                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
5800 #   endif
5801                 }
5802             }
5803         }
5804     else
5805         {
5806         /* has invariable category */
5807         for (c=i=0; c<m->numChars; c++)
5808             {
5809             r = rateCat[c];
5810             if (r < nGammaCats)
5811                 like = (clP[A] * bs[A] + clP[C] * bs[C] + clP[G] * bs[G] + clP[T] * bs[T]);
5812             else
5813                 like = (clInvar[A] * bs[A] + clInvar[C] * bs[C] + clInvar[G] * bs[G] + clInvar[T] * bs[T]);
5814             clInvar += 4;
5815             clP += 4;
5816
5817             /* check against LIKE_EPSILON (values close to zero are problematic) */
5818             if (like < LIKE_EPSILON)
5819                 {
5820 #   ifdef DEBUG_LIKELIHOOD
5821                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
5822 #   endif
5823                 (*lnL) = MRBFLT_NEG_MAX;
5824                 abortMove = YES;
5825                 return ERROR;
5826                 }
5827             else
5828                 {
5829                 (*lnL) += (log (like) + lnScaler[c]) * nSitesOfPat[c];
5830                 }
5831             }
5832         }
5833
5834 #   if defined (FAST_LOG)
5835     (*lnL) += log (likeAdjust);
5836 #   endif
5837
5838     return NO_ERROR;
5839 }
5840
5841
5842 //#if defined (SSE_ENABLED)
5843 ///*------------------------------------------------------------------
5844 // |
5845 // | Likelihood_NUC4_GibbsGamma: 4by4 nucleotide models with rate
5846 // |     variation using Gibbs sampling from gamma rate categories
5847 // |
5848 // -------------------------------------------------------------------*/
5849 //int Likelihood_NUC4_GibbsGamma_SSE (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
5850 //{
5851 //    int             c, i, r, nGammaCats, *rateCat;
5852 //    MrBFlt          *bs, like;
5853 //    CLFlt           *lnScaler, *nSitesOfPat, *lnL_SSE, *lnLI_SSE;
5854 //    __m128          *clP, *clInvar=NULL;
5855 //    __m128          m1, mA, mC, mG, mT, mFreq, mPInvar, mLike;
5856 //    ModelInfo       *m;
5857 //
5858 //#if defined (FAST_LOG)
5859 //    int             k, index;
5860 //    MrBFlt          likeAdjust = 1.0, f;
5861 //#endif
5862 //
5863 //    /* find model settings and invar cond likes */
5864 //    m = &modelSettings[division];
5865 //    clInvar = (__m128 *)m->invCondLikes;
5866 //    /* find conditional likelihood pointer */
5867 //    clP = (__m128 *)m->condLikes[m->condLikeIndex[chain][p->index]];
5868 //
5869 //    lnL_SSE  = m->lnL_SSE;
5870 //    lnLI_SSE = m->lnLI_SSE;
5871 //
5872 //    /* find base frequencies */
5873 //    bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
5874 //
5875 //    /* find tree scaler */
5876 //    lnScaler = m->scalers[m->siteScalerIndex[chain]];
5877 //
5878 //    /* find nSitesOfPat */
5879 //    nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
5880 //
5881 //    /* find rate category index  and number of gamma categories */
5882 //    rateCat = m->tiIndex + chain * m->numChars;
5883 //    nGammaCats = m->numGammaCats;
5884 //
5885 //    /* reset lnL */
5886 //    *lnL = 0.0;
5887 //
5888 //    /* calculate variable likelihood */
5889 //    for (c=0; c<m->numSSEChars; c++)
5890 //    {
5891 //        mLike = _mm_mul_ps (clP[A], mA);
5892 //        m1    = _mm_mul_ps (clP[C], mC);
5893 //        mLike = _mm_add_ps (mLike, m1);
5894 //        m1    = _mm_mul_ps (clP[G], mG);
5895 //        mLike = _mm_add_ps (mLike, m1);
5896 //        m1    = _mm_mul_ps (clP[T], mT);
5897 //        mLike = _mm_add_ps (mLike, m1);
5898 //
5899 //        clP += 4;
5900 //        _mm_store_ps (lnL_SSE, mLike);
5901 //        lnL_SSE += FLOATS_PER_VEC;
5902 //    }
5903 //
5904 //    /* calculate invariable likelihood */
5905 //    if (hasPInvar == YES)
5906 //    {
5907 //        for (c=0; c<m->numSSEChars; c++)
5908 //        {
5909 //            mLike = _mm_mul_ps (clInvar[A], mA);
5910 //            m1    = _mm_mul_ps (clInvar[C], mC);
5911 //            mLike = _mm_add_ps (mLike, m1);
5912 //            m1    = _mm_mul_ps (clInvar[G], mG);
5913 //            mLike = _mm_add_ps (mLike, m1);
5914 //            m1    = _mm_mul_ps (clInvar[T], mT);
5915 //            mLike = _mm_add_ps (mLike, m1);
5916 //            mLike = _mm_mul_ps (mLike, mPInvar);
5917 //
5918 //            _mm_store_ps (lnLI_SSE, mLike);
5919 //            clInvar += 4;
5920 //            lnLI_SSE += FLOATS_PER_VEC;
5921 //        }
5922 //    }
5923 //
5924 //
5925 //    /* loop over characters */
5926 //    if (m->pInvar == NULL)
5927 //    {
5928 //        for (c=i=0; c<m->numChars; c++)
5929 //        {
5930 //            like = m->lnL_SSE[c];
5931 //            /* check against LIKE_EPSILON (values close to zero are problematic) */
5932 //            if (like < LIKE_EPSILON)
5933 //            {
5934 //                MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30lf\n", spacer, division, c, like);
5935 //                (*lnL) = MRBFLT_NEG_MAX;
5936 //                return ERROR;
5937 //            }
5938 //            else
5939 //            {
5940 //#if defined (FAST_LOG)
5941 //                f = frexp (like, &index);
5942 //                index = 1-index;
5943 //                (*lnL) += (lnScaler[c] +  logValue[index]) * nSitesOfPat[c];
5944 //                for (k=0; k<(int)nSitesOfPat[c]; k++)
5945 //                    likeAdjust *= f;
5946 //#else
5947 //                (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
5948 //#endif
5949 //            }
5950 //        }
5951 //    }
5952 //    else
5953 //    {
5954 //        /* has invariable category */
5955 //        for (c=i=0; c<m->numChars; c++)
5956 //        {
5957 //            r = rateCat[c];
5958 //            if (r < nGammaCats)
5959 //                like = m->lnL_SSE[c];
5960 //            else
5961 //                like = m->lnLI_SSE[c];
5962 //
5963 //            /* check against LIKE_EPSILON (values close to zero are problematic) */
5964 //            if (like < LIKE_EPSILON)
5965 //            {
5966 //                MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30lf\n", spacer, division, c, like);
5967 //                (*lnL) = MRBFLT_NEG_MAX;
5968 //                return ERROR;
5969 //            }
5970 //            else
5971 //            {
5972 //                (*lnL) += (log (like) + lnScaler[c]) * nSitesOfPat[c];
5973 //            }
5974 //        }
5975 //    }
5976 //
5977 //#if defined (FAST_LOG)
5978 //    (*lnL) += log (likeAdjust);
5979 //#endif
5980 //
5981 //    return NO_ERROR;
5982 //}
5983 //#endif
5984
5985
5986 #if defined (SSE_ENABLED)
5987 /*------------------------------------------------------------------
5988 |
5989 |   Likelihood_NUC4_SSE: 4by4 nucleotide models with or without rate
5990 |       variation
5991 |
5992 -------------------------------------------------------------------*/
5993 int Likelihood_NUC4_SSE (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
5994 {
5995     int             c, k, hasPInvar;
5996     MrBFlt          freq, *bs, pInvar=0.0, like, likeI;
5997     CLFlt           *lnScaler, *nSitesOfPat, *lnL_SSE, *lnLI_SSE;
5998     __m128          *clPtr, **clP, *clInvar=NULL;
5999     __m128          m1, mA, mC, mG, mT, mFreq, mPInvar=_mm_set1_ps(0.0f), mLike;
6000     ModelInfo       *m;
6001
6002 #   if defined (FAST_LOG)
6003     int             index;
6004     MrBFlt          likeAdjust = 1.0, f;
6005 #   endif
6006
6007     /* find model settings and pInvar, invar cond likes */
6008     m = &modelSettings[division];
6009     if (m->pInvar == NULL)
6010         {
6011         hasPInvar = NO;
6012         }
6013     else
6014         {
6015         hasPInvar = YES;
6016         pInvar =  *(GetParamVals (m->pInvar, chain, state[chain]));
6017         mPInvar = _mm_set1_ps ((CLFlt)(pInvar));
6018         clInvar = (__m128 *) (m->invCondLikes);
6019         }
6020
6021     /* find conditional likelihood pointers */
6022     clPtr = (__m128 *) (m->condLikes[m->condLikeIndex[chain][p->index]]);
6023     clP = m->clP_SSE;
6024     for (k=0; k<m->numGammaCats; k++)
6025         {
6026         clP[k] = clPtr;
6027         clPtr += m->numSSEChars * m->numModelStates;
6028         }
6029     lnL_SSE  = m->lnL_SSE;
6030     lnLI_SSE = m->lnLI_SSE;
6031
6032     /* find base frequencies */
6033     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6034     mA = _mm_set1_ps ((CLFlt)(bs[A]));
6035     mC = _mm_set1_ps ((CLFlt)(bs[C]));
6036     mG = _mm_set1_ps ((CLFlt)(bs[G]));
6037     mT = _mm_set1_ps ((CLFlt)(bs[T]));
6038
6039     /* find category frequencies */
6040     if (hasPInvar == NO)
6041         freq =  1.0 / m->numGammaCats;
6042     else
6043         freq =  (1.0 - pInvar) / m->numGammaCats;
6044     mFreq = _mm_set1_ps ((CLFlt)(freq));
6045
6046     /* find tree scaler */
6047     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6048
6049     /* find nSitesOfPat */
6050     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6051
6052     /* reset lnL */
6053     *lnL = 0.0;
6054
6055     /* calculate variable likelihood */
6056     for (c=0; c<m->numSSEChars; c++)
6057         {
6058         mLike = _mm_setzero_ps ();
6059         for (k=0; k<m->numGammaCats; k++)
6060             {
6061             m1    = _mm_mul_ps (clP[k][A], mA);
6062             mLike = _mm_add_ps (mLike, m1);
6063             m1    = _mm_mul_ps (clP[k][C], mC);
6064             mLike = _mm_add_ps (mLike, m1);
6065             m1    = _mm_mul_ps (clP[k][G], mG);
6066             mLike = _mm_add_ps (mLike, m1);
6067             m1    = _mm_mul_ps (clP[k][T], mT);
6068             mLike = _mm_add_ps (mLike, m1);
6069             clP[k] += 4;
6070             }
6071         mLike = _mm_mul_ps (mLike, mFreq);
6072         _mm_store_ps (lnL_SSE, mLike);
6073         lnL_SSE += FLOATS_PER_VEC;
6074         }
6075
6076     /* calculate invariable likelihood */
6077     if (hasPInvar == YES)
6078         {
6079         for (c=0; c<m->numSSEChars; c++)
6080             {
6081             mLike = _mm_mul_ps (clInvar[A], mA);
6082             m1    = _mm_mul_ps (clInvar[C], mC);
6083             mLike = _mm_add_ps (mLike, m1);
6084             m1    = _mm_mul_ps (clInvar[G], mG);
6085             mLike = _mm_add_ps (mLike, m1);
6086             m1    = _mm_mul_ps (clInvar[T], mT);
6087             mLike = _mm_add_ps (mLike, m1);
6088             mLike = _mm_mul_ps (mLike, mPInvar);
6089
6090             _mm_store_ps (lnLI_SSE, mLike);
6091             clInvar += 4;
6092             lnLI_SSE += FLOATS_PER_VEC;
6093             }
6094         }
6095
6096     /* accumulate results */
6097     if (hasPInvar == NO)
6098         {
6099         for (c=0; c<m->numChars; c++)
6100             {
6101             like = m->lnL_SSE[c];
6102             /* check against LIKE_EPSILON (values close to zero are problematic) */
6103             if (like < LIKE_EPSILON)
6104                 {
6105 #   ifdef DEBUG_LIKELIHOOD
6106                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6107 #   endif
6108                 (*lnL) = MRBFLT_NEG_MAX;
6109                 abortMove = YES;
6110                 return ERROR;
6111                 }
6112             else
6113                 {
6114 #   if defined (FAST_LOG)
6115                 f = frexp (like, &index);
6116                 index = 1-index;
6117                 (*lnL) += (lnScaler[c] +  logValue[index]) * nSitesOfPat[c];
6118                 for (k=0; k<(int)nSitesOfPat[c]; k++)
6119                     likeAdjust *= f;
6120 #   else
6121                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6122 #   endif
6123                 }
6124             }
6125         }
6126     else
6127         {
6128         /* has invariable category */
6129         for (c=0; c<m->numChars; c++)
6130             {
6131             like  = m->lnL_SSE[c];
6132             likeI = m->lnLI_SSE[c];
6133             if (lnScaler[c] < -200)
6134                 {
6135                 /* we are not going to be able to exponentiate the scaling factor */
6136                 if (likeI > 1E-70)
6137                     {
6138                     /* forget about like; it is going to be insignificant compared to likeI */
6139                     like = likeI;
6140                     }
6141                 else
6142                     {
6143                     /* treat likeI as if 0.0, that is, ignore it completely */
6144                     }
6145                 }
6146             else
6147                 like = like + (likeI / exp (lnScaler[c]));
6148
6149             /* check against LIKE_EPSILON (values close to zero are problematic) */
6150             if (like < LIKE_EPSILON)
6151                 {
6152 #   ifdef DEBUG_LIKELIHOOD
6153                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6154 #   endif
6155                 (*lnL) = MRBFLT_NEG_MAX;
6156                 abortMove = YES;
6157                 return ERROR;
6158                 }
6159             else
6160                 {
6161 #   if defined (FAST_LOG)
6162                 f = frexp (like, &index);
6163                 index = 1-index;
6164                 (*lnL) += (lnScaler[c] +  logValue[index]) * nSitesOfPat[c];
6165                 for (k=0; k<(int)nSitesOfPat[c]; k++)
6166                     likeAdjust *= f;
6167 #   else
6168                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6169 #   endif
6170                 }
6171             }
6172         }
6173
6174 #   if defined (FAST_LOG)
6175     (*lnL) += log (likeAdjust);
6176 #   endif
6177
6178     return NO_ERROR;
6179 }
6180 #endif
6181
6182
6183 /*------------------------------------------------------------------
6184 |
6185 |   Likelihood_NY98: Codon model with three selection categories,
6186 |       after Nielsen and Yang (1998).
6187 |
6188 -------------------------------------------------------------------*/
6189 int Likelihood_NY98 (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6190 {
6191     int             c, j, k, nStates;
6192     MrBFlt          catLike, like, *bs, *omegaCatFreq;
6193     CLFlt           **clP,*clPtr, *lnScaler, *nSitesOfPat;
6194     ModelInfo       *m;
6195
6196     m = &modelSettings[division];
6197
6198     /* number of states */
6199     nStates = m->numModelStates;
6200
6201     /* find conditional likelihood pointers */
6202     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
6203     clP   = m->clP;
6204     for (k=0; k<m->numOmegaCats; k++)
6205         {
6206         clP[k] = clPtr;
6207         clPtr += m->numChars * m->numModelStates;
6208         }
6209
6210     /* find codon frequencies */
6211     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6212
6213     /* find category frequencies */
6214     omegaCatFreq = GetParamSubVals (m->omega, chain, state[chain]);
6215
6216     /* find site scaler */
6217     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6218
6219     /* find nSitesOfPat */
6220     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6221
6222     *lnL = 0.0; /* reset lnL */
6223
6224     for (c=m->numDummyChars; c<m->numChars; c++)
6225         {
6226         like = 0.0;
6227         for (k=0; k<m->numOmegaCats; k++)
6228             {
6229             catLike = 0.0;
6230             for (j=0; j<nStates; j++)
6231                 catLike += clP[k][j] * bs[j];
6232             like += catLike * omegaCatFreq[k];
6233             clP[k] += nStates;
6234             }
6235         /* check against LIKE_EPSILON (values close to zero are problematic) */
6236         if (like < LIKE_EPSILON)
6237             {
6238 #   ifdef DEBUG_LIKELIHOOD
6239             MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6240 #   endif
6241             (*lnL) = MRBFLT_NEG_MAX;
6242             abortMove = YES;
6243             return ERROR;
6244             }
6245         else
6246             {
6247             (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6248             }
6249         }
6250
6251     return NO_ERROR;
6252 }
6253
6254
6255 #if defined (SSE_ENABLED)
6256 /*------------------------------------------------------------------
6257 |
6258 |   Likelihood_NY98_SSE: Codon model with three selection categories,
6259 |       after Nielsen and Yang (1998).
6260 |
6261 -------------------------------------------------------------------*/
6262 int Likelihood_NY98_SSE (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6263 {
6264     int             c, j, k, nStates;
6265     MrBFlt          like, *bs, *omegaCatFreq;
6266     CLFlt           *lnScaler, *nSitesOfPat, *lnL_SSE;
6267     __m128          *clPtr, **clP;
6268     __m128          m1, mCatLike, mLike;
6269     ModelInfo       *m;
6270
6271     m = &modelSettings[division];
6272
6273     /* number of states */
6274     nStates = m->numModelStates;
6275
6276     /* find conditional likelihood pointers */
6277     clPtr = (__m128 *) m->condLikes[m->condLikeIndex[chain][p->index]];
6278     clP   = m->clP_SSE;
6279     for (k=0; k<m->numOmegaCats; k++)
6280         {
6281         clP[k] = clPtr;
6282         clPtr += m->numSSEChars * nStates;
6283         }
6284
6285     /* find codon frequencies */
6286     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6287
6288     /* find category frequencies */
6289     omegaCatFreq = GetParamSubVals (m->omega, chain, state[chain]);
6290
6291     /* find site scaler */
6292     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6293
6294     /* find nSitesOfPat */
6295     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6296
6297     *lnL = 0.0; /* reset lnL */
6298
6299     lnL_SSE  = m->lnL_SSE;
6300     for (c=0; c<m->numSSEChars; c++)
6301         {
6302         mLike = _mm_setzero_ps ();
6303         for (k=0; k<m->numOmegaCats; k++)
6304             {
6305             mCatLike = _mm_setzero_ps ();
6306             for (j=0; j<nStates; j++)
6307                 {
6308                 m1 = _mm_mul_ps (clP[k][j], _mm_set1_ps ((CLFlt)bs[j]));
6309                 mCatLike = _mm_add_ps (mCatLike, m1);
6310                 }
6311             m1 = _mm_mul_ps (mCatLike, _mm_set1_ps ((CLFlt)omegaCatFreq[k]));
6312             mLike = _mm_add_ps (mLike, m1);
6313             clP[k] += nStates;
6314             }
6315         _mm_store_ps (lnL_SSE, mLike);
6316         lnL_SSE += FLOATS_PER_VEC;
6317         }
6318     for (c=m->numDummyChars; c<m->numChars; c++)
6319         {
6320         like = m->lnL_SSE[c];
6321         /* check against LIKE_EPSILON (values close to zero are problematic) */
6322         if (like < LIKE_EPSILON)
6323             {
6324 #   ifdef DEBUG_LIKELIHOOD
6325             MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6326 #   endif
6327             (*lnL) = MRBFLT_NEG_MAX;
6328             abortMove = YES;
6329             return ERROR;
6330             }
6331         else
6332             {
6333             (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6334             }
6335         }
6336
6337     return NO_ERROR;
6338 }
6339 #endif
6340
6341
6342 /*------------------------------------------------------------------
6343 |
6344 |   Likelihood_Res: restriction site model with or without rate
6345 |       variation
6346 |
6347 -------------------------------------------------------------------*/
6348 int Likelihood_Res (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6349 {
6350     int             c, k;
6351     MrBFlt          *bs, freq, like, pUnobserved, pObserved;
6352     CLFlt           *clPtr, **clP, *lnScaler, *nSitesOfPat;
6353     ModelInfo       *m;
6354
6355
6356     m = &modelSettings[division];
6357
6358     /* find conditional likelihood pointer */
6359     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
6360     clP = m->clP;
6361     for (k=0; k<m->numGammaCats; k++)
6362         {
6363         clP[k] = clPtr;
6364         clPtr += m->numChars * m->numModelStates;
6365         }
6366
6367     /* find base frequencies */
6368     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6369
6370     /* find category frequencies */
6371     freq =  1.0 /  m->numGammaCats;
6372
6373     /* find site scaler */
6374     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6375
6376     /* find nSitesOfPat */
6377     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6378
6379     *lnL = 0.0; /* reset lnL */
6380
6381     pUnobserved = 0.0;
6382     for (c=0; c<m->numDummyChars; c++)
6383         {
6384         like = 0.0;
6385         for (k=0; k<m->numGammaCats; k++)
6386             {
6387             like += (clP[k][0]*bs[0] + clP[k][1]*bs[1]) * freq;
6388             clP[k] += 2;
6389             }
6390         pUnobserved += like *  exp(lnScaler[c]);
6391         }
6392
6393     pObserved =  1.0 - pUnobserved;
6394     if (pObserved < LIKE_EPSILON)
6395         {
6396 #   ifdef DEBUG_LIKELIHOOD
6397         MrBayesPrint ("%s   WARNING: p(Observed) < LIKE_EPSILON - for division %d p(Observed) = %1.30le\n", spacer, division+1, pObserved);
6398 #   endif
6399         (*lnL) = MRBFLT_NEG_MAX;
6400         abortMove = YES;
6401         return ERROR;
6402         }
6403
6404     for (c=m->numDummyChars; c<m->numChars; c++)
6405         {
6406         like = 0.0;
6407         for (k=0; k<m->numGammaCats; k++)
6408             {
6409             like += (clP[k][0]*bs[0] + clP[k][1]*bs[1]) * freq;
6410             clP[k] += 2;
6411             }
6412         /* check against LIKE_EPSILON (values close to zero are problematic) */
6413         if (like < LIKE_EPSILON)
6414             {
6415 #   ifdef DEBUG_LIKELIHOOD
6416             MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6417 #   endif
6418             (*lnL) = MRBFLT_NEG_MAX;
6419             abortMove = YES;
6420             return ERROR;
6421             }
6422         else
6423             {
6424             (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6425             }
6426         }
6427
6428     /* correct for absent characters */
6429     (*lnL) -=  log(pObserved) * (m->numUncompressedChars);
6430
6431     return NO_ERROR;
6432 }
6433
6434
6435 #if defined (SSE_ENABLED)
6436 /*------------------------------------------------------------------
6437 |
6438 |   Likelihood_Res_SSE: 4by4 nucleotide models with or without rate
6439 |       variation
6440 |
6441 -------------------------------------------------------------------*/
6442 int Likelihood_Res_SSE (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6443 {
6444     int             c, k;
6445     MrBFlt          freq, *bs, like, pUnobserved, pObserved;
6446     CLFlt           *lnScaler, *nSitesOfPat, *lnL_SSE;
6447     __m128          *clPtr, **clP;
6448     __m128          m1, mA, mB, mFreq, mLike;
6449     ModelInfo       *m;
6450
6451     /* find model settings and pInvar, invar cond likes */
6452     m = &modelSettings[division];
6453
6454     /* find conditional likelihood pointers */
6455     clPtr = (__m128 *) (m->condLikes[m->condLikeIndex[chain][p->index]]);
6456     clP = m->clP_SSE;
6457     for (k=0; k<m->numGammaCats; k++)
6458         {
6459         clP[k] = clPtr;
6460         clPtr += m->numSSEChars * m->numModelStates;
6461         }
6462     lnL_SSE  = m->lnL_SSE;
6463
6464     /* find base frequencies */
6465     bs = GetParamSubVals (m->stateFreq, chain, state[chain]);
6466     mA = _mm_set1_ps ((CLFlt)(bs[0]));
6467     mB = _mm_set1_ps ((CLFlt)(bs[1]));
6468
6469     freq =  1.0 / m->numGammaCats;
6470     mFreq = _mm_set1_ps ((CLFlt)(freq));
6471
6472     /* find tree scaler */
6473     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6474
6475     /* find nSitesOfPat */
6476     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6477
6478     /* reset lnL */
6479     *lnL = 0.0;
6480
6481     /* calculate variable likelihood */
6482     for (c=0; c<m->numSSEChars; c++)
6483         {
6484         mLike = _mm_setzero_ps ();
6485         for (k=0; k<m->numGammaCats; k++)
6486             {
6487             m1    = _mm_mul_ps (clP[k][0], mA);
6488             mLike = _mm_add_ps (mLike, m1);
6489             m1    = _mm_mul_ps (clP[k][1], mB);
6490             mLike = _mm_add_ps (mLike, m1);
6491             clP[k] += 2;
6492             }
6493         mLike = _mm_mul_ps (mLike, mFreq);
6494         _mm_store_ps (lnL_SSE, mLike);
6495         lnL_SSE += FLOATS_PER_VEC;
6496         }
6497
6498     pUnobserved = 0.0;
6499     for (c=0; c<m->numDummyChars; c++)
6500         {
6501         like  = m->lnL_SSE[c];
6502         pUnobserved += like *  exp(lnScaler[c]);
6503         }
6504
6505     pObserved =  1.0 - pUnobserved;
6506     if (pObserved < LIKE_EPSILON)
6507         {
6508 #   ifdef DEBUG_LIKELIHOOD
6509         MrBayesPrint ("%s   WARNING: p(Observed) < LIKE_EPSILON - for division %d p(Observed) = %1.30le\n", spacer, division+1, pObserved);
6510 #   endif
6511         (*lnL) = MRBFLT_NEG_MAX;
6512         abortMove = YES;
6513         return ERROR;
6514         }
6515
6516     for (c=m->numDummyChars; c<m->numChars; c++)
6517         {
6518         like  = m->lnL_SSE[c];
6519         /* check against LIKE_EPSILON (values close to zero are problematic) */
6520         if (like < LIKE_EPSILON)
6521             {
6522 #   ifdef DEBUG_LIKELIHOOD
6523             MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6524 #   endif
6525             (*lnL) = MRBFLT_NEG_MAX;
6526             abortMove = YES;
6527             return ERROR;
6528             }
6529         else
6530             {
6531             (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6532             }
6533         }
6534
6535     /* correct for absent characters */
6536     (*lnL) -=  log(pObserved) * (m->numUncompressedChars);
6537
6538     return NO_ERROR;
6539 }
6540 #endif
6541
6542
6543 /*------------------------------------------------------------------
6544 |
6545 |   Likelihood_Std: variable states model with or without rate
6546 |       variation
6547 |
6548 -------------------------------------------------------------------*/
6549 int Likelihood_Std (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6550 {
6551     int             b, c, j, k, nBetaCats, nGammaCats, nStates, numReps;
6552     MrBFlt          catLike, catFreq, gammaFreq, like, *bs, *bsBase,
6553                     pUnobserved, pObserved;
6554     CLFlt           *clPtr, **clP, *lnScaler, *nSitesOfPat;
6555     ModelInfo       *m;
6556
6557     m = &modelSettings[division];
6558
6559     numReps=0;
6560     for (c=0; c<m->numChars; c++)
6561         {
6562         if (m->nStates[c] == 2)
6563             numReps += m->numBetaCats * 2;
6564         else
6565             numReps += m->nStates[c];
6566         }
6567     /* find conditional likelihood pointers */
6568     clPtr = m->condLikes[m->condLikeIndex[chain][p->index]];
6569     clP   = m->clP;
6570     for (k=0; k<m->numGammaCats; k++)
6571         {
6572         clP[k] = clPtr;
6573         clPtr += numReps;
6574         }
6575
6576     /* find base frequencies */
6577     bsBase = GetParamStdStateFreqs (m->stateFreq, chain, state[chain]);
6578
6579     /* find gamma category number and frequencies */
6580     nGammaCats = m->numGammaCats;
6581     gammaFreq = 1.0 / nGammaCats;
6582
6583     /* find site scaler */
6584     lnScaler = m->scalers[m->siteScalerIndex[chain]];
6585
6586     /* find nSitesOfPat */
6587     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6588
6589     *lnL = 0.0; /* reset lnL */
6590
6591     if (m->numBetaCats == 1)
6592         {
6593         pUnobserved = 0.0;
6594         catFreq = gammaFreq;
6595         for (c=j=0; c<m->numDummyChars; c++)
6596             {
6597             like = 0.0;
6598             nStates = m->nStates[c];
6599             bs = bsBase + m->bsIndex[c];
6600             for (k=0; k<nGammaCats; k++)
6601                 {
6602                 catLike = 0.0;
6603                 for (j=0; j<nStates; j++)
6604                     catLike += clP[k][j] * bs[j];
6605                 like += catLike * catFreq;
6606                 clP[k] += nStates;
6607                 }
6608             pUnobserved += like *  exp(lnScaler[c]);
6609             }
6610
6611         pObserved =  1.0 - pUnobserved;
6612         if (pObserved < LIKE_EPSILON)
6613             pObserved = LIKE_EPSILON;
6614
6615         for (c=m->numDummyChars; c<m->numChars; c++)
6616             {
6617             like = 0.0;
6618             nStates = m->nStates[c];
6619             bs = bsBase + m->bsIndex[c];
6620
6621             for (k=0; k<nGammaCats; k++)
6622                 {
6623                 catLike = 0.0;
6624                 for (j=0; j<nStates; j++)
6625                     catLike += clP[k][j] * bs[j];
6626                 like += catLike * catFreq;
6627                 clP[k] += nStates;
6628                 }
6629             /* check against LIKE_EPSILON (values close to zero are problematic) */
6630             if (like < LIKE_EPSILON)
6631                 {
6632 #   ifdef DEBUG_LIKELIHOOD
6633                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6634 #   endif
6635                 (*lnL) = MRBFLT_NEG_MAX;
6636                 abortMove = YES;
6637                 return ERROR;
6638                 }
6639             else
6640                 {
6641                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6642                 }
6643             }
6644         }
6645     else
6646         {
6647         pUnobserved = 0.0;
6648         for (c=j=0; c<m->numDummyChars; c++)
6649             {
6650             like = 0.0;
6651             nStates = m->nStates[c];
6652             bs = bsBase + m->bsIndex[c];
6653             if (nStates == 2)
6654                 {
6655                 nBetaCats = m->numBetaCats;
6656                 catFreq = gammaFreq / nBetaCats;
6657                 }
6658             else
6659                 {
6660                 nBetaCats = 1;
6661                 catFreq = gammaFreq;
6662                 }
6663             for (b=0; b<nBetaCats; b++)
6664                 {
6665                 for (k=0; k<nGammaCats; k++)
6666                     {
6667                     catLike = 0.0;
6668                     for (j=0; j<nStates; j++)
6669                         catLike += clP[k][j] * bs[j];
6670                     like += catLike * catFreq;
6671                     clP[k] += nStates;
6672                     }
6673                 bs += nStates;
6674                 }
6675             pUnobserved += like *  exp(lnScaler[c]);
6676             }
6677
6678         pObserved =  1.0 - pUnobserved;
6679         if (pObserved < LIKE_EPSILON)
6680             pObserved = LIKE_EPSILON;
6681
6682         for (c=m->numDummyChars; c<m->numChars; c++)
6683             {
6684             like = 0.0;
6685             nStates = m->nStates[c];
6686             bs = bsBase + m->bsIndex[c];
6687             if (nStates == 2)
6688                 {
6689                 nBetaCats = m->numBetaCats;
6690                 catFreq = gammaFreq / nBetaCats;
6691                 }
6692             else
6693                 {
6694                 nBetaCats = 1;
6695                 catFreq = gammaFreq;
6696                 }
6697             for (b=0; b<nBetaCats; b++)
6698                 {
6699                 for (k=0; k<nGammaCats; k++)
6700                     {
6701                     catLike = 0.0;
6702                     for (j=0; j<nStates; j++)
6703                         catLike += clP[k][j] * bs[j];
6704                     like += catLike * catFreq;
6705                     clP[k] += nStates;
6706                     }
6707                 bs += nStates;
6708                 }
6709             /* check against LIKE_EPSILON (values close to zero are problematic) */
6710             if (like < LIKE_EPSILON)
6711                 {
6712 #   ifdef DEBUG_LIKELIHOOD
6713                 MrBayesPrint ("%s   WARNING: In LIKE_EPSILON - for division %d char %d has like = %1.30le\n", spacer, division+1, c+1, like);
6714 #   endif
6715                 (*lnL) = MRBFLT_NEG_MAX;
6716                 abortMove = YES;
6717                 return ERROR;
6718                 }
6719             else
6720                 {
6721                 (*lnL) += (lnScaler[c] +  log(like)) * nSitesOfPat[c];
6722                 }
6723             }
6724         }
6725
6726     /* correct for absent characters */
6727     (*lnL) -=  log(pObserved) * (m->numUncompressedChars);
6728
6729     return NO_ERROR;
6730 }
6731
6732
6733 /*------------------------------------------------------------------
6734 |
6735 |   Likelihood_Pars: likelihood under the Tuffley and Steel (1997)
6736 |       model for characters with constant number of states. The idea
6737 |       is described in:
6738 |
6739 |       Tuffley, C., and M. Steel. 1997. Links between maximum likelihood
6740 |          and maximum parsimony under a simple model of site substitution.
6741 |          Bull. Math. Bio. 59:581-607.
6742 |
6743 |       The likelihood under the Tuffley and Steel (1997) model is:
6744 |
6745 |       L = k^[-(T + n)]
6746 |
6747 |       where L is the likelihood
6748 |             k is the number of character states
6749 |             T is the parsimony tree length
6750 |             n is the number of characters
6751 |
6752 |   The parsimony calculator does not use character packing; this is
6753 |       to enable reweighting of characters
6754 |
6755 |   Note that this is an empirical Bayes approach in that it uses the
6756 |       maximum likelihood branch length.
6757 |
6758 -------------------------------------------------------------------*/
6759 int Likelihood_Pars (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6760 {
6761     int             c, i, nStates;
6762     BitsLong        done, *pL, *pR, *pP, *pA, *oldpP, x;
6763     CLFlt           nParsChars, treeLength;
6764     CLFlt           length, *nSitesOfPat, *newNodeLength, oldNodeLength;
6765     Tree            *t;
6766     ModelInfo       *m;
6767
6768     /* Find model settings */
6769     m = &modelSettings[division];
6770
6771     /* Get tree */
6772     t = GetTree(m->brlens,chain,state[chain]);
6773
6774     /* Get parsimony tree length */
6775     treeLength = (CLFlt) m->parsTreeLength[2 * chain + state[chain]];
6776
6777     /* Get number of states */
6778     nStates = m->numStates;
6779
6780     /* Get number of sites of pat */
6781     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6782
6783     /* Mark the nodes that can be stop nodes                 */
6784     /* (there must not be any touched side nodes below them) */
6785     p = t->root;
6786     p->marked = YES;
6787     for (i=t->nIntNodes-1; i>=0; i--)
6788         {
6789         p = t->intDownPass[i];
6790         p->marked = NO;
6791         if (p->upDateCl == YES && p->anc->marked == YES)
6792             {
6793             if (p->left->upDateCl == NO || p->right->upDateCl == NO)
6794                 p->marked = YES;
6795             }
6796         }
6797
6798     /* Now make downpass node by node */
6799     for (i=0; i<t->nIntNodes; i++)
6800         {
6801         p = t->intDownPass[i];
6802
6803         /* continue if no work needs to be done */
6804         if (p->upDateCl == NO)
6805             continue;
6806
6807         /* flip space */
6808         FlipCondLikeSpace(m, chain, p->index);
6809
6810         /* find parsimony sets for the node and its environment */
6811         pL    = m->parsSets[m->condLikeIndex[chain][p->left->index ]];
6812         pR    = m->parsSets[m->condLikeIndex[chain][p->right->index]];
6813         oldpP = m->parsSets[m->condLikeScratchIndex[p->index       ]];
6814         pP    = m->parsSets[m->condLikeIndex[chain][p->index       ]];
6815
6816         /* find old and new node lengths */
6817         oldNodeLength =  m->parsNodeLens[m->condLikeScratchIndex[p->index]];
6818         newNodeLength = &m->parsNodeLens[m->condLikeIndex[chain][p->index]];
6819
6820         if (t->isRooted == NO && p->anc->anc == NULL)
6821             {
6822             pA = m->parsSets[m->condLikeIndex[chain][p->anc->index]];
6823             length = 0.0;
6824             for (c=0; c<m->numChars; c++)
6825                 {
6826                 x = pL[c] & pR[c];
6827                 if (x == 0)
6828                     {
6829                     x = pL[c] | pR[c];
6830                     length += nSitesOfPat[c];
6831                     }
6832                 if ((x & pA[c]) == 0)
6833                     length += nSitesOfPat[c];
6834                 pP[c] = x;
6835                 }
6836             treeLength += (length - oldNodeLength);
6837             newNodeLength[0] = length;
6838             }
6839         else
6840             {
6841             length = 0.0;
6842             done = 0;
6843             for (c=0; c<m->numChars; c++)
6844                 {
6845                 x = pL[c] & pR[c];
6846                 if (x == 0)
6847                     {
6848                     x = pL[c] | pR[c];
6849                     length += nSitesOfPat[c];
6850                     }
6851                 pP[c] = x;
6852                 done |= (x^oldpP[c]);
6853                 }
6854             treeLength += (length - oldNodeLength);
6855             newNodeLength[0] = length;
6856             if (p->marked == YES && done == 0)
6857                 break;
6858             }
6859         }
6860
6861     /* Count number of characters in the partition. It is calculated
6862        on the fly because this number is going to differ for
6863        different chains if character reweighting is used. */
6864     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6865     nParsChars = 0.0;
6866     for (c=0; c<m->numChars; c++)
6867         nParsChars += nSitesOfPat[c];
6868
6869     /* Calculate likelihood from parsimony tree length */
6870     *lnL = - ((treeLength + nParsChars) *  log (nStates));
6871
6872     /* Store current parsimony tree length */
6873     m->parsTreeLength[2 * chain + state[chain]] = treeLength;
6874
6875     return (NO_ERROR);
6876 }
6877
6878
6879 #if 0
6880 int Likelihood_ParsCodon (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6881 {
6882     int             x, y;
6883     TreeNode        *q;
6884
6885     /* no warnings */
6886     q = p;
6887     x = division;
6888     y = chain;
6889     *lnL = 0.0;
6890     x = whichSitePats;
6891
6892     MrBayesPrint ("%s   Parsimony calculator for codons not yet implemented\n", spacer);
6893
6894     return ERROR;
6895 }
6896 #   endif
6897
6898
6899 /*------------------------------------------------------------------
6900 |
6901 |   Likelihood_Pars: likelihood under the Tuffley and Steel (1997)
6902 |       model for characters with constant number of states. The idea
6903 |       is described in:
6904 |
6905 |       Tuffley, C., and M. Steel. 1997. Links between maximum likelihood
6906 |          and maximum parsimony under a simple model of site substitution.
6907 |          Bull. Math. Bio. 59:581-607.
6908 |
6909 |       The likelihood under the Tuffley and Steel (1997) model is:
6910 |
6911 |       L = k^[-(T + n)]
6912 |
6913 |       where L is the likelihood
6914 |             k is the number of character states
6915 |             T is the parsimony tree length
6916 |             n is the number of characters
6917 |
6918 |   The parsimony calculator does not use character packing; this is
6919 |       to enable reweighting of characters
6920 |
6921 |   Note that this is an empirical Bayes approach in that it uses the
6922 |       maximum likelihood branch length.
6923 |
6924 |   This variant of the calculator assumes that the number of states
6925 |       is variable. It does not take state order into account.
6926 |
6927 -------------------------------------------------------------------*/
6928 int Likelihood_ParsStd (TreeNode *p, int division, int chain, MrBFlt *lnL, int whichSitePats)
6929 {
6930     int             c, i, *nStates;
6931     BitsLong        *pL, *pR, *pP, *pA, x;
6932     CLFlt           *treeLength;
6933     CLFlt           *nSitesOfPat;
6934     Tree            *t;
6935     ModelInfo       *m;
6936
6937     /* Find model settings */
6938     m = &modelSettings[division];
6939
6940     /* Get tree */
6941     t = GetTree(m->brlens,chain,state[chain]);
6942
6943     /* Allocate space for parsimony tree length */
6944     treeLength = (CLFlt *) SafeCalloc (m->numChars, sizeof (CLFlt));
6945
6946     /* Get number of states */
6947     nStates = m->nStates;
6948
6949     /* Get number of sites of pat */
6950     nSitesOfPat = numSitesOfPat + (whichSitePats*numCompressedChars) + m->compCharStart;
6951
6952     /* Make downpass node by node; do not skip any nodes */
6953     for (i=0; i<t->nIntNodes; i++)
6954         {
6955         p = t->intDownPass[i];
6956
6957         /* flip space */
6958         FlipCondLikeSpace(m, chain, p->index);
6959
6960         /* find parsimony sets for the node and its environment */
6961         pL    = m->parsSets[m->condLikeIndex[chain][p->left->index ]];
6962         pR    = m->parsSets[m->condLikeIndex[chain][p->right->index]];
6963         pP    = m->parsSets[m->condLikeIndex[chain][p->index       ]];
6964
6965         if (t->isRooted == NO && p->anc->anc == NULL)
6966             {
6967             pA = m->parsSets[m->condLikeIndex[chain][p->anc->index]];
6968             for (c=0; c<m->numChars; c++)
6969                 {
6970                 x = pL[c] & pR[c];
6971                 if (x == 0)
6972                     {
6973                     x = pL[c] | pR[c];
6974                     treeLength[c] += nSitesOfPat[c];
6975                     }
6976                 if ((x & pA[c]) == 0)
6977                     treeLength[c] += nSitesOfPat[c];
6978                 pP[c] = x;
6979                 }
6980             }
6981         else
6982             {
6983             for (c=0; c<m->numChars; c++)
6984                 {
6985                 x = pL[c] & pR[c];
6986                 if (x == 0)
6987                     {
6988                     x = pL[c] | pR[c];
6989                     treeLength[c] += nSitesOfPat[c];
6990                     }
6991                 pP[c] = x;
6992                 }
6993             }
6994         }
6995
6996     /* Calculate the likelihood one character at a time */
6997     *lnL = 0.0;
6998     for (c=0; c<m->numChars; c++)
6999         {
7000         *lnL -= ((treeLength[c] + nSitesOfPat[c]) * log (nStates[c]));
7001         }
7002
7003     /* Free space for parsimony character states */
7004     free (treeLength);
7005
7006     return (NO_ERROR);
7007 }
7008
7009
7010 /*-----------------------------------------------------------------
7011 |
7012 |   LaunchLogLikeForDivision: calculate the log likelihood of the
7013 |       new state of the chain for a single division
7014 |
7015 -----------------------------------------------------------------*/
7016 void LaunchLogLikeForDivision(int chain, int d, MrBFlt* lnL)
7017 {
7018     int i;
7019     TreeNode        *p;
7020     ModelInfo       *m;
7021     Tree            *tree;
7022 #   if defined (TIMING_ANALIZ)
7023     clock_t         CPUTimeStart;
7024 #   endif
7025
7026     m = &modelSettings[d];
7027     tree = GetTree(m->brlens, chain, state[chain]);
7028
7029     if (m->upDateCijk == YES)
7030         {
7031         if (UpDateCijk(d, chain)== ERROR)
7032             {
7033             (*lnL) = MRBFLT_NEG_MAX; /* effectively abort the move */
7034             return;
7035             }
7036         m->upDateAll = YES;
7037         }
7038
7039 #   if defined (BEAGLE_ENABLED)
7040     if (m->useBeagle == YES)
7041         {
7042         LaunchBEAGLELogLikeForDivision(chain, d, m, tree, lnL);
7043         return;
7044         }
7045 #   endif
7046
7047     /* Flip and copy or reset site scalers */
7048     FlipSiteScalerSpace(m, chain);
7049     if (m->upDateAll == YES)
7050         ResetSiteScalers(m, chain);
7051     else
7052         CopySiteScalers(m, chain);
7053
7054     if (m->parsModelId == NO)
7055         {
7056         for (i=0; i<tree->nIntNodes; i++)
7057             {
7058             p = tree->intDownPass[i];
7059
7060             if (p->left->upDateTi == YES)
7061                 {
7062                 /* shift state of ti probs for node */
7063                 FlipTiProbsSpace (m, chain, p->left->index);
7064                 m->TiProbs (p->left, d, chain);
7065                 }
7066
7067             if (p->right->upDateTi == YES)
7068                 {
7069                 /* shift state of ti probs for node */
7070                 FlipTiProbsSpace (m, chain, p->right->index);
7071                 m->TiProbs (p->right, d, chain);
7072                 }
7073
7074             if (tree->isRooted == NO)
7075                 {
7076                 if (p->anc->anc == NULL /* && p->upDateTi == YES */)
7077                     {
7078                     /* shift state of ti probs for node */
7079                     FlipTiProbsSpace (m, chain, p->index);
7080                     m->TiProbs (p, d, chain);
7081                     }
7082                 }
7083
7084             if (p->upDateCl == YES)
7085                 {
7086                 if (tree->isRooted == NO)
7087                     {
7088                     if (p->anc->anc == NULL)
7089                         {
7090                         TIME(m->CondLikeRoot (p, d, chain),CPUCondLikeRoot);
7091                         }
7092                     else
7093                         {
7094                         TIME(m->CondLikeDown (p, d, chain),CPUCondLikeDown);
7095                         }
7096                     }
7097                 else
7098                     {
7099                     TIME(m->CondLikeDown (p, d, chain),CPUCondLikeDown);
7100                     }
7101
7102                 if (m->scalersSet[chain][p->index] == YES && m->upDateAll == NO)
7103                     {
7104 #   if defined (SSE_ENABLED)
7105                     if (m->useSSE == YES)
7106                         {
7107                         TIME(RemoveNodeScalers_SSE (p, d, chain),CPUScalersRemove);
7108                         }
7109                     else
7110                         {
7111                         TIME(RemoveNodeScalers (p, d, chain),CPUScalersRemove);
7112                         }
7113 #   else
7114                     TIME(RemoveNodeScalers (p, d, chain),CPUScalersRemove);
7115 #   endif
7116                     }
7117                 FlipNodeScalerSpace (m, chain, p->index);
7118                 m->scalersSet[chain][p->index] = NO;
7119
7120                 if (p->scalerNode == YES)
7121                     {
7122                     TIME(m->CondLikeScaler (p, d, chain),CPUScalers);
7123                     }
7124                 }
7125             }
7126         }
7127     TIME(m->Likelihood (tree->root->left, d, chain, lnL, (chainId[chain] % chainParams.numChains)),CPULilklihood);
7128     return;
7129 }
7130
7131
7132 /*----------------------------------------------------------------
7133 |
7134 |   RemoveNodeScalers: Remove node scalers
7135 |
7136 -----------------------------------------------------------------*/
7137 int RemoveNodeScalers (TreeNode *p, int division, int chain)
7138 {
7139     int             c;
7140     CLFlt           *scP, *lnScaler;
7141     ModelInfo       *m;
7142
7143     m = &modelSettings[division];
7144     assert (m->scalersSet[chain][p->index] == YES);
7145
7146     /* find scalers */
7147     scP = m->scalers[m->nodeScalerIndex[chain][p->index]];
7148
7149     /* find site scalers */
7150     lnScaler = m->scalers[m->siteScalerIndex[chain]];
7151
7152     /* remove scalers */
7153     for (c=0; c<m->numChars; c++)
7154         lnScaler[c] -= scP[c];
7155
7156     return NO_ERROR;
7157 }
7158
7159
7160 #if defined (SSE_ENABLED)
7161 /*----------------------------------------------------------------
7162 |
7163 |   RemoveNodeScalers_SSE: Remove node scalers, SSE code
7164 |
7165 -----------------------------------------------------------------*/
7166 int RemoveNodeScalers_SSE (TreeNode *p, int division, int chain)
7167 {
7168     int             c;
7169     __m128          *scP_SSE, *lnScaler_SSE;
7170     ModelInfo       *m;
7171
7172     m = &modelSettings[division];
7173     assert (m->scalersSet[chain][p->index] == YES);
7174
7175     /* find scalers */
7176     scP_SSE = (__m128*)(m->scalers[m->nodeScalerIndex[chain][p->index]]);
7177
7178     /* find site scalers */
7179     lnScaler_SSE = (__m128*)(m->scalers[m->siteScalerIndex[chain]]);
7180
7181     /* remove scalers */
7182     for (c=0; c<m->numSSEChars; c++)
7183         {
7184         lnScaler_SSE[c] = _mm_sub_ps(lnScaler_SSE[c], scP_SSE[c]);
7185         }
7186
7187     return NO_ERROR;
7188
7189 }
7190 #endif
7191
7192
7193 int SetBinaryQMatrix (MrBFlt **a, int whichChain, int division)
7194 {
7195     MrBFlt          scaler, *bs;
7196     ModelInfo       *m;
7197
7198     /* set up pointers to the appropriate model information */
7199     m = &modelSettings[division];
7200     assert (m->numModelStates == 2);
7201
7202     bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
7203     scaler = 1.0 / (2*bs[0]*bs[1]);
7204     a[0][0]= -bs[1]*scaler;
7205     a[0][1]=  bs[1]*scaler;
7206     a[1][0]=  bs[0]*scaler;
7207     a[1][1]= -bs[0]*scaler;
7208
7209     return (NO_ERROR);
7210 }
7211
7212
7213 int SetNucQMatrix (MrBFlt **a, int n, int whichChain, int division, MrBFlt rateMult, MrBFlt *rA, MrBFlt *rS)
7214 {
7215     register int    i, j, k;
7216     int             isTransition=0, nDiff, rtNum=0;
7217     MrBFlt          scaler, mult=0.0, probOn, sum, *swr, s01, s10, s[4][4], nonsyn, *rateValues=NULL, *bs, dN, dS;
7218     ModelInfo       *m;
7219     ModelParams     *mp;
7220 #   if defined BEAGLE_ENABLED
7221     MrBFlt          trans;
7222 #   endif
7223
7224     /* set up pointers to the appropriate model information */
7225     mp = &modelParams[division];
7226     m = &modelSettings[division];
7227     assert (m->numModelStates == n);
7228
7229     /* All of the models that are set up in this function require the frequencies
7230        of the nucleotides (or doublets or codons). They will also require either
7231        a transition/transversion rate ratio or the GTR rate parameters. The
7232        "rateValues" will either be
7233
7234           rateValues[0] = transtion/transversion rate (kappa)
7235
7236        for nst=2 models or
7237
7238           rateValues[0] = A <-> C rate
7239           rateValues[1] = A <-> G rate
7240           rateValues[2] = A <-> T rate
7241           rateValues[3] = C <-> G rate
7242           rateValues[4] = C <-> T rate
7243           rateValues[5] = G <-> T rate
7244
7245        for nst=6 models. */
7246     bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
7247     if (m->nst == 2)
7248         {
7249         rateValues = GetParamVals(m->tRatio, whichChain, state[whichChain]);
7250 #   if defined (BEAGLE_ENABLED)
7251         /* transversions assumed to have rate 1.0; */
7252         trans = rateValues[0];
7253         if (m->numModelStates == 4)   /* code to satisfy Beagle */
7254             {
7255             rateValues = (MrBFlt *) SafeCalloc (6, sizeof(MrBFlt));
7256             rateValues[0] = rateValues[2] = rateValues[3] = rateValues[5] =1.0; /* Setting transversions */
7257             rateValues[1] = rateValues[4] = trans; /* Setting transitions */
7258             }
7259 #   endif
7260         }
7261
7262     else if (m->nst == 6 || m->nst == NST_MIXED)
7263         rateValues = GetParamVals(m->revMat, whichChain, state[whichChain]);
7264 #   if defined (BEAGLE_ENABLED)
7265     else if (m->nst == 1 && m->numModelStates == 4)   /* code to satisfy Beagle */
7266         {
7267         rateValues = (MrBFlt *) SafeCalloc (6, sizeof(MrBFlt));
7268         for (i=0; i<6; i++)
7269             rateValues[i] = 1.0;
7270         }
7271 #   endif
7272
7273     if (n == 4)
7274         {
7275         /* 4 X 4 model:
7276
7277            Here, we set the rate matrix for the GTR model (Tavare, 1986). We
7278            need not only the 6 rates for this model (rateValues), but also the
7279            base frequencies (bs). */
7280
7281         /* set diagonal of Q matrix to 0 */
7282         for (i=0; i<4; i++)
7283             a[i][i] = 0.0;
7284
7285         /* initialize Q matrix */
7286         scaler = 0.0;
7287         for (i=0; i<4; i++)
7288             {
7289             for (j=i+1; j<4; j++)
7290                 {
7291                 if (i == 0 && j == 1)
7292                     mult = rateValues[0];
7293                 else if (i == 0 && j == 2)
7294                     mult = rateValues[1];
7295                 else if (i == 0 && j == 3)
7296                     mult = rateValues[2];
7297                 else if (i == 1 && j == 2)
7298                     mult = rateValues[3];
7299                 else if (i == 1 && j == 3)
7300                     mult = rateValues[4];
7301                 else if (i == 2 && j == 3)
7302                     mult = rateValues[5];
7303                 a[i][i] -= (a[i][j] = bs[j] * mult);
7304                 a[j][j] -= (a[j][i] = bs[i] * mult);
7305                 scaler += bs[i] * a[i][j];
7306                 scaler += bs[j] * a[j][i];
7307                 }
7308             }
7309
7310         /* rescale Q matrix */
7311         scaler = 1.0 / scaler;
7312         for (i=0; i<4; i++)
7313             for (j=0; j<4; j++)
7314                 a[i][j] *= scaler;
7315         }
7316     else if (n == 8) /* we have a 4 X 4 covarion model */
7317         {
7318         /* 8 X 8 covarion model:
7319
7320            Here, we set the rate matrix for the covarion model (Tuffley and
7321            Steel, 1997). We need the rate parameters of the model
7322            (contained in rateValues), the frequencies of the four nucleotides,
7323            and the switching rates to completely specify the rate matrix. We
7324            first set up the 4 X 4 submatrix that represents changes (the upper
7325            left portion of the 8 X 8 matrix). Note that if we have rate
7326            variation across sites, that we need to deal with the multiplication
7327            in the rate matrix (i.e., we cannot simply deal with rate variation
7328            by multiplying the branch length by a rate multiplier as we can
7329            with other models). Instead, we multiply the scaled rate matrix
7330            by the rate multiplier. */
7331
7332         /* Get the switching rates. The rate of off->on is s01 and the rate
7333            of on->off is s10. The stationary probability of the switch process
7334            is prob1 = s01/(s01+s10) and prob0 = s10/(s01+s10). */
7335         swr = GetParamVals (m->switchRates, whichChain, state[whichChain]);
7336         s01 = swr[0];
7337         s10 = swr[1];
7338         probOn = s01 / (s01 + s10);
7339
7340         /* set matrix a to 0 */
7341         for (i=0; i<8; i++)
7342             for (j=0; j<8; j++)
7343                 a[i][j] = 0.0;
7344
7345         /* set up the 4 X 4 matrix representing substitutions (s[][]; upper left) */
7346         if (m->nst == 1)
7347             {
7348             scaler = 0.0;
7349             for (i=0; i<4; i++)
7350                 {
7351                 for (j=i+1; j<4; j++)
7352                     {
7353                     s[i][j] = bs[j];
7354                     s[j][i] = bs[i];
7355                     scaler += bs[i] * s[i][j] * probOn;
7356                     scaler += bs[j] * s[j][i] * probOn;
7357                     }
7358                 }
7359             }
7360         else if (m->nst == 2)
7361             {
7362             scaler = 0.0;
7363             for (i=0; i<4; i++)
7364                 {
7365                 for (j=i+1; j<4; j++)
7366                     {
7367                     if ((i == 0 && j == 2) || (i == 2 && j == 0) || (i == 1 && j == 3) || (i == 3 && j == 1))
7368                         mult = rateValues[0];
7369                     else
7370                         mult = 1.0;
7371                     s[i][j] = bs[j] * mult;
7372                     s[j][i] = bs[i] * mult;
7373                     scaler += bs[i] * s[i][j] * probOn;
7374                     scaler += bs[j] * s[j][i] * probOn;
7375                     }
7376                 }
7377             }
7378         else
7379             {
7380             scaler = 0.0;
7381             for (i=0; i<4; i++)
7382                 {
7383                 for (j=i+1; j<4; j++)
7384                     {
7385                     if (i == 0 && j == 1)
7386                         mult = rateValues[0];
7387                     else if (i == 0 && j == 2)
7388                         mult = rateValues[1];
7389                     else if (i == 0 && j == 3)
7390                         mult = rateValues[2];
7391                     else if (i == 1 && j == 2)
7392                         mult = rateValues[3];
7393                     else if (i == 1 && j == 3)
7394                         mult = rateValues[4];
7395                     else if (i == 2 && j == 3)
7396                         mult = rateValues[5];
7397
7398                     s[i][j] = bs[j] * mult;
7399                     s[j][i] = bs[i] * mult;
7400                     scaler += bs[i] * s[i][j] * probOn;
7401                     scaler += bs[j] * s[j][i] * probOn;
7402                     }
7403                 }
7404             }
7405
7406         /* rescale off diagonal elements of s[][] matrix */
7407         scaler = 1.0 / scaler;
7408         for (i=0; i<4; i++)
7409             {
7410             for (j=0; j<4; j++)
7411                 {
7412                 if (i != j)
7413                     s[i][j] *= scaler;
7414                 }
7415             }
7416
7417         /* now, scale s[][] by rate factor */
7418         for (i=0; i<4; i++)
7419             {
7420             for (j=0; j<4; j++)
7421                 {
7422                 if (i != j)
7423                     s[i][j] *= rateMult;
7424                 }
7425             }
7426
7427         /* put in diagonal elements of s[][] */
7428         for (i=0; i<4; i++)
7429             {
7430             sum = 0.0;
7431             for (j=0; j<4; j++)
7432                 {
7433                 if (i != j)
7434                     sum += s[i][j];
7435                 }
7436             s[i][i] = -(sum + s10);
7437             }
7438
7439         /* Now, put s[][] into top left portion of a matrix and fill in the
7440            other parts of the matrix with the appropriate switching rates. */
7441         for (i=0; i<4; i++)
7442             for (j=0; j<4; j++)
7443                 a[i][j] = s[i][j];
7444         for (i=4; i<8; i++)
7445             a[i][i] = -s01;
7446         a[0][4] = s10;
7447         a[1][5] = s10;
7448         a[2][6] = s10;
7449         a[3][7] = s10;
7450         a[4][0] = s01;
7451         a[5][1] = s01;
7452         a[6][2] = s01;
7453         a[7][3] = s01;
7454
7455 #       if 0
7456         for (i=0; i<8; i++)
7457             {
7458             for (j=0; j<8; j++)
7459                 printf ("%1.10lf ", a[i][j]);
7460             printf ("\n");
7461             }
7462         for (i=0; i<4; i++)
7463             printf ("%lf ", bs[i]);
7464         printf ("\n");
7465         printf ("s01 = %lf s10 = %lf pi1 = %lf pi0 = %lf\n", s01, s10, probOn, 1-probOn);
7466 #       endif
7467         }
7468     else if (n == 16)
7469         {
7470         /* 16 X 16 doublet model:
7471
7472            We have a doublet model. The states are in the order AA, AC, AG, AT, CA, CC
7473            CG, CT, GA, GC, GG, GT, TA, TC, TG, TT. The rate matrix is straight-forward
7474            to set up. We simply multiply the rate parameter (e.g., the ti/tv rate
7475            ratio) by the doublet frequencies. */
7476
7477         /* set diagonal of Q matrix to 0 */
7478         for (i=0; i<16; i++)
7479             a[i][i] = 0.0;
7480
7481         if (m->nst == 1) /* F81-like doublet model */
7482             {
7483             scaler = 0.0;
7484             for (i=0; i<16; i++)
7485                 {
7486                 for (j=i+1; j<16; j++)
7487                     {
7488                     if (((doublet[i].first & doublet[j].first) == 0) && ((doublet[i].second & doublet[j].second) == 0))
7489                         mult = 0.0;
7490                     else
7491                         mult = 1.0;
7492                     a[i][i] -= (a[i][j] = bs[j] * mult);
7493                     a[j][j] -= (a[j][i] = bs[i] * mult);
7494                     scaler += bs[i] * a[i][j];
7495                     scaler += bs[j] * a[j][i];
7496                     }
7497                 }
7498             }
7499         else if (m->nst == 2) /* HKY-like doublet model */
7500             {
7501             scaler = 0.0;
7502             for (i=0; i<16; i++)
7503                 {
7504                 for (j=i+1; j<16; j++)
7505                     {
7506                     if (((doublet[i].first & doublet[j].first) == 0) && ((doublet[i].second & doublet[j].second) == 0))
7507                         mult = 0.0;
7508                     else
7509                         {
7510                         if ((doublet[i].first & doublet[j].first) == 0)
7511                             {
7512                             if ((doublet[i].first + doublet[j].first) == 5 || (doublet[i].first + doublet[j].first) == 10)
7513                                 mult = rateValues[0];
7514                             else
7515                                 mult = 1.0;
7516                             }
7517                         else
7518                             {
7519                             if ((doublet[i].second + doublet[j].second) == 5 || (doublet[i].second + doublet[j].second) == 10)
7520                                 mult = rateValues[0];
7521                             else
7522                                 mult = 1.0;
7523                             }
7524                         }
7525                     a[i][i] -= (a[i][j] = bs[j] * mult);
7526                     a[j][j] -= (a[j][i] = bs[i] * mult);
7527                     scaler += bs[i] * a[i][j];
7528                     scaler += bs[j] * a[j][i];
7529                     }
7530                 }
7531             }
7532         else /* GTR-like doublet model */
7533             {
7534             scaler = 0.0;
7535             for (i=0; i<16; i++)
7536                 {
7537                 for (j=i+1; j<16; j++)
7538                     {
7539                     if (((doublet[i].first & doublet[j].first) == 0) && ((doublet[i].second & doublet[j].second) == 0))
7540                         mult = 0.0;
7541                     else
7542                         {
7543                         if ((doublet[i].first & doublet[j].first) == 0)
7544                             {
7545                             if ((doublet[i].first + doublet[j].first) == 3)
7546                                 mult = rateValues[0];
7547                             else if ((doublet[i].first + doublet[j].first) == 5)
7548                                 mult = rateValues[1];
7549                             else if ((doublet[i].first + doublet[j].first) == 9)
7550                                 mult = rateValues[2];
7551                             else if ((doublet[i].first + doublet[j].first) == 6)
7552                                 mult = rateValues[3];
7553                             else if ((doublet[i].first + doublet[j].first) == 10)
7554                                 mult = rateValues[4];
7555                             else
7556                                 mult = rateValues[5];
7557                             }
7558                         else
7559                             {
7560                             if ((doublet[i].second + doublet[j].second) == 3)
7561                                 mult = rateValues[0];
7562                             else if ((doublet[i].second + doublet[j].second) == 5)
7563                                 mult = rateValues[1];
7564                             else if ((doublet[i].second + doublet[j].second) == 9)
7565                                 mult = rateValues[2];
7566                             else if ((doublet[i].second + doublet[j].second) == 6)
7567                                 mult = rateValues[3];
7568                             else if ((doublet[i].second + doublet[j].second) == 10)
7569                                 mult = rateValues[4];
7570                             else
7571                                 mult = rateValues[5];
7572                             }
7573                         }
7574                     a[i][i] -= (a[i][j] = bs[j] * mult);
7575                     a[j][j] -= (a[j][i] = bs[i] * mult);
7576                     scaler += bs[i] * a[i][j];
7577                     scaler += bs[j] * a[j][i];
7578                     }
7579                 }
7580             }
7581
7582
7583         /* rescale Q matrix */
7584         scaler = 1.0 / scaler;
7585         for (i=0; i<16; i++)
7586             for (j=0; j<16; j++)
7587                 a[i][j] *= scaler;
7588         }
7589     else
7590         {
7591         /* 64(ish) X 64(ish) codon model:
7592
7593            Here, we set the rate matrix for the codon model (see Goldman and
7594            Yang, 1994). Note that we can specifiy any general type of codon
7595            model, with these constraints:
7596
7597             a[i][j] = 0                      -> if i and j differ at 2 or 3 nucleotides
7598             a[i][j] = rateValues[0] * bs[j]  -> if synonymous A <-> C change
7599             a[i][j] = rateValues[1] * bs[j]  -> if synonymous A <-> G change
7600             a[i][j] = rateValues[2] * bs[j]  -> if synonymous A <-> T change
7601             a[i][j] = rateValues[3] * bs[j]  -> if synonymous C <-> G change
7602             a[i][j] = rateValues[4] * bs[j]  -> if synonymous C <-> T change
7603             a[i][j] = rateValues[5] * bs[j]  -> if synonymous G <-> T change
7604
7605             a[i][j] = rateValues[0] * nonsyn * bs[j]  -> if nonsynonymous A <-> C change
7606             a[i][j] = rateValues[1] * nonsyn * bs[j]  -> if nonsynonymous A <-> G change
7607             a[i][j] = rateValues[2] * nonsyn * bs[j]  -> if nonsynonymous A <-> T change
7608             a[i][j] = rateValues[3] * nonsyn * bs[j]  -> if nonsynonymous C <-> G change
7609             a[i][j] = rateValues[4] * nonsyn * bs[j]  -> if nonsynonymous C <-> T change
7610             a[i][j] = rateValues[5] * nonsyn * bs[j]  -> if nonsynonymous G <-> T change
7611
7612           Other models, such as the one used by Nielsen & Yang (1998) can be obtained
7613           from this model by restricing transitions and transversions to have the same rate.
7614           nonsyn is the nonsynonymous/synonymous rate ratio (often called the
7615           dN/dS ratio). If we are in this part of the function, then we rely on it
7616           being called with the "rateMult" parameter specifying the dN/dS ratio. Note
7617           that the size of the matrix will never be 64 X 64 as we only consider changes
7618           among coding triplets (i.e., we exclude the stop codons). */
7619
7620         /* get the nonsynonymous/synonymous rate ratio */
7621         nonsyn = rateMult;
7622
7623         /* set diagonal of Q matrix to 0 */
7624         for (i=0; i<n; i++)
7625             a[i][i] = 0.0;
7626
7627         /* set dN and dS rates to zero */
7628         dN = dS = 0.0;
7629
7630         if (m->nst == 1) /* F81-like codon model */
7631             {
7632             scaler = 0.0;
7633             for (i=0; i<n; i++)
7634                 {
7635                 for (j=i+1; j<n; j++)
7636                     {
7637                     nDiff = 0;
7638                     for (k=0; k<3; k++)
7639                         {
7640                         if (mp->codonNucs[i][k] != mp->codonNucs[j][k])
7641                             nDiff++;
7642                         }
7643                     if (nDiff > 1)
7644                         {
7645                         mult = 0.0;
7646                         }
7647                     else
7648                         {
7649                         if (mp->codonAAs[i] == mp->codonAAs[j])
7650                             mult = 1.0;
7651                         else
7652                             mult = nonsyn;
7653                         }
7654
7655                     a[i][i] -= (a[i][j] = bs[j] * mult);
7656                     a[j][j] -= (a[j][i] = bs[i] * mult);
7657                     if (mp->codonAAs[i] == mp->codonAAs[j])
7658                         dS += (bs[i] * a[i][j] + bs[j] * a[j][i]);
7659                     else
7660                         dN += (bs[i] * a[i][j] + bs[j] * a[j][i]);
7661                     scaler += bs[i] * a[i][j];
7662                     scaler += bs[j] * a[j][i];
7663                     }
7664                 }
7665             }
7666         else if (m->nst == 2) /* HKY-like codon model */
7667             {
7668             scaler = 0.0;
7669             for (i=0; i<n; i++)
7670                 {
7671                 for (j=i+1; j<n; j++)
7672                     {
7673                     nDiff = 0;
7674                     for (k=0; k<3; k++)
7675                         {
7676                         if (mp->codonNucs[i][k] != mp->codonNucs[j][k])
7677                             {
7678                             nDiff++;
7679                             if ((mp->codonNucs[i][k] == 0 && mp->codonNucs[j][k] == 2) || (mp->codonNucs[i][k] == 2 && mp->codonNucs[j][k] == 0) ||
7680                                 (mp->codonNucs[i][k] == 1 && mp->codonNucs[j][k] == 3) || (mp->codonNucs[i][k] == 3 && mp->codonNucs[j][k] == 1))
7681                                 isTransition = YES;
7682                             else
7683                                 isTransition = NO;
7684                             }
7685                         }
7686                     if (nDiff > 1)
7687                         {
7688                         mult = 0.0;
7689                         }
7690                     else
7691                         {
7692                         if (mp->codonAAs[i] == mp->codonAAs[j])
7693                             mult = 1.0;
7694                         else
7695                             mult = nonsyn;
7696                         if (isTransition == YES)
7697                             mult *= rateValues[0];
7698                         }
7699
7700                     a[i][i] -= (a[i][j] = bs[j] * mult);
7701                     a[j][j] -= (a[j][i] = bs[i] * mult);
7702                     if (mp->codonAAs[i] == mp->codonAAs[j])
7703                         dS += (bs[i] * a[i][j] + bs[j] * a[j][i]);
7704                     else
7705                         dN += (bs[i] * a[i][j] + bs[j] * a[j][i]);
7706                     scaler += bs[i] * a[i][j];
7707                     scaler += bs[j] * a[j][i];
7708                     }
7709                 }
7710             }
7711         else /* GTR-like codon model */
7712             {
7713             scaler = 0.0;
7714             for (i=0; i<n; i++)
7715                 {
7716                 for (j=i+1; j<n; j++)
7717                     {
7718                     nDiff = 0;
7719                     for (k=0; k<3; k++)
7720                         {
7721                         if (mp->codonNucs[i][k] != mp->codonNucs[j][k])
7722                             {
7723                             nDiff++;
7724                             if ((mp->codonNucs[i][k] == 0 && mp->codonNucs[j][k] == 1) || (mp->codonNucs[i][k] == 1 && mp->codonNucs[j][k] == 0))
7725                                 rtNum = 0;
7726                             else if ((mp->codonNucs[i][k] == 0 && mp->codonNucs[j][k] == 2) || (mp->codonNucs[i][k] == 2 && mp->codonNucs[j][k] == 0))
7727                                 rtNum = 1;
7728                             else if ((mp->codonNucs[i][k] == 0 && mp->codonNucs[j][k] == 3) || (mp->codonNucs[i][k] == 3 && mp->codonNucs[j][k] == 0))
7729                                 rtNum = 2;
7730                             else if ((mp->codonNucs[i][k] == 1 && mp->codonNucs[j][k] == 2) || (mp->codonNucs[i][k] == 2 && mp->codonNucs[j][k] == 1))
7731                                 rtNum = 3;
7732                             else if ((mp->codonNucs[i][k] == 1 && mp->codonNucs[j][k] == 3) || (mp->codonNucs[i][k] == 3 && mp->codonNucs[j][k] == 1))
7733                                 rtNum = 4;
7734                             else
7735                                 rtNum = 5;
7736                             }
7737                         }
7738                     if (nDiff > 1)
7739                         {
7740                         mult = 0.0;
7741                         }
7742                     else
7743                         {
7744                         if (mp->codonAAs[i] == mp->codonAAs[j])
7745                             mult = 1.0;
7746                         else
7747                             mult = nonsyn;
7748                         if (rtNum == 0)
7749                             mult *= rateValues[0];
7750                         else if (rtNum == 1)
7751                             mult *= rateValues[1];
7752                         else if (rtNum == 2)
7753                             mult *= rateValues[2];
7754                         else if (rtNum == 3)
7755                             mult *= rateValues[3];
7756                         else if (rtNum == 4)
7757                             mult *= rateValues[4];
7758                         else
7759                             mult *= rateValues[5];
7760                         }
7761
7762                     a[i][i] -= (a[i][j] = bs[j] * mult);
7763                     a[j][j] -= (a[j][i] = bs[i] * mult);
7764                     if (mp->codonAAs[i] == mp->codonAAs[j])
7765                         dS += (bs[i] * a[i][j] + bs[j] * a[j][i]);
7766                     else
7767                         dN += (bs[i] * a[i][j] + bs[j] * a[j][i]);
7768                     scaler += bs[i] * a[i][j];
7769                     scaler += bs[j] * a[j][i];
7770                     }
7771                 }
7772             }
7773
7774         /* rescale Q matrix */
7775         if (m->nucModelId == NUCMODEL_CODON && m->numOmegaCats > 1)
7776             {
7777             /* If we have a positive selection model with multiple categories, then
7778                we do not rescale the rate matrix until we have finished generating
7779                all of the necessary rate matrices. The rescaling occurrs in
7780                UpDateCijk. */
7781             (*rA) = dN;
7782             (*rS) = dS;
7783             }
7784         else
7785             {
7786             scaler = 1.0 / scaler;
7787             for (i=0; i<n; i++)
7788                 for (j=0; j<n; j++)
7789                     a[i][j] *= scaler;
7790             (*rA) = (*rS) = 1.0;
7791             }
7792         }
7793
7794 #   if 0
7795     for (i=0; i<n; i++)
7796         {
7797         for (j=0; j<n; j++)
7798             printf ("%0.5lf ", a[i][j]);
7799         printf ("\n");
7800         }
7801 #   endif
7802
7803 #   if defined (BEAGLE_ENABLED)
7804     if ((m->nst == 1 || m->nst == 2) && m->numModelStates == 4)
7805         free (rateValues);
7806 #   endif
7807
7808     return (NO_ERROR);
7809 }
7810
7811
7812 int SetProteinQMatrix (MrBFlt **a, int n, int whichChain, int division, MrBFlt rateMult)
7813 {
7814     register int    i, j, k;
7815     int             aaModelID;
7816     MrBFlt          scaler, probOn, sum, *swr, s01, s10, *bs, *rt;
7817     ModelInfo       *m;
7818
7819     /* set up pointers to the appropriate model information */
7820     m = &modelSettings[division];
7821
7822     /* get amino acid model ID
7823         AAMODEL_POISSON         0
7824         AAMODEL_JONES           1
7825         AAMODEL_DAY             2
7826         AAMODEL_MTREV           3
7827         AAMODEL_MTMAM           4
7828         AAMODEL_WAG             5
7829         AAMODEL_RTREV           6
7830         AAMODEL_CPREV           7
7831         AAMODEL_VT              8
7832         AAMODEL_BLOSUM          9
7833         AAMODEL_LG             10
7834         AAMODEL_EQ             11
7835         AAMODEL_GTR            12 */
7836
7837     if (m->aaModelId >= 0)
7838         aaModelID = m->aaModelId;
7839     else
7840         aaModelID = (int)*GetParamVals(m->aaModel, whichChain, state[whichChain]);
7841
7842     /* Make certain that we have either 20 or 40 states. Anything
7843        else means we have a real problem. */
7844     if (n != 20 && n != 40)
7845         {
7846         MrBayesPrint ("%s   ERROR: There should be 20 or 40 states for the aa model\n");
7847         return (ERROR);
7848         }
7849
7850     if (n == 20)
7851         {
7852         /* We have a run-of-the-mill amino acid model (i.e., 20 X 20). */
7853         if (aaModelID == AAMODEL_POISSON)
7854             {
7855             scaler = 1.0 / 19.0;
7856             for (i=0; i<20; i++)
7857                 {
7858                 for (j=i+1; j<20; j++)
7859                     {
7860                     a[i][j] = scaler;
7861                     a[j][i] = scaler;
7862                     }
7863                 }
7864             for (i=0; i<20; i++)
7865                 a[i][i] = -1.0;
7866             }
7867         else if (aaModelID == AAMODEL_EQ)
7868             {
7869             bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
7870             for (i=0; i<20; i++)
7871                 for (j=0; j<20; j++)
7872                     a[i][j] = 0.0;
7873             scaler = 0.0;
7874             for (i=0; i<20; i++)
7875                 {
7876                 for (j=i+1; j<20; j++)
7877                     {
7878                     a[i][i] -= (a[i][j] = bs[j]);
7879                     a[j][j] -= (a[j][i] = bs[i]);
7880                     scaler += bs[i] * a[i][j];
7881                     scaler += bs[j] * a[j][i];
7882                     }
7883                 }
7884             scaler = 1.0 / scaler;
7885             for (i=0; i<20; i++)
7886                 for (j=0; j<20; j++)
7887                     a[i][j] *= scaler;
7888             }
7889         else if (aaModelID == AAMODEL_GTR)
7890             {
7891             bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
7892             rt = GetParamVals (m->revMat, whichChain, state[whichChain]);
7893             for (i=0; i<20; i++)
7894                 for (j=0; j<20; j++)
7895                     a[i][j] = 0.0;
7896             scaler = 0.0;
7897             for (i=k=0; i<20; i++)
7898                 {
7899                 for (j=i+1; j<20; j++)
7900                     {
7901                     a[i][i] -= (a[i][j] = bs[j] * rt[k]);
7902                     a[j][j] -= (a[j][i] = bs[i] * rt[k]);
7903                     k++;
7904                     }
7905                 }
7906             for (i=0; i<20; i++)
7907                 scaler += -(bs[i] * a[i][i]);
7908             for (i=0; i<20; i++)
7909                 for (j=0; j<20; j++)
7910                     a[i][j] /= scaler;
7911             }
7912         else if (aaModelID == AAMODEL_JONES)
7913             {
7914             for (i=0; i<20; i++)
7915                 for (j=0; j<20; j++)
7916                     a[i][j] = aaJones[i][j];
7917             }
7918         else if (aaModelID == AAMODEL_DAY)
7919             {
7920             for (i=0; i<20; i++)
7921                 for (j=0; j<20; j++)
7922                     a[i][j] = aaDayhoff[i][j];
7923             }
7924         else if (aaModelID == AAMODEL_MTREV)
7925             {
7926             for (i=0; i<20; i++)
7927                 for (j=0; j<20; j++)
7928                     a[i][j] = aaMtrev24[i][j];
7929             }
7930         else if (aaModelID == AAMODEL_MTMAM)
7931             {
7932             for (i=0; i<20; i++)
7933                 for (j=0; j<20; j++)
7934                     a[i][j] = aaMtmam[i][j];
7935             }
7936         else if (aaModelID == AAMODEL_RTREV)
7937             {
7938             for (i=0; i<20; i++)
7939                 for (j=0; j<20; j++)
7940                     a[i][j] = aartREV[i][j];
7941             }
7942         else if (aaModelID == AAMODEL_WAG)
7943             {
7944             for (i=0; i<20; i++)
7945                 for (j=0; j<20; j++)
7946                     a[i][j] = aaWAG[i][j];
7947             }
7948         else if (aaModelID == AAMODEL_CPREV)
7949             {
7950             for (i=0; i<20; i++)
7951                 for (j=0; j<20; j++)
7952                     a[i][j] = aacpREV[i][j];
7953             }
7954         else if (aaModelID == AAMODEL_VT)
7955             {
7956             for (i=0; i<20; i++)
7957                 for (j=0; j<20; j++)
7958                     a[i][j] = aaVt[i][j];
7959             }
7960         else if (aaModelID == AAMODEL_BLOSUM)
7961             {
7962             for (i=0; i<20; i++)
7963                 for (j=0; j<20; j++)
7964                     a[i][j] = aaBlosum[i][j];
7965             }
7966         else if (aaModelID == AAMODEL_LG)
7967             {
7968             for (i=0; i<20; i++)
7969                 for (j=0; j<20; j++)
7970                     a[i][j] = aaLG[i][j];
7971             }
7972         else
7973             {
7974             MrBayesPrint ("%s   ERROR: Don't understand which amino acid model is needed\n");
7975             return (ERROR);
7976             }
7977 #       if 0
7978         for (i=0; i<20; i++)
7979             {
7980             for (j=0; j<20; j++)
7981                 printf ("%1.3lf ", a[i][j]);
7982             printf ("\n");
7983             }
7984 #       endif
7985         }
7986     else
7987         {
7988         /* 40 X 40 covarion model:
7989
7990            We have a covarion model, and must set up the other quadrants. Note that if
7991            we are at this point in the code, that we have already set up the upper left
7992            portion of the 40 X 40 rate matrix. Note that if we have rate
7993            variation across sites, that we need to deal with the multiplication
7994            in the rate matrix (i.e., we cannot simply deal with rate variation
7995            by multiplying the branch length by a rate multiplier as we can
7996            with other models). Instead, we multiply the scaled rate matrix
7997            by the rate multiplier. */
7998
7999         /* Get the switching rates. The rate of off->on is s01 and the rate
8000            of on->off is s10. The stationary probability of the switch process
8001            is prob1 = s01/(s01+s10) and prob0 = s10/(s01+s10). */
8002         swr = GetParamVals (m->switchRates, whichChain, state[whichChain]);
8003         s01 = swr[0];
8004         s10 = swr[1];
8005         probOn = s01 / (s01 + s10);
8006
8007         /* set matrix a[][] to 0 */
8008         for (i=0; i<40; i++)
8009             for (j=0; j<40; j++)
8010                 a[i][j] = 0.0;
8011
8012         /* fill in upper-left sub matrix (where substitutions occur */
8013         if (aaModelID == AAMODEL_POISSON)
8014             {
8015             scaler = 0.0;
8016             for (i=0; i<20; i++)
8017                 {
8018                 for (j=i+1; j<20; j++)
8019                     {
8020                     a[i][j] = 0.05;
8021                     a[j][i] = 0.05;
8022                     scaler += 0.05 * a[i][j] * probOn;
8023                     scaler += 0.05 * a[j][i] * probOn;
8024                     }
8025                 }
8026             }
8027         else if (aaModelID == AAMODEL_EQ)
8028             {
8029             bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
8030             scaler = 0.0;
8031             for (i=0; i<20; i++)
8032                 {
8033                 for (j=i+1; j<20; j++)
8034                     {
8035                     a[i][j] = bs[j];
8036                     a[j][i] = bs[i];
8037                     scaler += bs[i] * a[i][j] * probOn;
8038                     scaler += bs[j] * a[j][i] * probOn;
8039                     }
8040                 }
8041             }
8042         else if (aaModelID == AAMODEL_GTR)
8043             {
8044             bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
8045             rt = GetParamVals (m->revMat, whichChain, state[whichChain]);
8046             for (i=0; i<20; i++)
8047                 for (j=0; j<20; j++)
8048                     a[i][j] = 0.0;
8049             scaler = 0.0;
8050             for (i=k=0; i<20; i++)
8051                 {
8052                 for (j=i+1; j<20; j++)
8053                     {
8054                     a[i][i] -= (a[i][j] = bs[j] * rt[k]);
8055                     a[j][j] -= (a[j][i] = bs[i] * rt[k]);
8056                     k++;
8057                     }
8058                 }
8059             for (i=0; i<20; i++)
8060                 scaler += -(bs[i] * a[i][i]);
8061             for (i=0; i<20; i++)
8062                 for (j=0; j<20; j++)
8063                     a[i][j] /= scaler;
8064             for (i=0; i<20; i++)
8065                 {
8066                 for (j=i+1; j<20; j++)
8067                     {
8068                     a[i][j] = bs[j];
8069                     a[j][i] = bs[i];
8070                     scaler += bs[i] * a[i][j] * probOn;
8071                     scaler += bs[j] * a[j][i] * probOn;
8072                     }
8073                 }
8074             }
8075         else if (aaModelID == AAMODEL_JONES)
8076             {
8077             scaler = 0.0;
8078             for (i=0; i<20; i++)
8079                 {
8080                 for (j=i+1; j<20; j++)
8081                     {
8082                     a[i][j] = aaJones[i][j];
8083                     a[j][i] = aaJones[j][i];
8084                     scaler += jonesPi[i] * a[i][j] * probOn;
8085                     scaler += jonesPi[j] * a[j][i] * probOn;
8086                     }
8087                 }
8088             }
8089         else if (aaModelID == AAMODEL_DAY)
8090             {
8091             scaler = 0.0;
8092             for (i=0; i<20; i++)
8093                 {
8094                 for (j=i+1; j<20; j++)
8095                     {
8096                     a[i][j] = aaDayhoff[i][j];
8097                     a[j][i] = aaDayhoff[j][i];
8098                     scaler += dayhoffPi[i] * a[i][j] * probOn;
8099                     scaler += dayhoffPi[j] * a[j][i] * probOn;
8100                     }
8101                 }
8102             }
8103         else if (aaModelID == AAMODEL_MTREV)
8104             {
8105             scaler = 0.0;
8106             for (i=0; i<20; i++)
8107                 {
8108                 for (j=i+1; j<20; j++)
8109                     {
8110                     a[i][j] = aaMtrev24[i][j];
8111                     a[j][i] = aaMtrev24[j][i];
8112                     scaler += mtrev24Pi[i] * a[i][j] * probOn;
8113                     scaler += mtrev24Pi[j] * a[j][i] * probOn;
8114                     }
8115                 }
8116             }
8117         else if (aaModelID == AAMODEL_MTMAM)
8118             {
8119             scaler = 0.0;
8120             for (i=0; i<20; i++)
8121                 {
8122                 for (j=i+1; j<20; j++)
8123                     {
8124                     a[i][j] = aaMtmam[i][j];
8125                     a[j][i] = aaMtmam[j][i];
8126                     scaler += mtmamPi[i] * a[i][j] * probOn;
8127                     scaler += mtmamPi[j] * a[j][i] * probOn;
8128                     }
8129                 }
8130             }
8131         else if (aaModelID == AAMODEL_RTREV)
8132             {
8133             scaler = 0.0;
8134             for (i=0; i<20; i++)
8135                 {
8136                 for (j=i+1; j<20; j++)
8137                     {
8138                     a[i][j] = aartREV[i][j];
8139                     a[j][i] = aartREV[j][i];
8140                     scaler += rtrevPi[i] * a[i][j] * probOn;
8141                     scaler += rtrevPi[j] * a[j][i] * probOn;
8142                     }
8143                 }
8144             }
8145         else if (aaModelID == AAMODEL_WAG)
8146             {
8147             scaler = 0.0;
8148             for (i=0; i<20; i++)
8149                 {
8150                 for (j=i+1; j<20; j++)
8151                     {
8152                     a[i][j] = aaWAG[i][j];
8153                     a[j][i] = aaWAG[j][i];
8154                     scaler += wagPi[i] * a[i][j] * probOn;
8155                     scaler += wagPi[j] * a[j][i] * probOn;
8156                     }
8157                 }
8158             }
8159         else if (aaModelID == AAMODEL_CPREV)
8160             {
8161             scaler = 0.0;
8162             for (i=0; i<20; i++)
8163                 {
8164                 for (j=i+1; j<20; j++)
8165                     {
8166                     a[i][j] = aacpREV[i][j];
8167                     a[j][i] = aacpREV[j][i];
8168                     scaler += cprevPi[i] * a[i][j] * probOn;
8169                     scaler += cprevPi[j] * a[j][i] * probOn;
8170                     }
8171                 }
8172             }
8173         else if (aaModelID == AAMODEL_VT)
8174             {
8175             scaler = 0.0;
8176             for (i=0; i<20; i++)
8177                 {
8178                 for (j=i+1; j<20; j++)
8179                     {
8180                     a[i][j] = aaVt[i][j];
8181                     a[j][i] = aaVt[j][i];
8182                     scaler += vtPi[i] * a[i][j] * probOn;
8183                     scaler += vtPi[j] * a[j][i] * probOn;
8184                     }
8185                 }
8186             }
8187         else if (aaModelID == AAMODEL_BLOSUM)
8188             {
8189             scaler = 0.0;
8190             for (i=0; i<20; i++)
8191                 {
8192                 for (j=i+1; j<20; j++)
8193                     {
8194                     a[i][j] = aaBlosum[i][j];
8195                     a[j][i] = aaBlosum[j][i];
8196                     scaler += blosPi[i] * a[i][j] * probOn;
8197                     scaler += blosPi[j] * a[j][i] * probOn;
8198                     }
8199                 }
8200             }
8201         else if (aaModelID == AAMODEL_LG)
8202             {
8203             scaler = 0.0;
8204             for (i=0; i<20; i++)
8205                 {
8206                 for (j=i+1; j<20; j++)
8207                     {
8208                     a[i][j] = aaLG[i][j];
8209                     a[j][i] = aaLG[j][i];
8210                     scaler += lgPi[i] * a[i][j] * probOn;
8211                     scaler += lgPi[j] * a[j][i] * probOn;
8212                     }
8213                 }
8214             }
8215         else
8216             {
8217             MrBayesPrint ("%s   ERROR: Don't understand which amino acid model is needed\n");
8218             return (ERROR);
8219             }
8220
8221         /* rescale off diagonal elements of Q matrix */
8222         scaler = 1.0 / scaler;
8223         for (i=0; i<20; i++)
8224             {
8225             for (j=0; j<20; j++)
8226                 {
8227                 if (i != j)
8228                     a[i][j] *= scaler;
8229                 }
8230             }
8231
8232         /* now, scale by rate factor */
8233         for (i=0; i<20; i++)
8234             {
8235             for (j=0; j<20; j++)
8236                 {
8237                 if (i != j)
8238                     a[i][j] *= rateMult;
8239                 }
8240             }
8241
8242         /* put in diagonal elements */
8243         for (i=0; i<20; i++)
8244             {
8245             sum = 0.0;
8246             for (j=0; j<20; j++)
8247                 {
8248                 if (i != j)
8249                     sum += a[i][j];
8250                 a[i][i] = -(sum + s10);
8251                 }
8252             }
8253
8254         /* fill in the other three submatrices */
8255         for (i=20; i<40; i++)
8256             a[i][i] = -s01;
8257         for (i=0; i<20; i++)
8258             {
8259             a[i][20+i] = s10;
8260             a[20+i][i] = s01;
8261             }
8262
8263         }
8264
8265     return (NO_ERROR);
8266 }
8267
8268
8269 int SetStdQMatrix (MrBFlt **a, int nStates, MrBFlt *bs, int cType)
8270 {
8271     register int    i, j;
8272     MrBFlt          scaler;
8273
8274     /* This function sets up ordered or unordered models for standard characters
8275        with unequal stationary state frequencies. It requires the stationary
8276        frequencies of the states (passed when calling the function). It also
8277        needs to know the number of states and the type (ordered or unordered)
8278        of the character. */
8279
8280     /* set Q matrix to 0 */
8281     for (i=0; i<nStates; i++)
8282         for (j=0; j<nStates; j++)
8283             a[i][j] = 0.0;
8284
8285     /* initialize Q matrix */
8286     scaler = 0.0;
8287     if (cType == UNORD)
8288         {
8289         /* unordered characters */
8290         for (i=0; i<nStates; i++)
8291             {
8292             for (j=0; j<nStates; j++)
8293                 {
8294                 if (i != j)
8295                     {
8296                     a[i][i] -= (a[i][j] = bs[j]);
8297                     scaler += bs[i] * a[i][j];
8298                     }
8299                 }
8300             }
8301         }
8302     else
8303         {
8304         /* ordered characters */
8305         for (i=0; i<nStates; i++)
8306             {
8307             for (j=0; j<nStates; j++)
8308                 {
8309                 if (abs(i - j) == 1)
8310                     {
8311                     a[i][i] -= (a[i][j] = bs[j]);
8312                     scaler += bs[i] * a[i][j];
8313                     }
8314                 }
8315             }
8316         }
8317
8318     /* rescale Q matrix */
8319     for (i=0; i<nStates; i++)
8320         for (j=0; j<nStates; j++)
8321             a[i][j] /= scaler;
8322
8323 #   if defined DEBUG_SETSTDQMATRIX
8324     for (i=0; i<nStates; i++)
8325         {
8326         for (j=0; j<nStates; j++)
8327             printf ("%0.5lf ", a[i][j]);
8328         printf ("\n");
8329         }
8330 #   endif
8331
8332     return (NO_ERROR);
8333 }
8334
8335
8336 int TiProbs_Fels (TreeNode *p, int division, int chain)
8337 {
8338     int         i, j, k, index;
8339     MrBFlt      t, u, x, z, beta, bigPi_j[4], pij, bigPij,
8340                 *catRate, baseRate, theRate, *pis, length;
8341     CLFlt       *tiP;
8342     ModelInfo   *m;
8343
8344     m = &modelSettings[division];
8345
8346     /* find transition probabilities */
8347     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
8348
8349     /* get base frequencies */
8350     pis = GetParamSubVals (m->stateFreq, chain, state[chain]);
8351
8352     /* get rate multipliers (for gamma & partition specific rates) */
8353     theRate = 1.0;
8354     baseRate = GetRate (division, chain);
8355     /* compensate for invariable sites if appropriate */
8356     if (m->pInvar != NULL)
8357         baseRate /= (1.0 - (*GetParamVals(m->pInvar, chain, state[chain])));
8358     /* get category rates */
8359     if (m->shape == NULL)
8360         catRate = &theRate;
8361     else
8362         catRate = GetParamSubVals (m->shape, chain, state[chain]);
8363
8364     /* rescale beta */
8365     beta =  (0.5 / ((pis[0] + pis[2])*(pis[1] + pis[3]) + ((pis[0]*pis[2]) + (pis[1]*pis[3]))));
8366
8367     bigPi_j[0] =  (pis[0] + pis[2]);
8368     bigPi_j[1] =  (pis[1] + pis[3]);
8369     bigPi_j[2] =  (pis[0] + pis[2]);
8370     bigPi_j[3] =  (pis[1] + pis[3]);
8371
8372     /* find length */
8373     if (m->cppEvents != NULL)
8374         {
8375         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
8376         }
8377     else if (m->tk02BranchRates != NULL)
8378         {
8379         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
8380         }
8381     else if (m->igrBranchRates != NULL)
8382         {
8383         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
8384         }
8385     else if (m->mixedBrchRates != NULL)
8386         {
8387         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
8388         }
8389     else
8390         length = p->length;
8391
8392     /* numerical errors will ensue if we allow very large or very small branch lengths,
8393        which might occur in relaxed clock models */
8394
8395     /* fill in values */
8396     for (k=index=0; k<m->numGammaCats; k++)
8397         {
8398         t =  length * baseRate * catRate[k];
8399
8400         if (t < TIME_MIN)
8401             {
8402             /* Fill in identity matrix */
8403             for (i=0; i<4; i++)
8404                 {
8405                 for (j=0; j<4; j++)
8406                     {
8407                     if (i == j)
8408                         tiP[index++] = 1.0;
8409                     else
8410                         tiP[index++] = 0.0;
8411                     }
8412                 }
8413             }
8414         else if (t > TIME_MAX)
8415             {
8416             /* Fill in stationary matrix */
8417             for (i=0; i<4; i++)
8418                 for (j=0; j<4; j++)
8419                     tiP[index++] = (CLFlt) pis[j];
8420             }
8421         else
8422             {
8423             /* calculate probabilities */
8424             for (i=0; i<4; i++)
8425                 {
8426                 for (j=0; j<4; j++)
8427                     {
8428                     bigPij = bigPi_j[j];
8429                     pij =  pis[j];
8430                     u =  1.0/bigPij -  1.0;
8431                     x =  exp(-beta * t);
8432                     z = (bigPij - pij) / bigPij;
8433
8434                     if (i == j)
8435                         tiP[index++] = (CLFlt) (pij + pij * u * x + z * x);
8436                     else
8437                         tiP[index++] = (CLFlt) (pij + pij * u * x - (pij/bigPij) * x);
8438                     }
8439                 }
8440             }
8441         }
8442
8443     return NO_ERROR;
8444 }
8445
8446
8447 /*----------------------------------------------------------------
8448 |
8449 |   TiProbs_Gen: Calculates transition probabilities for general
8450 |       models with or without rate variation. This function does
8451 |       not work with:
8452 |
8453 |       1. codon models with omega variation or
8454 |       2. covarion models with rate variation
8455 |
8456 |   In either of these cases, TiProbs_GenCov is used
8457 |
8458 -----------------------------------------------------------------*/
8459 int TiProbs_Gen (TreeNode *p, int division, int chain)
8460 {
8461     register int    i, j, k, n, s, index;
8462     MrBFlt          t, *catRate, baseRate, *eigenValues, *cijk, *bs,
8463                     EigValexp[64], sum, *ptr, theRate, correctionFactor,
8464                     length;
8465     CLFlt           *tiP;
8466     ModelInfo       *m;
8467
8468     m = &modelSettings[division];
8469     n = m->numModelStates;
8470
8471     /* find the correction factor to make branch lengths
8472        in terms of expected number of substitutions per character */
8473     correctionFactor = 1.0;
8474     if (m->dataType == DNA || m->dataType == RNA)
8475         {
8476         if (m->nucModelId == NUCMODEL_DOUBLET)
8477             correctionFactor = 2.0;
8478         else if (m->nucModelId == NUCMODEL_CODON)
8479             correctionFactor = 3.0;
8480         }
8481
8482     /* find transition probabilities */
8483     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
8484
8485     /* get rate multipliers (for gamma & partition specific rates) */
8486     theRate = 1.0;
8487     baseRate = GetRate (division, chain);
8488
8489     /* compensate for invariable sites if appropriate */
8490     if (m->pInvar != NULL)
8491         baseRate /= (1.0 - (*GetParamVals(m->pInvar, chain, state[chain])));
8492
8493     /* get category rates */
8494     if (m->shape == NULL)
8495         catRate = &theRate;
8496     else
8497         catRate = GetParamSubVals (m->shape, chain, state[chain]);
8498
8499     /* get eigenvalues and cijk pointers */
8500     eigenValues = m->cijks[m->cijkIndex[chain]];
8501     cijk        = eigenValues + (2 * n);
8502
8503     /* find length */
8504     if (m->cppEvents != NULL)
8505         {
8506         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
8507         }
8508     else if (m->tk02BranchRates != NULL)
8509         {
8510         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
8511         }
8512     else if (m->igrBranchRates != NULL)
8513         {
8514         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
8515         }
8516     else if (m->mixedBrchRates != NULL)
8517         {
8518         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
8519         }
8520     else
8521         length = p->length;
8522
8523     /* fill in values */
8524     for (k=index=0; k<m->numGammaCats; k++)
8525         {
8526         t =  length * baseRate * catRate[k] * correctionFactor;
8527
8528         if (t < TIME_MIN)
8529             {
8530             /* Fill in identity matrix */
8531             for (i=0; i<n; i++)
8532                 {
8533                 for (j=0; j<n; j++)
8534                     {
8535                     if (i == j)
8536                         tiP[index++] = 1.0;
8537                     else
8538                         tiP[index++] = 0.0;
8539                     }
8540                 }
8541             }
8542         else if (t > TIME_MAX)
8543             {
8544             /* Get base freq */
8545             bs = GetParamSubVals(m->stateFreq, chain, state[chain]);
8546             /* Fill in stationary matrix */
8547             for (i=0; i<n; i++)
8548                 for (j=0; j<n; j++)
8549                     tiP[index++] = (CLFlt) bs[j];
8550             }
8551         else
8552             {
8553             /* We actually need to do some work... */
8554             for (s=0; s<n; s++)
8555                 EigValexp[s] =  exp(eigenValues[s] * t);
8556
8557             ptr = cijk;
8558             for (i=0; i<n; i++)
8559                 {
8560                 for (j=0; j<n; j++)
8561                     {
8562                     sum = 0.0;
8563                     for (s=0; s<n; s++)
8564                         sum += (*ptr++) * EigValexp[s];
8565                     tiP[index++] = (CLFlt) ((sum < 0.0) ? 0.0 : sum);
8566                     }
8567                 }
8568             }
8569         }
8570
8571 #   if 0
8572     printf ("v = %lf (%d)\n", t, p->index);
8573     for (i=index=0; i<n; i++)
8574         {
8575         for (j=0; j<n; j++)
8576             printf ("%1.4lf ", tiP[index++]);
8577         printf ("\n");
8578         }
8579     printf ("\n");
8580 #   endif
8581
8582     return NO_ERROR;
8583 }
8584
8585
8586 /*----------------------------------------------------------------
8587 |
8588 |   TiProbs_GenCov: Calculates transition probabilities for codon
8589 |       models with omega variation or covarion models with
8590 |       rate variation.
8591 |
8592 -----------------------------------------------------------------*/
8593 int TiProbs_GenCov (TreeNode *p, int division, int chain)
8594 {
8595     register int    i, j, k, n, s, index;
8596     int             sizeOfSingleCijk;
8597     MrBFlt          t, *eigenValues, *cijk, EigValexp[64], sum, *ptr, correctionFactor,
8598                     length, *bs;
8599     CLFlt           *tiP;
8600     ModelInfo       *m;
8601
8602     m = &modelSettings[division];
8603     n = m->numModelStates;
8604
8605     /* find the correction factor to make branch lengths
8606        in terms of expected number of substitutions per character */
8607     correctionFactor = 1.0;
8608     if (m->dataType == DNA || m->dataType == RNA)
8609         {
8610         if (m->nucModelId == NUCMODEL_DOUBLET)
8611             correctionFactor = 2.0;
8612         else if (m->nucModelId == NUCMODEL_CODON)
8613             correctionFactor = 3.0;
8614         }
8615
8616     /* find transition probabilities */
8617     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
8618
8619     /* get eigenvalues and cijk pointers */
8620     eigenValues = m->cijks[m->cijkIndex[chain]];
8621     cijk        = eigenValues + (2 * n);
8622
8623     /* get offset size (we need to move the pointers to the appropriate
8624        cijk information for these models) */
8625     sizeOfSingleCijk = m->cijkLength / m->nCijkParts;
8626
8627     /* find length */
8628     if (m->cppEvents != NULL)
8629         {
8630         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
8631         }
8632     else if (m->tk02BranchRates != NULL)
8633         {
8634         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
8635         }
8636     else if (m->igrBranchRates != NULL)
8637         {
8638         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
8639         }
8640     else if (m->mixedBrchRates != NULL)
8641         {
8642         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
8643         }
8644     else
8645         length = p->length;
8646
8647     /* numerical errors will ensue if we allow very large or very small branch lengths,
8648        which might occur in relaxed clock models */
8649
8650     /* fill in values */
8651     for (k=index=0; k<m->nCijkParts; k++)
8652         {
8653         t =  length * correctionFactor;
8654
8655         if (t < TIME_MIN)
8656             {
8657             /* Fill in identity matrix */
8658             for (i=0; i<n; i++)
8659                 {
8660                 for (j=0; j<n; j++)
8661                     {
8662                     if (i == j)
8663                         tiP[index++] = 1.0;
8664                     else
8665                         tiP[index++] = 0.0;
8666                     }
8667                 }
8668             }
8669         else if (t > TIME_MAX)
8670             {
8671             /* Get base freq */
8672             bs = GetParamSubVals(m->stateFreq, chain, state[chain]);
8673             /* Fill in stationary matrix */
8674             for (i=0; i<n; i++)
8675                 for (j=0; j<n; j++)
8676                     tiP[index++] = (CLFlt) bs[j];
8677             }
8678         else
8679             {
8680             /* We actually need to do some work... */
8681             for (s=0; s<n; s++)
8682                 EigValexp[s] =  exp(eigenValues[s] * t);
8683
8684             ptr = cijk;
8685             for (i=0; i<n; i++)
8686                 {
8687                 for (j=0; j<n; j++)
8688                     {
8689                     sum = 0.0;
8690                     for (s=0; s<n; s++)
8691                         sum += (*ptr++) * EigValexp[s];
8692                     tiP[index++] = (CLFlt) ((sum < 0.0) ? 0.0 : sum);
8693                     }
8694                 }
8695
8696             /* increment pointers by m->cijkLength */
8697             if (k+1 < m->nCijkParts)
8698                 {
8699                 /* shift pointers */
8700                 eigenValues += sizeOfSingleCijk;
8701                 cijk        += sizeOfSingleCijk;
8702                 }
8703             }
8704         }
8705
8706 #   if 0
8707     for (i=index=0; i<n; i++)
8708         {
8709         for (j=0; j<n; j++)
8710             printf ("%1.4lf ", tiP[index++]);
8711         printf ("\n");
8712         }
8713 #   endif
8714
8715     return NO_ERROR;
8716 }
8717
8718
8719 /*-----------------------------------------------------------------
8720 |
8721 |   TiProbs_Hky: update transition probabilities for 4by4
8722 |       nucleotide model with nst == 2 (K80/HKY85)
8723 |       with or without rate variation
8724 |
8725 ------------------------------------------------------------------*/
8726 int TiProbs_Hky (TreeNode *p, int division, int chain)
8727 {
8728     int         i, j, k, index;
8729     MrBFlt      t, kap, u, w, x, y, z, beta, bigPi_j[4], pij, bigPij, *pis,
8730                 *catRate, baseRate, theRate, length;
8731     CLFlt       *tiP;
8732     ModelInfo   *m;
8733
8734     m = &modelSettings[division];
8735
8736     /* find transition probabilities */
8737     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
8738
8739     /* get kappa */
8740     kap =  *GetParamVals (m->tRatio, chain, state[chain]);
8741
8742     /* get base frequencies */
8743     pis = GetParamSubVals (m->stateFreq, chain, state[chain]);
8744
8745     /* get rate multipliers (for gamma & partition specific rates) */
8746     theRate = 1.0;
8747     baseRate = GetRate (division, chain);
8748     /* compensate for invariable sites if appropriate */
8749     if (m->pInvar != NULL)
8750         baseRate /= (1.0 - (*GetParamVals(m->pInvar, chain, state[chain])));
8751     /* get category rates */
8752     if (m->shape == NULL)
8753         catRate = &theRate;
8754     else
8755         catRate = GetParamSubVals (m->shape, chain, state[chain]);
8756
8757     /* rescale beta */
8758     beta =  0.5 / ((pis[0] + pis[2])*(pis[1] + pis[3]) + kap*((pis[0]*pis[2]) + (pis[1]*pis[3])));
8759
8760     bigPi_j[0] = pis[0] + pis[2];
8761     bigPi_j[1] = pis[1] + pis[3];
8762     bigPi_j[2] = pis[0] + pis[2];
8763     bigPi_j[3] = pis[1] + pis[3];
8764
8765     /* find length */
8766     if (m->cppEvents != NULL)
8767         {
8768         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
8769         }
8770     else if (m->tk02BranchRates != NULL)
8771         {
8772         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
8773         }
8774     else if (m->igrBranchRates != NULL)
8775         {
8776         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
8777         }
8778     else if (m->mixedBrchRates != NULL)
8779         {
8780         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
8781         }
8782     else
8783         length = p->length;
8784
8785     /* numerical errors will ensue if we allow very large or very small branch lengths,
8786        which might occur in relaxed clock models */
8787
8788     /* fill in values */
8789     for (k=index=0; k<m->numGammaCats; k++)
8790         {
8791         t =  length * baseRate * catRate[k];
8792
8793         if (t < TIME_MIN)
8794             {
8795             /* Fill in identity matrix */
8796             for (i=0; i<4; i++)
8797                 {
8798                 for (j=0; j<4; j++)
8799                     {
8800                     if (i == j)
8801                         tiP[index++] = 1.0;
8802                     else
8803                         tiP[index++] = 0.0;
8804                     }
8805                 }
8806             }
8807         else if (t > TIME_MAX)
8808             {
8809             /* Fill in stationary matrix */
8810             for (i=0; i<4; i++)
8811                 for (j=0; j<4; j++)
8812                     tiP[index++] = (CLFlt) pis[j];
8813             }
8814         else
8815             {
8816             /* calculate probabilities */
8817             for (i=0; i<4; i++)
8818                 {
8819                 for (j=0; j<4; j++)
8820                     {
8821                     bigPij = bigPi_j[j];
8822                     pij = pis[j];
8823                     u =  1.0/bigPij -  1.0;
8824                     w = -beta * (1.0 + bigPij * (kap -  1.0));
8825                     x =  exp(-beta * t);
8826                     y =  exp(w * t);
8827                     z = (bigPij - pij) / bigPij;
8828
8829                     if (i == j)
8830                         tiP[index++] = (CLFlt) (pij + pij * u * x + z * y);
8831                     else if ((i == 0 && j == 2) || (i == 2 && j == 0) || (i == 1 && j == 3) || (i == 3 && j == 1))
8832                         tiP[index++] = (CLFlt) (pij + pij * u * x - (pij/bigPij) * y);
8833                     else
8834                         tiP[index++] = (CLFlt) (pij * (1.0 - x));
8835                     }
8836                 }
8837             }
8838         }
8839
8840     return NO_ERROR;
8841 }
8842
8843
8844 /*-----------------------------------------------------------------
8845 |
8846 |   TiProbs_JukesCantor: update transition probabilities for 4by4
8847 |       nucleotide model with nst == 1 (Jukes-Cantor)
8848 |       with or without rate variation
8849 |
8850 ------------------------------------------------------------------*/
8851 int TiProbs_JukesCantor (TreeNode *p, int division, int chain)
8852 {
8853     /* calculate Jukes Cantor transition probabilities */
8854
8855     int         i, j, k, index;
8856     MrBFlt      t, *catRate, baseRate, length;
8857     CLFlt       pNoChange, pChange, *tiP;
8858     ModelInfo   *m;
8859
8860     m = &modelSettings[division];
8861
8862     /* find transition probabilities */
8863     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
8864
8865     baseRate =  1.0;
8866     if (m->shape == NULL)
8867         catRate = &baseRate;
8868     else
8869         catRate = GetParamSubVals (m->shape, chain, state[chain]);
8870
8871     /* find length */
8872     if (m->cppEvents != NULL)
8873         {
8874         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
8875         }
8876     else if (m->tk02BranchRates != NULL)
8877         {
8878         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
8879         }
8880     else if (m->igrBranchRates != NULL)
8881         {
8882         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
8883         }
8884     else if (m->mixedBrchRates != NULL)
8885         {
8886         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
8887         }
8888     else
8889         length = p->length;
8890
8891     /* numerical errors will ensue if we allow very large or very small branch lengths,
8892        which might occur in relaxed clock models */
8893
8894     /* fill in values */
8895     for (k=index=0; k<m->numGammaCats; k++)
8896         {
8897         t = length*catRate[k];
8898
8899         if (t < TIME_MIN)
8900             {
8901             /* Fill in identity matrix */
8902             for (i=0; i<4; i++)
8903                 {
8904                 for (j=0; j<4; j++)
8905                     {
8906                     if (i == j)
8907                         tiP[index++] = 1.0;
8908                     else
8909                         tiP[index++] = 0.0;
8910                     }
8911                 }
8912             }
8913         else if (t > TIME_MAX)
8914             {
8915             /* Fill in stationary matrix */
8916             for (i=0; i<4; i++)
8917                 for (j=0; j<4; j++)
8918                     tiP[index++] = 0.25;
8919             }
8920         else
8921             {
8922             /* calculate probabilities */
8923             pChange   = (CLFlt) (0.25 - 0.25 * exp(-(4.0/3.0)*t));
8924             pNoChange = (CLFlt) (0.25 + 0.75 * exp(-(4.0/3.0)*t));
8925             for (i=0; i<4; i++)
8926                 {
8927                 for (j=0; j<4; j++)
8928                     {
8929                     if (i == j)
8930                         tiP[index++] = pNoChange;
8931                     else
8932                         tiP[index++] = pChange;
8933                     }
8934                 }
8935             }
8936         }
8937
8938     return NO_ERROR;
8939 }
8940
8941
8942 /*-----------------------------------------------------------------
8943 |
8944 |   TiProbs_Res: update transition probabilities for binary
8945 |       restriction site model with or without rate variation
8946 |
8947 ------------------------------------------------------------------*/
8948 int TiProbs_Res (TreeNode *p, int division, int chain)
8949 {
8950     int         k, index;
8951     MrBFlt      baseRate, eV, mu, theRate, v,
8952                 *bs, *catRate, length;
8953     CLFlt       *tiP;
8954     ModelInfo   *m;
8955
8956     /* find model settings for the division */
8957     m = &modelSettings[division];
8958
8959     /* find transition probabilities */
8960     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
8961
8962     /* find rates */
8963     baseRate = GetRate (division, chain);
8964     theRate = 1.0;
8965     if (m->shape == NULL)
8966         catRate = &theRate;
8967     else
8968         catRate = GetParamSubVals (m->shape, chain, state[chain]);
8969
8970     /* find base frequencies */
8971     bs = GetParamSubVals(m->stateFreq, chain, state[chain]);
8972
8973     /* calculate scaling factor */
8974     mu =  1.0 / (2.0 * bs[0] * bs[1]);
8975
8976     /* find length */
8977     if (m->cppEvents != NULL)
8978         {
8979         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
8980         }
8981     else if (m->tk02BranchRates != NULL)
8982         {
8983         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
8984         }
8985     else if (m->igrBranchRates != NULL)
8986         {
8987         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
8988         }
8989     else if (m->mixedBrchRates != NULL)
8990         {
8991         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
8992         }
8993     else
8994         length = p->length;
8995
8996     /* numerical errors will ensue if we allow very large or very small branch lengths,
8997        which might occur in relaxed clock models */
8998
8999     /* fill in values */
9000     for (k=index=0; k<m->numGammaCats; k++)
9001         {
9002         v =  length * baseRate * catRate[k];
9003
9004         if (v < TIME_MIN)
9005             {
9006             /* Fill in identity matrix */
9007             tiP[index++] = (CLFlt) (bs[0] + bs[1]);
9008             tiP[index++] = (CLFlt) (bs[1] - bs[1]);
9009             tiP[index++] = (CLFlt) (bs[0] - bs[0]);
9010             tiP[index++] = (CLFlt) (bs[1] + bs[0]);
9011             }
9012         else if (v > TIME_MAX)
9013             {
9014             /* Fill in stationary matrix */
9015             tiP[index++] = (CLFlt) bs[0];
9016             tiP[index++] = (CLFlt) bs[1];
9017             tiP[index++] = (CLFlt) bs[0];
9018             tiP[index++] = (CLFlt) bs[1];
9019             }
9020         else
9021             {
9022             /* calculate probabilities */
9023             eV =  exp(-mu * v);
9024             tiP[index++] = (CLFlt) (bs[0] + bs[1] * eV);
9025             tiP[index++] = (CLFlt) (bs[1] - bs[1] * eV);
9026             tiP[index++] = (CLFlt) (bs[0] - bs[0] * eV);
9027             tiP[index++] = (CLFlt) (bs[1] + bs[0] * eV);
9028             }
9029         }
9030
9031     return NO_ERROR;
9032 }
9033
9034
9035 /*-----------------------------------------------------------------
9036 |
9037 |   TiProbs_Std: update transition probabilities for
9038 |       variable states model with or without rate variation
9039 |
9040 ------------------------------------------------------------------*/
9041 int TiProbs_Std (TreeNode *p, int division, int chain)
9042 {
9043     int         b, c, i, j, k, n, s, nStates, index=0, index2;
9044     MrBFlt      v, eV1, eV2, eV3, eV4, eV5, *catRate,
9045                 baseRate, theRate, pi, f1, f2, f3, f4, f5, f6, f7, root,
9046                 *eigenValues, *cijk, sum, *bs, mu, length;
9047     CLFlt       pNoChange, pChange, *tiP;
9048     ModelInfo   *m;
9049 #   if defined (DEBUG_TIPROBS_STD)
9050     int         index3;
9051 #   endif
9052
9053     m = &modelSettings[division];
9054
9055     /* find transition probabilities */
9056     tiP = m->tiProbs[m->tiProbsIndex[chain][p->index]];
9057
9058     /* get rate multiplier */
9059     theRate = 1.0;
9060     baseRate = GetRate (division, chain);
9061
9062     /* get category rates */
9063     if (m->shape == NULL)
9064         catRate = &theRate;
9065     else
9066         catRate = GetParamSubVals (m->shape, chain, state[chain]);
9067
9068 #   if defined (DEBUG_TIPROBS_STD)
9069     /* find base frequencies */
9070     bs = GetParamStdStateFreqs (m->stateFreq, chain, state[chain]);
9071 #   endif
9072
9073     /* find length */
9074     if (m->cppEvents != NULL)
9075         {
9076         length = GetParamSubVals (m->cppEvents, chain, state[chain])[p->index];
9077         }
9078     else if (m->tk02BranchRates != NULL)
9079         {
9080         length = GetParamSubVals (m->tk02BranchRates, chain, state[chain])[p->index];
9081         }
9082     else if (m->igrBranchRates != NULL)
9083         {
9084         length = GetParamSubVals (m->igrBranchRates, chain, state[chain])[p->index];
9085         }
9086     else if (m->mixedBrchRates != NULL)
9087         {
9088         length = GetParamSubVals (m->mixedBrchRates, chain, state[chain])[p->index];
9089         }
9090     else
9091         length = p->length;
9092
9093     /* numerical errors will ensue if we allow very large or very small branch lengths, which might
9094        occur in relaxed clock models; an elegant solution would be to substitute the stationary
9095        probs and initial probs but for now we truncate lengths at small or large values TODO */
9096     if (length > BRLENS_MAX)
9097         length = BRLENS_MAX;
9098     else if (length < BRLENS_MIN)
9099         length = BRLENS_MIN;
9100
9101     /* fill in values; this has to be done differently if state freqs are not equal */
9102     if (m->stateFreq->paramId == SYMPI_EQUAL)
9103         {
9104         /* equal state frequencies */
9105         /* fill in values for unordered characters */
9106         index = 0;
9107 #   if defined (DEBUG_TIPROBS_STD)
9108         index3 = 0;
9109 #   endif
9110         for (nStates=2; nStates<=10; nStates++)
9111             {
9112             if (m->isTiNeeded[nStates-2] == NO)
9113                 continue;
9114             for (k=0; k<m->numGammaCats; k++)
9115                 {
9116                 /* calculate probabilities */
9117                 v =  length*catRate[k]*baseRate;
9118                 eV1 =  exp(-(nStates / (nStates -  1.0)) * v);
9119                 pChange   = (CLFlt) ((1.0 / nStates) - ((1.0 / nStates) * eV1));
9120                 pNoChange = (CLFlt) ((1.0 / nStates) + ((nStates - 1.0) / nStates) * eV1);
9121                 if (pChange<0.0)
9122                     pChange = (CLFlt) 0.0;
9123                 for (i=0; i<nStates; i++)
9124                     {
9125                     for (j=0; j<nStates; j++)
9126                         {
9127                         if (i == j)
9128                             tiP[index++] = pNoChange;
9129                         else
9130                             tiP[index++] = pChange;
9131                         }
9132                     }
9133 #   if defined (DEBUG_TIPROBS_STD)
9134                 PrintTiProbs (tiP+index-(nStates*nStates), bs+index3, nStates);
9135 #   endif
9136                 }
9137 #   if defined (DEBUG_TIPROBS_STD)
9138             index3 += nStates;
9139 #   endif
9140             }
9141
9142         /* fill in values for 3-state ordered character */
9143         if (m->isTiNeeded[9] == YES)
9144             {
9145             nStates = 3;
9146             for (k=0; k<m->numGammaCats; k++)
9147                 {
9148                 /* calculate probabilities */
9149                 v =  length * catRate[k] * baseRate;
9150                 eV1 =  exp (-(3.0 / 4.0) * v);
9151                 eV2 =  exp (-(9.0 / 4.0) * v);
9152
9153                 /* pij(0,0) */
9154                 tiP[index] = (CLFlt) ((1.0 / 3.0) + (eV1 / 2.0) + (eV2 / 6.0));
9155                 /* pij(0,1) = pij(1,0) */
9156                 tiP[index+1] = tiP[index+3] = (CLFlt) ((1.0 / 3.0) - (eV2 / 3.0));
9157                 /* pij(0,2) */
9158                 tiP[index+2] = (CLFlt) ((1.0 / 3.0) - (eV1 / 2.0) + (eV2 / 6.0));
9159                 /* pij(1,1) */
9160                 tiP[index+4] = (CLFlt) ((1.0 / 3.0) + (2.0 * eV2 / 3.0));
9161
9162                 /* fill in mirror part of matrix */
9163                 index += 5;
9164                 index2 = index - 2;
9165                 for (i=0; i<4; i++)
9166                     tiP[index++] = tiP[index2--];
9167
9168                 /* make sure no value is negative */
9169                 for (i=index-(nStates*nStates); i<index; i++) {
9170                     if (tiP[i] < 0.0)
9171                         tiP[i] = (CLFlt) 0.0;
9172                 }
9173 #   if defined (DEBUG_TIPROBS_STD)
9174                 PrintTiProbs (tiP+index-(nStates*nStates), bs+index3, nStates);
9175 #   endif
9176                 }
9177
9178 #   if defined (DEBUG_TIPROBS_STD)
9179             index3 += nStates;
9180 #   endif
9181             }
9182
9183         /* 4-state ordered character */
9184         if (m->isTiNeeded[10] == YES)
9185             {
9186             nStates = 4;
9187             pi = 1.0 / 4.0;
9188             root =  sqrt (2.0);
9189             f1 = root +  1.0;
9190             f2 = root -  1.0;
9191
9192             for (k=0; k<m->numGammaCats; k++)
9193                 {
9194                 /* calculate probabilities */
9195                 v =  length * catRate[k] * baseRate;
9196                 eV1 =  1.0 / (exp ((4.0 * v) / 3.0));
9197                 eV2 =  exp ((2.0 * (root - 2.0) * v) / 3.0) / root;
9198                 eV3 =  1.0 / (root *  exp ((2.0 * (root + 2.0) * v) / 3.0));
9199
9200                 /* pij(0,0) */
9201                 tiP[index] = (CLFlt) (pi * (1.0 + eV1 + (f1*eV2) + (f2*eV3)));
9202                 /* pij(0,1) = pij(1,0) */
9203                 tiP[index+1] = tiP[index+4] = (CLFlt) (pi * (1.0 - eV1 + eV2 - eV3));
9204                 /* pij(0,2) = tiP(1,3) */
9205                 tiP[index+2] = tiP[index+7] = (CLFlt) (pi * (1.0 - eV1 - eV2 + eV3));
9206                 /* pij(0,3) */
9207                 tiP[index+3] = (CLFlt) (pi * (1.0 + eV1 - (f1*eV2) - (f2*eV3)));
9208                 /* pij(1,1) */
9209                 tiP[index+5] = (CLFlt) (pi * (1.0 + eV1 + (f2*eV2) + (f1*eV3)));
9210                 /* pij(1,2) */
9211                 tiP[index+6] = (CLFlt) (pi * (1.0 + eV1 - (f2*eV2) - (f1*eV3)));
9212
9213                 /* fill in mirror part of matrix */
9214                 index += 8;
9215                 index2 = index - 1;
9216                 for (i=0; i<8; i++)
9217                     tiP[index++] = tiP[index2--];
9218
9219                 /* make sure no value is negative */
9220                 for (i=index-(nStates*nStates); i<index; i++) {
9221                     if (tiP[i] < 0.0)
9222                         tiP[i] = (CLFlt) 0.0;
9223                 }
9224 #   if defined (DEBUG_TIPROBS_STD)
9225                 PrintTiProbs (tiP+index-(nStates*nStates), bs+index3, nStates);
9226 #   endif
9227                 }
9228 #   if defined (DEBUG_TIPROBS_STD)
9229             index3 += nStates;
9230 #   endif
9231             }
9232
9233         /* 5-state ordered character */
9234         if (m->isTiNeeded[11] == YES)
9235             {
9236             nStates = 5;
9237             pi = 1.0 / 5.0;
9238             root =  sqrt (5.0);
9239
9240             f5 = root /  4.0;
9241             f1 =  0.75 + f5;;
9242             f2 =  1.25 + f5;
9243             f3 =  1.25 - f5;
9244             f4 =  0.75 - f5;
9245             f5 = f5 *  2.0;
9246             f6 = f5 +  0.5;
9247             f7 = f5 -  0.5;
9248
9249             for (k=0; k<m->numGammaCats; k++)
9250                 {
9251                 /* calculate probabilities */
9252                 v =  length * catRate[k] * baseRate;
9253                 v *=  5.0 /  16.0;
9254
9255                 eV1 =  exp ((root -  3.0) * v);
9256                 eV2 =  exp (-(root +  3.0) * v);
9257                 eV3 =  exp ((root -  5.0) * v);
9258                 eV4 =  exp (-(root +  5.0) * v);
9259
9260                 /* pij(0,0) */
9261                 tiP[index] = (CLFlt) (pi* (1.0 + (f1*eV3) + (f2*eV1) + (f3*eV2) + (f4*eV4)));
9262                 /* pij(0,1) = pij(1,0) */
9263                 tiP[index+1] = tiP[index+5] =
9264                     (CLFlt) (pi*(1.0 - (eV3/2.0) + (f5*eV1) - (f5*eV2) - (eV4/2.0)));
9265                 /* pij(0,2) = pij(2,0) */
9266                 tiP[index+2] = tiP[index+10] = (CLFlt) (pi*(1.0 - (f6*eV3) + (f7*eV4)));
9267                 /* pij(0,3) = pij(1,4) */
9268                 tiP[index+3] = tiP[index+9] =
9269                     (CLFlt) (pi*(1.0 - (eV3/2.0) - (f5*eV1) + (f5*eV2) - (eV4/2.0)));
9270                 /* pij(0,4) */
9271                 tiP[index+4] = (CLFlt) (pi*(1.0 + (f1*eV3) - (f2*eV1) - (f3*eV2) + (f4*eV4)));
9272                 /* pij(1,1) */
9273                 tiP[index+6] = (CLFlt) (pi*(1.0 + (f4*eV3) + (f3*eV1) + (f2*eV2) + (f1*eV4)));
9274                 /* pij(1,2) = pij(2,1) */
9275                 tiP[index+7] = tiP[index+11] = (CLFlt) (pi*(1.0 + (f7*eV3) - (f6*eV4)));
9276                 /* pij(1,3) */
9277                 tiP[index+8] = (CLFlt) (pi*(1.0 + (f4*eV3) - (f3*eV1) - (f2*eV2) + (f1*eV4)));
9278                 /* pij(2,2) */
9279                 tiP[index+12] = (CLFlt) (pi*(1.0 + (2.0*eV3) + (2.0*eV4)));
9280
9281                 /* fill in mirror part of matrix */
9282                 index += 13;
9283                 index2 = index - 2;
9284                 for (i=0; i<12; i++)
9285                     tiP[index++] = tiP[index2--];
9286
9287                 /* make sure no value is negative */
9288                 for (i=index-(nStates*nStates); i<index; i++) {
9289                     if (tiP[i] < 0.0)
9290                         tiP[i] = (CLFlt) 0.0;
9291                 }
9292 #   if defined (DEBUG_TIPROBS_STD)
9293                 PrintTiProbs (tiP+index-(nStates*nStates), bs+index3, nStates);
9294 #   endif
9295                 }
9296 #   if defined (DEBUG_TIPROBS_STD)
9297             index3 += nStates;
9298 #   endif
9299             }
9300
9301         /* 6-state ordered character */
9302         if (m->isTiNeeded[12] == YES)
9303             {
9304             nStates = 6;
9305             pi =  1.0 /  6.0;
9306             root =  sqrt (3.0);
9307
9308             f4 = (3.0 / (2.0 * root));
9309             f1 =  1.0 + f4;
9310             f2 =  1.0 - f4;
9311             f3 =  0.5 + f4;
9312             f4 =  0.5 - f4;
9313
9314             for (k=0; k<m->numGammaCats; k++)
9315                 {
9316                 /* calculate probabilities */
9317                 v =  length * catRate[k] * baseRate;
9318                 v /=  5.0;
9319
9320                 eV1 =  exp (-9 * v);
9321                 eV2 =  exp (-6 * v);
9322                 eV3 =  exp (-3 * v);
9323                 eV4 =  exp (3.0 * (root - 2.0) * v);
9324                 eV5 =  exp (-3.0 * (root + 2.0) * v);
9325
9326                 /* pij(0,0) */
9327                 tiP[index] = (CLFlt) (pi* (1.0 + (0.5*eV1) + eV2 + (1.5*eV3) + (f1*eV4) + (f2*eV5)));
9328                 /* pij(0,1) = pij(1,0) */
9329                 tiP[index+1] = tiP[index+6] = (CLFlt) (pi*(1.0 - eV1 - eV2 + (f3*eV4) + (f4*eV5)));
9330                 /* pij(0,2) = pij(2,0) */
9331                 tiP[index+2] = tiP[index+12] =
9332                     (CLFlt) (pi*(1.0 + (0.5*eV1) - eV2 - (1.5*eV3) + (0.5*eV4) + (0.5*eV5)));
9333                 /* pij(0,3) = pij(2,5) */
9334                 tiP[index+3] = tiP[index+17] =
9335                     (CLFlt) (pi*(1.0 + (0.5*eV1) + eV2 - (1.5*eV3) - (0.5*eV4) - (0.5*eV5)));
9336                 /* pij(0,4) = pij(1,5) */
9337                 tiP[index+4] = tiP[index+11] = (CLFlt) (pi*(1.0 - eV1 + eV2 - (f3*eV4) - (f4*eV5)));
9338                 /* pij(0,5) */
9339                 tiP[index+5] = (CLFlt) (pi*(1.0 + (0.5*eV1) - eV2 + (1.5*eV3) - (f1*eV4) - (f2*eV5)));
9340                 /* pij(1,1) */
9341                 tiP[index+7] = (CLFlt) (pi*(1.0 + (2.0*eV1) + eV2 + eV4 + eV5));
9342                 /* pij(1,2) = pij(2,1) */
9343                 tiP[index+8] = tiP[index+13] = (CLFlt) (pi*(1.0 - eV1 + eV2 - (f4*eV4) - (f3*eV5)));
9344                 /* pij(1,3) = pij(2,4) */
9345                 tiP[index+9] = tiP[index+16] = (CLFlt) (pi*(1.0 - eV1 - eV2 + (f4*eV4) + (f3*eV5)));
9346                 /* pij(1,4) */
9347                 tiP[index+10] = (CLFlt) (pi*(1.0 + (2.0*eV1) - eV2 - eV4 - eV5));
9348                 /* pij(2,2) */
9349                 tiP[index+14] = (CLFlt) (pi*(1.0 + (0.5*eV1) + eV2 + (1.5*eV3) + (f2*eV4) + (f1*eV5)));
9350                 /* pij(2,3) */
9351                 tiP[index+15] = (CLFlt) (pi*(1.0 + (0.5*eV1) - eV2 + (1.5*eV3) - (f2*eV4) - (f1*eV5)));
9352
9353                 /* fill in mirror part of matrix */
9354                 index += 18;
9355                 index2 = index - 1;
9356                 for (i=0; i<18; i++)
9357                     tiP[index++] = tiP[index2--];
9358
9359                 /* make sure no value is negative */
9360                 for (i=index-(nStates*nStates); i<index; i++) {
9361                     if (tiP[i] < 0.0)
9362                         tiP[i] = (CLFlt) 0.0;
9363                 }
9364 #   if defined (DEBUG_TIPROBS_STD)
9365                 PrintTiProbs (tiP+index-(nStates*nStates), bs+index3, nStates);
9366 #   endif
9367                 }
9368 #   if defined (DEBUG_TIPROBS_STD)
9369             index3 += nStates;
9370 #   endif
9371             }
9372         }
9373     else
9374         {
9375         /* unequal state frequencies */
9376         index = 0;
9377
9378         /* first fill in for binary characters using beta categories if needed */
9379         if (m->isTiNeeded[0] == YES)
9380             {
9381             /* find base frequencies */
9382             bs = GetParamStdStateFreqs (m->stateFreq, chain, state[chain]);
9383
9384             /* cycle through beta and gamma cats */
9385             for (b=0; b<m->numBetaCats; b++)
9386                 {
9387                 mu =  1.0 / (2.0 * bs[0] * bs[1]);
9388                 for (k=0; k<m->numGammaCats; k++)
9389                     {
9390                     /* calculate probabilities */
9391                     v =  length*catRate[k]*baseRate;
9392                     eV1 =  exp(- mu * v);
9393                     tiP[index++] = (CLFlt) (bs[0] + (bs[1] * eV1));
9394                     tiP[index++] = (CLFlt) (bs[1] - (bs[1] * eV1));
9395                     tiP[index++] = (CLFlt) (bs[0] - (bs[0] * eV1));
9396                     tiP[index++] = (CLFlt) (bs[1] + (bs[0] * eV1));
9397                     }
9398                 /* update stationary state frequency pointer */
9399                 bs += 2;
9400                 }
9401             }
9402
9403         /* now use general algorithm for the other cases */
9404         if (m->cijkLength > 0)
9405             {
9406             /* first update cijk if necessary */
9407             if (m->cijkLength > 0 && m->upDateCijk == YES)
9408                 {
9409                 if (UpDateCijk (division, chain) == ERROR)
9410                     return (ERROR);
9411                 }
9412
9413             /* then get first set of eigenvalues */
9414             eigenValues = m->cijks[m->cijkIndex[chain]];
9415
9416             /* and cycle through the relevant characters */
9417             for (c=0; c<m->stateFreq->nSympi; c++)
9418                 {
9419                 n = m->stateFreq->sympinStates[c];
9420
9421                 /* fill in values */
9422                 for (k=0; k<m->numGammaCats; k++)
9423                     {
9424                     v =  length * baseRate * catRate[k];
9425                     cijk = eigenValues + (2 * n);
9426
9427                     for (i=0; i<n; i++)
9428                         {
9429                         for (j=0; j<n; j++)
9430                             {
9431                             sum = 0.0;
9432                             for (s=0; s<n; s++)
9433                                 sum += (*cijk++) * exp(eigenValues[s] * v);
9434                             tiP[index++] = (CLFlt) ((sum <  0.0) ?  0.0 : sum);
9435                             }
9436                         }
9437                     }
9438
9439                 /* update eigenValues pointer */
9440                 eigenValues += (n * n * n) + (2 * n);
9441                 }
9442             }
9443         }
9444
9445     return NO_ERROR;
9446 }
9447
9448
9449 int UpDateCijk (int whichPart, int whichChain)
9450 {
9451     int         c, i, j, k, n, n3, isComplex, sizeOfSingleCijk, cType, numQAllocated;
9452     MrBFlt      **q[100], **eigvecs, **inverseEigvecs;
9453     MrBFlt      *eigenValues, *eigvalsImag, *cijk;
9454     MrBFlt      *bs, *bsBase, *rateOmegaValues=NULL, rA=0.0, rS=0.0, posScaler, *omegaCatFreq=NULL;
9455     complex     **Ceigvecs, **CinverseEigvecs;
9456     ModelInfo   *m;
9457     Param       *p;
9458 #   if defined (BEAGLE_ENABLED)
9459     int         u;
9460     double      *beagleEigvecs=NULL, *beagleInverseEigvecs=NULL;
9461 #   endif
9462
9463     /* get a pointer to the model settings for this partition */
9464     m = &modelSettings[whichPart];
9465     assert (m->upDateCijk == YES);
9466
9467     /* we should only go through here if we have cijk information available for the partition */
9468     if (m->cijkLength > 0)
9469         {
9470         /* flip cijk space */
9471         FlipCijkSpace(m, whichChain);
9472
9473         /* figure out information on either omega values or rate values, if necessary */
9474         if (m->dataType == DNA || m->dataType == RNA)
9475             {
9476             if (m->nucModelId == NUCMODEL_CODON)                                                    /* we have a NY98 model     */
9477                 {
9478                 rateOmegaValues = GetParamVals(m->omega, whichChain, state[whichChain]);
9479                 if (m->numOmegaCats > 1)
9480                     omegaCatFreq = GetParamSubVals (m->omega, whichChain, state[whichChain]);
9481                 }
9482             else if (m->nCijkParts > 1 && m->nucModelId == NUCMODEL_4BY4 && m->numModelStates == 8) /* we have a covarion model */
9483                 rateOmegaValues = GetParamSubVals (m->shape, whichChain, state[whichChain]);        /* with rate variation      */
9484             }
9485         else if (m->dataType == PROTEIN)
9486             {
9487             if (m->nCijkParts > 1)                                                                  /* we have a covarion model */
9488                 rateOmegaValues = GetParamSubVals (m->shape, whichChain, state[whichChain]);        /* with rate variation      */
9489             }
9490 #   if defined (BEAGLE_ENABLED)
9491         else if (m->dataType == RESTRICTION){}
9492 #   endif
9493         else if (m->dataType != STANDARD)
9494             {
9495             MrBayesPrint ("%s   ERROR: Should not be updating cijks!\n", spacer);
9496             return (ERROR);
9497             }
9498
9499         if (m->dataType == STANDARD)
9500             {
9501             /* set pointers and other stuff needed */
9502             numQAllocated = 1;
9503             p = m->stateFreq;
9504             eigenValues = m->cijks[m->cijkIndex[whichChain]];
9505             q[0] = AllocateSquareDoubleMatrix (10);
9506             eigvecs = AllocateSquareDoubleMatrix (10);
9507             inverseEigvecs = AllocateSquareDoubleMatrix (10);
9508             Ceigvecs = AllocateSquareComplexMatrix (10);
9509             CinverseEigvecs = AllocateSquareComplexMatrix (10);
9510             bsBase = GetParamStdStateFreqs (m->stateFreq, whichChain, state[whichChain]);
9511
9512             /* cycle over characters needing cijks */
9513             for (c=0; c<p->nSympi; c++)
9514                 {
9515                 n = p->sympinStates[c];
9516                 bs = bsBase + p->sympiBsIndex[c];
9517                 cType = p->sympiCType[c];
9518                 n3 = n * n * n;
9519                 eigvalsImag = eigenValues + n;
9520                 cijk = eigenValues + (2 * n);
9521                 if (SetStdQMatrix (q[0], n, bs, cType) == ERROR)
9522                     return (ERROR);
9523                 isComplex = GetEigens (n, q[0], eigenValues, eigvalsImag, eigvecs, inverseEigvecs, Ceigvecs, CinverseEigvecs);
9524                 if (isComplex == NO)
9525                     {
9526                     CalcCijk (n, cijk, eigvecs, inverseEigvecs);
9527                     }
9528                 else
9529                     {
9530                     if (isComplex == YES)
9531                         MrBayesPrint ("%s   ERROR: Complex eigenvalues found!\n", spacer);
9532                     else
9533                         MrBayesPrint ("%s   ERROR: Computing eigenvalues problem!\n", spacer);
9534                     goto errorExit;
9535                     }
9536                 eigenValues += (n3 + (2 * n));
9537                 }
9538             }
9539         else
9540             {
9541             /* all other data types */
9542             numQAllocated = m->nCijkParts;
9543             sizeOfSingleCijk = m->cijkLength / m->nCijkParts;
9544             n = m->numModelStates;
9545             n3 = n * n * n;
9546 #   if defined (BEAGLE_ENABLED)
9547             if (m->useBeagle == YES)
9548                 eigenValues = m->cijks[m->cijkIndex[whichChain]/m->nCijkParts];
9549             else
9550                 eigenValues = m->cijks[m->cijkIndex[whichChain]];
9551 #   else
9552             eigenValues = m->cijks[m->cijkIndex[whichChain]];
9553 #   endif
9554             eigvalsImag = eigenValues + n;
9555             cijk        = eigenValues + (2 * n);
9556             for (k=0; k<numQAllocated; k++)
9557                 q[k] = AllocateSquareDoubleMatrix (n);
9558             eigvecs = AllocateSquareDoubleMatrix (n);
9559             inverseEigvecs = AllocateSquareDoubleMatrix (n);
9560             Ceigvecs = AllocateSquareComplexMatrix (n);
9561             CinverseEigvecs = AllocateSquareComplexMatrix (n);
9562             bs = GetParamSubVals (m->stateFreq, whichChain, state[whichChain]);
9563
9564             if (m->nCijkParts == 1)
9565                 {
9566                 if (m->dataType == DNA || m->dataType == RNA)
9567                     {
9568                     if (m->nucModelId == NUCMODEL_CODON)
9569                         {
9570                         if (SetNucQMatrix (q[0], n, whichChain, whichPart, rateOmegaValues[0], &rA, &rS) == ERROR)
9571                             goto errorExit;
9572                         }
9573                     else
9574                         {
9575                         if (SetNucQMatrix (q[0], n, whichChain, whichPart, 1.0, &rA, &rS) == ERROR)
9576                             goto errorExit;
9577                         }
9578                     }
9579 #   if defined (BEAGLE_ENABLED)
9580                 else if (m->dataType == RESTRICTION)
9581                     {
9582                     SetBinaryQMatrix (q[0], whichChain, whichPart);
9583                     }
9584 #   endif
9585                 else
9586                     {
9587                     if (SetProteinQMatrix (q[0], n, whichChain, whichPart, 1.0) == ERROR)
9588                         goto errorExit;
9589                     }
9590                 isComplex = GetEigens (n, q[0], eigenValues, eigvalsImag, eigvecs, inverseEigvecs, Ceigvecs, CinverseEigvecs);
9591 #   if defined (BEAGLE_ENABLED)
9592                 if (isComplex == YES)
9593                     {
9594                     if (isComplex == YES)
9595                         MrBayesPrint ("%s   ERROR: Complex eigenvalues found!\n", spacer);
9596                     else
9597                         MrBayesPrint ("%s   ERROR: Computing eigenvalues problem!\n", spacer);
9598                     goto errorExit;
9599                     }
9600                 if (m->useBeagle == YES)
9601                     {
9602                     /* TODO: only allocate this space once at initialization */
9603                     beagleEigvecs = (double*) SafeCalloc (2*n*n, sizeof(double));
9604                     beagleInverseEigvecs = beagleEigvecs + n*n;
9605                     for (i=k=0; i<n; i++)
9606                         {
9607                         // eigenValues[i] = 0.1;
9608                         for (j=0; j<n; j++)
9609                             {
9610                             beagleEigvecs[k] = eigvecs[i][j];
9611                             beagleInverseEigvecs[k] = inverseEigvecs[i][j];
9612                             k++;
9613                             }
9614                         }
9615                     beagleSetEigenDecomposition(m->beagleInstance,
9616                                                 m->cijkIndex[whichChain],
9617                                                 beagleEigvecs,
9618                                                 beagleInverseEigvecs,
9619                                                 eigenValues);
9620                     free(beagleEigvecs);
9621                     }
9622                 else
9623                     {
9624                     CalcCijk (n, cijk, eigvecs, inverseEigvecs);
9625                     }
9626 #   else
9627                 if (isComplex == NO)
9628                     {
9629                     CalcCijk (n, cijk, eigvecs, inverseEigvecs);
9630                     }
9631                 else
9632                     {
9633                     MrBayesPrint ("%s   ERROR: Complex eigenvalues found!\n", spacer);
9634                     goto errorExit;
9635                     }
9636 #   endif
9637                 }
9638             else
9639                 {
9640                 /* Here, we calculate the rate matrices (Q) for various nucleotide and amino acid
9641                    data models. Usually, when the rate matrix is set in SetNucQMatrix, it is scaled
9642                    such that the average substitution rate is one. However, there is a complication
9643                    for positive selection models using codon rate matrices. First, we have more than
9644                    one matrix; in fact, we have as many rate matrices as there are omega values. Second,
9645                    the mean substitution rate still has to be one. And third, we want the synonymous
9646                    rate to be the same across the rate matrices. For positive selection models, the Q
9647                    matrix comes out of SetNucQMatrix unscaled. Once we have all m->nCijkParts rate
9648                    matrices, we then scale again, this time to ensure that the mean substitution rate is one. */
9649
9650                 /* First, calculate rate matrices for each category: */
9651                 posScaler = 0.0;
9652                 for (k=0; k<m->nCijkParts; k++)
9653                     {
9654                     if (m->dataType == DNA || m->dataType == RNA)
9655                         {
9656                         if (SetNucQMatrix (q[k], n, whichChain, whichPart, rateOmegaValues[k], &rA, &rS) == ERROR)
9657                             goto errorExit;
9658                         }
9659                     else
9660                         {
9661                         if (SetProteinQMatrix (q[k], n, whichChain, whichPart, rateOmegaValues[k]) == ERROR)
9662                             goto errorExit;
9663                         }
9664                     if (m->nucModelId == NUCMODEL_CODON && m->numOmegaCats > 1)
9665                         posScaler += omegaCatFreq[k] * (rS + rA);
9666                     }
9667
9668                 /* Then rescale the rate matrices, if this is a positive selection model: */
9669                 if (m->nucModelId == NUCMODEL_CODON && m->numOmegaCats > 1)
9670                     {
9671                     posScaler = 1.0 / posScaler;
9672                     for (k=0; k<m->nCijkParts; k++)
9673                         {
9674                         for (i=0; i<n; i++)
9675                             for (j=0; j<n; j++)
9676                                 q[k][i][j] *= posScaler;
9677                         }
9678                     }
9679
9680                 /* Finally, calculate eigenvalues, etc.: */
9681 #   if defined (BEAGLE_ENABLED)
9682                 if (m->useBeagle == YES)
9683                     {
9684                     /* TODO: only allocate this space once at initialization */
9685                     beagleEigvecs = (double*) SafeCalloc (2*n*n, sizeof(double));
9686                     beagleInverseEigvecs = beagleEigvecs + n*n;
9687                     }
9688 #   endif
9689                 for (k=0; k<m->nCijkParts; k++)
9690                     {
9691                     isComplex = GetEigens (n, q[k], eigenValues, eigvalsImag, eigvecs, inverseEigvecs, Ceigvecs, CinverseEigvecs);
9692 #   if defined (BEAGLE_ENABLED)
9693                     if (isComplex == YES)
9694                         {
9695                         if (isComplex == YES)
9696                             MrBayesPrint ("%s   ERROR: Complex eigenvalues found!\n", spacer);
9697                         else
9698                             MrBayesPrint ("%s   ERROR: Computing eigenvalues problem!\n", spacer);
9699                         goto errorExit;
9700                         }
9701                     if (m->useBeagle == YES)
9702                         {
9703                         for (i=u=0; i<n; i++)
9704                             {
9705                             for (j=0; j<n; j++)
9706                                 {
9707                                 beagleEigvecs[u] = eigvecs[i][j];
9708                                 beagleInverseEigvecs[u] = inverseEigvecs[i][j];
9709                                 u++;
9710                                 }
9711                             }
9712
9713                         beagleSetEigenDecomposition(m->beagleInstance,
9714                                                     m->cijkIndex[whichChain] + k,
9715                                                     beagleEigvecs,
9716                                                     beagleInverseEigvecs,
9717                                                     eigenValues);
9718                         }
9719                     else
9720                         {
9721                         CalcCijk (n, cijk, eigvecs, inverseEigvecs);
9722                         }
9723 #   else
9724                     if (isComplex == NO)
9725                         {
9726                         CalcCijk (n, cijk, eigvecs, inverseEigvecs);
9727                         }
9728                     else
9729                         {
9730                         MrBayesPrint ("%s   ERROR: Complex eigenvalues found!\n", spacer);
9731                         goto errorExit;
9732                         }
9733 #   endif
9734                     /* shift pointers */
9735                     eigenValues += sizeOfSingleCijk;
9736                     eigvalsImag += sizeOfSingleCijk;
9737                     cijk        += sizeOfSingleCijk;
9738                     }
9739 #   if defined (BEAGLE_ENABLED)
9740                 free(beagleEigvecs);
9741 #   endif
9742                 }
9743             }
9744
9745         for (k=0; k<numQAllocated; k++)
9746             FreeSquareDoubleMatrix (q[k]);
9747         FreeSquareDoubleMatrix (eigvecs);
9748         FreeSquareDoubleMatrix (inverseEigvecs);
9749         FreeSquareComplexMatrix (Ceigvecs);
9750         FreeSquareComplexMatrix (CinverseEigvecs);
9751         }
9752
9753     return (NO_ERROR);
9754
9755     errorExit:
9756         for (k=0; k<numQAllocated; k++)
9757             FreeSquareDoubleMatrix (q[k]);
9758         FreeSquareDoubleMatrix (eigvecs);
9759         FreeSquareDoubleMatrix (inverseEigvecs);
9760         FreeSquareComplexMatrix (Ceigvecs);
9761         FreeSquareComplexMatrix (CinverseEigvecs);
9762
9763         return ERROR;
9764 }
9765