-/******************************************************************\r
- Copyright 2006 by Michael Farrar. All rights reserved.\r
- This program may not be sold or incorporated into a commercial product,\r
- in whole or in part, without written consent of Michael Farrar. For \r
- further information regarding permission for use or reproduction, please \r
- contact: Michael Farrar at farrar.michael@gmail.com.\r
-*******************************************************************/\r
-\r
-/* Written by Michael Farrar, 2006.\r
- Please send bug reports and/or suggestions to farrar.michael@gmail.com.\r
-*/\r
-\r
-/* Implementation of the Wozniak "vertical" vectorization\r
- strategy for Smith-Waterman comparison, Wozniak (1997) Comp.\r
- Appl. Biosci. 13:145-150\r
-*/\r
-\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#include <emmintrin.h>\r
-\r
-#include "swsse2.h"\r
-#include "swwozniak.h"\r
-\r
-#define MATRIX_ROW_SIZE 32\r
-#define MATRIX_SIZE (MATRIX_ROW_SIZE * (ALPHA_SIZE + 1))\r
-\r
-typedef struct {\r
- char *pbMatrix;\r
- short *psMatrix;\r
- __m128i *pvHStore;\r
- __m128i *pvEStore;\r
- unsigned char *pData;\r
- unsigned short bias;\r
-} SwWozniakData;\r
-\r
-int\r
-swWozniakWord (unsigned char *querySeq,\r
- int queryLength,\r
- unsigned char *dbSeq,\r
- int dbLength,\r
- unsigned short gapOpen,\r
- unsigned short gapExtend,\r
- short *pMatrix,\r
- __m128i *pvHStore,\r
- __m128i *pvEStore);\r
-\r
-int\r
-swWozniakByte (unsigned char *querySeq,\r
- int queryLength,\r
- unsigned char *dbSeq,\r
- int dbLength,\r
- unsigned short gapOpen,\r
- unsigned short gapExtend,\r
- char *pMatrix,\r
- __m128i *pvHStore,\r
- __m128i *pvEStore,\r
- unsigned short bias);\r
-\r
-void *\r
-swWozniakInit(unsigned char *querySeq,\r
- int queryLength,\r
- signed char *matrix)\r
-{\r
- int i, j;\r
-\r
- int nCount;\r
-\r
- int lenQryByte;\r
- int lenQryShort;\r
-\r
- int bias;\r
-\r
- int weight;\r
-\r
- short *ps;\r
- char *pc;\r
-\r
- signed char *matrixRow;\r
-\r
- size_t aligned;\r
-\r
- SwWozniakData *pSwData;\r
- \r
- lenQryByte = (queryLength + 15) / 16 + 2;\r
- lenQryShort = (queryLength + 7) / 8 + 2;\r
-\r
- pSwData = (SwWozniakData *) malloc (sizeof (SwWozniakData));\r
- if (!pSwData) {\r
- fprintf (stderr, "Unable to allocate memory for SW data\n");\r
- exit (-1);\r
- }\r
-\r
- nCount = 64 + /* slack bytes */\r
- 4 * (ALPHA_SIZE + 1) + /* byte matrix */\r
- 4 * (ALPHA_SIZE + 1) + /* short matrix */\r
- ((queryLength + 16) * 2); /* vH and vE */\r
-\r
-\r
- pSwData->pData = (unsigned char *) calloc (nCount, sizeof (__m128i));\r
- if (!pSwData->pData) {\r
- fprintf (stderr, "Unable to allocate memory for SW data buffers\n");\r
- exit (-1);\r
- }\r
-\r
- pSwData->pbMatrix = (char *) pSwData->pData;\r
- pSwData->psMatrix = (short *) (pSwData->pbMatrix + MATRIX_SIZE);\r
-\r
- /* since we might port this to another platform, lets align the data */\r
- /* to 16 byte boundries ourselves */\r
- aligned = (size_t) (pSwData->psMatrix + MATRIX_SIZE);\r
- aligned = (aligned + 15) & ~(0x0f);\r
-\r
- pSwData->pvHStore = (__m128i *) aligned;\r
- pSwData->pvEStore = pSwData->pvHStore + queryLength + 16;\r
-\r
- /* Find the bias to use in the substitution matrix */\r
- bias = 127;\r
- for (i = 0; i < ALPHA_SIZE * ALPHA_SIZE; i++) {\r
- if (matrix[i] < bias) {\r
- bias = matrix[i];\r
- }\r
- }\r
- if (bias > 0) {\r
- bias = 0;\r
- }\r
-\r
- pc = pSwData->pbMatrix;\r
- ps = pSwData->psMatrix;\r
-\r
- for (i = 0; i < ALPHA_SIZE; i++) {\r
- matrixRow = matrix + i * ALPHA_SIZE;\r
-\r
- for (j = 0; j < ALPHA_SIZE; j++) {\r
- weight = matrixRow[j];\r
- *pc++ = weight - bias;\r
- *ps++ = weight;\r
- }\r
-\r
- for ( ; j < MATRIX_ROW_SIZE; j++) {\r
- *pc++ = -bias;\r
- *ps++ = 0;\r
- }\r
- }\r
-\r
- /* add the weights for the NULL rows */\r
- for (j = 0; j < MATRIX_ROW_SIZE; j++) {\r
- *pc++ = -bias;\r
- *ps++ = 0;\r
- }\r
-\r
- pSwData->bias = (unsigned short) -bias;\r
-\r
- return pSwData;\r
-}\r
-\r
-void swWozniakScan (unsigned char *querySeq,\r
- int queryLength,\r
- FASTA_LIB *dbLib,\r
- void *swData,\r
- SEARCH_OPTIONS *options,\r
- SCORE_LIST *scores)\r
-{\r
- int score;\r
-\r
- int threshold = options->threshold;\r
-\r
- unsigned char *dbSeq;\r
- int dbLen;\r
-\r
- int gapInit = -(options->gapInit + options->gapExt);\r
- int gapExt = -options->gapExt;\r
-\r
- SwWozniakData *wozniakData = (SwWozniakData *) swData;\r
-\r
- dbSeq = nextSeq (dbLib, &dbLen);\r
- while (dbLen > 0) {\r
-\r
- score = swWozniakByte (querySeq, queryLength, \r
- dbSeq, dbLen, \r
- gapInit, gapExt, \r
- wozniakData->pbMatrix,\r
- wozniakData->pvHStore,\r
- wozniakData->pvEStore,\r
- wozniakData->bias);\r
-\r
- /* check if needs a run with higher precision */\r
- if (score >= 255) {\r
- score = swWozniakWord (querySeq, queryLength, \r
- dbSeq, dbLen, \r
- gapInit, gapExt, \r
- wozniakData->psMatrix,\r
- wozniakData->pvHStore,\r
- wozniakData->pvEStore);\r
- }\r
-\r
- if (score >= threshold) {\r
- int minScore = insertList (scores, score, seqName (dbLib));\r
- if (minScore >= threshold) {\r
- threshold = minScore;\r
- }\r
- }\r
-\r
- dbSeq = nextSeq (dbLib, &dbLen);\r
- }\r
-}\r
-\r
-void\r
-swWozniakComplete(void *pSwData)\r
-{\r
- SwWozniakData *pWozniakData = (SwWozniakData *) pSwData;\r
-\r
- free (pWozniakData->pData);\r
- free (pWozniakData);\r
-}\r
-\r
-\r
-int\r
-swWozniakWord (unsigned char *querySeq,\r
- int queryLength,\r
- unsigned char *dbSeq,\r
- int dbLength,\r
- unsigned short gapOpen,\r
- unsigned short gapExtend,\r
- short *pMatrix,\r
- __m128i *pvHStore,\r
- __m128i *pvEStore)\r
-{\r
- int i, j, k, l, m;\r
- int score;\r
-\r
- short *pScore;\r
-\r
- __m128i vE, vF, vH;\r
- __m128i vEUp, vHUp1, vHUp2;\r
-\r
- __m128i vMaxScore;\r
- __m128i vGapOpen;\r
- __m128i vGapExtend;\r
- __m128i vScore;\r
-\r
- __m128i vMin;\r
- __m128i vMinimums;\r
- __m128i vTemp;\r
-\r
- /* remove unreferenced warning */\r
- querySeq;\r
-\r
- /* Load gap opening penalty to all elements of a constant */\r
- vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);\r
- vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);\r
- vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);\r
-\r
- /* Load gap extension penalty to all elements of a constant */\r
- vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);\r
- vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);\r
- vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);\r
-\r
- /* load vMaxScore with the zeros. since we are using signed */\r
- /* math, we will bias the maxscore to -32768 so we have the */\r
- /* full range of the short. */\r
- vMaxScore = _mm_cmpeq_epi16 (vMaxScore, vMaxScore);\r
- vMaxScore = _mm_slli_epi16 (vMaxScore, 15);\r
-\r
- vMinimums = _mm_shuffle_epi32 (vMaxScore, 0);\r
-\r
- vMin = _mm_shuffle_epi32 (vMaxScore, 0);\r
- vMin = _mm_srli_si128 (vMin, 14);\r
-\r
- for (i = 0; i < queryLength + 8; i++)\r
- {\r
- _mm_store_si128 (pvEStore + i, vMaxScore);\r
- _mm_store_si128 (pvHStore + i, vMaxScore);\r
- }\r
-\r
- pScore = (short *) &vScore;\r
-\r
- for (i = 0; i < dbLength; i += 8, dbSeq += 8)\r
- {\r
- /* zero lots of stuff. */\r
- vE = _mm_shuffle_epi32 (vMinimums, 0);\r
- vF = _mm_shuffle_epi32 (vMinimums, 0);\r
- vH = _mm_shuffle_epi32 (vMinimums, 0);\r
- vHUp2 = _mm_shuffle_epi32 (vMinimums, 0);\r
-\r
- vScore = _mm_xor_si128 (vScore, vScore);\r
-\r
- for (j = 0; j < 8; ++j)\r
- {\r
- for (k = 0; k <= j; ++k) {\r
- int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;\r
- pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));\r
- }\r
- for ( ; k < 8; ++k) {\r
- pScore[k] = 0;\r
- }\r
-\r
- /* load values of vE and vH from previous row (one unit up) */\r
- vEUp = _mm_load_si128 (pvEStore + j);\r
- vHUp1 = _mm_load_si128 (pvHStore + j);\r
-\r
- /* shift into place so we have complete vE and vH vectors */\r
- /* that refer to the values one unit up from each cell */\r
- /* that we are currently working on. */\r
- vTemp = _mm_slli_si128 (vE, 2);\r
- vEUp = _mm_srli_si128 (vEUp, 14);\r
- vEUp = _mm_or_si128 (vEUp, vTemp);\r
-\r
- vTemp = _mm_slli_si128 (vH, 2);\r
- vHUp1 = _mm_srli_si128 (vHUp1, 14);\r
- vHUp1 = _mm_or_si128 (vHUp1, vTemp);\r
-\r
- /* do the dynamic programming */\r
-\r
- /* update vE value */\r
- vE = _mm_subs_epi16 (vEUp, vGapExtend);\r
- vTemp = _mm_subs_epi16 (vHUp1, vGapOpen);\r
- vE = _mm_max_epi16 (vE, vTemp);\r
-\r
- /* update vF value */\r
- vF = _mm_subs_epi16 (vF, vGapExtend);\r
- vTemp = _mm_subs_epi16 (vH, vGapOpen);\r
- vF = _mm_max_epi16 (vF, vTemp);\r
-\r
- /* add score to vH */\r
- vH = _mm_adds_epi16 (vHUp2, vScore);\r
-\r
- /* set vH to max of vH, vE, vF */\r
- vH = _mm_max_epi16 (vH, vE);\r
- vH = _mm_max_epi16 (vH, vF);\r
-\r
- /* Save value to use for next diagonal vH */\r
- vHUp2 = vHUp1;\r
-\r
- /* Update highest score encountered this far */\r
- vMaxScore = _mm_max_epi16 (vMaxScore, vH);\r
- }\r
-\r
- for (l = 0; j < queryLength; ++j, ++l)\r
- {\r
- for (k = 0; k < 8; ++k) {\r
- int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;\r
- pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));\r
- }\r
-\r
- /* load values of vE and vH from previous row (one unit up) */\r
- vEUp = _mm_load_si128 (pvEStore + j);\r
- vHUp1 = _mm_load_si128 (pvHStore + j);\r
-\r
- /* save old values of vE and vH to use on next row */\r
- _mm_store_si128 (pvEStore + l, vE);\r
- _mm_store_si128 (pvHStore + l, vH);\r
- \r
- /* shift into place so we have complete vE and vH vectors */\r
- /* that refer to the values one unit up from each cell */\r
- /* that we are currently working on. */\r
- vTemp = _mm_slli_si128 (vE, 2);\r
- vEUp = _mm_srli_si128 (vEUp, 14);\r
- vEUp = _mm_or_si128 (vEUp, vTemp);\r
-\r
- vTemp = _mm_slli_si128 (vH, 2);\r
- vHUp1 = _mm_srli_si128 (vHUp1, 14);\r
- vHUp1 = _mm_or_si128 (vHUp1, vTemp);\r
-\r
- /* do the dynamic programming */\r
-\r
- /* update vE value */\r
- vE = _mm_subs_epi16 (vEUp, vGapExtend);\r
- vTemp = _mm_subs_epi16 (vHUp1, vGapOpen);\r
- vE = _mm_max_epi16 (vE, vTemp);\r
-\r
- /* update vF value */\r
- vF = _mm_subs_epi16 (vF, vGapExtend);\r
- vTemp = _mm_subs_epi16 (vH, vGapOpen);\r
- vF = _mm_max_epi16 (vF, vTemp);\r
-\r
- /* add score to vH */\r
- vH = _mm_adds_epi16(vHUp2, vScore);\r
-\r
- /* set vH to max of vH, vE, vF */\r
- vH = _mm_max_epi16 (vH, vE);\r
- vH = _mm_max_epi16 (vH, vF);\r
-\r
- /* Save value to use for next diagonal vH */\r
- vHUp2 = vHUp1;\r
-\r
- /* Update highest score encountered this far */\r
- vMaxScore = _mm_max_epi16 (vMaxScore, vH);\r
- }\r
-\r
- for (m = 0 ; m < 7; ++j, ++l, ++m)\r
- {\r
- for (k = 0; k <= m; ++k) {\r
- pScore[k] = 0;\r
- }\r
- for ( ; k < 8; ++k) {\r
- int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;\r
- pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));\r
- }\r
-\r
- /* save old values of vE and vH to use on next row */\r
- _mm_store_si128 (pvEStore + l, vE);\r
- _mm_store_si128 (pvHStore + l, vH);\r
-\r
- /* v_score_load contains all zeros */\r
- vTemp = _mm_slli_si128 (vE, 2);\r
- vEUp = _mm_or_si128 (vMin, vTemp);\r
- vTemp = _mm_slli_si128 (vH, 2);\r
- vHUp1 = _mm_or_si128 (vMin, vTemp);\r
-\r
- /* do the dynamic programming */\r
-\r
- /* update vE value */\r
- vE = _mm_subs_epi16 (vEUp, vGapExtend);\r
- vTemp = _mm_subs_epi16 (vHUp1, vGapOpen);\r
- vE = _mm_max_epi16 (vE, vTemp);\r
-\r
- /* update vF value */\r
- vF = _mm_subs_epi16 (vF, vGapExtend);\r
- vTemp = _mm_subs_epi16 (vH, vGapOpen);\r
- vF = _mm_max_epi16 (vF, vTemp);\r
-\r
- /* add score to vH */\r
- vH = _mm_adds_epi16 (vHUp2, vScore);\r
-\r
- /* set vH to max of vH, vE, vF */\r
- vH = _mm_max_epi16 (vH, vE);\r
- vH = _mm_max_epi16 (vH, vF);\r
-\r
- /* Save value to use for next diagonal vH */\r
- vHUp2 = vHUp1;\r
-\r
- /* Update highest score encountered this far */\r
- vMaxScore = _mm_max_epi16(vMaxScore,vH);\r
- }\r
-\r
- _mm_store_si128 (pvEStore + l, vE);\r
- _mm_store_si128 (pvHStore + l, vH);\r
- }\r
-\r
- /* find largest score in the vMaxScore vector */\r
- vTemp = _mm_srli_si128 (vMaxScore, 8);\r
- vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);\r
- vTemp = _mm_srli_si128 (vMaxScore, 4);\r
- vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);\r
- vTemp = _mm_srli_si128 (vMaxScore, 2);\r
- vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);\r
-\r
- /* store in temporary variable */\r
- score = (short) _mm_extract_epi16 (vMaxScore, 0);\r
-\r
- /* return largest score */\r
- return score + SHORT_BIAS;\r
-}\r
-\r
-\r
-\r
-\r
-int\r
-swWozniakByte (unsigned char *querySeq,\r
- int queryLength,\r
- unsigned char *dbSeq,\r
- int dbLength,\r
- unsigned short gapOpen,\r
- unsigned short gapExtend,\r
- char *pMatrix,\r
- __m128i *pvHStore,\r
- __m128i *pvEStore,\r
- unsigned short bias)\r
-{\r
- int i, j, k, l, m;\r
- int score;\r
-\r
- int dup;\r
-\r
- char *pScore;\r
-\r
- __m128i vE, vF, vH;\r
- __m128i vEUp, vHUp1, vHUp2;\r
-\r
- __m128i vMaxScore;\r
- __m128i vBias;\r
- __m128i vGapOpen;\r
- __m128i vGapExtend;\r
- __m128i vScore;\r
-\r
- __m128i vTemp;\r
-\r
- /* remove unreferenced warning */\r
- querySeq;\r
-\r
- /* Load the bias to all elements of a constant */\r
- dup = (bias << 8) | (bias & 0x00ff);\r
- vBias = _mm_insert_epi16 (vBias, dup, 0);\r
- vBias = _mm_shufflelo_epi16 (vBias, 0);\r
- vBias = _mm_shuffle_epi32 (vBias, 0);\r
-\r
- /* Load gap opening penalty to all elements of a constant */\r
- dup = (gapOpen << 8) | (gapOpen & 0x00ff);\r
- vGapOpen = _mm_insert_epi16 (vGapOpen, dup, 0);\r
- vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);\r
- vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);\r
-\r
- /* Load gap extension penalty to all elements of a constant */\r
- dup = (gapExtend << 8) | (gapExtend & 0x00ff);\r
- vGapExtend = _mm_insert_epi16 (vGapExtend, dup, 0);\r
- vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);\r
- vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);\r
-\r
- vScore = _mm_xor_si128 (vScore, vScore);\r
- vMaxScore = _mm_xor_si128 (vMaxScore, vMaxScore);\r
-\r
- for (i = 0; i < queryLength + 16; i++)\r
- {\r
- _mm_store_si128 (pvEStore + i, vMaxScore);\r
- _mm_store_si128 (pvHStore + i, vMaxScore);\r
- }\r
-\r
- pScore = (char *) &vScore;\r
-\r
- for (i = 0; i < dbLength; i += 16, dbSeq += 16)\r
- {\r
- // zero lots of stuff.\r
- vE = _mm_xor_si128 (vE, vE);\r
- vF = _mm_xor_si128 (vF, vF);\r
- vH = _mm_xor_si128 (vH, vH);\r
- vHUp2 = _mm_xor_si128 (vHUp2, vHUp2);\r
-\r
- vScore = _mm_xor_si128 (vScore, vScore);\r
-\r
- for (j = 0; j < 16; ++j)\r
- {\r
- for (k = 0; k <= j; ++k) {\r
- int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;\r
- pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));\r
- }\r
- for ( ; k < 16; ++k) {\r
- pScore[k] = (char) bias;\r
- }\r
-\r
- // load values of vE and vH from previous row (one unit up)\r
- vEUp = _mm_load_si128 (pvEStore + j);\r
- vHUp1 = _mm_load_si128 (pvHStore + j);\r
-\r
- // shift into place so we have complete vE and vH vectors\r
- // that refer to the values one unit up from each cell\r
- // that we are currently working on.\r
- vTemp = _mm_slli_si128 (vE, 1);\r
- vEUp = _mm_srli_si128 (vEUp, 15);\r
- vEUp = _mm_or_si128 (vEUp, vTemp);\r
-\r
- vTemp = _mm_slli_si128 (vH, 1);\r
- vHUp1 = _mm_srli_si128 (vHUp1, 15);\r
- vHUp1 = _mm_or_si128 (vHUp1, vTemp);\r
-\r
- // do the dynamic programming\r
-\r
- // update vE value\r
- vE = _mm_subs_epu8 (vEUp, vGapExtend);\r
- vTemp = _mm_subs_epu8 (vHUp1, vGapOpen);\r
- vE = _mm_max_epu8 (vE, vTemp);\r
-\r
- // update vF value\r
- vF = _mm_subs_epu8 (vF, vGapExtend);\r
- vTemp = _mm_subs_epu8 (vH, vGapOpen);\r
- vF = _mm_max_epu8 (vF, vTemp);\r
-\r
- // add score to vH\r
- vH = _mm_adds_epu8 (vHUp2, *((__m128i *) pScore));\r
- vH = _mm_subs_epu8 (vH, vBias);\r
-\r
- // set vH to max of vH, vE, vF\r
- vH = _mm_max_epu8 (vH, vE);\r
- vH = _mm_max_epu8 (vH, vF);\r
-\r
- // Save value to use for next diagonal vH\r
- vHUp2 = vHUp1;\r
-\r
- // Update highest score encountered this far\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vH);\r
- }\r
-\r
- for (l = 0; j < queryLength; ++j, ++l)\r
- {\r
- for (k = 0; k < 16; ++k) {\r
- int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;\r
- pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));\r
- }\r
-\r
- // load values of vE and vH from previous row (one unit up)\r
- vEUp = _mm_load_si128 (pvEStore + j);\r
- vHUp1 = _mm_load_si128 (pvHStore + j);\r
-\r
- // save old values of vE and vH to use on next row\r
- _mm_store_si128 (pvEStore + l, vE);\r
- _mm_store_si128 (pvHStore + l, vH);\r
- \r
- // shift into place so we have complete vE and vH vectors\r
- // that refer to the values one unit up from each cell\r
- // that we are currently working on.\r
- vTemp = _mm_slli_si128 (vE, 1);\r
- vEUp = _mm_srli_si128 (vEUp, 15);\r
- vEUp = _mm_or_si128 (vEUp, vTemp);\r
-\r
- vTemp = _mm_slli_si128 (vH, 1);\r
- vHUp1 = _mm_srli_si128 (vHUp1, 15);\r
- vHUp1 = _mm_or_si128 (vHUp1, vTemp);\r
-\r
- // do the dynamic programming\r
-\r
- // update vE value\r
- vE = _mm_subs_epu8 (vEUp, vGapExtend);\r
- vTemp = _mm_subs_epu8 (vHUp1, vGapOpen);\r
- vE = _mm_max_epu8 (vE, vTemp);\r
-\r
- // update vF value\r
- vF = _mm_subs_epu8 (vF, vGapExtend);\r
- vTemp = _mm_subs_epu8 (vH, vGapOpen);\r
- vF = _mm_max_epu8 (vF, vTemp);\r
-\r
- // add score to vH\r
- vH = _mm_adds_epu8(vHUp2, vScore);\r
- vH = _mm_subs_epu8 (vH, vBias);\r
-\r
- // set vH to max of vH, vE, vF\r
- vH = _mm_max_epu8 (vH, vE);\r
- vH = _mm_max_epu8 (vH, vF);\r
-\r
- // Save value to use for next diagonal vH\r
- vHUp2 = vHUp1;\r
-\r
- // Update highest score encountered this far\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vH);\r
- }\r
-\r
- for (m = 0 ; m < 15; ++j, ++l, ++m)\r
- {\r
- for (k = 0; k <= m; ++k) {\r
- pScore[k] = (char) bias;\r
- }\r
- for ( ; k < 16; ++k) {\r
- int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;\r
- pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));\r
- }\r
-\r
- // save old values of vE and vH to use on next row\r
- _mm_store_si128 (pvEStore + l, vE);\r
- _mm_store_si128 (pvHStore + l, vH);\r
-\r
- // v_score_load contains all zeros\r
- vEUp = _mm_slli_si128 (vE, 1);\r
- vHUp1 = _mm_slli_si128 (vH, 1);\r
-\r
- // do the dynamic programming\r
-\r
- // update vE value\r
- vE = _mm_subs_epu8 (vEUp, vGapExtend);\r
- vTemp = _mm_subs_epu8 (vHUp1, vGapOpen);\r
- vE = _mm_max_epu8 (vE, vTemp);\r
-\r
- // update vF value\r
- vF = _mm_subs_epu8 (vF, vGapExtend);\r
- vTemp = _mm_subs_epu8 (vH, vGapOpen);\r
- vF = _mm_max_epu8 (vF, vTemp);\r
-\r
- // add score to vH\r
- vH = _mm_adds_epu8 (vHUp2, vScore);\r
- vH = _mm_subs_epu8 (vH, vBias);\r
-\r
- // set vH to max of vH, vE, vF\r
- vH = _mm_max_epu8 (vH, vE);\r
- vH = _mm_max_epu8 (vH, vF);\r
-\r
- // Save value to use for next diagonal vH\r
- vHUp2 = vHUp1;\r
-\r
- // Update highest score encountered this far\r
- vMaxScore = _mm_max_epu8(vMaxScore,vH);\r
- }\r
-\r
- _mm_store_si128 (pvEStore + l, vE);\r
- _mm_store_si128 (pvHStore + l, vH);\r
- }\r
-\r
- // find largest score in the vMaxScore vector\r
- vTemp = _mm_srli_si128 (vMaxScore, 8);\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);\r
- vTemp = _mm_srli_si128 (vMaxScore, 4);\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);\r
- vTemp = _mm_srli_si128 (vMaxScore, 2);\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);\r
- vTemp = _mm_srli_si128 (vMaxScore, 1);\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);\r
-\r
- // store in temporary variable\r
- score = (short) _mm_extract_epi16 (vMaxScore, 0);\r
- score = score & 0x00ff;\r
-\r
- // check if we might have overflowed\r
- if (score + bias >= 255)\r
- {\r
- score = 255;\r
- }\r
-\r
-\r
- // return largest score\r
- return score;\r
-}\r