+++ /dev/null
-/******************************************************************\r
- Copyright 2006 by Michael Farrar. All rights reserved.\r
- This program may not be sold or incorporated into a commercial product,\r
- in whole or in part, without written consent of Michael Farrar. For \r
- further information regarding permission for use or reproduction, please \r
- contact: Michael Farrar at farrar.michael@gmail.com.\r
-*******************************************************************/\r
-\r
-/*\r
- Written by Michael Farrar, 2006.\r
- Please send bug reports and/or suggestions to farrar.michael@gmail.com.\r
-*/\r
-\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#include <emmintrin.h>\r
-\r
-#include "swsse2.h"\r
-#include "swstriped.h"\r
-\r
-void *\r
-swStripedInit(unsigned char *querySeq,\r
- int queryLength,\r
- signed char *matrix)\r
-{\r
- int i, j, k;\r
-\r
- int segSize;\r
- int nCount;\r
-\r
- int bias;\r
-\r
- int lenQryByte;\r
- int lenQryShort;\r
-\r
- int weight;\r
-\r
- short *ps;\r
- char *pc;\r
-\r
- signed char *matrixRow;\r
-\r
- size_t aligned;\r
-\r
- SwStripedData *pSwData;\r
- \r
- lenQryByte = (queryLength + 15) / 16;\r
- lenQryShort = (queryLength + 7) / 8;\r
-\r
- pSwData = (SwStripedData *) malloc (sizeof (SwStripedData));\r
- if (!pSwData) {\r
- fprintf (stderr, "Unable to allocate memory for SW data\n");\r
- exit (-1);\r
- }\r
-\r
- nCount = 64 + /* slack bytes */\r
- lenQryByte * ALPHA_SIZE + /* query profile byte */\r
- lenQryShort * ALPHA_SIZE + /* query profile short */\r
- (lenQryShort * 3); /* vH1, vH2 and vE */\r
-\r
- pSwData->pData = (unsigned char *) calloc (nCount, sizeof (__m128i));\r
- if (!pSwData->pData) {\r
- fprintf (stderr, "Unable to allocate memory for SW data buffers\n");\r
- exit (-1);\r
- }\r
-\r
- /* since we might port this to another platform, lets align the data */\r
- /* to 16 byte boundries ourselves */\r
- aligned = ((size_t) pSwData->pData + 15) & ~(0x0f);\r
-\r
- pSwData->pvbQueryProf = (__m128i *) aligned;\r
- pSwData->pvsQueryProf = pSwData->pvbQueryProf + lenQryByte * ALPHA_SIZE;\r
-\r
- pSwData->pvH1 = pSwData->pvsQueryProf + lenQryShort * ALPHA_SIZE;\r
- pSwData->pvH2 = pSwData->pvH1 + lenQryShort;\r
- pSwData->pvE = pSwData->pvH2 + lenQryShort;\r
-\r
- /* Use a scoring profile for the SSE2 implementation, but the layout\r
- * is a bit strange. The scoring profile is parallel to the query, but is\r
- * accessed in a stripped pattern. The query is divided into equal length\r
- * segments. The number of segments is equal to the number of elements\r
- * processed in the SSE2 register. For 8-bit calculations, the query will\r
- * be divided into 16 equal length parts. If the query is not long enough\r
- * to fill the last segment, it will be filled with neutral weights. The\r
- * first element in the SSE register will hold a value from the first segment,\r
- * the second element of the SSE register will hold a value from the\r
- * second segment and so on. So if the query length is 288, then each\r
- * segment will have a length of 18. So the first 16 bytes will have\r
- * the following weights: Q1, Q19, Q37, ... Q271; the next 16 bytes will\r
- * have the following weights: Q2, Q20, Q38, ... Q272; and so on until\r
- * all parts of all segments have been written. The last seqment will\r
- * have the following weights: Q18, Q36, Q54, ... Q288. This will be\r
- * done for the entire alphabet.\r
- */\r
-\r
- /* Find the bias to use in the substitution matrix */\r
- bias = 127;\r
- for (i = 0; i < ALPHA_SIZE * ALPHA_SIZE; i++) {\r
- if (matrix[i] < bias) {\r
- bias = matrix[i];\r
- }\r
- }\r
- if (bias > 0) {\r
- bias = 0;\r
- }\r
-\r
- /* Fill in the byte query profile */\r
- pc = (char *) pSwData->pvbQueryProf;\r
- segSize = (queryLength + 15) / 16;\r
- nCount = segSize * 16;\r
- for (i = 0; i < ALPHA_SIZE; ++i) {\r
- matrixRow = matrix + i * ALPHA_SIZE;\r
- for (j = 0; j < segSize; ++j) {\r
- for (k = j; k < nCount; k += segSize) {\r
- if (k >= queryLength) {\r
- weight = 0;\r
- } else {\r
- weight = matrixRow[*(querySeq + k)];\r
- }\r
- *pc++ = (char) (weight - bias);\r
- }\r
- }\r
- }\r
-\r
- /* Fill in the short query profile */\r
- ps = (short *) pSwData->pvsQueryProf;\r
- segSize = (queryLength + 7) / 8;\r
- nCount = segSize * 8;\r
- for (i = 0; i < ALPHA_SIZE; ++i) {\r
- matrixRow = matrix + i * ALPHA_SIZE;\r
- for (j = 0; j < segSize; ++j) {\r
- for (k = j; k < nCount; k += segSize) {\r
- if (k >= queryLength) {\r
- weight = 0;\r
- } else {\r
- weight = matrixRow[*(querySeq + k)];\r
- }\r
- *ps++ = (unsigned short) weight;\r
- }\r
- }\r
- }\r
-\r
- pSwData->bias = (unsigned short) -bias;\r
-\r
- return pSwData;\r
-}\r
-\r
-void swStripedScan (unsigned char *querySeq,\r
- int queryLength,\r
- FASTA_LIB *dbLib,\r
- void *swData,\r
- SEARCH_OPTIONS *options,\r
- SCORE_LIST *scores)\r
-{\r
- int score;\r
-\r
- int threshold = options->threshold;\r
-\r
- unsigned char *dbSeq;\r
- int dbLen;\r
-\r
- int gapInit = -(options->gapInit + options->gapExt);\r
- int gapExt = -options->gapExt;\r
-\r
- SwStripedData *stripedData = (SwStripedData *) swData;\r
-\r
- dbSeq = nextSeq (dbLib, &dbLen);\r
- while (dbLen > 0) {\r
-\r
- score = swStripedByte (querySeq, queryLength, \r
- dbSeq, dbLen, \r
- gapInit, gapExt, \r
- stripedData->pvbQueryProf,\r
- stripedData->pvH1,\r
- stripedData->pvH2,\r
- stripedData->pvE,\r
- stripedData->bias);\r
-\r
- /* check if needs a run with higher precision */\r
- if (score >= 255) {\r
- score = swStripedWord (querySeq, queryLength, \r
- dbSeq, dbLen, \r
- gapInit, gapExt, \r
- stripedData->pvsQueryProf,\r
- stripedData->pvH1,\r
- stripedData->pvH2,\r
- stripedData->pvE);\r
- }\r
-\r
- if (score >= threshold) {\r
- int minScore = insertList (scores, score, seqName (dbLib));\r
- if (minScore >= threshold) {\r
- threshold = minScore;\r
- }\r
- }\r
-\r
- dbSeq = nextSeq (dbLib, &dbLen);\r
- }\r
-}\r
-\r
-void\r
-swStripedComplete(void *pSwData)\r
-{\r
- SwStripedData *pStripedData = (SwStripedData *) pSwData;\r
-\r
- free (pStripedData->pData);\r
- free (pStripedData);\r
-}\r
-\r
-int\r
-swStripedWord(unsigned char *querySeq,\r
- int queryLength,\r
- unsigned char *dbSeq,\r
- int dbLength,\r
- unsigned short gapOpen,\r
- unsigned short gapExtend,\r
- __m128i *pvQueryProf,\r
- __m128i *pvHLoad,\r
- __m128i *pvHStore,\r
- __m128i *pvE)\r
-{\r
- int i, j;\r
- int score;\r
-\r
- int cmp;\r
- int iter = (queryLength + 7) / 8;\r
- \r
- __m128i *pv;\r
-\r
- __m128i vE, vF, vH;\r
-\r
- __m128i vMaxScore;\r
- __m128i vGapOpen;\r
- __m128i vGapExtend;\r
-\r
- __m128i vMin;\r
- __m128i vMinimums;\r
- __m128i vTemp;\r
-\r
- __m128i *pvScore;\r
-\r
- /* remove unreferenced warning */\r
- querySeq;\r
-\r
- /* Load gap opening penalty to all elements of a constant */\r
- vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);\r
- vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);\r
- vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);\r
-\r
- /* Load gap extension penalty to all elements of a constant */\r
- vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);\r
- vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);\r
- vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);\r
-\r
- /* load vMaxScore with the zeros. since we are using signed */\r
- /* math, we will bias the maxscore to -32768 so we have the */\r
- /* full range of the short. */\r
- vMaxScore = _mm_cmpeq_epi16 (vMaxScore, vMaxScore);\r
- vMaxScore = _mm_slli_epi16 (vMaxScore, 15);\r
-\r
- vMinimums = _mm_shuffle_epi32 (vMaxScore, 0);\r
-\r
- vMin = _mm_shuffle_epi32 (vMaxScore, 0);\r
- vMin = _mm_srli_si128 (vMin, 14);\r
-\r
- /* Zero out the storage vector */\r
- for (i = 0; i < iter; i++)\r
- {\r
- _mm_store_si128 (pvE + i, vMaxScore);\r
- _mm_store_si128 (pvHStore + i, vMaxScore);\r
- }\r
-\r
- for (i = 0; i < dbLength; ++i)\r
- {\r
- /* fetch first data asap. */\r
- pvScore = pvQueryProf + dbSeq[i] * iter;\r
-\r
- /* zero out F. */\r
- vF = _mm_cmpeq_epi16 (vF, vF);\r
- vF = _mm_slli_epi16 (vF, 15);\r
-\r
- /* load the next h value */\r
- vH = _mm_load_si128 (pvHStore + iter - 1);\r
- vH = _mm_slli_si128 (vH, 2);\r
- vH = _mm_or_si128 (vH, vMin);\r
-\r
- pv = pvHLoad;\r
- pvHLoad = pvHStore;\r
- pvHStore = pv;\r
-\r
- for (j = 0; j < iter; j++)\r
- {\r
- /* load values of vF and vH from previous row (one unit up) */\r
- vE = _mm_load_si128 (pvE + j);\r
-\r
- /* add score to vH */\r
- vH = _mm_adds_epi16 (vH, *pvScore++);\r
-\r
- /* Update highest score encountered this far */\r
- vMaxScore = _mm_max_epi16 (vMaxScore, vH);\r
-\r
- /* get max from vH, vE and vF */\r
- vH = _mm_max_epi16 (vH, vE);\r
- vH = _mm_max_epi16 (vH, vF);\r
-\r
- /* save vH values */\r
- _mm_store_si128 (pvHStore + j, vH);\r
-\r
- /* update vE value */\r
- vH = _mm_subs_epi16 (vH, vGapOpen);\r
- vE = _mm_subs_epi16 (vE, vGapExtend);\r
- vE = _mm_max_epi16 (vE, vH);\r
-\r
- /* update vF value */\r
- vF = _mm_subs_epi16 (vF, vGapExtend);\r
- vF = _mm_max_epi16 (vF, vH);\r
-\r
- /* save vE values */\r
- _mm_store_si128 (pvE + j, vE);\r
-\r
- /* load the next h value */\r
- vH = _mm_load_si128 (pvHLoad + j);\r
- }\r
-\r
- /* reset pointers to the start of the saved data */\r
- j = 0;\r
- vH = _mm_load_si128 (pvHStore + j);\r
-\r
- /* the computed vF value is for the given column. since */\r
- /* we are at the end, we need to shift the vF value over */\r
- /* to the next column. */\r
- vF = _mm_slli_si128 (vF, 2);\r
- vF = _mm_or_si128 (vF, vMin);\r
- vTemp = _mm_subs_epi16 (vH, vGapOpen);\r
- vTemp = _mm_cmpgt_epi16 (vF, vTemp);\r
- cmp = _mm_movemask_epi8 (vTemp);\r
- while (cmp != 0x0000) \r
- {\r
- vE = _mm_load_si128 (pvE + j);\r
-\r
- vH = _mm_max_epi16 (vH, vF);\r
-\r
- /* save vH values */\r
- _mm_store_si128 (pvHStore + j, vH);\r
-\r
- /* update vE incase the new vH value would change it */\r
- vH = _mm_subs_epi16 (vH, vGapOpen);\r
- vE = _mm_max_epi16 (vE, vH);\r
- _mm_store_si128 (pvE + j, vE);\r
-\r
- /* update vF value */\r
- vF = _mm_subs_epi16 (vF, vGapExtend);\r
-\r
- j++;\r
- if (j >= iter)\r
- {\r
- j = 0;\r
- vF = _mm_slli_si128 (vF, 2);\r
- vF = _mm_or_si128 (vF, vMin);\r
- }\r
-\r
- vH = _mm_load_si128 (pvHStore + j);\r
-\r
- vTemp = _mm_subs_epi16 (vH, vGapOpen);\r
- vTemp = _mm_cmpgt_epi16 (vF, vTemp);\r
- cmp = _mm_movemask_epi8 (vTemp);\r
- }\r
- }\r
-\r
- /* find largest score in the vMaxScore vector */\r
- vTemp = _mm_srli_si128 (vMaxScore, 8);\r
- vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);\r
- vTemp = _mm_srli_si128 (vMaxScore, 4);\r
- vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);\r
- vTemp = _mm_srli_si128 (vMaxScore, 2);\r
- vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);\r
-\r
- /* store in temporary variable */\r
- score = (short) _mm_extract_epi16 (vMaxScore, 0);\r
-\r
- /* return largest score */\r
- return score + SHORT_BIAS;\r
-}\r
-\r
-\r
-\r
-\r
-int\r
-swStripedByte(unsigned char *querySeq,\r
- int queryLength,\r
- unsigned char *dbSeq,\r
- int dbLength,\r
- unsigned short gapOpen,\r
- unsigned short gapExtend,\r
- __m128i *pvQueryProf,\r
- __m128i *pvHLoad,\r
- __m128i *pvHStore,\r
- __m128i *pvE,\r
- unsigned short bias)\r
-{\r
- int i, j;\r
- int score;\r
-\r
- int dup;\r
- int cmp;\r
- int iter = (queryLength + 15) / 16;\r
- \r
- __m128i *pv;\r
-\r
- __m128i vE, vF, vH;\r
-\r
- __m128i vMaxScore;\r
- __m128i vBias;\r
- __m128i vGapOpen;\r
- __m128i vGapExtend;\r
-\r
- __m128i vTemp;\r
- __m128i vZero;\r
-\r
- __m128i *pvScore;\r
-\r
- /* remove unreferenced warning */\r
- querySeq;\r
-\r
- /* Load the bias to all elements of a constant */\r
- dup = (bias << 8) | (bias & 0x00ff);\r
- vBias = _mm_insert_epi16 (vBias, dup, 0);\r
- vBias = _mm_shufflelo_epi16 (vBias, 0);\r
- vBias = _mm_shuffle_epi32 (vBias, 0);\r
-\r
- /* Load gap opening penalty to all elements of a constant */\r
- dup = (gapOpen << 8) | (gapOpen & 0x00ff);\r
- vGapOpen = _mm_insert_epi16 (vGapOpen, dup, 0);\r
- vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);\r
- vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);\r
-\r
- /* Load gap extension penalty to all elements of a constant */\r
- dup = (gapExtend << 8) | (gapExtend & 0x00ff);\r
- vGapExtend = _mm_insert_epi16 (vGapExtend, dup, 0);\r
- vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);\r
- vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);\r
-\r
- vMaxScore = _mm_xor_si128 (vMaxScore, vMaxScore);\r
-\r
- vZero = _mm_xor_si128 (vZero, vZero);\r
-\r
- /* Zero out the storage vector */\r
- for (i = 0; i < iter; i++)\r
- {\r
- _mm_store_si128 (pvE + i, vMaxScore);\r
- _mm_store_si128 (pvHStore + i, vMaxScore);\r
- }\r
-\r
- for (i = 0; i < dbLength; ++i)\r
- {\r
- /* fetch first data asap. */\r
- pvScore = pvQueryProf + dbSeq[i] * iter;\r
-\r
- /* zero out F. */\r
- vF = _mm_xor_si128 (vF, vF);\r
-\r
- /* load the next h value */\r
- vH = _mm_load_si128 (pvHStore + iter - 1);\r
- vH = _mm_slli_si128 (vH, 1);\r
-\r
- pv = pvHLoad;\r
- pvHLoad = pvHStore;\r
- pvHStore = pv;\r
-\r
- for (j = 0; j < iter; j++)\r
- {\r
- /* load values of vF and vH from previous row (one unit up) */\r
- vE = _mm_load_si128 (pvE + j);\r
-\r
- /* add score to vH */\r
- vH = _mm_adds_epu8 (vH, *(pvScore + j));\r
- vH = _mm_subs_epu8 (vH, vBias);\r
-\r
- /* Update highest score encountered this far */\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vH);\r
-\r
- /* get max from vH, vE and vF */\r
- vH = _mm_max_epu8 (vH, vE);\r
- vH = _mm_max_epu8 (vH, vF);\r
-\r
- /* save vH values */\r
- _mm_store_si128 (pvHStore + j, vH);\r
-\r
- /* update vE value */\r
- vH = _mm_subs_epu8 (vH, vGapOpen);\r
- vE = _mm_subs_epu8 (vE, vGapExtend);\r
- vE = _mm_max_epu8 (vE, vH);\r
-\r
- /* update vF value */\r
- vF = _mm_subs_epu8 (vF, vGapExtend);\r
- vF = _mm_max_epu8 (vF, vH);\r
-\r
- /* save vE values */\r
- _mm_store_si128 (pvE + j, vE);\r
-\r
- /* load the next h value */\r
- vH = _mm_load_si128 (pvHLoad + j);\r
- }\r
-\r
- /* reset pointers to the start of the saved data */\r
- j = 0;\r
- vH = _mm_load_si128 (pvHStore + j);\r
-\r
- /* the computed vF value is for the given column. since */\r
- /* we are at the end, we need to shift the vF value over */\r
- /* to the next column. */\r
- vF = _mm_slli_si128 (vF, 1);\r
- vTemp = _mm_subs_epu8 (vH, vGapOpen);\r
- vTemp = _mm_subs_epu8 (vF, vTemp);\r
- vTemp = _mm_cmpeq_epi8 (vTemp, vZero);\r
- cmp = _mm_movemask_epi8 (vTemp);\r
-\r
- while (cmp != 0xffff) \r
- {\r
- vE = _mm_load_si128 (pvE + j);\r
-\r
- vH = _mm_max_epu8 (vH, vF);\r
-\r
- /* save vH values */\r
- _mm_store_si128 (pvHStore + j, vH);\r
-\r
- /* update vE incase the new vH value would change it */\r
- vH = _mm_subs_epu8 (vH, vGapOpen);\r
- vE = _mm_max_epu8 (vE, vH);\r
- _mm_store_si128 (pvE + j, vE);\r
-\r
- /* update vF value */\r
- vF = _mm_subs_epu8 (vF, vGapExtend);\r
-\r
- j++;\r
- if (j >= iter)\r
- {\r
- j = 0;\r
- vF = _mm_slli_si128 (vF, 1);\r
- }\r
-\r
- vH = _mm_load_si128 (pvHStore + j);\r
-\r
- vTemp = _mm_subs_epu8 (vH, vGapOpen);\r
- vTemp = _mm_subs_epu8 (vF, vTemp);\r
- vTemp = _mm_cmpeq_epi8 (vTemp, vZero);\r
- cmp = _mm_movemask_epi8 (vTemp);\r
- }\r
- }\r
-\r
- /* find largest score in the vMaxScore vector */\r
- vTemp = _mm_srli_si128 (vMaxScore, 8);\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);\r
- vTemp = _mm_srli_si128 (vMaxScore, 4);\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);\r
- vTemp = _mm_srli_si128 (vMaxScore, 2);\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);\r
- vTemp = _mm_srli_si128 (vMaxScore, 1);\r
- vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);\r
-\r
- /* store in temporary variable */\r
- score = _mm_extract_epi16 (vMaxScore, 0);\r
- score = score & 0x00ff;\r
-\r
- /* check if we might have overflowed */\r
- if (score + bias >= 255)\r
- {\r
- score = 255;\r
- }\r
-\r
- /* return largest score */\r
- return score;\r
-}\r