1 /******************************************************************
\r
2 Copyright 2006 by Michael Farrar. All rights reserved.
\r
3 This program may not be sold or incorporated into a commercial product,
\r
4 in whole or in part, without written consent of Michael Farrar. For
\r
5 further information regarding permission for use or reproduction, please
\r
6 contact: Michael Farrar at farrar.michael@gmail.com.
\r
7 *******************************************************************/
\r
10 Written by Michael Farrar, 2006.
\r
11 Please send bug reports and/or suggestions to farrar.michael@gmail.com.
\r
16 #include <emmintrin.h>
\r
19 #include "swstriped.h"
\r
22 swStripedInit(unsigned char *querySeq,
\r
24 signed char *matrix)
\r
41 signed char *matrixRow;
\r
45 SwStripedData *pSwData;
\r
47 lenQryByte = (queryLength + 15) / 16;
\r
48 lenQryShort = (queryLength + 7) / 8;
\r
50 pSwData = (SwStripedData *) malloc (sizeof (SwStripedData));
\r
52 fprintf (stderr, "Unable to allocate memory for SW data\n");
\r
56 nCount = 64 + /* slack bytes */
\r
57 lenQryByte * ALPHA_SIZE + /* query profile byte */
\r
58 lenQryShort * ALPHA_SIZE + /* query profile short */
\r
59 (lenQryShort * 3); /* vH1, vH2 and vE */
\r
61 pSwData->pData = (unsigned char *) calloc (nCount, sizeof (__m128i));
\r
62 if (!pSwData->pData) {
\r
63 fprintf (stderr, "Unable to allocate memory for SW data buffers\n");
\r
67 /* since we might port this to another platform, lets align the data */
\r
68 /* to 16 byte boundries ourselves */
\r
69 aligned = ((size_t) pSwData->pData + 15) & ~(0x0f);
\r
71 pSwData->pvbQueryProf = (__m128i *) aligned;
\r
72 pSwData->pvsQueryProf = pSwData->pvbQueryProf + lenQryByte * ALPHA_SIZE;
\r
74 pSwData->pvH1 = pSwData->pvsQueryProf + lenQryShort * ALPHA_SIZE;
\r
75 pSwData->pvH2 = pSwData->pvH1 + lenQryShort;
\r
76 pSwData->pvE = pSwData->pvH2 + lenQryShort;
\r
78 /* Use a scoring profile for the SSE2 implementation, but the layout
\r
79 * is a bit strange. The scoring profile is parallel to the query, but is
\r
80 * accessed in a stripped pattern. The query is divided into equal length
\r
81 * segments. The number of segments is equal to the number of elements
\r
82 * processed in the SSE2 register. For 8-bit calculations, the query will
\r
83 * be divided into 16 equal length parts. If the query is not long enough
\r
84 * to fill the last segment, it will be filled with neutral weights. The
\r
85 * first element in the SSE register will hold a value from the first segment,
\r
86 * the second element of the SSE register will hold a value from the
\r
87 * second segment and so on. So if the query length is 288, then each
\r
88 * segment will have a length of 18. So the first 16 bytes will have
\r
89 * the following weights: Q1, Q19, Q37, ... Q271; the next 16 bytes will
\r
90 * have the following weights: Q2, Q20, Q38, ... Q272; and so on until
\r
91 * all parts of all segments have been written. The last seqment will
\r
92 * have the following weights: Q18, Q36, Q54, ... Q288. This will be
\r
93 * done for the entire alphabet.
\r
96 /* Find the bias to use in the substitution matrix */
\r
98 for (i = 0; i < ALPHA_SIZE * ALPHA_SIZE; i++) {
\r
99 if (matrix[i] < bias) {
\r
107 /* Fill in the byte query profile */
\r
108 pc = (char *) pSwData->pvbQueryProf;
\r
109 segSize = (queryLength + 15) / 16;
\r
110 nCount = segSize * 16;
\r
111 for (i = 0; i < ALPHA_SIZE; ++i) {
\r
112 matrixRow = matrix + i * ALPHA_SIZE;
\r
113 for (j = 0; j < segSize; ++j) {
\r
114 for (k = j; k < nCount; k += segSize) {
\r
115 if (k >= queryLength) {
\r
118 weight = matrixRow[*(querySeq + k)];
\r
120 *pc++ = (char) (weight - bias);
\r
125 /* Fill in the short query profile */
\r
126 ps = (short *) pSwData->pvsQueryProf;
\r
127 segSize = (queryLength + 7) / 8;
\r
128 nCount = segSize * 8;
\r
129 for (i = 0; i < ALPHA_SIZE; ++i) {
\r
130 matrixRow = matrix + i * ALPHA_SIZE;
\r
131 for (j = 0; j < segSize; ++j) {
\r
132 for (k = j; k < nCount; k += segSize) {
\r
133 if (k >= queryLength) {
\r
136 weight = matrixRow[*(querySeq + k)];
\r
138 *ps++ = (unsigned short) weight;
\r
143 pSwData->bias = (unsigned short) -bias;
\r
148 void swStripedScan (unsigned char *querySeq,
\r
152 SEARCH_OPTIONS *options,
\r
153 SCORE_LIST *scores)
\r
157 int threshold = options->threshold;
\r
159 unsigned char *dbSeq;
\r
162 int gapInit = -(options->gapInit + options->gapExt);
\r
163 int gapExt = -options->gapExt;
\r
165 SwStripedData *stripedData = (SwStripedData *) swData;
\r
167 dbSeq = nextSeq (dbLib, &dbLen);
\r
168 while (dbLen > 0) {
\r
170 score = swStripedByte (querySeq, queryLength,
\r
173 stripedData->pvbQueryProf,
\r
177 stripedData->bias);
\r
179 /* check if needs a run with higher precision */
\r
180 if (score >= 255) {
\r
181 score = swStripedWord (querySeq, queryLength,
\r
184 stripedData->pvsQueryProf,
\r
190 if (score >= threshold) {
\r
191 int minScore = insertList (scores, score, seqName (dbLib));
\r
192 if (minScore >= threshold) {
\r
193 threshold = minScore;
\r
197 dbSeq = nextSeq (dbLib, &dbLen);
\r
202 swStripedComplete(void *pSwData)
\r
204 SwStripedData *pStripedData = (SwStripedData *) pSwData;
\r
206 free (pStripedData->pData);
\r
207 free (pStripedData);
\r
211 swStripedWord(unsigned char *querySeq,
\r
213 unsigned char *dbSeq,
\r
215 unsigned short gapOpen,
\r
216 unsigned short gapExtend,
\r
217 __m128i *pvQueryProf,
\r
226 int iter = (queryLength + 7) / 8;
\r
230 __m128i vE, vF, vH;
\r
234 __m128i vGapExtend;
\r
242 /* remove unreferenced warning */
\r
245 /* Load gap opening penalty to all elements of a constant */
\r
246 vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);
\r
247 vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
\r
248 vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);
\r
250 /* Load gap extension penalty to all elements of a constant */
\r
251 vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);
\r
252 vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
\r
253 vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);
\r
255 /* load vMaxScore with the zeros. since we are using signed */
\r
256 /* math, we will bias the maxscore to -32768 so we have the */
\r
257 /* full range of the short. */
\r
258 vMaxScore = _mm_cmpeq_epi16 (vMaxScore, vMaxScore);
\r
259 vMaxScore = _mm_slli_epi16 (vMaxScore, 15);
\r
261 vMinimums = _mm_shuffle_epi32 (vMaxScore, 0);
\r
263 vMin = _mm_shuffle_epi32 (vMaxScore, 0);
\r
264 vMin = _mm_srli_si128 (vMin, 14);
\r
266 /* Zero out the storage vector */
\r
267 for (i = 0; i < iter; i++)
\r
269 _mm_store_si128 (pvE + i, vMaxScore);
\r
270 _mm_store_si128 (pvHStore + i, vMaxScore);
\r
273 for (i = 0; i < dbLength; ++i)
\r
275 /* fetch first data asap. */
\r
276 pvScore = pvQueryProf + dbSeq[i] * iter;
\r
279 vF = _mm_cmpeq_epi16 (vF, vF);
\r
280 vF = _mm_slli_epi16 (vF, 15);
\r
282 /* load the next h value */
\r
283 vH = _mm_load_si128 (pvHStore + iter - 1);
\r
284 vH = _mm_slli_si128 (vH, 2);
\r
285 vH = _mm_or_si128 (vH, vMin);
\r
288 pvHLoad = pvHStore;
\r
291 for (j = 0; j < iter; j++)
\r
293 /* load values of vF and vH from previous row (one unit up) */
\r
294 vE = _mm_load_si128 (pvE + j);
\r
296 /* add score to vH */
\r
297 vH = _mm_adds_epi16 (vH, *pvScore++);
\r
299 /* Update highest score encountered this far */
\r
300 vMaxScore = _mm_max_epi16 (vMaxScore, vH);
\r
302 /* get max from vH, vE and vF */
\r
303 vH = _mm_max_epi16 (vH, vE);
\r
304 vH = _mm_max_epi16 (vH, vF);
\r
306 /* save vH values */
\r
307 _mm_store_si128 (pvHStore + j, vH);
\r
309 /* update vE value */
\r
310 vH = _mm_subs_epi16 (vH, vGapOpen);
\r
311 vE = _mm_subs_epi16 (vE, vGapExtend);
\r
312 vE = _mm_max_epi16 (vE, vH);
\r
314 /* update vF value */
\r
315 vF = _mm_subs_epi16 (vF, vGapExtend);
\r
316 vF = _mm_max_epi16 (vF, vH);
\r
318 /* save vE values */
\r
319 _mm_store_si128 (pvE + j, vE);
\r
321 /* load the next h value */
\r
322 vH = _mm_load_si128 (pvHLoad + j);
\r
325 /* reset pointers to the start of the saved data */
\r
327 vH = _mm_load_si128 (pvHStore + j);
\r
329 /* the computed vF value is for the given column. since */
\r
330 /* we are at the end, we need to shift the vF value over */
\r
331 /* to the next column. */
\r
332 vF = _mm_slli_si128 (vF, 2);
\r
333 vF = _mm_or_si128 (vF, vMin);
\r
334 vTemp = _mm_subs_epi16 (vH, vGapOpen);
\r
335 vTemp = _mm_cmpgt_epi16 (vF, vTemp);
\r
336 cmp = _mm_movemask_epi8 (vTemp);
\r
337 while (cmp != 0x0000)
\r
339 vE = _mm_load_si128 (pvE + j);
\r
341 vH = _mm_max_epi16 (vH, vF);
\r
343 /* save vH values */
\r
344 _mm_store_si128 (pvHStore + j, vH);
\r
346 /* update vE incase the new vH value would change it */
\r
347 vH = _mm_subs_epi16 (vH, vGapOpen);
\r
348 vE = _mm_max_epi16 (vE, vH);
\r
349 _mm_store_si128 (pvE + j, vE);
\r
351 /* update vF value */
\r
352 vF = _mm_subs_epi16 (vF, vGapExtend);
\r
358 vF = _mm_slli_si128 (vF, 2);
\r
359 vF = _mm_or_si128 (vF, vMin);
\r
362 vH = _mm_load_si128 (pvHStore + j);
\r
364 vTemp = _mm_subs_epi16 (vH, vGapOpen);
\r
365 vTemp = _mm_cmpgt_epi16 (vF, vTemp);
\r
366 cmp = _mm_movemask_epi8 (vTemp);
\r
370 /* find largest score in the vMaxScore vector */
\r
371 vTemp = _mm_srli_si128 (vMaxScore, 8);
\r
372 vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);
\r
373 vTemp = _mm_srli_si128 (vMaxScore, 4);
\r
374 vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);
\r
375 vTemp = _mm_srli_si128 (vMaxScore, 2);
\r
376 vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);
\r
378 /* store in temporary variable */
\r
379 score = (short) _mm_extract_epi16 (vMaxScore, 0);
\r
381 /* return largest score */
\r
382 return score + SHORT_BIAS;
\r
389 swStripedByte(unsigned char *querySeq,
\r
391 unsigned char *dbSeq,
\r
393 unsigned short gapOpen,
\r
394 unsigned short gapExtend,
\r
395 __m128i *pvQueryProf,
\r
399 unsigned short bias)
\r
406 int iter = (queryLength + 15) / 16;
\r
410 __m128i vE, vF, vH;
\r
415 __m128i vGapExtend;
\r
422 /* remove unreferenced warning */
\r
425 /* Load the bias to all elements of a constant */
\r
426 dup = (bias << 8) | (bias & 0x00ff);
\r
427 vBias = _mm_insert_epi16 (vBias, dup, 0);
\r
428 vBias = _mm_shufflelo_epi16 (vBias, 0);
\r
429 vBias = _mm_shuffle_epi32 (vBias, 0);
\r
431 /* Load gap opening penalty to all elements of a constant */
\r
432 dup = (gapOpen << 8) | (gapOpen & 0x00ff);
\r
433 vGapOpen = _mm_insert_epi16 (vGapOpen, dup, 0);
\r
434 vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
\r
435 vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);
\r
437 /* Load gap extension penalty to all elements of a constant */
\r
438 dup = (gapExtend << 8) | (gapExtend & 0x00ff);
\r
439 vGapExtend = _mm_insert_epi16 (vGapExtend, dup, 0);
\r
440 vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
\r
441 vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);
\r
443 vMaxScore = _mm_xor_si128 (vMaxScore, vMaxScore);
\r
445 vZero = _mm_xor_si128 (vZero, vZero);
\r
447 /* Zero out the storage vector */
\r
448 for (i = 0; i < iter; i++)
\r
450 _mm_store_si128 (pvE + i, vMaxScore);
\r
451 _mm_store_si128 (pvHStore + i, vMaxScore);
\r
454 for (i = 0; i < dbLength; ++i)
\r
456 /* fetch first data asap. */
\r
457 pvScore = pvQueryProf + dbSeq[i] * iter;
\r
460 vF = _mm_xor_si128 (vF, vF);
\r
462 /* load the next h value */
\r
463 vH = _mm_load_si128 (pvHStore + iter - 1);
\r
464 vH = _mm_slli_si128 (vH, 1);
\r
467 pvHLoad = pvHStore;
\r
470 for (j = 0; j < iter; j++)
\r
472 /* load values of vF and vH from previous row (one unit up) */
\r
473 vE = _mm_load_si128 (pvE + j);
\r
475 /* add score to vH */
\r
476 vH = _mm_adds_epu8 (vH, *(pvScore + j));
\r
477 vH = _mm_subs_epu8 (vH, vBias);
\r
479 /* Update highest score encountered this far */
\r
480 vMaxScore = _mm_max_epu8 (vMaxScore, vH);
\r
482 /* get max from vH, vE and vF */
\r
483 vH = _mm_max_epu8 (vH, vE);
\r
484 vH = _mm_max_epu8 (vH, vF);
\r
486 /* save vH values */
\r
487 _mm_store_si128 (pvHStore + j, vH);
\r
489 /* update vE value */
\r
490 vH = _mm_subs_epu8 (vH, vGapOpen);
\r
491 vE = _mm_subs_epu8 (vE, vGapExtend);
\r
492 vE = _mm_max_epu8 (vE, vH);
\r
494 /* update vF value */
\r
495 vF = _mm_subs_epu8 (vF, vGapExtend);
\r
496 vF = _mm_max_epu8 (vF, vH);
\r
498 /* save vE values */
\r
499 _mm_store_si128 (pvE + j, vE);
\r
501 /* load the next h value */
\r
502 vH = _mm_load_si128 (pvHLoad + j);
\r
505 /* reset pointers to the start of the saved data */
\r
507 vH = _mm_load_si128 (pvHStore + j);
\r
509 /* the computed vF value is for the given column. since */
\r
510 /* we are at the end, we need to shift the vF value over */
\r
511 /* to the next column. */
\r
512 vF = _mm_slli_si128 (vF, 1);
\r
513 vTemp = _mm_subs_epu8 (vH, vGapOpen);
\r
514 vTemp = _mm_subs_epu8 (vF, vTemp);
\r
515 vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
\r
516 cmp = _mm_movemask_epi8 (vTemp);
\r
518 while (cmp != 0xffff)
\r
520 vE = _mm_load_si128 (pvE + j);
\r
522 vH = _mm_max_epu8 (vH, vF);
\r
524 /* save vH values */
\r
525 _mm_store_si128 (pvHStore + j, vH);
\r
527 /* update vE incase the new vH value would change it */
\r
528 vH = _mm_subs_epu8 (vH, vGapOpen);
\r
529 vE = _mm_max_epu8 (vE, vH);
\r
530 _mm_store_si128 (pvE + j, vE);
\r
532 /* update vF value */
\r
533 vF = _mm_subs_epu8 (vF, vGapExtend);
\r
539 vF = _mm_slli_si128 (vF, 1);
\r
542 vH = _mm_load_si128 (pvHStore + j);
\r
544 vTemp = _mm_subs_epu8 (vH, vGapOpen);
\r
545 vTemp = _mm_subs_epu8 (vF, vTemp);
\r
546 vTemp = _mm_cmpeq_epi8 (vTemp, vZero);
\r
547 cmp = _mm_movemask_epi8 (vTemp);
\r
551 /* find largest score in the vMaxScore vector */
\r
552 vTemp = _mm_srli_si128 (vMaxScore, 8);
\r
553 vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);
\r
554 vTemp = _mm_srli_si128 (vMaxScore, 4);
\r
555 vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);
\r
556 vTemp = _mm_srli_si128 (vMaxScore, 2);
\r
557 vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);
\r
558 vTemp = _mm_srli_si128 (vMaxScore, 1);
\r
559 vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);
\r
561 /* store in temporary variable */
\r
562 score = _mm_extract_epi16 (vMaxScore, 0);
\r
563 score = score & 0x00ff;
\r
565 /* check if we might have overflowed */
\r
566 if (score + bias >= 255)
\r
571 /* return largest score */
\r