1 /******************************************************************
\r
2 Copyright 2006 by Michael Farrar. All rights reserved.
\r
3 This program may not be sold or incorporated into a commercial product,
\r
4 in whole or in part, without written consent of Michael Farrar. For
\r
5 further information regarding permission for use or reproduction, please
\r
6 contact: Michael Farrar at farrar.michael@gmail.com.
\r
7 *******************************************************************/
\r
9 /* Written by Michael Farrar, 2006.
\r
10 Please send bug reports and/or suggestions to farrar.michael@gmail.com.
\r
13 /* Implementation of the Wozniak "vertical" vectorization
\r
14 strategy for Smith-Waterman comparison, Wozniak (1997) Comp.
\r
15 Appl. Biosci. 13:145-150
\r
20 #include <emmintrin.h>
\r
23 #include "swwozniak.h"
\r
25 #define MATRIX_ROW_SIZE 32
\r
26 #define MATRIX_SIZE (MATRIX_ROW_SIZE * (ALPHA_SIZE + 1))
\r
33 unsigned char *pData;
\r
34 unsigned short bias;
\r
38 swWozniakWord (unsigned char *querySeq,
\r
40 unsigned char *dbSeq,
\r
42 unsigned short gapOpen,
\r
43 unsigned short gapExtend,
\r
49 swWozniakByte (unsigned char *querySeq,
\r
51 unsigned char *dbSeq,
\r
53 unsigned short gapOpen,
\r
54 unsigned short gapExtend,
\r
58 unsigned short bias);
\r
61 swWozniakInit(unsigned char *querySeq,
\r
63 signed char *matrix)
\r
79 signed char *matrixRow;
\r
83 SwWozniakData *pSwData;
\r
85 lenQryByte = (queryLength + 15) / 16 + 2;
\r
86 lenQryShort = (queryLength + 7) / 8 + 2;
\r
88 pSwData = (SwWozniakData *) malloc (sizeof (SwWozniakData));
\r
90 fprintf (stderr, "Unable to allocate memory for SW data\n");
\r
94 nCount = 64 + /* slack bytes */
\r
95 4 * (ALPHA_SIZE + 1) + /* byte matrix */
\r
96 4 * (ALPHA_SIZE + 1) + /* short matrix */
\r
97 ((queryLength + 16) * 2); /* vH and vE */
\r
100 pSwData->pData = (unsigned char *) calloc (nCount, sizeof (__m128i));
\r
101 if (!pSwData->pData) {
\r
102 fprintf (stderr, "Unable to allocate memory for SW data buffers\n");
\r
106 pSwData->pbMatrix = (char *) pSwData->pData;
\r
107 pSwData->psMatrix = (short *) (pSwData->pbMatrix + MATRIX_SIZE);
\r
109 /* since we might port this to another platform, lets align the data */
\r
110 /* to 16 byte boundries ourselves */
\r
111 aligned = (size_t) (pSwData->psMatrix + MATRIX_SIZE);
\r
112 aligned = (aligned + 15) & ~(0x0f);
\r
114 pSwData->pvHStore = (__m128i *) aligned;
\r
115 pSwData->pvEStore = pSwData->pvHStore + queryLength + 16;
\r
117 /* Find the bias to use in the substitution matrix */
\r
119 for (i = 0; i < ALPHA_SIZE * ALPHA_SIZE; i++) {
\r
120 if (matrix[i] < bias) {
\r
128 pc = pSwData->pbMatrix;
\r
129 ps = pSwData->psMatrix;
\r
131 for (i = 0; i < ALPHA_SIZE; i++) {
\r
132 matrixRow = matrix + i * ALPHA_SIZE;
\r
134 for (j = 0; j < ALPHA_SIZE; j++) {
\r
135 weight = matrixRow[j];
\r
136 *pc++ = weight - bias;
\r
140 for ( ; j < MATRIX_ROW_SIZE; j++) {
\r
146 /* add the weights for the NULL rows */
\r
147 for (j = 0; j < MATRIX_ROW_SIZE; j++) {
\r
152 pSwData->bias = (unsigned short) -bias;
\r
157 void swWozniakScan (unsigned char *querySeq,
\r
161 SEARCH_OPTIONS *options,
\r
162 SCORE_LIST *scores)
\r
166 int threshold = options->threshold;
\r
168 unsigned char *dbSeq;
\r
171 int gapInit = -(options->gapInit + options->gapExt);
\r
172 int gapExt = -options->gapExt;
\r
174 SwWozniakData *wozniakData = (SwWozniakData *) swData;
\r
176 dbSeq = nextSeq (dbLib, &dbLen);
\r
177 while (dbLen > 0) {
\r
179 score = swWozniakByte (querySeq, queryLength,
\r
182 wozniakData->pbMatrix,
\r
183 wozniakData->pvHStore,
\r
184 wozniakData->pvEStore,
\r
185 wozniakData->bias);
\r
187 /* check if needs a run with higher precision */
\r
188 if (score >= 255) {
\r
189 score = swWozniakWord (querySeq, queryLength,
\r
192 wozniakData->psMatrix,
\r
193 wozniakData->pvHStore,
\r
194 wozniakData->pvEStore);
\r
197 if (score >= threshold) {
\r
198 int minScore = insertList (scores, score, seqName (dbLib));
\r
199 if (minScore >= threshold) {
\r
200 threshold = minScore;
\r
204 dbSeq = nextSeq (dbLib, &dbLen);
\r
209 swWozniakComplete(void *pSwData)
\r
211 SwWozniakData *pWozniakData = (SwWozniakData *) pSwData;
\r
213 free (pWozniakData->pData);
\r
214 free (pWozniakData);
\r
219 swWozniakWord (unsigned char *querySeq,
\r
221 unsigned char *dbSeq,
\r
223 unsigned short gapOpen,
\r
224 unsigned short gapExtend,
\r
234 __m128i vE, vF, vH;
\r
235 __m128i vEUp, vHUp1, vHUp2;
\r
239 __m128i vGapExtend;
\r
246 /* remove unreferenced warning */
\r
249 /* Load gap opening penalty to all elements of a constant */
\r
250 vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);
\r
251 vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
\r
252 vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);
\r
254 /* Load gap extension penalty to all elements of a constant */
\r
255 vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);
\r
256 vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
\r
257 vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);
\r
259 /* load vMaxScore with the zeros. since we are using signed */
\r
260 /* math, we will bias the maxscore to -32768 so we have the */
\r
261 /* full range of the short. */
\r
262 vMaxScore = _mm_cmpeq_epi16 (vMaxScore, vMaxScore);
\r
263 vMaxScore = _mm_slli_epi16 (vMaxScore, 15);
\r
265 vMinimums = _mm_shuffle_epi32 (vMaxScore, 0);
\r
267 vMin = _mm_shuffle_epi32 (vMaxScore, 0);
\r
268 vMin = _mm_srli_si128 (vMin, 14);
\r
270 for (i = 0; i < queryLength + 8; i++)
\r
272 _mm_store_si128 (pvEStore + i, vMaxScore);
\r
273 _mm_store_si128 (pvHStore + i, vMaxScore);
\r
276 pScore = (short *) &vScore;
\r
278 for (i = 0; i < dbLength; i += 8, dbSeq += 8)
\r
280 /* zero lots of stuff. */
\r
281 vE = _mm_shuffle_epi32 (vMinimums, 0);
\r
282 vF = _mm_shuffle_epi32 (vMinimums, 0);
\r
283 vH = _mm_shuffle_epi32 (vMinimums, 0);
\r
284 vHUp2 = _mm_shuffle_epi32 (vMinimums, 0);
\r
286 vScore = _mm_xor_si128 (vScore, vScore);
\r
288 for (j = 0; j < 8; ++j)
\r
290 for (k = 0; k <= j; ++k) {
\r
291 int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;
\r
292 pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));
\r
294 for ( ; k < 8; ++k) {
\r
298 /* load values of vE and vH from previous row (one unit up) */
\r
299 vEUp = _mm_load_si128 (pvEStore + j);
\r
300 vHUp1 = _mm_load_si128 (pvHStore + j);
\r
302 /* shift into place so we have complete vE and vH vectors */
\r
303 /* that refer to the values one unit up from each cell */
\r
304 /* that we are currently working on. */
\r
305 vTemp = _mm_slli_si128 (vE, 2);
\r
306 vEUp = _mm_srli_si128 (vEUp, 14);
\r
307 vEUp = _mm_or_si128 (vEUp, vTemp);
\r
309 vTemp = _mm_slli_si128 (vH, 2);
\r
310 vHUp1 = _mm_srli_si128 (vHUp1, 14);
\r
311 vHUp1 = _mm_or_si128 (vHUp1, vTemp);
\r
313 /* do the dynamic programming */
\r
315 /* update vE value */
\r
316 vE = _mm_subs_epi16 (vEUp, vGapExtend);
\r
317 vTemp = _mm_subs_epi16 (vHUp1, vGapOpen);
\r
318 vE = _mm_max_epi16 (vE, vTemp);
\r
320 /* update vF value */
\r
321 vF = _mm_subs_epi16 (vF, vGapExtend);
\r
322 vTemp = _mm_subs_epi16 (vH, vGapOpen);
\r
323 vF = _mm_max_epi16 (vF, vTemp);
\r
325 /* add score to vH */
\r
326 vH = _mm_adds_epi16 (vHUp2, vScore);
\r
328 /* set vH to max of vH, vE, vF */
\r
329 vH = _mm_max_epi16 (vH, vE);
\r
330 vH = _mm_max_epi16 (vH, vF);
\r
332 /* Save value to use for next diagonal vH */
\r
335 /* Update highest score encountered this far */
\r
336 vMaxScore = _mm_max_epi16 (vMaxScore, vH);
\r
339 for (l = 0; j < queryLength; ++j, ++l)
\r
341 for (k = 0; k < 8; ++k) {
\r
342 int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;
\r
343 pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));
\r
346 /* load values of vE and vH from previous row (one unit up) */
\r
347 vEUp = _mm_load_si128 (pvEStore + j);
\r
348 vHUp1 = _mm_load_si128 (pvHStore + j);
\r
350 /* save old values of vE and vH to use on next row */
\r
351 _mm_store_si128 (pvEStore + l, vE);
\r
352 _mm_store_si128 (pvHStore + l, vH);
\r
354 /* shift into place so we have complete vE and vH vectors */
\r
355 /* that refer to the values one unit up from each cell */
\r
356 /* that we are currently working on. */
\r
357 vTemp = _mm_slli_si128 (vE, 2);
\r
358 vEUp = _mm_srli_si128 (vEUp, 14);
\r
359 vEUp = _mm_or_si128 (vEUp, vTemp);
\r
361 vTemp = _mm_slli_si128 (vH, 2);
\r
362 vHUp1 = _mm_srli_si128 (vHUp1, 14);
\r
363 vHUp1 = _mm_or_si128 (vHUp1, vTemp);
\r
365 /* do the dynamic programming */
\r
367 /* update vE value */
\r
368 vE = _mm_subs_epi16 (vEUp, vGapExtend);
\r
369 vTemp = _mm_subs_epi16 (vHUp1, vGapOpen);
\r
370 vE = _mm_max_epi16 (vE, vTemp);
\r
372 /* update vF value */
\r
373 vF = _mm_subs_epi16 (vF, vGapExtend);
\r
374 vTemp = _mm_subs_epi16 (vH, vGapOpen);
\r
375 vF = _mm_max_epi16 (vF, vTemp);
\r
377 /* add score to vH */
\r
378 vH = _mm_adds_epi16(vHUp2, vScore);
\r
380 /* set vH to max of vH, vE, vF */
\r
381 vH = _mm_max_epi16 (vH, vE);
\r
382 vH = _mm_max_epi16 (vH, vF);
\r
384 /* Save value to use for next diagonal vH */
\r
387 /* Update highest score encountered this far */
\r
388 vMaxScore = _mm_max_epi16 (vMaxScore, vH);
\r
391 for (m = 0 ; m < 7; ++j, ++l, ++m)
\r
393 for (k = 0; k <= m; ++k) {
\r
396 for ( ; k < 8; ++k) {
\r
397 int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;
\r
398 pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));
\r
401 /* save old values of vE and vH to use on next row */
\r
402 _mm_store_si128 (pvEStore + l, vE);
\r
403 _mm_store_si128 (pvHStore + l, vH);
\r
405 /* v_score_load contains all zeros */
\r
406 vTemp = _mm_slli_si128 (vE, 2);
\r
407 vEUp = _mm_or_si128 (vMin, vTemp);
\r
408 vTemp = _mm_slli_si128 (vH, 2);
\r
409 vHUp1 = _mm_or_si128 (vMin, vTemp);
\r
411 /* do the dynamic programming */
\r
413 /* update vE value */
\r
414 vE = _mm_subs_epi16 (vEUp, vGapExtend);
\r
415 vTemp = _mm_subs_epi16 (vHUp1, vGapOpen);
\r
416 vE = _mm_max_epi16 (vE, vTemp);
\r
418 /* update vF value */
\r
419 vF = _mm_subs_epi16 (vF, vGapExtend);
\r
420 vTemp = _mm_subs_epi16 (vH, vGapOpen);
\r
421 vF = _mm_max_epi16 (vF, vTemp);
\r
423 /* add score to vH */
\r
424 vH = _mm_adds_epi16 (vHUp2, vScore);
\r
426 /* set vH to max of vH, vE, vF */
\r
427 vH = _mm_max_epi16 (vH, vE);
\r
428 vH = _mm_max_epi16 (vH, vF);
\r
430 /* Save value to use for next diagonal vH */
\r
433 /* Update highest score encountered this far */
\r
434 vMaxScore = _mm_max_epi16(vMaxScore,vH);
\r
437 _mm_store_si128 (pvEStore + l, vE);
\r
438 _mm_store_si128 (pvHStore + l, vH);
\r
441 /* find largest score in the vMaxScore vector */
\r
442 vTemp = _mm_srli_si128 (vMaxScore, 8);
\r
443 vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);
\r
444 vTemp = _mm_srli_si128 (vMaxScore, 4);
\r
445 vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);
\r
446 vTemp = _mm_srli_si128 (vMaxScore, 2);
\r
447 vMaxScore = _mm_max_epi16 (vMaxScore, vTemp);
\r
449 /* store in temporary variable */
\r
450 score = (short) _mm_extract_epi16 (vMaxScore, 0);
\r
452 /* return largest score */
\r
453 return score + SHORT_BIAS;
\r
460 swWozniakByte (unsigned char *querySeq,
\r
462 unsigned char *dbSeq,
\r
464 unsigned short gapOpen,
\r
465 unsigned short gapExtend,
\r
469 unsigned short bias)
\r
478 __m128i vE, vF, vH;
\r
479 __m128i vEUp, vHUp1, vHUp2;
\r
484 __m128i vGapExtend;
\r
489 /* remove unreferenced warning */
\r
492 /* Load the bias to all elements of a constant */
\r
493 dup = (bias << 8) | (bias & 0x00ff);
\r
494 vBias = _mm_insert_epi16 (vBias, dup, 0);
\r
495 vBias = _mm_shufflelo_epi16 (vBias, 0);
\r
496 vBias = _mm_shuffle_epi32 (vBias, 0);
\r
498 /* Load gap opening penalty to all elements of a constant */
\r
499 dup = (gapOpen << 8) | (gapOpen & 0x00ff);
\r
500 vGapOpen = _mm_insert_epi16 (vGapOpen, dup, 0);
\r
501 vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
\r
502 vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);
\r
504 /* Load gap extension penalty to all elements of a constant */
\r
505 dup = (gapExtend << 8) | (gapExtend & 0x00ff);
\r
506 vGapExtend = _mm_insert_epi16 (vGapExtend, dup, 0);
\r
507 vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
\r
508 vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);
\r
510 vScore = _mm_xor_si128 (vScore, vScore);
\r
511 vMaxScore = _mm_xor_si128 (vMaxScore, vMaxScore);
\r
513 for (i = 0; i < queryLength + 16; i++)
\r
515 _mm_store_si128 (pvEStore + i, vMaxScore);
\r
516 _mm_store_si128 (pvHStore + i, vMaxScore);
\r
519 pScore = (char *) &vScore;
\r
521 for (i = 0; i < dbLength; i += 16, dbSeq += 16)
\r
523 // zero lots of stuff.
\r
524 vE = _mm_xor_si128 (vE, vE);
\r
525 vF = _mm_xor_si128 (vF, vF);
\r
526 vH = _mm_xor_si128 (vH, vH);
\r
527 vHUp2 = _mm_xor_si128 (vHUp2, vHUp2);
\r
529 vScore = _mm_xor_si128 (vScore, vScore);
\r
531 for (j = 0; j < 16; ++j)
\r
533 for (k = 0; k <= j; ++k) {
\r
534 int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;
\r
535 pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));
\r
537 for ( ; k < 16; ++k) {
\r
538 pScore[k] = (char) bias;
\r
541 // load values of vE and vH from previous row (one unit up)
\r
542 vEUp = _mm_load_si128 (pvEStore + j);
\r
543 vHUp1 = _mm_load_si128 (pvHStore + j);
\r
545 // shift into place so we have complete vE and vH vectors
\r
546 // that refer to the values one unit up from each cell
\r
547 // that we are currently working on.
\r
548 vTemp = _mm_slli_si128 (vE, 1);
\r
549 vEUp = _mm_srli_si128 (vEUp, 15);
\r
550 vEUp = _mm_or_si128 (vEUp, vTemp);
\r
552 vTemp = _mm_slli_si128 (vH, 1);
\r
553 vHUp1 = _mm_srli_si128 (vHUp1, 15);
\r
554 vHUp1 = _mm_or_si128 (vHUp1, vTemp);
\r
556 // do the dynamic programming
\r
559 vE = _mm_subs_epu8 (vEUp, vGapExtend);
\r
560 vTemp = _mm_subs_epu8 (vHUp1, vGapOpen);
\r
561 vE = _mm_max_epu8 (vE, vTemp);
\r
564 vF = _mm_subs_epu8 (vF, vGapExtend);
\r
565 vTemp = _mm_subs_epu8 (vH, vGapOpen);
\r
566 vF = _mm_max_epu8 (vF, vTemp);
\r
569 vH = _mm_adds_epu8 (vHUp2, *((__m128i *) pScore));
\r
570 vH = _mm_subs_epu8 (vH, vBias);
\r
572 // set vH to max of vH, vE, vF
\r
573 vH = _mm_max_epu8 (vH, vE);
\r
574 vH = _mm_max_epu8 (vH, vF);
\r
576 // Save value to use for next diagonal vH
\r
579 // Update highest score encountered this far
\r
580 vMaxScore = _mm_max_epu8 (vMaxScore, vH);
\r
583 for (l = 0; j < queryLength; ++j, ++l)
\r
585 for (k = 0; k < 16; ++k) {
\r
586 int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;
\r
587 pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));
\r
590 // load values of vE and vH from previous row (one unit up)
\r
591 vEUp = _mm_load_si128 (pvEStore + j);
\r
592 vHUp1 = _mm_load_si128 (pvHStore + j);
\r
594 // save old values of vE and vH to use on next row
\r
595 _mm_store_si128 (pvEStore + l, vE);
\r
596 _mm_store_si128 (pvHStore + l, vH);
\r
598 // shift into place so we have complete vE and vH vectors
\r
599 // that refer to the values one unit up from each cell
\r
600 // that we are currently working on.
\r
601 vTemp = _mm_slli_si128 (vE, 1);
\r
602 vEUp = _mm_srli_si128 (vEUp, 15);
\r
603 vEUp = _mm_or_si128 (vEUp, vTemp);
\r
605 vTemp = _mm_slli_si128 (vH, 1);
\r
606 vHUp1 = _mm_srli_si128 (vHUp1, 15);
\r
607 vHUp1 = _mm_or_si128 (vHUp1, vTemp);
\r
609 // do the dynamic programming
\r
612 vE = _mm_subs_epu8 (vEUp, vGapExtend);
\r
613 vTemp = _mm_subs_epu8 (vHUp1, vGapOpen);
\r
614 vE = _mm_max_epu8 (vE, vTemp);
\r
617 vF = _mm_subs_epu8 (vF, vGapExtend);
\r
618 vTemp = _mm_subs_epu8 (vH, vGapOpen);
\r
619 vF = _mm_max_epu8 (vF, vTemp);
\r
622 vH = _mm_adds_epu8(vHUp2, vScore);
\r
623 vH = _mm_subs_epu8 (vH, vBias);
\r
625 // set vH to max of vH, vE, vF
\r
626 vH = _mm_max_epu8 (vH, vE);
\r
627 vH = _mm_max_epu8 (vH, vF);
\r
629 // Save value to use for next diagonal vH
\r
632 // Update highest score encountered this far
\r
633 vMaxScore = _mm_max_epu8 (vMaxScore, vH);
\r
636 for (m = 0 ; m < 15; ++j, ++l, ++m)
\r
638 for (k = 0; k <= m; ++k) {
\r
639 pScore[k] = (char) bias;
\r
641 for ( ; k < 16; ++k) {
\r
642 int matrixOffset = *(dbSeq + k) * MATRIX_ROW_SIZE;
\r
643 pScore[k] = *(pMatrix + matrixOffset + *(querySeq + j - k));
\r
646 // save old values of vE and vH to use on next row
\r
647 _mm_store_si128 (pvEStore + l, vE);
\r
648 _mm_store_si128 (pvHStore + l, vH);
\r
650 // v_score_load contains all zeros
\r
651 vEUp = _mm_slli_si128 (vE, 1);
\r
652 vHUp1 = _mm_slli_si128 (vH, 1);
\r
654 // do the dynamic programming
\r
657 vE = _mm_subs_epu8 (vEUp, vGapExtend);
\r
658 vTemp = _mm_subs_epu8 (vHUp1, vGapOpen);
\r
659 vE = _mm_max_epu8 (vE, vTemp);
\r
662 vF = _mm_subs_epu8 (vF, vGapExtend);
\r
663 vTemp = _mm_subs_epu8 (vH, vGapOpen);
\r
664 vF = _mm_max_epu8 (vF, vTemp);
\r
667 vH = _mm_adds_epu8 (vHUp2, vScore);
\r
668 vH = _mm_subs_epu8 (vH, vBias);
\r
670 // set vH to max of vH, vE, vF
\r
671 vH = _mm_max_epu8 (vH, vE);
\r
672 vH = _mm_max_epu8 (vH, vF);
\r
674 // Save value to use for next diagonal vH
\r
677 // Update highest score encountered this far
\r
678 vMaxScore = _mm_max_epu8(vMaxScore,vH);
\r
681 _mm_store_si128 (pvEStore + l, vE);
\r
682 _mm_store_si128 (pvHStore + l, vH);
\r
685 // find largest score in the vMaxScore vector
\r
686 vTemp = _mm_srli_si128 (vMaxScore, 8);
\r
687 vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);
\r
688 vTemp = _mm_srli_si128 (vMaxScore, 4);
\r
689 vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);
\r
690 vTemp = _mm_srli_si128 (vMaxScore, 2);
\r
691 vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);
\r
692 vTemp = _mm_srli_si128 (vMaxScore, 1);
\r
693 vMaxScore = _mm_max_epu8 (vMaxScore, vTemp);
\r
695 // store in temporary variable
\r
696 score = (short) _mm_extract_epi16 (vMaxScore, 0);
\r
697 score = score & 0x00ff;
\r
699 // check if we might have overflowed
\r
700 if (score + bias >= 255)
\r
706 // return largest score
\r