12 #if defined(CRYPTOPP_DISABLE_VMAC_ASM)
13 # undef CRYPTOPP_X86_ASM_AVAILABLE
14 # undef CRYPTOPP_X32_ASM_AVAILABLE
15 # undef CRYPTOPP_X64_ASM_AVAILABLE
16 # undef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
19 #if CRYPTOPP_MSC_VERSION
20 # pragma warning(disable: 4731)
25 #if defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64
29 #if defined(CRYPTOPP_WORD128_AVAILABLE) && !defined(CRYPTOPP_X64_ASM_AVAILABLE)
30 # define VMAC_BOOL_WORD128 1
32 # define VMAC_BOOL_WORD128 0
36 #define const // Turbo C++ 2006 workaround
38 static const word64 p64 = W64LIT(0xfffffffffffffeff);
39 static const word64 m62 = W64LIT(0x3fffffffffffffff);
40 static const word64 m63 = W64LIT(0x7fffffffffffffff);
41 static const word64 m64 = W64LIT(0xffffffffffffffff);
42 static const word64 mpoly = W64LIT(0x1fffffff1fffffff);
49 #define m126 ((word128(m62)<<64)|m64)
51 static const word128 m126 = (word128(m62)<<64)|m64;
58 if (digestLength != 8 && digestLength != 16)
60 m_is128 = digestLength == 16;
63 if (m_L1KeyLength <= 0 || m_L1KeyLength % 128 != 0)
64 throw InvalidArgument(
"VMAC: L1KeyLength must be a positive multiple of 128");
69 cipher.
SetKey(userKey, keylength, params);
70 const unsigned int blockSize = cipher.
BlockSize();
71 const unsigned int blockSizeInWords = blockSize /
sizeof(word64);
80 ConditionalByteReverse<word64>(
BIG_ENDIAN_ORDER, m_nhKey(), m_nhKey(), m_nhKeySize()*
sizeof(word64));
85 for (i = 0; i <= (size_t)m_is128; i++)
96 word64 *l3Key = m_l3Key();
99 for (i = 0; i <= (size_t)m_is128; i++)
106 }
while ((l3Key[i*2+0] >= p64) || (l3Key[i*2+1] >= p64));
110 const byte *nonce = GetIVAndThrowIfInvalid(params, nonceLength);
122 size_t length = ThrowIfInvalidIVLength(len);
124 byte *storedNonce = m_nonce();
128 memset(storedNonce, 0, s-length);
129 memcpy(storedNonce+s-length, nonce, length);
134 if (m_padCached && (storedNonce[s-1] | 1) == (nonce[length-1] | 1))
137 for (
size_t i=0; m_padCached && i<s-length; i++)
138 m_padCached = (storedNonce[i] == 0);
142 memset(storedNonce, 0, s-length);
143 memcpy(storedNonce+s-length, nonce, length-1);
144 storedNonce[s-1] = nonce[length-1] & 0xfe;
148 storedNonce[s-1] = nonce[length-1];
150 m_isFirstBlock =
true;
154 void VMAC_Base::HashEndianCorrectedBlock(
const word64 *data)
156 CRYPTOPP_UNUSED(data);
158 throw NotImplemented(
"VMAC: HashEndianCorrectedBlock is not implemented");
164 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
170 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)
171 #if CRYPTOPP_MSC_VERSION
172 # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
176 __attribute__ ((noinline))
178 VMAC_Base::VHASH_Update_SSE2(
const word64 *data,
size_t blocksRemainingInWord64,
int tagPart)
183 const word64 *nhK = m_nhKey();
184 word64 *polyS = (word64*)(
void*)m_polyState();
185 word32 L1KeyLength = m_L1KeyLength;
188 CRYPTOPP_UNUSED(data); CRYPTOPP_UNUSED(tagPart); CRYPTOPP_UNUSED(L1KeyLength);
189 CRYPTOPP_UNUSED(blocksRemainingInWord64);
199 #
if _MSC_VER < 1300 || defined(__INTEL_COMPILER)
200 char isFirstBlock = m_isFirstBlock;
201 AS2( mov ebx, [L1KeyLength])
202 AS2( mov dl, [isFirstBlock])
205 AS2( mov ebx, [ecx+m_L1KeyLength])
206 AS2( mov dl, [ecx+m_isFirstBlock])
208 AS2( mov eax, tagPart)
216 AS2( mov ecx, blocksRemainingInWord64)
220 #if CRYPTOPP_BOOL_X32
232 AS2( lea ebp, [edi+8*ebp])
233 AS2( movq mm6, [esi])
234 AS2( paddq mm6, [edi])
235 AS2( movq mm5, [esi+8])
236 AS2( paddq mm5, [edi+8])
240 ASS( pshufw mm2, mm6, 1, 0, 3, 2)
241 AS2( pmuludq mm6, mm5)
242 ASS( pshufw mm3, mm5, 1, 0, 3, 2)
243 AS2( pmuludq mm5, mm2)
244 AS2( pmuludq mm2, mm3)
245 AS2( pmuludq mm3, mm4)
247 AS2( movd [esp], mm6)
249 #if CRYPTOPP_BOOL_X32
250 AS2( movd [esp+8], mm5)
252 AS2( movd [esp+4], mm5)
258 AS2( movq mm0, [esi])
259 AS2( paddq mm0, [edi])
260 AS2( movq mm1, [esi+8])
261 AS2( paddq mm1, [edi+8])
266 ASS( pshufw mm2, mm0, 1, 0, 3, 2)
267 AS2( pmuludq mm0, mm1)
268 #if CRYPTOPP_BOOL_X32
269 AS2( movd [esp+16], mm3)
271 AS2( movd [esp+8], mm3)
275 ASS( pshufw mm3, mm1, 1, 0, 3, 2)
276 AS2( pmuludq mm1, mm2)
277 AS2( pmuludq mm2, mm3)
278 AS2( pmuludq mm3, mm4)
279 AS2( movd mm4, [esp])
281 #if CRYPTOPP_BOOL_X32
282 AS2( movd mm4, [esp+8])
284 AS2( movd mm4, [esp+16])
286 AS2( movd mm4, [esp+4])
288 AS2( movd mm4, [esp+8])
291 AS2( movd [esp], mm0)
294 #if CRYPTOPP_BOOL_X32
295 AS2( movd [esp+8], mm1)
297 AS2( movd [esp+4], mm1)
305 #if CRYPTOPP_BOOL_X32
306 AS2( movd [esp+16], mm3)
308 AS2( movd [esp+8], mm3)
312 AS2( movd mm4, [esp])
314 #if CRYPTOPP_BOOL_X32
315 AS2( movd mm4, [esp+8])
317 AS2( movd mm4, [esp+16])
319 AS2( movd mm4, [esp+4])
321 AS2( movd mm4, [esp+8])
324 AS2( lea ebp, [8*ebx])
327 AS2( movd [esp], mm7)
330 #if CRYPTOPP_BOOL_X32
331 AS2( movd [esp+8], mm6)
333 AS2( movd [esp+4], mm6)
344 #define k0 [eax+2*8+2*4]
345 #define k1 [eax+2*8+3*4]
346 #define k2 [eax+2*8+0*4]
347 #define k3 [eax+2*8+1*4]
351 AS2( movd mm0, [esp])
356 #if CRYPTOPP_BOOL_X32
357 AS2( movd mm2, [esp+8])
359 AS2( movd mm2, [esp+4])
373 AS2( pmuludq mm0, k3)
375 AS2( pmuludq mm1, k2)
378 AS2( pmuludq mm2, mm6)
384 AS2( pmuludq mm3, mm7)
385 AS2( pmuludq mm4, mm7)
386 AS2( pmuludq mm5, mm6)
391 AS2( pmuludq mm1, k2)
396 AS2( pmuludq mm2, k3)
397 AS2( pmuludq mm3, mm7)
398 #if CRYPTOPP_BOOL_X32
399 AS2( movd [esp+16], mm0)
401 AS2( movd [esp+8], mm0)
404 AS2( pmuludq mm7, mm5)
405 AS2( pmuludq mm5, k3)
408 AS2( pmuludq mm1, k2)
413 AS2( pmuludq mm2, mm6)
414 AS2( pmuludq mm6, a0)
417 AS2( movd mm3, [esp])
420 AS2( pmuludq mm3, k3)
423 AS2( pmuludq mm1, k2)
425 #if CRYPTOPP_BOOL_X32
426 AS2( movd mm2, [esp+8])
428 AS2( movd mm2, [esp+4])
436 #if CRYPTOPP_BOOL_X32
437 AS2( movd mm7, [esp+16])
439 AS2( movd mm7, [esp+8])
464 #if CRYPTOPP_BOOL_X32
475 :
"m" (L1KeyLength),
"c" (blocksRemainingInWord64),
"S" (data),
"D" (nhK+tagPart*2),
"d" (m_isFirstBlock),
"a" (polyS+tagPart*4)
482 #if VMAC_BOOL_WORD128
483 #define DeclareNH(a) word128 a=0
484 #define MUL64(rh,rl,i1,i2) {word128 p = word128(i1)*(i2); rh = word64(p>>64); rl = word64(p);}
485 #define AccumulateNH(a, b, c) a += word128(b)*(c)
486 #define Multiply128(r, i1, i2) r = word128(word64(i1)) * word64(i2)
488 #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER) && !defined(_M_ARM)
489 #define MUL32(a, b) __emulu(word32(a), word32(b))
491 #define MUL32(a, b) ((word64)((word32)(a)) * (word32)(b))
493 #if defined(CRYPTOPP_X64_ASM_AVAILABLE)
494 #define DeclareNH(a) word64 a##0=0, a##1=0
495 #define MUL64(rh,rl,i1,i2) asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "g"(i2) : "cc");
496 #define AccumulateNH(a, b, c) asm ("mulq %3; addq %%rax, %0; adcq %%rdx, %1" : "+r"(a##0), "+r"(a##1) : "a"(b), "g"(c) : "%rdx", "cc");
497 #define ADD128(rh,rl,ih,il) asm ("addq %3, %1; adcq %2, %0" : "+r"(rh),"+r"(rl) : "r"(ih),"r"(il) : "cc");
498 #elif defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64
499 #define DeclareNH(a) word64 a##0=0, a##1=0
500 #define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh));
501 #define AccumulateNH(a, b, c) {\
503 pl = _umul128(b,c,&ph);\
505 a##1 += ph + (a##0 < pl);}
507 #define VMAC_BOOL_32BIT 1
508 #define DeclareNH(a) word64 a##0=0, a##1=0, a##2=0
509 #define MUL64(rh,rl,i1,i2) \
510 { word64 _i1 = (i1), _i2 = (i2); \
511 word64 m1= MUL32(_i1,_i2>>32); \
512 word64 m2= MUL32(_i1>>32,_i2); \
513 rh = MUL32(_i1>>32,_i2>>32); \
514 rl = MUL32(_i1,_i2); \
515 ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \
516 ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \
518 #define AccumulateNH(a, b, c) {\
519 word64 p = MUL32(b, c);\
520 a##1 += word32((p)>>32);\
522 p = MUL32((b)>>32, c);\
523 a##2 += word32((p)>>32);\
525 p = MUL32((b)>>32, (c)>>32);\
527 p = MUL32(b, (c)>>32);\
529 a##2 += word32(p>>32);}
532 #ifndef VMAC_BOOL_32BIT
533 #define VMAC_BOOL_32BIT 0
536 #define ADD128(rh,rl,ih,il) \
537 { word64 _il = (il); \
539 (rh) += (ih) + ((rl) < (_il)); \
543 #if !(defined(_MSC_VER) && _MSC_VER < 1300)
544 template <
bool T_128BitTag>
546 void VMAC_Base::VHASH_Update_Template(
const word64 *data,
size_t blocksRemainingInWord64)
551 #define INNER_LOOP_ITERATION(j) {\
552 word64 d0 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+0]);\
553 word64 d1 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+1]);\
554 AccumulateNH(nhA, d0+nhK[i+2*j+0], d1+nhK[i+2*j+1]);\
556 AccumulateNH(nhB, d0+nhK[i+2*j+2], d1+nhK[i+2*j+3]);\
559 #if (defined(_MSC_VER) && _MSC_VER < 1300)
560 bool T_128BitTag = m_is128;
562 size_t L1KeyLengthInWord64 = m_L1KeyLength / 8;
563 size_t innerLoopEnd = L1KeyLengthInWord64;
564 const word64 *nhK = m_nhKey();
565 word64 *polyS = (word64*)(
void*)m_polyState();
566 bool isFirstBlock =
true;
570 #if VMAC_BOOL_WORD128
573 word64 ah1=0, al1=0, ah2=0, al2=0;
575 word64 kh1, kl1, kh2, kl2;
576 kh1=(polyS+0*4+2)[0]; kl1=(polyS+0*4+2)[1];
579 kh2=(polyS+1*4+2)[0]; kl2=(polyS+1*4+2)[1];
589 if (blocksRemainingInWord64 < L1KeyLengthInWord64)
591 if (blocksRemainingInWord64 % 8)
593 innerLoopEnd = blocksRemainingInWord64 % 8;
594 for (; i<innerLoopEnd; i+=2)
595 INNER_LOOP_ITERATION(0);
597 innerLoopEnd = blocksRemainingInWord64;
599 for (; i<innerLoopEnd; i+=8)
601 INNER_LOOP_ITERATION(0);
602 INNER_LOOP_ITERATION(1);
603 INNER_LOOP_ITERATION(2);
604 INNER_LOOP_ITERATION(3);
606 blocksRemainingInWord64 -= innerLoopEnd;
607 data += innerLoopEnd;
610 word32 nh0[2], nh1[2];
613 nh0[0] = word32(nhA0);
614 nhA1 += (nhA0 >> 32);
615 nh1[0] = word32(nhA1);
616 nh2[0] = (nhA2 + (nhA1 >> 32)) & m62;
620 nh0[1] = word32(nhB0);
621 nhB1 += (nhB0 >> 32);
622 nh1[1] = word32(nhB1);
623 nh2[1] = (nhB2 + (nhB1 >> 32)) & m62;
626 #define a0 (((word32 *)(polyS+i*4))[2+NativeByteOrder::ToEnum()])
627 #define a1 (*(((word32 *)(polyS+i*4))+3-NativeByteOrder::ToEnum())) // workaround for GCC 3.2
628 #define a2 (((word32 *)(polyS+i*4))[0+NativeByteOrder::ToEnum()])
629 #define a3 (*(((word32 *)(polyS+i*4))+1-NativeByteOrder::ToEnum()))
630 #define aHi ((polyS+i*4)[0])
631 #define k0 (((word32 *)(polyS+i*4+2))[2+NativeByteOrder::ToEnum()])
632 #define k1 (*(((word32 *)(polyS+i*4+2))+3-NativeByteOrder::ToEnum()))
633 #define k2 (((word32 *)(polyS+i*4+2))[0+NativeByteOrder::ToEnum()])
634 #define k3 (*(((word32 *)(polyS+i*4+2))+1-NativeByteOrder::ToEnum()))
635 #define kHi ((polyS+i*4+2)[0])
639 isFirstBlock =
false;
642 m_isFirstBlock =
false;
643 for (i=0; i<=(size_t)T_128BitTag; i++)
645 word64 t = (word64)nh0[i] + k0;
647 t = (t >> 32) + nh1[i] + k1;
649 aHi = (t >> 32) + nh2[i] + kHi;
654 for (i=0; i<=(size_t)T_128BitTag; i++)
670 t = (word64(word32(p) & 0x7fffffff) << 32) | t2;
674 p += MUL32(a1, 2*k3);
675 p += MUL32(a2, 2*k2);
676 p += MUL32(a3, 2*k1);
682 p += MUL32(a2, 2*k3);
683 p += MUL32(a3, 2*k2);
699 #else // #if VMAC_BOOL_32BIT
702 isFirstBlock =
false;
705 m_isFirstBlock =
false;
706 #if VMAC_BOOL_WORD128
707 #define first_poly_step(a, kh, kl, m) a = (m & m126) + ((word128(kh) << 64) | kl)
709 first_poly_step(a1, kh1, kl1, nhA);
711 first_poly_step(a2, kh2, kl2, nhB);
713 #define first_poly_step(ah, al, kh, kl, mh, ml) {\
715 ADD128(mh, ml, kh, kl); \
718 first_poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
720 first_poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
726 #if VMAC_BOOL_WORD128
727 a1 = (word128((polyS+0*4)[0]) << 64) | (polyS+0*4)[1];
729 ah1=(polyS+0*4)[0]; al1=(polyS+0*4)[1];
733 #if VMAC_BOOL_WORD128
734 a2 = (word128((polyS+1*4)[0]) << 64) | (polyS+1*4)[1];
736 ah2=(polyS+1*4)[0]; al2=(polyS+1*4)[1];
742 #if VMAC_BOOL_WORD128
743 #define poly_step(a, kh, kl, m) \
744 { word128 t1, t2, t3, t4;\
745 Multiply128(t2, a>>64, kl);\
746 Multiply128(t3, a, kh);\
747 Multiply128(t1, a, kl);\
748 Multiply128(t4, a>>64, 2*kh);\
752 a = (word128(word64(t2)&m63) << 64) | word64(t4);\
757 poly_step(a1, kh1, kl1, nhA);
759 poly_step(a2, kh2, kl2, nhB);
761 #define poly_step(ah, al, kh, kl, mh, ml) \
762 { word64 t1h, t1l, t2h, t2l, t3h, t3l, z=0; \
764 MUL64(t2h,t2l,ah,kl); \
765 MUL64(t3h,t3l,al,kh); \
766 MUL64(t1h,t1l,ah,2*kh); \
767 MUL64(ah,al,al,kl); \
769 ADD128(t2h,t2l,t3h,t3l); \
771 ADD128(ah,al,t1h,t1l); \
774 ADD128(t2h,ah,z,t2l); \
776 t2h += t2h + (ah >> 63); \
780 ADD128(ah,al,mh,ml); \
781 ADD128(ah,al,z,t2h); \
784 poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
786 poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
788 #endif // #if VMAC_BOOL_32BIT
789 }
while (blocksRemainingInWord64);
791 #if VMAC_BOOL_WORD128
792 (polyS+0*4)[0]=word64(a1>>64); (polyS+0*4)[1]=word64(a1);
795 (polyS+1*4)[0]=word64(a2>>64); (polyS+1*4)[1]=word64(a2);
797 #elif !VMAC_BOOL_32BIT
798 (polyS+0*4)[0]=ah1; (polyS+0*4)[1]=al1;
801 (polyS+1*4)[0]=ah2; (polyS+1*4)[1]=al2;
806 inline void VMAC_Base::VHASH_Update(
const word64 *data,
size_t blocksRemainingInWord64)
808 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X32)
811 VHASH_Update_SSE2(data, blocksRemainingInWord64, 0);
813 VHASH_Update_SSE2(data, blocksRemainingInWord64, 1);
814 m_isFirstBlock =
false;
819 #if defined(_MSC_VER) && _MSC_VER < 1300
820 VHASH_Update_Template(data, blocksRemainingInWord64);
823 VHASH_Update_Template<true>(data, blocksRemainingInWord64);
825 VHASH_Update_Template<false>(data, blocksRemainingInWord64);
830 size_t VMAC_Base::HashMultipleBlocks(
const word64 *data,
size_t length)
832 size_t remaining =
ModPowerOf2(length, m_L1KeyLength);
833 VHASH_Update(data, (length-remaining)/8);
837 static word64 L3Hash(
const word64 *input,
const word64 *l3Key,
size_t len)
839 word64 rh, rl, t, z=0;
840 word64 p1 = input[0], p2 = input[1];
841 word64 k1 = l3Key[0], k2 = l3Key[1];
846 ADD128(p1, p2, len, t);
848 t = (p1 > m63) + ((p1 == m63) & (p2 == m64));
849 ADD128(p1, p2, z, t);
855 t += (word32)t > 0xfffffffeU;
861 p1 += (0 - (p1 < k1)) & 257;
863 p2 += (0 - (p2 < k2)) & 257;
866 MUL64(rh, rl, p1, p2);
868 ADD128(t, rl, z, rh);
870 ADD128(t, rl, z, rh);
873 rl += (0 - (rl < t)) & 257;
874 rl += (0 - (rl > p64-1)) & 257;
882 size_t len =
ModPowerOf2(GetBitCountLo()/8, m_L1KeyLength);
886 memset(m_data()+len, 0, (0-len)%16);
887 VHASH_Update(DataBuf(), ((len+15)/16)*2);
890 else if (m_isFirstBlock)
893 m_polyState()[0] = m_polyState()[2];
894 m_polyState()[1] = m_polyState()[3];
897 m_polyState()[4] = m_polyState()[6];
898 m_polyState()[5] = m_polyState()[7];
905 t[0] = L3Hash(m_polyState(), m_l3Key(), len) + GetWord<word64>(
true,
BIG_ENDIAN_ORDER, m_pad());
906 t[1] = L3Hash(m_polyState()+4, m_l3Key()+2, len) + GetWord<word64>(
true,
BIG_ENDIAN_ORDER, m_pad()+8);
916 memcpy(mac, t, size);
921 word64 t = L3Hash(m_polyState(), m_l3Key(), len);
928 memcpy(mac, &t, size);