Crypto++ 8.5
Free C++ class library of cryptographic schemes
sha_simd.cpp
1// sha_simd.cpp - written and placed in the public domain by
2// Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3//
4// This source file uses intrinsics to gain access to SHA-NI and
5// ARMv8a SHA instructions. A separate source file is needed
6// because additional CXXFLAGS are required to enable the
7// appropriate instructions sets in some build configurations.
8
9#include "pch.h"
10#include "config.h"
11#include "sha.h"
12#include "misc.h"
13
14#if defined(CRYPTOPP_DISABLE_SHA_ASM)
15# undef CRYPTOPP_X86_ASM_AVAILABLE
16# undef CRYPTOPP_X32_ASM_AVAILABLE
17# undef CRYPTOPP_X64_ASM_AVAILABLE
18# undef CRYPTOPP_SSE2_ASM_AVAILABLE
19#endif
20
21#if (CRYPTOPP_SHANI_AVAILABLE)
22# include <nmmintrin.h>
23# include <immintrin.h>
24#endif
25
26#if (CRYPTOPP_ARM_NEON_HEADER)
27# include <arm_neon.h>
28#endif
29
30#if (CRYPTOPP_ARM_ACLE_HEADER)
31# include <stdint.h>
32# include <arm_acle.h>
33#endif
34
35#if CRYPTOPP_POWER8_SHA_AVAILABLE
36# include "ppc_simd.h"
37#endif
38
39#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
40# include <signal.h>
41# include <setjmp.h>
42#endif
43
44#ifndef EXCEPTION_EXECUTE_HANDLER
45# define EXCEPTION_EXECUTE_HANDLER 1
46#endif
47
48// Clang intrinsic casts
49#define M128_CAST(x) ((__m128i *)(void *)(x))
50#define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
51
52// Squash MS LNK4221 and libtool warnings
53extern const char SHA_SIMD_FNAME[] = __FILE__;
54
55NAMESPACE_BEGIN(CryptoPP)
56
57// ***************** SHA key tables ********************
58
59extern const word32 SHA256_K[64];
60extern const word64 SHA512_K[80];
61
62// ***************** SIGILL probes ********************
63
64#ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
65extern "C" {
66 typedef void (*SigHandler)(int);
67
68 static jmp_buf s_jmpSIGILL;
69 static void SigIllHandler(int)
70 {
71 longjmp(s_jmpSIGILL, 1);
72 }
73}
74#endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
75
76#if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
77bool CPU_ProbeSHA1()
78{
79#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
80 return false;
81#elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
82# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
83 volatile bool result = true;
84 __try
85 {
86 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
87 uint32x4_t data1 = vld1q_u32(w+0);
88 uint32x4_t data2 = vld1q_u32(w+4);
89 uint32x4_t data3 = vld1q_u32(w+8);
90
91 uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
92 uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
93 uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
94 uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
95 uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
96
97 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
98 }
99 __except (EXCEPTION_EXECUTE_HANDLER)
100 {
101 return false;
102 }
103 return result;
104# else
105
106 // longjmp and clobber warnings. Volatile is required.
107 // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
108 volatile bool result = true;
109
110 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
111 if (oldHandler == SIG_ERR)
112 return false;
113
114 volatile sigset_t oldMask;
115 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
116 {
117 signal(SIGILL, oldHandler);
118 return false;
119 }
120
121 if (setjmp(s_jmpSIGILL))
122 result = false;
123 else
124 {
125 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
126 uint32x4_t data1 = vld1q_u32(w+0);
127 uint32x4_t data2 = vld1q_u32(w+4);
128 uint32x4_t data3 = vld1q_u32(w+8);
129
130 uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
131 uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
132 uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
133 uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
134 uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
135
136 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
137 }
138
139 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
140 signal(SIGILL, oldHandler);
141 return result;
142# endif
143#else
144 return false;
145#endif // CRYPTOPP_ARM_SHA1_AVAILABLE
146}
147
148bool CPU_ProbeSHA256()
149{
150#if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
151 return false;
152#elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
153# if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
154 volatile bool result = true;
155 __try
156 {
157 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
158 uint32x4_t data1 = vld1q_u32(w+0);
159 uint32x4_t data2 = vld1q_u32(w+4);
160 uint32x4_t data3 = vld1q_u32(w+8);
161
162 uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
163 uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
164 uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
165 uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
166
167 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
168 }
169 __except (EXCEPTION_EXECUTE_HANDLER)
170 {
171 return false;
172 }
173 return result;
174#else
175
176 // longjmp and clobber warnings. Volatile is required.
177 // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
178 volatile bool result = true;
179
180 volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
181 if (oldHandler == SIG_ERR)
182 return false;
183
184 volatile sigset_t oldMask;
185 if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
186 {
187 signal(SIGILL, oldHandler);
188 return false;
189 }
190
191 if (setjmp(s_jmpSIGILL))
192 result = false;
193 else
194 {
195 unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
196 uint32x4_t data1 = vld1q_u32(w+0);
197 uint32x4_t data2 = vld1q_u32(w+4);
198 uint32x4_t data3 = vld1q_u32(w+8);
199
200 uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
201 uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
202 uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
203 uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
204
205 result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
206 }
207
208 sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
209 signal(SIGILL, oldHandler);
210 return result;
211# endif
212#else
213 return false;
214#endif // CRYPTOPP_ARM_SHA2_AVAILABLE
215}
216#endif // ARM32 or ARM64
217
218// ***************** Intel x86 SHA ********************
219
220/////////////////////////////////////
221// start of Walton and Gulley code //
222/////////////////////////////////////
223
224#if CRYPTOPP_SHANI_AVAILABLE
225// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
226void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
227{
228 CRYPTOPP_ASSERT(state);
229 CRYPTOPP_ASSERT(data);
230 CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
231
232 __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
233 __m128i MASK, MSG0, MSG1, MSG2, MSG3;
234
235 // Load initial values
236 ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
237 E0 = _mm_set_epi32(state[4], 0, 0, 0);
238 ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
239
240 // IA-32 SHA is little endian, SHA::Transform is big endian,
241 // and SHA::HashMultipleBlocks can be either. ByteOrder
242 // allows us to avoid extra endian reversals. It saves 1.0 cpb.
243 MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
244 _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
245 _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
246
247 while (length >= SHA1::BLOCKSIZE)
248 {
249 // Save current hash
250 ABCD_SAVE = ABCD;
251 E0_SAVE = E0;
252
253 // Rounds 0-3
254 MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
255 MSG0 = _mm_shuffle_epi8(MSG0, MASK);
256 E0 = _mm_add_epi32(E0, MSG0);
257 E1 = ABCD;
258 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
259
260 // Rounds 4-7
261 MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
262 MSG1 = _mm_shuffle_epi8(MSG1, MASK);
263 E1 = _mm_sha1nexte_epu32(E1, MSG1);
264 E0 = ABCD;
265 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
266 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
267
268 // Rounds 8-11
269 MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
270 MSG2 = _mm_shuffle_epi8(MSG2, MASK);
271 E0 = _mm_sha1nexte_epu32(E0, MSG2);
272 E1 = ABCD;
273 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
274 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
275 MSG0 = _mm_xor_si128(MSG0, MSG2);
276
277 // Rounds 12-15
278 MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
279 MSG3 = _mm_shuffle_epi8(MSG3, MASK);
280 E1 = _mm_sha1nexte_epu32(E1, MSG3);
281 E0 = ABCD;
282 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
283 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
284 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
285 MSG1 = _mm_xor_si128(MSG1, MSG3);
286
287 // Rounds 16-19
288 E0 = _mm_sha1nexte_epu32(E0, MSG0);
289 E1 = ABCD;
290 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
291 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
292 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
293 MSG2 = _mm_xor_si128(MSG2, MSG0);
294
295 // Rounds 20-23
296 E1 = _mm_sha1nexte_epu32(E1, MSG1);
297 E0 = ABCD;
298 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
299 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
300 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
301 MSG3 = _mm_xor_si128(MSG3, MSG1);
302
303 // Rounds 24-27
304 E0 = _mm_sha1nexte_epu32(E0, MSG2);
305 E1 = ABCD;
306 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
307 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
308 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
309 MSG0 = _mm_xor_si128(MSG0, MSG2);
310
311 // Rounds 28-31
312 E1 = _mm_sha1nexte_epu32(E1, MSG3);
313 E0 = ABCD;
314 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
315 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
316 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
317 MSG1 = _mm_xor_si128(MSG1, MSG3);
318
319 // Rounds 32-35
320 E0 = _mm_sha1nexte_epu32(E0, MSG0);
321 E1 = ABCD;
322 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
323 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
324 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
325 MSG2 = _mm_xor_si128(MSG2, MSG0);
326
327 // Rounds 36-39
328 E1 = _mm_sha1nexte_epu32(E1, MSG1);
329 E0 = ABCD;
330 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
331 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
332 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
333 MSG3 = _mm_xor_si128(MSG3, MSG1);
334
335 // Rounds 40-43
336 E0 = _mm_sha1nexte_epu32(E0, MSG2);
337 E1 = ABCD;
338 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
339 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
340 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
341 MSG0 = _mm_xor_si128(MSG0, MSG2);
342
343 // Rounds 44-47
344 E1 = _mm_sha1nexte_epu32(E1, MSG3);
345 E0 = ABCD;
346 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
347 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
348 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
349 MSG1 = _mm_xor_si128(MSG1, MSG3);
350
351 // Rounds 48-51
352 E0 = _mm_sha1nexte_epu32(E0, MSG0);
353 E1 = ABCD;
354 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
355 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
356 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
357 MSG2 = _mm_xor_si128(MSG2, MSG0);
358
359 // Rounds 52-55
360 E1 = _mm_sha1nexte_epu32(E1, MSG1);
361 E0 = ABCD;
362 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
363 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
364 MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
365 MSG3 = _mm_xor_si128(MSG3, MSG1);
366
367 // Rounds 56-59
368 E0 = _mm_sha1nexte_epu32(E0, MSG2);
369 E1 = ABCD;
370 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
371 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
372 MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
373 MSG0 = _mm_xor_si128(MSG0, MSG2);
374
375 // Rounds 60-63
376 E1 = _mm_sha1nexte_epu32(E1, MSG3);
377 E0 = ABCD;
378 MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
379 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
380 MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
381 MSG1 = _mm_xor_si128(MSG1, MSG3);
382
383 // Rounds 64-67
384 E0 = _mm_sha1nexte_epu32(E0, MSG0);
385 E1 = ABCD;
386 MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
387 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
388 MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
389 MSG2 = _mm_xor_si128(MSG2, MSG0);
390
391 // Rounds 68-71
392 E1 = _mm_sha1nexte_epu32(E1, MSG1);
393 E0 = ABCD;
394 MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
395 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
396 MSG3 = _mm_xor_si128(MSG3, MSG1);
397
398 // Rounds 72-75
399 E0 = _mm_sha1nexte_epu32(E0, MSG2);
400 E1 = ABCD;
401 MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
402 ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
403
404 // Rounds 76-79
405 E1 = _mm_sha1nexte_epu32(E1, MSG3);
406 E0 = ABCD;
407 ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
408
409 // Add values back to state
410 E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
411 ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
412
413 data += SHA1::BLOCKSIZE/sizeof(word32);
414 length -= SHA1::BLOCKSIZE;
415 }
416
417 // Save state
418 ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
419 _mm_storeu_si128(M128_CAST(state), ABCD);
420 state[4] = _mm_extract_epi32(E0, 3);
421}
422
423// Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
424void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
425{
426 CRYPTOPP_ASSERT(state);
427 CRYPTOPP_ASSERT(data);
428 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
429
430 __m128i STATE0, STATE1;
431 __m128i MSG, TMP, MASK;
432 __m128i TMSG0, TMSG1, TMSG2, TMSG3;
433 __m128i ABEF_SAVE, CDGH_SAVE;
434
435 // Load initial values
436 TMP = _mm_loadu_si128(M128_CAST(&state[0]));
437 STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
438
439 // IA-32 SHA is little endian, SHA::Transform is big endian,
440 // and SHA::HashMultipleBlocks can be either. ByteOrder
441 // allows us to avoid extra endian reversals. It saves 1.0 cpb.
442 MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
443 _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
444 _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
445
446 TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
447 STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
448 STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
449 STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
450
451 while (length >= SHA256::BLOCKSIZE)
452 {
453 // Save current hash
454 ABEF_SAVE = STATE0;
455 CDGH_SAVE = STATE1;
456
457 // Rounds 0-3
458 MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
459 TMSG0 = _mm_shuffle_epi8(MSG, MASK);
460 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
461 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
462 MSG = _mm_shuffle_epi32(MSG, 0x0E);
463 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
464
465 // Rounds 4-7
466 TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
467 TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
468 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
469 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
470 MSG = _mm_shuffle_epi32(MSG, 0x0E);
471 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
472 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
473
474 // Rounds 8-11
475 TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
476 TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
477 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
478 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
479 MSG = _mm_shuffle_epi32(MSG, 0x0E);
480 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
481 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
482
483 // Rounds 12-15
484 TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
485 TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
486 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
487 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
488 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
489 TMSG0 = _mm_add_epi32(TMSG0, TMP);
490 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
491 MSG = _mm_shuffle_epi32(MSG, 0x0E);
492 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
493 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
494
495 // Rounds 16-19
496 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
497 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
498 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
499 TMSG1 = _mm_add_epi32(TMSG1, TMP);
500 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
501 MSG = _mm_shuffle_epi32(MSG, 0x0E);
502 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
503 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
504
505 // Rounds 20-23
506 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
507 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
508 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
509 TMSG2 = _mm_add_epi32(TMSG2, TMP);
510 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
511 MSG = _mm_shuffle_epi32(MSG, 0x0E);
512 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
513 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
514
515 // Rounds 24-27
516 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
517 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
518 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
519 TMSG3 = _mm_add_epi32(TMSG3, TMP);
520 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
521 MSG = _mm_shuffle_epi32(MSG, 0x0E);
522 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
523 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
524
525 // Rounds 28-31
526 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
527 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
528 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
529 TMSG0 = _mm_add_epi32(TMSG0, TMP);
530 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
531 MSG = _mm_shuffle_epi32(MSG, 0x0E);
532 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
533 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
534
535 // Rounds 32-35
536 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
537 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
538 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
539 TMSG1 = _mm_add_epi32(TMSG1, TMP);
540 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
541 MSG = _mm_shuffle_epi32(MSG, 0x0E);
542 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
543 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
544
545 // Rounds 36-39
546 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
547 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
548 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
549 TMSG2 = _mm_add_epi32(TMSG2, TMP);
550 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
551 MSG = _mm_shuffle_epi32(MSG, 0x0E);
552 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
553 TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
554
555 // Rounds 40-43
556 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
557 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
558 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
559 TMSG3 = _mm_add_epi32(TMSG3, TMP);
560 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
561 MSG = _mm_shuffle_epi32(MSG, 0x0E);
562 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
563 TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
564
565 // Rounds 44-47
566 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
567 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
568 TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
569 TMSG0 = _mm_add_epi32(TMSG0, TMP);
570 TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
571 MSG = _mm_shuffle_epi32(MSG, 0x0E);
572 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
573 TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
574
575 // Rounds 48-51
576 MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
577 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
578 TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
579 TMSG1 = _mm_add_epi32(TMSG1, TMP);
580 TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
581 MSG = _mm_shuffle_epi32(MSG, 0x0E);
582 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
583 TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
584
585 // Rounds 52-55
586 MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
587 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
588 TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
589 TMSG2 = _mm_add_epi32(TMSG2, TMP);
590 TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
591 MSG = _mm_shuffle_epi32(MSG, 0x0E);
592 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
593
594 // Rounds 56-59
595 MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
596 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
597 TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
598 TMSG3 = _mm_add_epi32(TMSG3, TMP);
599 TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
600 MSG = _mm_shuffle_epi32(MSG, 0x0E);
601 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
602
603 // Rounds 60-63
604 MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
605 STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
606 MSG = _mm_shuffle_epi32(MSG, 0x0E);
607 STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
608
609 // Add values back to state
610 STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
611 STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
612
613 data += SHA256::BLOCKSIZE/sizeof(word32);
614 length -= SHA256::BLOCKSIZE;
615 }
616
617 TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
618 STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
619 STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
620 STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
621
622 // Save state
623 _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
624 _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
625}
626#endif // CRYPTOPP_SHANI_AVAILABLE
627
628///////////////////////////////////
629// end of Walton and Gulley code //
630///////////////////////////////////
631
632// ***************** ARMV8 SHA ********************
633
634/////////////////////////////////////////////////////////////
635// start of Walton, Schneiders, O'Rourke and Hovsmith code //
636/////////////////////////////////////////////////////////////
637
638#if CRYPTOPP_ARM_SHA1_AVAILABLE
639void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
640{
641 CRYPTOPP_ASSERT(state);
642 CRYPTOPP_ASSERT(data);
643 CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
644
645 uint32x4_t C0, C1, C2, C3;
646 uint32x4_t ABCD, ABCD_SAVED;
647 uint32x4_t MSG0, MSG1, MSG2, MSG3;
648 uint32x4_t TMP0, TMP1;
649 uint32_t E0, E0_SAVED, E1;
650
651 // Load initial values
652 C0 = vdupq_n_u32(0x5A827999);
653 C1 = vdupq_n_u32(0x6ED9EBA1);
654 C2 = vdupq_n_u32(0x8F1BBCDC);
655 C3 = vdupq_n_u32(0xCA62C1D6);
656
657 ABCD = vld1q_u32(&state[0]);
658 E0 = state[4];
659
660 while (length >= SHA1::BLOCKSIZE)
661 {
662 // Save current hash
663 ABCD_SAVED = ABCD;
664 E0_SAVED = E0;
665
666 MSG0 = vld1q_u32(data + 0);
667 MSG1 = vld1q_u32(data + 4);
668 MSG2 = vld1q_u32(data + 8);
669 MSG3 = vld1q_u32(data + 12);
670
671 if (order == BIG_ENDIAN_ORDER) // Data arrangement
672 {
673 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
674 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
675 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
676 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
677 }
678
679 TMP0 = vaddq_u32(MSG0, C0);
680 TMP1 = vaddq_u32(MSG1, C0);
681
682 // Rounds 0-3
683 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
684 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
685 TMP0 = vaddq_u32(MSG2, C0);
686 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
687
688 // Rounds 4-7
689 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
690 ABCD = vsha1cq_u32(ABCD, E1, TMP1);
691 TMP1 = vaddq_u32(MSG3, C0);
692 MSG0 = vsha1su1q_u32(MSG0, MSG3);
693 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
694
695 // Rounds 8-11
696 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
697 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
698 TMP0 = vaddq_u32(MSG0, C0);
699 MSG1 = vsha1su1q_u32(MSG1, MSG0);
700 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
701
702 // Rounds 12-15
703 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
704 ABCD = vsha1cq_u32(ABCD, E1, TMP1);
705 TMP1 = vaddq_u32(MSG1, C1);
706 MSG2 = vsha1su1q_u32(MSG2, MSG1);
707 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
708
709 // Rounds 16-19
710 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
711 ABCD = vsha1cq_u32(ABCD, E0, TMP0);
712 TMP0 = vaddq_u32(MSG2, C1);
713 MSG3 = vsha1su1q_u32(MSG3, MSG2);
714 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
715
716 // Rounds 20-23
717 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
718 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
719 TMP1 = vaddq_u32(MSG3, C1);
720 MSG0 = vsha1su1q_u32(MSG0, MSG3);
721 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
722
723 // Rounds 24-27
724 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
725 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
726 TMP0 = vaddq_u32(MSG0, C1);
727 MSG1 = vsha1su1q_u32(MSG1, MSG0);
728 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
729
730 // Rounds 28-31
731 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
732 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
733 TMP1 = vaddq_u32(MSG1, C1);
734 MSG2 = vsha1su1q_u32(MSG2, MSG1);
735 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
736
737 // Rounds 32-35
738 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
739 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
740 TMP0 = vaddq_u32(MSG2, C2);
741 MSG3 = vsha1su1q_u32(MSG3, MSG2);
742 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
743
744 // Rounds 36-39
745 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
746 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
747 TMP1 = vaddq_u32(MSG3, C2);
748 MSG0 = vsha1su1q_u32(MSG0, MSG3);
749 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
750
751 // Rounds 40-43
752 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
753 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
754 TMP0 = vaddq_u32(MSG0, C2);
755 MSG1 = vsha1su1q_u32(MSG1, MSG0);
756 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
757
758 // Rounds 44-47
759 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
760 ABCD = vsha1mq_u32(ABCD, E1, TMP1);
761 TMP1 = vaddq_u32(MSG1, C2);
762 MSG2 = vsha1su1q_u32(MSG2, MSG1);
763 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
764
765 // Rounds 48-51
766 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
767 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
768 TMP0 = vaddq_u32(MSG2, C2);
769 MSG3 = vsha1su1q_u32(MSG3, MSG2);
770 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
771
772 // Rounds 52-55
773 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
774 ABCD = vsha1mq_u32(ABCD, E1, TMP1);
775 TMP1 = vaddq_u32(MSG3, C3);
776 MSG0 = vsha1su1q_u32(MSG0, MSG3);
777 MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
778
779 // Rounds 56-59
780 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
781 ABCD = vsha1mq_u32(ABCD, E0, TMP0);
782 TMP0 = vaddq_u32(MSG0, C3);
783 MSG1 = vsha1su1q_u32(MSG1, MSG0);
784 MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
785
786 // Rounds 60-63
787 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
788 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
789 TMP1 = vaddq_u32(MSG1, C3);
790 MSG2 = vsha1su1q_u32(MSG2, MSG1);
791 MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
792
793 // Rounds 64-67
794 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
795 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
796 TMP0 = vaddq_u32(MSG2, C3);
797 MSG3 = vsha1su1q_u32(MSG3, MSG2);
798 MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
799
800 // Rounds 68-71
801 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
802 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
803 TMP1 = vaddq_u32(MSG3, C3);
804 MSG0 = vsha1su1q_u32(MSG0, MSG3);
805
806 // Rounds 72-75
807 E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
808 ABCD = vsha1pq_u32(ABCD, E0, TMP0);
809
810 // Rounds 76-79
811 E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
812 ABCD = vsha1pq_u32(ABCD, E1, TMP1);
813
814 E0 += E0_SAVED;
815 ABCD = vaddq_u32(ABCD_SAVED, ABCD);
816
817 data += SHA1::BLOCKSIZE/sizeof(word32);
818 length -= SHA1::BLOCKSIZE;
819 }
820
821 // Save state
822 vst1q_u32(&state[0], ABCD);
823 state[4] = E0;
824}
825#endif // CRYPTOPP_ARM_SHA1_AVAILABLE
826
827#if CRYPTOPP_ARM_SHA2_AVAILABLE
828void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
829{
830 CRYPTOPP_ASSERT(state);
831 CRYPTOPP_ASSERT(data);
832 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
833
834 uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
835 uint32x4_t MSG0, MSG1, MSG2, MSG3;
836 uint32x4_t TMP0, TMP1, TMP2;
837
838 // Load initial values
839 STATE0 = vld1q_u32(&state[0]);
840 STATE1 = vld1q_u32(&state[4]);
841
842 while (length >= SHA256::BLOCKSIZE)
843 {
844 // Save current hash
845 ABEF_SAVE = STATE0;
846 CDGH_SAVE = STATE1;
847
848 // Load message
849 MSG0 = vld1q_u32(data + 0);
850 MSG1 = vld1q_u32(data + 4);
851 MSG2 = vld1q_u32(data + 8);
852 MSG3 = vld1q_u32(data + 12);
853
854 if (order == BIG_ENDIAN_ORDER) // Data arrangement
855 {
856 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
857 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
858 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
859 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
860 }
861
862 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
863
864 // Rounds 0-3
865 MSG0 = vsha256su0q_u32(MSG0, MSG1);
866 TMP2 = STATE0;
867 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
868 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
869 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
870 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
871
872 // Rounds 4-7
873 MSG1 = vsha256su0q_u32(MSG1, MSG2);
874 TMP2 = STATE0;
875 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
876 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
877 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
878 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
879
880 // Rounds 8-11
881 MSG2 = vsha256su0q_u32(MSG2, MSG3);
882 TMP2 = STATE0;
883 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
884 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
885 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
886 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
887
888 // Rounds 12-15
889 MSG3 = vsha256su0q_u32(MSG3, MSG0);
890 TMP2 = STATE0;
891 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
892 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
893 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
894 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
895
896 // Rounds 16-19
897 MSG0 = vsha256su0q_u32(MSG0, MSG1);
898 TMP2 = STATE0;
899 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
900 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
901 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
902 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
903
904 // Rounds 20-23
905 MSG1 = vsha256su0q_u32(MSG1, MSG2);
906 TMP2 = STATE0;
907 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
908 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
909 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
910 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
911
912 // Rounds 24-27
913 MSG2 = vsha256su0q_u32(MSG2, MSG3);
914 TMP2 = STATE0;
915 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
916 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
917 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
918 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
919
920 // Rounds 28-31
921 MSG3 = vsha256su0q_u32(MSG3, MSG0);
922 TMP2 = STATE0;
923 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
924 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
925 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
926 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
927
928 // Rounds 32-35
929 MSG0 = vsha256su0q_u32(MSG0, MSG1);
930 TMP2 = STATE0;
931 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
932 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
933 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
934 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
935
936 // Rounds 36-39
937 MSG1 = vsha256su0q_u32(MSG1, MSG2);
938 TMP2 = STATE0;
939 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
940 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
941 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
942 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
943
944 // Rounds 40-43
945 MSG2 = vsha256su0q_u32(MSG2, MSG3);
946 TMP2 = STATE0;
947 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
948 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
949 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
950 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
951
952 // Rounds 44-47
953 MSG3 = vsha256su0q_u32(MSG3, MSG0);
954 TMP2 = STATE0;
955 TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
956 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
957 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
958 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
959
960 // Rounds 48-51
961 TMP2 = STATE0;
962 TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
963 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
964 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
965
966 // Rounds 52-55
967 TMP2 = STATE0;
968 TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
969 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
970 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
971
972 // Rounds 56-59
973 TMP2 = STATE0;
974 TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
975 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
976 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
977
978 // Rounds 60-63
979 TMP2 = STATE0;
980 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
981 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
982
983 // Add back to state
984 STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
985 STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
986
987 data += SHA256::BLOCKSIZE/sizeof(word32);
988 length -= SHA256::BLOCKSIZE;
989 }
990
991 // Save state
992 vst1q_u32(&state[0], STATE0);
993 vst1q_u32(&state[4], STATE1);
994}
995#endif // CRYPTOPP_ARM_SHA2_AVAILABLE
996
997///////////////////////////////////////////////////////////
998// end of Walton, Schneiders, O'Rourke and Hovsmith code //
999///////////////////////////////////////////////////////////
1000
1001// ***************** Power8 SHA ********************
1002
1003//////////////////////////////////////////////////
1004// start Gustavo, Serra, Scalet and Walton code //
1005//////////////////////////////////////////////////
1006
1007#if CRYPTOPP_POWER8_SHA_AVAILABLE
1008
1009// Indexes into the S[] array
1010enum {A=0, B=1, C, D, E, F, G, H};
1011
1012inline
1013uint32x4_p VecLoad32(const word32* data, int offset)
1014{
1015#if (CRYPTOPP_LITTLE_ENDIAN)
1016 const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1017 const uint32x4_p val = VecLoad(offset, data);
1018 return (uint32x4_p)VecPermute(val, val, mask);
1019#else
1020 return VecLoad(offset, data);
1021#endif
1022}
1023
1024template<class T> inline
1025void VecStore32(const T data, word32 dest[4])
1026{
1027 VecStore(data, dest);
1028}
1029
1030inline
1031uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1032{
1033 // The trick below is due to Andy Polyakov and Jack Lloyd
1034 return vec_sel(z,y,x);
1035}
1036
1037inline
1038uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1039{
1040 // The trick below is due to Andy Polyakov and Jack Lloyd
1041 return vec_sel(y, z, VecXor(x, y));
1042}
1043
1044inline
1045uint32x4_p Vector_sigma0(const uint32x4_p val)
1046{
1047 return VecSHA256<0,0>(val);
1048}
1049
1050inline
1051uint32x4_p Vector_sigma1(const uint32x4_p val)
1052{
1053 return VecSHA256<0,0xf>(val);
1054}
1055
1056inline
1057uint32x4_p VectorSigma0(const uint32x4_p val)
1058{
1059 return VecSHA256<1,0>(val);
1060}
1061
1062inline
1063uint32x4_p VectorSigma1(const uint32x4_p val)
1064{
1065 return VecSHA256<1,0xf>(val);
1066}
1067
1068inline
1069uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b,
1070 const uint32x4_p c, const uint32x4_p d)
1071{
1072 const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1073 const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1074 return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
1075}
1076
1077template <unsigned int R> inline
1078void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M)
1079{
1080 uint32x4_p T1, T2;
1081
1082 W[R] = M;
1083 T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1084 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1085
1086 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1087 S[E] = S[D] + T1;
1088 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1089 S[A] = T1 + T2;
1090}
1091
1092template <unsigned int R> inline
1093void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K)
1094{
1095 // Indexes into the W[] array
1096 enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1097
1098 const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
1099 const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
1100
1101 uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1102 T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1103 uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1104
1105 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1106 S[E] = S[D] + T1;
1107 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1108 S[A] = T1 + T2;
1109}
1110
1111void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
1112{
1113 CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1114 CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1115 CRYPTOPP_UNUSED(order);
1116
1117 const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1118 const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1119
1120 uint32x4_p abcd = VecLoad(state+0);
1121 uint32x4_p efgh = VecLoad(state+4);
1122 uint32x4_p W[16], S[8], vm, vk;
1123
1124 size_t blocks = length / SHA256::BLOCKSIZE;
1125 while (blocks--)
1126 {
1127 unsigned int offset=0;
1128
1129 S[A] = abcd; S[E] = efgh;
1130 S[B] = VecShiftLeftOctet<4>(S[A]);
1131 S[F] = VecShiftLeftOctet<4>(S[E]);
1132 S[C] = VecShiftLeftOctet<4>(S[B]);
1133 S[G] = VecShiftLeftOctet<4>(S[F]);
1134 S[D] = VecShiftLeftOctet<4>(S[C]);
1135 S[H] = VecShiftLeftOctet<4>(S[G]);
1136
1137 // Rounds 0-16
1138 vk = VecLoad(offset, k);
1139 vm = VecLoad32(m, offset);
1140 SHA256_ROUND1<0>(W,S, vk,vm);
1141 offset+=16;
1142
1143 vk = VecShiftLeftOctet<4>(vk);
1144 vm = VecShiftLeftOctet<4>(vm);
1145 SHA256_ROUND1<1>(W,S, vk,vm);
1146
1147 vk = VecShiftLeftOctet<4>(vk);
1148 vm = VecShiftLeftOctet<4>(vm);
1149 SHA256_ROUND1<2>(W,S, vk,vm);
1150
1151 vk = VecShiftLeftOctet<4>(vk);
1152 vm = VecShiftLeftOctet<4>(vm);
1153 SHA256_ROUND1<3>(W,S, vk,vm);
1154
1155 vk = VecLoad(offset, k);
1156 vm = VecLoad32(m, offset);
1157 SHA256_ROUND1<4>(W,S, vk,vm);
1158 offset+=16;
1159
1160 vk = VecShiftLeftOctet<4>(vk);
1161 vm = VecShiftLeftOctet<4>(vm);
1162 SHA256_ROUND1<5>(W,S, vk,vm);
1163
1164 vk = VecShiftLeftOctet<4>(vk);
1165 vm = VecShiftLeftOctet<4>(vm);
1166 SHA256_ROUND1<6>(W,S, vk,vm);
1167
1168 vk = VecShiftLeftOctet<4>(vk);
1169 vm = VecShiftLeftOctet<4>(vm);
1170 SHA256_ROUND1<7>(W,S, vk,vm);
1171
1172 vk = VecLoad(offset, k);
1173 vm = VecLoad32(m, offset);
1174 SHA256_ROUND1<8>(W,S, vk,vm);
1175 offset+=16;
1176
1177 vk = VecShiftLeftOctet<4>(vk);
1178 vm = VecShiftLeftOctet<4>(vm);
1179 SHA256_ROUND1<9>(W,S, vk,vm);
1180
1181 vk = VecShiftLeftOctet<4>(vk);
1182 vm = VecShiftLeftOctet<4>(vm);
1183 SHA256_ROUND1<10>(W,S, vk,vm);
1184
1185 vk = VecShiftLeftOctet<4>(vk);
1186 vm = VecShiftLeftOctet<4>(vm);
1187 SHA256_ROUND1<11>(W,S, vk,vm);
1188
1189 vk = VecLoad(offset, k);
1190 vm = VecLoad32(m, offset);
1191 SHA256_ROUND1<12>(W,S, vk,vm);
1192 offset+=16;
1193
1194 vk = VecShiftLeftOctet<4>(vk);
1195 vm = VecShiftLeftOctet<4>(vm);
1196 SHA256_ROUND1<13>(W,S, vk,vm);
1197
1198 vk = VecShiftLeftOctet<4>(vk);
1199 vm = VecShiftLeftOctet<4>(vm);
1200 SHA256_ROUND1<14>(W,S, vk,vm);
1201
1202 vk = VecShiftLeftOctet<4>(vk);
1203 vm = VecShiftLeftOctet<4>(vm);
1204 SHA256_ROUND1<15>(W,S, vk,vm);
1205
1206 m += 16; // 32-bit words, not bytes
1207
1208 // Rounds 16-64
1209 for (unsigned int i=16; i<64; i+=16)
1210 {
1211 vk = VecLoad(offset, k);
1212 SHA256_ROUND2<0>(W,S, vk);
1213 SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
1214 SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
1215 SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
1216 offset+=16;
1217
1218 vk = VecLoad(offset, k);
1219 SHA256_ROUND2<4>(W,S, vk);
1220 SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
1221 SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
1222 SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
1223 offset+=16;
1224
1225 vk = VecLoad(offset, k);
1226 SHA256_ROUND2<8>(W,S, vk);
1227 SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
1228 SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
1229 SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
1230 offset+=16;
1231
1232 vk = VecLoad(offset, k);
1233 SHA256_ROUND2<12>(W,S, vk);
1234 SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
1235 SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
1236 SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
1237 offset+=16;
1238 }
1239
1240 abcd += VectorPack(S[A],S[B],S[C],S[D]);
1241 efgh += VectorPack(S[E],S[F],S[G],S[H]);
1242 }
1243
1244 VecStore32(abcd, state+0);
1245 VecStore32(efgh, state+4);
1246}
1247
1248inline
1249void VecStore64(const uint64x2_p val, word64* data)
1250{
1251 VecStore(val, data);
1252}
1253
1254inline
1255uint64x2_p VecLoad64(const word64* data, int offset)
1256{
1257#if (CRYPTOPP_LITTLE_ENDIAN)
1258 const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1259 return VecPermute(VecLoad(offset, data), mask);
1260#else
1261 return VecLoad(offset, data);
1262#endif
1263}
1264
1265inline
1266uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1267{
1268 // The trick below is due to Andy Polyakov and Jack Lloyd
1269 return vec_sel(z,y,x);
1270}
1271
1272inline
1273uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1274{
1275 // The trick below is due to Andy Polyakov and Jack Lloyd
1276 return vec_sel(y, z, VecXor(x, y));
1277}
1278
1279inline
1280uint64x2_p Vector_sigma0(const uint64x2_p val)
1281{
1282 return VecSHA512<0,0>(val);
1283}
1284
1285inline
1286uint64x2_p Vector_sigma1(const uint64x2_p val)
1287{
1288 return VecSHA512<0,0xf>(val);
1289}
1290
1291inline
1292uint64x2_p VectorSigma0(const uint64x2_p val)
1293{
1294 return VecSHA512<1,0>(val);
1295}
1296
1297inline
1298uint64x2_p VectorSigma1(const uint64x2_p val)
1299{
1300 return VecSHA512<1,0xf>(val);
1301}
1302
1303inline
1304uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y)
1305{
1306 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1307 return VecPermute(x,y,m);
1308}
1309
1310template <unsigned int R> inline
1311void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M)
1312{
1313 uint64x2_p T1, T2;
1314
1315 W[R] = M;
1316 T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1317 T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1318
1319 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1320 S[E] = S[D] + T1;
1321 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1322 S[A] = T1 + T2;
1323}
1324
1325template <unsigned int R> inline
1326void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K)
1327{
1328 // Indexes into the W[] array
1329 enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1330
1331 const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
1332 const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
1333
1334 uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1335 T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1336 uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1337
1338 S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1339 S[E] = S[D] + T1;
1340 S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1341 S[A] = T1 + T2;
1342}
1343
1344void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
1345{
1346 CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1347 CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1348 CRYPTOPP_UNUSED(order);
1349
1350 const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1351 const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1352
1353 uint64x2_p ab = VecLoad(state+0);
1354 uint64x2_p cd = VecLoad(state+2);
1355 uint64x2_p ef = VecLoad(state+4);
1356 uint64x2_p gh = VecLoad(state+6);
1357 uint64x2_p W[16], S[8], vm, vk;
1358
1359 size_t blocks = length / SHA512::BLOCKSIZE;
1360 while (blocks--)
1361 {
1362 unsigned int offset=0;
1363
1364 S[A] = ab; S[C] = cd;
1365 S[E] = ef; S[G] = gh;
1366 S[B] = VecShiftLeftOctet<8>(S[A]);
1367 S[D] = VecShiftLeftOctet<8>(S[C]);
1368 S[F] = VecShiftLeftOctet<8>(S[E]);
1369 S[H] = VecShiftLeftOctet<8>(S[G]);
1370
1371 // Rounds 0-16
1372 vk = VecLoad(offset, k);
1373 vm = VecLoad64(m, offset);
1374 SHA512_ROUND1<0>(W,S, vk,vm);
1375 offset+=16;
1376
1377 vk = VecShiftLeftOctet<8>(vk);
1378 vm = VecShiftLeftOctet<8>(vm);
1379 SHA512_ROUND1<1>(W,S, vk,vm);
1380
1381 vk = VecLoad(offset, k);
1382 vm = VecLoad64(m, offset);
1383 SHA512_ROUND1<2>(W,S, vk,vm);
1384 offset+=16;
1385
1386 vk = VecShiftLeftOctet<8>(vk);
1387 vm = VecShiftLeftOctet<8>(vm);
1388 SHA512_ROUND1<3>(W,S, vk,vm);
1389
1390 vk = VecLoad(offset, k);
1391 vm = VecLoad64(m, offset);
1392 SHA512_ROUND1<4>(W,S, vk,vm);
1393 offset+=16;
1394
1395 vk = VecShiftLeftOctet<8>(vk);
1396 vm = VecShiftLeftOctet<8>(vm);
1397 SHA512_ROUND1<5>(W,S, vk,vm);
1398
1399 vk = VecLoad(offset, k);
1400 vm = VecLoad64(m, offset);
1401 SHA512_ROUND1<6>(W,S, vk,vm);
1402 offset+=16;
1403
1404 vk = VecShiftLeftOctet<8>(vk);
1405 vm = VecShiftLeftOctet<8>(vm);
1406 SHA512_ROUND1<7>(W,S, vk,vm);
1407
1408 vk = VecLoad(offset, k);
1409 vm = VecLoad64(m, offset);
1410 SHA512_ROUND1<8>(W,S, vk,vm);
1411 offset+=16;
1412
1413 vk = VecShiftLeftOctet<8>(vk);
1414 vm = VecShiftLeftOctet<8>(vm);
1415 SHA512_ROUND1<9>(W,S, vk,vm);
1416
1417 vk = VecLoad(offset, k);
1418 vm = VecLoad64(m, offset);
1419 SHA512_ROUND1<10>(W,S, vk,vm);
1420 offset+=16;
1421
1422 vk = VecShiftLeftOctet<8>(vk);
1423 vm = VecShiftLeftOctet<8>(vm);
1424 SHA512_ROUND1<11>(W,S, vk,vm);
1425
1426 vk = VecLoad(offset, k);
1427 vm = VecLoad64(m, offset);
1428 SHA512_ROUND1<12>(W,S, vk,vm);
1429 offset+=16;
1430
1431 vk = VecShiftLeftOctet<8>(vk);
1432 vm = VecShiftLeftOctet<8>(vm);
1433 SHA512_ROUND1<13>(W,S, vk,vm);
1434
1435 vk = VecLoad(offset, k);
1436 vm = VecLoad64(m, offset);
1437 SHA512_ROUND1<14>(W,S, vk,vm);
1438 offset+=16;
1439
1440 vk = VecShiftLeftOctet<8>(vk);
1441 vm = VecShiftLeftOctet<8>(vm);
1442 SHA512_ROUND1<15>(W,S, vk,vm);
1443
1444 m += 16; // 64-bit words, not bytes
1445
1446 // Rounds 16-80
1447 for (unsigned int i=16; i<80; i+=16)
1448 {
1449 vk = VecLoad(offset, k);
1450 SHA512_ROUND2<0>(W,S, vk);
1451 SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
1452 offset+=16;
1453
1454 vk = VecLoad(offset, k);
1455 SHA512_ROUND2<2>(W,S, vk);
1456 SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
1457 offset+=16;
1458
1459 vk = VecLoad(offset, k);
1460 SHA512_ROUND2<4>(W,S, vk);
1461 SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
1462 offset+=16;
1463
1464 vk = VecLoad(offset, k);
1465 SHA512_ROUND2<6>(W,S, vk);
1466 SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
1467 offset+=16;
1468
1469 vk = VecLoad(offset, k);
1470 SHA512_ROUND2<8>(W,S, vk);
1471 SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
1472 offset+=16;
1473
1474 vk = VecLoad(offset, k);
1475 SHA512_ROUND2<10>(W,S, vk);
1476 SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
1477 offset+=16;
1478
1479 vk = VecLoad(offset, k);
1480 SHA512_ROUND2<12>(W,S, vk);
1481 SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
1482 offset+=16;
1483
1484 vk = VecLoad(offset, k);
1485 SHA512_ROUND2<14>(W,S, vk);
1486 SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
1487 offset+=16;
1488 }
1489
1490 ab += VectorPack(S[A],S[B]);
1491 cd += VectorPack(S[C],S[D]);
1492 ef += VectorPack(S[E],S[F]);
1493 gh += VectorPack(S[G],S[H]);
1494 }
1495
1496 VecStore64(ab, state+0);
1497 VecStore64(cd, state+2);
1498 VecStore64(ef, state+4);
1499 VecStore64(gh, state+6);
1500}
1501
1502#endif // CRYPTOPP_POWER8_SHA_AVAILABLE
1503
1504////////////////////////////////////////////////
1505// end Gustavo, Serra, Scalet and Walton code //
1506////////////////////////////////////////////////
1507
1508NAMESPACE_END
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:609
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:614
Library configuration file.
#define W64LIT(x)
Declare an unsigned word64.
Definition: config_int.h:119
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:91
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:143
@ BIG_ENDIAN_ORDER
byte order is big-endian
Definition: cryptlib.h:147
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:202
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1478
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:192
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1414
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:212
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:895
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:369
Classes for SHA-1 and SHA-2 family of message digests.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68