Crypto++ 8.5
Free C++ class library of cryptographic schemes
lea_simd.cpp
1// lea_simd.cpp - written and placed in the public domain by Jeffrey Walton
2//
3// This source file uses intrinsics and built-ins to gain access to
4// SSSE3, ARM NEON and ARMv8a, and Power8 Altivec instructions. A separate
5// source file is needed because additional CXXFLAGS are required to enable
6// the appropriate instructions sets in some build configurations.
7
8#include "pch.h"
9#include "config.h"
10
11#include "lea.h"
12#include "misc.h"
13
14// Uncomment for benchmarking C++ against SSE or NEON.
15// Do so in both simon.cpp and simon_simd.cpp.
16// #undef CRYPTOPP_SSSE3_AVAILABLE
17// #undef CRYPTOPP_ARM_NEON_AVAILABLE
18
19#if (CRYPTOPP_SSSE3_AVAILABLE)
20# include "adv_simd.h"
21# include <pmmintrin.h>
22# include <tmmintrin.h>
23#endif
24
25#if defined(__XOP__)
26# include <ammintrin.h>
27# if defined(__GNUC__)
28# include <x86intrin.h>
29# endif
30#endif
31
32#if defined(__AVX512F__)
33# define CRYPTOPP_AVX512_ROTATE 1
34# include <immintrin.h>
35#endif
36
37#if (CRYPTOPP_ARM_NEON_HEADER)
38# include "adv_simd.h"
39# include <arm_neon.h>
40#endif
41
42#if (CRYPTOPP_ARM_ACLE_HEADER)
43# include <stdint.h>
44# include <arm_acle.h>
45#endif
46
47#if defined(_M_ARM64)
48# include "adv_simd.h"
49#endif
50
51// Do not port this to POWER architecture. Naively we hoped
52// for a 2x to 3x speedup. The result was a 5x slow down.
53// The table below shows MiB/s and cpb.
54//
55// C++:
56// <TD>LEA-128(128)/CTR (128-bit key)<TD>C++<TD>207<TD>15.64
57// <TD>LEA-128(192)/CTR (192-bit key)<TD>C++<TD>186<TD>17.48
58// <TD>LEA-128(256)/CTR (256-bit key)<TD>C++<TD>124<TD>26.2
59//
60// Power8:
61// <TD>LEA-128(128)/CTR (128-bit key)<TD>Power8<TD>37<TD>88.7
62// <TD>LEA-128(192)/CTR (192-bit key)<TD>Power8<TD>40<TD>82.1
63// <TD>LEA-128(256)/CTR (256-bit key)<TD>Power8<TD>28<TD>116.0
64
65#undef CRYPTOPP_POWER8_AVAILABLE
66#if defined(CRYPTOPP_POWER8_AVAILABLE)
67# include "adv_simd.h"
68# include "ppc_simd.h"
69#endif
70
71// Squash MS LNK4221 and libtool warnings
72extern const char LEA_SIMD_FNAME[] = __FILE__;
73
74ANONYMOUS_NAMESPACE_BEGIN
75
77
78// *************************** ARM NEON ***************************//
79
80#if (CRYPTOPP_ARM_NEON_AVAILABLE)
81
82inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b)
83{
84 return veorq_u32(a, b);
85}
86
87inline uint32x4_t Add(const uint32x4_t& a, const uint32x4_t& b)
88{
89 return vaddq_u32(a, b);
90}
91
92inline uint32x4_t Sub(const uint32x4_t& a, const uint32x4_t& b)
93{
94 return vsubq_u32(a, b);
95}
96
97template <unsigned int R>
98inline uint32x4_t RotateLeft(const uint32x4_t& val)
99{
100 const uint32x4_t a(vshlq_n_u32(val, R));
101 const uint32x4_t b(vshrq_n_u32(val, 32 - R));
102 return vorrq_u32(a, b);
103}
104
105template <unsigned int R>
106inline uint32x4_t RotateRight(const uint32x4_t& val)
107{
108 const uint32x4_t a(vshlq_n_u32(val, 32 - R));
109 const uint32x4_t b(vshrq_n_u32(val, R));
110 return vorrq_u32(a, b);
111}
112
113#if defined(__aarch32__) || defined(__aarch64__)
114template <>
115inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)
116{
117#if (CRYPTOPP_BIG_ENDIAN)
118 const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
119 const uint8x16_t mask = vld1q_u8(maskb);
120#else
121 const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
122 const uint8x16_t mask = vld1q_u8(maskb);
123#endif
124
125 return vreinterpretq_u32_u8(
126 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
127}
128
129template <>
130inline uint32x4_t RotateRight<8>(const uint32x4_t& val)
131{
132#if (CRYPTOPP_BIG_ENDIAN)
133 const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
134 const uint8x16_t mask = vld1q_u8(maskb);
135#else
136 const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
137 const uint8x16_t mask = vld1q_u8(maskb);
138#endif
139
140 return vreinterpretq_u32_u8(
141 vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
142}
143#endif
144
145uint32x4_t UnpackLow32(uint32x4_t a, uint32x4_t b)
146{
147 uint32x2_t a1 = vget_low_u32(a);
148 uint32x2_t b1 = vget_low_u32(b);
149 uint32x2x2_t result = vzip_u32(a1, b1);
150 return vcombine_u32(result.val[0], result.val[1]);
151}
152
153uint32x4_t UnpackHigh32(uint32x4_t a, uint32x4_t b)
154{
155 uint32x2_t a1 = vget_high_u32(a);
156 uint32x2_t b1 = vget_high_u32(b);
157 uint32x2x2_t result = vzip_u32(a1, b1);
158 return vcombine_u32(result.val[0], result.val[1]);
159}
160
161uint32x4_t UnpackLow64(uint32x4_t a, uint32x4_t b)
162{
163 uint64x1_t a1 = vget_low_u64((uint64x2_t)a);
164 uint64x1_t b1 = vget_low_u64((uint64x2_t)b);
165 return (uint32x4_t)vcombine_u64(a1, b1);
166}
167
168uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b)
169{
170 uint64x1_t a1 = vget_high_u64((uint64x2_t)a);
171 uint64x1_t b1 = vget_high_u64((uint64x2_t)b);
172 return (uint32x4_t)vcombine_u64(a1, b1);
173}
174
175template <unsigned int IDX>
176inline uint32x4_t LoadKey(const word32 rkey[])
177{
178 return vdupq_n_u32(rkey[IDX]);
179}
180
181template <unsigned int IDX>
182inline uint32x4_t UnpackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
183{
184 // Should not be instantiated
186
187 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
188 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
189 return vmovq_n_u32(0);
190}
191
192template <>
193inline uint32x4_t UnpackNEON<0>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
194{
195 const uint32x4_t r1 = UnpackLow32(a, b);
196 const uint32x4_t r2 = UnpackLow32(c, d);
197 return UnpackLow64(r1, r2);
198}
199
200template <>
201inline uint32x4_t UnpackNEON<1>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
202{
203 const uint32x4_t r1 = UnpackLow32(a, b);
204 const uint32x4_t r2 = UnpackLow32(c, d);
205 return UnpackHigh64(r1, r2);
206}
207
208template <>
209inline uint32x4_t UnpackNEON<2>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
210{
211 const uint32x4_t r1 = UnpackHigh32(a, b);
212 const uint32x4_t r2 = UnpackHigh32(c, d);
213 return UnpackLow64(r1, r2);
214}
215
216template <>
217inline uint32x4_t UnpackNEON<3>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
218{
219 const uint32x4_t r1 = UnpackHigh32(a, b);
220 const uint32x4_t r2 = UnpackHigh32(c, d);
221 return UnpackHigh64(r1, r2);
222}
223
224template <unsigned int IDX>
225inline uint32x4_t UnpackNEON(const uint32x4_t& v)
226{
227 // Should not be instantiated
229
230 CRYPTOPP_UNUSED(v);
231 return vmovq_n_u32(0);
232}
233
234template <>
235inline uint32x4_t UnpackNEON<0>(const uint32x4_t& v)
236{
237 // Splat to all lanes
238 return vdupq_n_u32(vgetq_lane_u32(v, 0));
239}
240
241template <>
242inline uint32x4_t UnpackNEON<1>(const uint32x4_t& v)
243{
244 // Splat to all lanes
245 return vdupq_n_u32(vgetq_lane_u32(v, 1));
246}
247
248template <>
249inline uint32x4_t UnpackNEON<2>(const uint32x4_t& v)
250{
251 // Splat to all lanes
252 return vdupq_n_u32(vgetq_lane_u32(v, 2));
253}
254
255template <>
256inline uint32x4_t UnpackNEON<3>(const uint32x4_t& v)
257{
258 // Splat to all lanes
259 return vdupq_n_u32(vgetq_lane_u32(v, 3));
260}
261
262template <unsigned int IDX>
263inline uint32x4_t RepackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
264{
265 return UnpackNEON<IDX>(a, b, c, d);
266}
267
268template <unsigned int IDX>
269inline uint32x4_t RepackNEON(const uint32x4_t& v)
270{
271 return UnpackNEON<IDX>(v);
272}
273
274#endif // CRYPTOPP_ARM_NEON_AVAILABLE
275
276// *************************** IA-32 ***************************//
277
278#if (CRYPTOPP_SSSE3_AVAILABLE)
279
280inline __m128i Xor(const __m128i& a, const __m128i& b)
281{
282 return _mm_xor_si128(a, b);
283}
284
285inline __m128i Add(const __m128i& a, const __m128i& b)
286{
287 return _mm_add_epi32(a, b);
288}
289
290inline __m128i Sub(const __m128i& a, const __m128i& b)
291{
292 return _mm_sub_epi32(a, b);
293}
294
295template <unsigned int R>
296inline __m128i RotateLeft(const __m128i& val)
297{
298#if defined(__XOP__)
299 return _mm_roti_epi32(val, R);
300#else
301 return _mm_or_si128(
302 _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
303#endif
304}
305
306template <unsigned int R>
307inline __m128i RotateRight(const __m128i& val)
308{
309#if defined(__XOP__)
310 return _mm_roti_epi32(val, 32-R);
311#else
312 return _mm_or_si128(
313 _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
314#endif
315}
316
317// Faster than two Shifts and an Or.
318template <>
319inline __m128i RotateLeft<8>(const __m128i& val)
320{
321#if defined(__XOP__)
322 return _mm_roti_epi32(val, 8);
323#else
324 const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
325 return _mm_shuffle_epi8(val, mask);
326#endif
327}
328
329// Faster than two Shifts and an Or.
330template <>
331inline __m128i RotateRight<8>(const __m128i& val)
332{
333#if defined(__XOP__)
334 return _mm_roti_epi32(val, 32-8);
335#else
336 const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
337 return _mm_shuffle_epi8(val, mask);
338#endif
339}
340
341template <unsigned int IDX>
342inline __m128i LoadKey(const word32 rkey[])
343{
344 float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk));
345 return _mm_castps_si128(_mm_load_ps1(&rk));
346}
347
348template <unsigned int IDX>
349inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
350{
351 // Should not be instantiated
352 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
353 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
355 return _mm_setzero_si128();
356}
357
358template <>
359inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
360{
361 // LEA is little-endian oriented, so there is no need for a separate shuffle.
362 const __m128i r1 = _mm_unpacklo_epi32(a, b);
363 const __m128i r2 = _mm_unpacklo_epi32(c, d);
364 return _mm_unpacklo_epi64(r1, r2);
365}
366
367template <>
368inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
369{
370 // LEA is little-endian oriented, so there is no need for a separate shuffle.
371 const __m128i r1 = _mm_unpacklo_epi32(a, b);
372 const __m128i r2 = _mm_unpacklo_epi32(c, d);
373 return _mm_unpackhi_epi64(r1, r2);
374}
375
376template <>
377inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
378{
379 // LEA is little-endian oriented, so there is no need for a separate shuffle.
380 const __m128i r1 = _mm_unpackhi_epi32(a, b);
381 const __m128i r2 = _mm_unpackhi_epi32(c, d);
382 return _mm_unpacklo_epi64(r1, r2);
383}
384
385template <>
386inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
387{
388 // LEA is little-endian oriented, so there is no need for a separate shuffle.
389 const __m128i r1 = _mm_unpackhi_epi32(a, b);
390 const __m128i r2 = _mm_unpackhi_epi32(c, d);
391 return _mm_unpackhi_epi64(r1, r2);
392}
393
394template <unsigned int IDX>
395inline __m128i UnpackXMM(const __m128i& v)
396{
397 // Should not be instantiated
398 CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
399 return _mm_setzero_si128();
400}
401
402template <>
403inline __m128i UnpackXMM<0>(const __m128i& v)
404{
405 // Splat to all lanes
406 return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
407}
408
409template <>
410inline __m128i UnpackXMM<1>(const __m128i& v)
411{
412 // Splat to all lanes
413 return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
414}
415
416template <>
417inline __m128i UnpackXMM<2>(const __m128i& v)
418{
419 // Splat to all lanes
420 return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
421}
422
423template <>
424inline __m128i UnpackXMM<3>(const __m128i& v)
425{
426 // Splat to all lanes
427 return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
428}
429
430template <unsigned int IDX>
431inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
432{
433 return UnpackXMM<IDX>(a, b, c, d);
434}
435
436template <unsigned int IDX>
437inline __m128i RepackXMM(const __m128i& v)
438{
439 return UnpackXMM<IDX>(v);
440}
441
442#endif // CRYPTOPP_SSSE3_AVAILABLE
443
444// *************************** Power8 ***************************//
445
446#if (CRYPTOPP_POWER8_AVAILABLE)
447
451
452inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b)
453{
454 return VecXor(a, b);
455}
456
457inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b)
458{
459 return VecAdd(a, b);
460}
461
462inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b)
463{
464 return VecSub(a, b);
465}
466
467template <unsigned int R>
468inline uint32x4_p RotateLeft(const uint32x4_p& val)
469{
470 const uint32x4_p m = {R, R, R, R};
471 return vec_rl(val, m);
472}
473
474template <unsigned int R>
475inline uint32x4_p RotateRight(const uint32x4_p& val)
476{
477 const uint32x4_p m = {32-R, 32-R, 32-R, 32-R};
478 return vec_rl(val, m);
479}
480
481template <unsigned int IDX>
482inline uint32x4_p LoadKey(const word32 rkey[])
483{
484 return vec_splats(rkey[IDX]);
485}
486
487template <unsigned int IDX>
488inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
489{
490 // Should not be instantiated
491 CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
492 CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
494 return VecXor(a, a);
495}
496
497template <>
498inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
499{
500 const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
501 const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
502 return (uint32x4_p)vec_mergel(r1, r2);
503}
504
505template <>
506inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
507{
508 const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
509 const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
510 return (uint32x4_p)vec_mergeh(r1, r2);
511}
512
513template <>
514inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
515{
516 const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
517 const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
518 return (uint32x4_p)vec_mergel(r1, r2);
519}
520
521template <>
522inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
523{
524 const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
525 const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
526 return (uint32x4_p)vec_mergeh(r1, r2);
527}
528
529template <unsigned int IDX>
530inline uint32x4_p UnpackSIMD(const uint32x4_p& v)
531{
532 // Should not be instantiated
534 return VecXor(v, v);
535}
536
537template <>
538inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v)
539{
540 // Splat to all lanes
541 const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0};
542 return (uint32x4_p)VecPermute(v, v, m);
543}
544
545template <>
546inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v)
547{
548 // Splat to all lanes
549 const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4};
550 return (uint32x4_p)VecPermute(v, v, m);
551}
552
553template <>
554inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v)
555{
556 // Splat to all lanes
557 const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8};
558 return (uint32x4_p)VecPermute(v, v, m);
559}
560
561template <>
562inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v)
563{
564 // Splat to all lanes
565 const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12};
566 return (uint32x4_p)VecPermute(v, v, m);
567}
568
569template <unsigned int IDX>
570inline uint32x4_p RepackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
571{
572 return UnpackSIMD<IDX>(a, b, c, d);
573}
574
575template <unsigned int IDX>
576inline uint32x4_p RepackSIMD(const uint32x4_p& v)
577{
578 return UnpackSIMD<IDX>(v);
579}
580
581#endif // CRYPTOPP_POWER8_AVAILABLE
582
583// *************************** LEA Encryption ***************************//
584
585#if (CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_SSSE3_AVAILABLE)
586
587template <class W>
588inline void LEA_Encryption(W temp[4], const word32 *subkeys, unsigned int rounds)
589{
590 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys))));
591 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys))));
592 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys))));
593 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys))));
594 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys))));
595 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys))));
596 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys))));
597 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys))));
598 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys))));
599 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys))));
600 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys))));
601 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys))));
602
603 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys))));
604 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys))));
605 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys))));
606 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys))));
607 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys))));
608 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys))));
609 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys))));
610 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys))));
611 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys))));
612 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys))));
613 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys))));
614 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys))));
615
616 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys))));
617 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys))));
618 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys))));
619 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys))));
620 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys))));
621 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys))));
622 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys))));
623 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys))));
624 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys))));
625 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys))));
626 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys))));
627 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys))));
628
629 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys))));
630 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys))));
631 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys))));
632 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys))));
633 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys))));
634 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys))));
635 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys))));
636 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys))));
637 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys))));
638 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys))));
639 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys))));
640 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys))));
641
642 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys))));
643 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys))));
644 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys))));
645 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys))));
646 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys))));
647 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys))));
648 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys))));
649 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys))));
650 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys))));
651 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys))));
652 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys))));
653 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys))));
654
655 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys))));
656 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys))));
657 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys))));
658 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys))));
659 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys))));
660 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys))));
661 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys))));
662 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys))));
663 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys))));
664 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys))));
665 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys))));
666 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys))));
667
668 if(rounds > 24)
669 {
670 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys))));
671 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys))));
672 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys))));
673 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys))));
674 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys))));
675 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys))));
676 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys))));
677 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys))));
678 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys))));
679 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys))));
680 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys))));
681 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys))));
682 }
683
684 if(rounds > 28)
685 {
686 temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys))));
687 temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys))));
688 temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys))));
689 temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys))));
690 temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys))));
691 temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys))));
692 temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys))));
693 temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys))));
694 temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys))));
695 temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys))));
696 temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys))));
697 temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys))));
698 }
699}
700
701// *************************** LEA Decryption ***************************//
702
703template <class W>
704inline void LEA_Decryption(W temp[4], const word32 *subkeys, unsigned int rounds)
705{
706 if(rounds > 28)
707 {
708 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys));
709 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys));
710 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys));
711 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys));
712 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys));
713 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys));
714 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys));
715 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys));
716 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys));
717 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys));
718 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys));
719 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys));
720 }
721
722 if(rounds > 24)
723 {
724 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys));
725 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys));
726 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys));
727 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys));
728 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys));
729 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys));
730 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys));
731 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys));
732 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys));
733 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys));
734 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys));
735 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys));
736 }
737
738 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys));
739 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys));
740 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys));
741 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys));
742 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys));
743 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys));
744 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys));
745 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys));
746 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys));
747 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys));
748 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys));
749 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys));
750
751 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys));
752 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys));
753 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys));
754 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys));
755 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys));
756 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys));
757 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys));
758 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys));
759 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys));
760 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys));
761 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys));
762 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys));
763
764 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys));
765 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys));
766 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys));
767 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys));
768 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys));
769 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys));
770 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys));
771 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys));
772 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys));
773 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys));
774 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys));
775 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys));
776
777 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys));
778 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys));
779 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys));
780 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys));
781 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys));
782 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys));
783 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys));
784 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys));
785 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys));
786 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys));
787 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys));
788 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys));
789
790 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys));
791 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys));
792 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys));
793 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys));
794 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys));
795 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys));
796 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys));
797 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys));
798 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys));
799 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys));
800 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys));
801 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys));
802
803 temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys));
804 temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys));
805 temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys));
806 temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys));
807 temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys));
808 temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys));
809 temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys));
810 temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys));
811 temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys));
812 temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys));
813 temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys));
814 temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys));
815}
816
817#endif // LEA Encryption and Decryption
818
819// *************************** ARM NEON ***************************//
820
821#if (CRYPTOPP_ARM_NEON_AVAILABLE)
822
823inline void LEA_Enc_Block(uint32x4_t &block0,
824 const word32 *subkeys, unsigned int rounds)
825{
826 uint32x4_t temp[4];
827 temp[0] = UnpackNEON<0>(block0);
828 temp[1] = UnpackNEON<1>(block0);
829 temp[2] = UnpackNEON<2>(block0);
830 temp[3] = UnpackNEON<3>(block0);
831
832 LEA_Encryption(temp, subkeys, rounds);
833
834 block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
835}
836
837inline void LEA_Dec_Block(uint32x4_t &block0,
838 const word32 *subkeys, unsigned int rounds)
839{
840 uint32x4_t temp[4];
841 temp[0] = UnpackNEON<0>(block0);
842 temp[1] = UnpackNEON<1>(block0);
843 temp[2] = UnpackNEON<2>(block0);
844 temp[3] = UnpackNEON<3>(block0);
845
846 LEA_Decryption(temp, subkeys, rounds);
847
848 block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
849}
850
851inline void LEA_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
852 uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
853{
854 uint32x4_t temp[4];
855 temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
856 temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
857 temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
858 temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
859
860 LEA_Encryption(temp, subkeys, rounds);
861
862 block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
863 block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
864 block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
865 block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
866}
867
868inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
869 uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
870{
871 uint32x4_t temp[4];
872 temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
873 temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
874 temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
875 temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
876
877 LEA_Decryption(temp, subkeys, rounds);
878
879 block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
880 block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
881 block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
882 block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
883}
884
885#endif // CRYPTOPP_ARM_NEON_AVAILABLE
886
887// *************************** IA-32 ***************************//
888
889#if (CRYPTOPP_SSSE3_AVAILABLE)
890
891inline void LEA_Enc_Block(__m128i &block0,
892 const word32 *subkeys, unsigned int rounds)
893{
894 __m128i temp[4];
895 temp[0] = UnpackXMM<0>(block0);
896 temp[1] = UnpackXMM<1>(block0);
897 temp[2] = UnpackXMM<2>(block0);
898 temp[3] = UnpackXMM<3>(block0);
899
900 LEA_Encryption(temp, subkeys, rounds);
901
902 block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
903}
904
905inline void LEA_Dec_Block(__m128i &block0,
906 const word32 *subkeys, unsigned int rounds)
907{
908 __m128i temp[4];
909 temp[0] = UnpackXMM<0>(block0);
910 temp[1] = UnpackXMM<1>(block0);
911 temp[2] = UnpackXMM<2>(block0);
912 temp[3] = UnpackXMM<3>(block0);
913
914 LEA_Decryption(temp, subkeys, rounds);
915
916 block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
917}
918
919inline void LEA_Enc_4_Blocks(__m128i &block0, __m128i &block1,
920 __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
921{
922 __m128i temp[4];
923 temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
924 temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
925 temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
926 temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
927
928 LEA_Encryption(temp, subkeys, rounds);
929
930 block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
931 block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
932 block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
933 block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
934}
935
936inline void LEA_Dec_4_Blocks(__m128i &block0, __m128i &block1,
937 __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
938{
939 __m128i temp[4];
940 temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
941 temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
942 temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
943 temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
944
945 LEA_Decryption(temp, subkeys, rounds);
946
947 block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
948 block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
949 block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
950 block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
951}
952
953#endif // CRYPTOPP_SSSE3_AVAILABLE
954
955// *************************** Power8 ***************************//
956
957#if (CRYPTOPP_POWER8_AVAILABLE)
958
959inline void LEA_Enc_Block(uint32x4_p &block0,
960 const word32 *subkeys, unsigned int rounds)
961{
962 uint32x4_p temp[4];
963 temp[0] = UnpackSIMD<0>(block0);
964 temp[1] = UnpackSIMD<1>(block0);
965 temp[2] = UnpackSIMD<2>(block0);
966 temp[3] = UnpackSIMD<3>(block0);
967
968 LEA_Encryption(temp, subkeys, rounds);
969
970 block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
971}
972
973inline void LEA_Dec_Block(uint32x4_p &block0,
974 const word32 *subkeys, unsigned int rounds)
975{
976 uint32x4_p temp[4];
977 temp[0] = UnpackSIMD<0>(block0);
978 temp[1] = UnpackSIMD<1>(block0);
979 temp[2] = UnpackSIMD<2>(block0);
980 temp[3] = UnpackSIMD<3>(block0);
981
982 LEA_Decryption(temp, subkeys, rounds);
983
984 block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
985}
986
987inline void LEA_Enc_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
988 uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
989{
990 uint32x4_p temp[4];
991 temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
992 temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
993 temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
994 temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
995
996 LEA_Encryption(temp, subkeys, rounds);
997
998 block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
999 block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
1000 block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
1001 block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
1002}
1003
1004inline void LEA_Dec_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
1005 uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
1006{
1007 uint32x4_p temp[4];
1008 temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
1009 temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
1010 temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
1011 temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
1012
1013 LEA_Decryption(temp, subkeys, rounds);
1014
1015 block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
1016 block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
1017 block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
1018 block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
1019}
1020
1021#endif // CRYPTOPP_POWER8_AVAILABLE
1022
1023ANONYMOUS_NAMESPACE_END
1024
1025// *************************** SIMD Templates ***************************//
1026
1027NAMESPACE_BEGIN(CryptoPP)
1028
1029#if defined(CRYPTOPP_SSSE3_AVAILABLE)
1030size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1031 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1032{
1033 return AdvancedProcessBlocks128_4x1_SSE(LEA_Enc_Block, LEA_Enc_4_Blocks,
1034 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1035}
1036
1037size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1038 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1039{
1040 return AdvancedProcessBlocks128_4x1_SSE(LEA_Dec_Block, LEA_Dec_4_Blocks,
1041 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1042}
1043#endif // CRYPTOPP_SSSE3_AVAILABLE
1044
1045#if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
1046size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1047 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1048{
1049 return AdvancedProcessBlocks128_4x1_NEON(LEA_Enc_Block, LEA_Enc_4_Blocks,
1050 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1051}
1052
1053size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1054 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1055{
1056 return AdvancedProcessBlocks128_4x1_NEON(LEA_Dec_Block, LEA_Dec_4_Blocks,
1057 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1058}
1059#endif // CRYPTOPP_ARM_NEON_AVAILABLE
1060
1061#if defined(CRYPTOPP_POWER8_AVAILABLE)
1062size_t LEA_Enc_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1063 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1064{
1065 return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Enc_Block, LEA_Enc_4_Blocks,
1066 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1067}
1068
1069size_t LEA_Dec_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1070 const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1071{
1072 return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Dec_Block, LEA_Dec_4_Blocks,
1073 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1074}
1075#endif // CRYPTOPP_POWER8_AVAILABLE
1076
1077NAMESPACE_END
Template for AdvancedProcessBlocks and SIMD processing.
size_t AdvancedProcessBlocks128_4x1_NEON(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Definition: adv_simd.h:254
size_t AdvancedProcessBlocks128_4x1_ALTIVEC(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Definition: adv_simd.h:971
size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4, const W *subKeys, size_t rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
AdvancedProcessBlocks for 1 and 4 blocks.
Definition: adv_simd.h:830
Library configuration file.
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
Classes for the LEA block cipher.
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:202
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1478
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:192
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1414
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:212
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:1456
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:1438
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68