Crypto++ 8.5
Free C++ class library of cryptographic schemes
arm_simd.h
Go to the documentation of this file.
1// arm_simd.h - written and placed in public domain by Jeffrey Walton
2
3/// \file arm_simd.h
4/// \brief Support functions for ARM and vector operations
5
6#ifndef CRYPTOPP_ARM_SIMD_H
7#define CRYPTOPP_ARM_SIMD_H
8
9#include "config.h"
10
11#if (CRYPTOPP_ARM_NEON_HEADER)
12# include <arm_neon.h>
13#endif
14
15#if (CRYPTOPP_ARM_ACLE_HEADER)
16# include <stdint.h>
17# include <arm_acle.h>
18#endif
19
20#if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
21
22/// \brief Polynomial multiplication
23/// \param a the first term
24/// \param b the second term
25/// \return vector product
26/// \details PMULL_00() performs polynomial multiplication and presents
27/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
28/// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
29/// are multiplied.
30/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
31/// is MSB and numbered 127, while the rightmost bit is LSB and
32/// numbered 0.
33/// \since Crypto++ 8.0
34inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
35{
36#if defined(_MSC_VER)
37 const __n64 x = { vgetq_lane_u64(a, 0) };
38 const __n64 y = { vgetq_lane_u64(b, 0) };
39 return vmull_p64(x, y);
40#elif defined(__GNUC__)
41 uint64x2_t r;
42 __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
43 :"=w" (r) : "w" (a), "w" (b) );
44 return r;
45#else
46 return (uint64x2_t)(vmull_p64(
47 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
48 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
49#endif
50}
51
52/// \brief Polynomial multiplication
53/// \param a the first term
54/// \param b the second term
55/// \return vector product
56/// \details PMULL_01 performs() polynomial multiplication and presents
57/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
58/// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
59/// 64-bits of <tt>b</tt> are multiplied.
60/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
61/// is MSB and numbered 127, while the rightmost bit is LSB and
62/// numbered 0.
63/// \since Crypto++ 8.0
64inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
65{
66#if defined(_MSC_VER)
67 const __n64 x = { vgetq_lane_u64(a, 0) };
68 const __n64 y = { vgetq_lane_u64(b, 1) };
69 return vmull_p64(x, y);
70#elif defined(__GNUC__)
71 uint64x2_t r;
72 __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
73 :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
74 return r;
75#else
76 return (uint64x2_t)(vmull_p64(
77 vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
78 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
79#endif
80}
81
82/// \brief Polynomial multiplication
83/// \param a the first term
84/// \param b the second term
85/// \return vector product
86/// \details PMULL_10() performs polynomial multiplication and presents
87/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
88/// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
89/// 64-bits of <tt>b</tt> are multiplied.
90/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
91/// is MSB and numbered 127, while the rightmost bit is LSB and
92/// numbered 0.
93/// \since Crypto++ 8.0
94inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
95{
96#if defined(_MSC_VER)
97 const __n64 x = { vgetq_lane_u64(a, 1) };
98 const __n64 y = { vgetq_lane_u64(b, 0) };
99 return vmull_p64(x, y);
100#elif defined(__GNUC__)
101 uint64x2_t r;
102 __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
103 :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
104 return r;
105#else
106 return (uint64x2_t)(vmull_p64(
107 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
108 vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
109#endif
110}
111
112/// \brief Polynomial multiplication
113/// \param a the first term
114/// \param b the second term
115/// \return vector product
116/// \details PMULL_11() performs polynomial multiplication and presents
117/// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
118/// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
119/// are multiplied.
120/// \note An Intel XMM register is composed of 128-bits. The leftmost bit
121/// is MSB and numbered 127, while the rightmost bit is LSB and
122/// numbered 0.
123/// \since Crypto++ 8.0
124inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
125{
126#if defined(_MSC_VER)
127 const __n64 x = { vgetq_lane_u64(a, 1) };
128 const __n64 y = { vgetq_lane_u64(b, 1) };
129 return vmull_p64(x, y);
130#elif defined(__GNUC__)
131 uint64x2_t r;
132 __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t"
133 :"=w" (r) : "w" (a), "w" (b) );
134 return r;
135#else
136 return (uint64x2_t)(vmull_p64(
137 vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
138 vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
139#endif
140}
141
142/// \brief Vector extraction
143/// \param a the first term
144/// \param b the second term
145/// \param c the byte count
146/// \return vector
147/// \details VEXT_U8() extracts the first <tt>c</tt> bytes of vector
148/// <tt>a</tt> and the remaining bytes in <tt>b</tt>.
149/// \since Crypto++ 8.0
150inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
151{
152#if defined(_MSC_VER)
153 return (uint64x2_t)vextq_u8(
154 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c);
155#else
156 uint64x2_t r;
157 __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
158 :"=w" (r) : "w" (a), "w" (b), "I" (c) );
159 return r;
160#endif
161}
162
163/// \brief Vector extraction
164/// \tparam C the byte count
165/// \param a the first term
166/// \param b the second term
167/// \return vector
168/// \details VEXT_U8() extracts the first <tt>C</tt> bytes of vector
169/// <tt>a</tt> and the remaining bytes in <tt>b</tt>.
170/// \since Crypto++ 8.0
171template <unsigned int C>
172inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
173{
174 // https://github.com/weidai11/cryptopp/issues/366
175#if defined(_MSC_VER)
176 return (uint64x2_t)vextq_u8(
177 vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C);
178#else
179 uint64x2_t r;
180 __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
181 :"=w" (r) : "w" (a), "w" (b), "I" (C) );
182 return r;
183#endif
184}
185
186#endif // CRYPTOPP_ARM_PMULL_AVAILABLE
187
188#endif // CRYPTOPP_ARM_SIMD_H
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:34
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:124
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:64
uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:94
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
Vector extraction.
Definition: arm_simd.h:150
Library configuration file.