Gitly


1 /*
2  *  Armv8-A Cryptographic Extension support functions for Aarch64
3  *
4  *  Copyright The Mbed TLS Contributors
5  *  SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6  */
7 
8 #if defined(__clang__) &&  (__clang_major__ >= 4)
9 
10 /* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
11  * but that is defined by build_info.h, and we need this block to happen first. */
12 #if defined(__ARM_ARCH)
13 #if __ARM_ARCH >= 8
14 #define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
15 #endif
16 #endif
17 
18 #if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
19 /* The intrinsic declaration are guarded by predefined ACLE macros in clang:
20  * these are normally only enabled by the -march option on the command line.
21  * By defining the macros ourselves we gain access to those declarations without
22  * requiring -march on the command line.
23  *
24  * `arm_neon.h` is included by common.h, so we put these defines
25  * at the top of this file, before any includes. This is necessary with
26  * Clang <=15.x. With Clang 16.0 and above, these macro definitions are
27  * no longer required, but they're harmless. See
28  * https://reviews.llvm.org/D131064
29  */
30 #define __ARM_FEATURE_CRYPTO 1
31 /* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
32  *
33  * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
34  * for older compilers.
35  */
36 #define __ARM_FEATURE_AES    1
37 #define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
38 #endif
39 
40 #endif /* defined(__clang__) &&  (__clang_major__ >= 4) */
41 
42 #include "common.h"
43 
44 #if defined(MBEDTLS_AESCE_C)
45 
46 #include <string.h>
47 
48 #include "aesce.h"
49 
50 #if defined(MBEDTLS_AESCE_HAVE_CODE)
51 
52 /* Compiler version checks. */
53 #if defined(__clang__)
54 #   if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
55 #       error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
56 #   elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
57 #       error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
58 #   endif
59 #elif defined(__GNUC__)
60 #   if __GNUC__ < 6
61 #       error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
62 #   endif
63 #elif defined(_MSC_VER)
64 /* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
65  *       please update this and document of `MBEDTLS_AESCE_C` in
66  *       `mbedtls_config.h`. */
67 #   if _MSC_VER < 1929
68 #       error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
69 #   endif
70 #elif defined(__ARMCC_VERSION)
71 #    if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
72 /* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
73  * If someone verified that, please update this and document of
74  * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
75 #         error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
76 #    elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
77 #         error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
78 #    endif
79 #endif
80 
81 #if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
82     defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
83 #   if defined(__ARMCOMPILER_VERSION)
84 #       if __ARMCOMPILER_VERSION <= 6090000
85 #           error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
86 #       else
87 #           pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
88 #           define MBEDTLS_POP_TARGET_PRAGMA
89 #       endif
90 #   elif defined(__clang__)
91 #       if __clang_major__ < 7
92 #           pragma clang attribute push (__attribute__((target("crypto"))), apply_to=function)
93 #       else
94 #           pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
95 #       endif
96 #       define MBEDTLS_POP_TARGET_PRAGMA
97 #   elif defined(__GNUC__)
98 #       pragma GCC push_options
99 #       pragma GCC target ("+crypto")
100 #       define MBEDTLS_POP_TARGET_PRAGMA
101 #   elif defined(_MSC_VER)
102 #       error "Required feature(__ARM_FEATURE_AES) is not enabled."
103 #   endif
104 #endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
105           MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
106 
107 #if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
108 
109 #include <sys/auxv.h>
110 #if !defined(HWCAP_NEON)
111 #define HWCAP_NEON  (1 << 12)
112 #endif
113 #if !defined(HWCAP2_AES)
114 #define HWCAP2_AES  (1 << 0)
115 #endif
116 #if !defined(HWCAP_AES)
117 #define HWCAP_AES   (1 << 3)
118 #endif
119 #if !defined(HWCAP_ASIMD)
120 #define HWCAP_ASIMD (1 << 1)
121 #endif
122 
123 signed char mbedtls_aesce_has_support_result = -1;
124 
125 #if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
126 /*
127  * AES instruction support detection routine
128  */
129 int mbedtls_aesce_has_support_impl(void)
130 {
131     /* To avoid many calls to getauxval, cache the result. This is
132      * thread-safe, because we store the result in a char so cannot
133      * be vulnerable to non-atomic updates.
134      * It is possible that we could end up setting result more than
135      * once, but that is harmless.
136      */
137     if (mbedtls_aesce_has_support_result == -1) {
138 #if defined(MBEDTLS_ARCH_IS_ARM32)
139         unsigned long auxval  = getauxval(AT_HWCAP);
140         unsigned long auxval2 = getauxval(AT_HWCAP2);
141         if (((auxval  & HWCAP_NEON) == HWCAP_NEON) &&
142             ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
143             mbedtls_aesce_has_support_result = 1;
144         } else {
145             mbedtls_aesce_has_support_result = 0;
146         }
147 #else
148         unsigned long auxval = getauxval(AT_HWCAP);
149         if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
150             (HWCAP_ASIMD | HWCAP_AES)) {
151             mbedtls_aesce_has_support_result = 1;
152         } else {
153             mbedtls_aesce_has_support_result = 0;
154         }
155 #endif
156     }
157     return mbedtls_aesce_has_support_result;
158 }
159 #endif
160 
161 #endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
162 
163 /* Single round of AESCE encryption */
164 #define AESCE_ENCRYPT_ROUND                   \
165     block = vaeseq_u8(block, vld1q_u8(keys)); \
166     block = vaesmcq_u8(block);                \
167     keys += 16
168 /* Two rounds of AESCE encryption */
169 #define AESCE_ENCRYPT_ROUND_X2        AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
170 
171 MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
172 static uint8x16_t aesce_encrypt_block(uint8x16_t block,
173                                       unsigned char *keys,
174                                       int rounds)
175 {
176     /* 10, 12 or 14 rounds. Unroll loop. */
177     if (rounds == 10) {
178         goto rounds_10;
179     }
180     if (rounds == 12) {
181         goto rounds_12;
182     }
183     AESCE_ENCRYPT_ROUND_X2;
184 rounds_12:
185     AESCE_ENCRYPT_ROUND_X2;
186 rounds_10:
187     AESCE_ENCRYPT_ROUND_X2;
188     AESCE_ENCRYPT_ROUND_X2;
189     AESCE_ENCRYPT_ROUND_X2;
190     AESCE_ENCRYPT_ROUND_X2;
191     AESCE_ENCRYPT_ROUND;
192 
193     /* AES AddRoundKey for the previous round.
194      * SubBytes, ShiftRows for the final round.  */
195     block = vaeseq_u8(block, vld1q_u8(keys));
196     keys += 16;
197 
198     /* Final round: no MixColumns */
199 
200     /* Final AddRoundKey */
201     block = veorq_u8(block, vld1q_u8(keys));
202 
203     return block;
204 }
205 
206 /* Single round of AESCE decryption
207  *
208  * AES AddRoundKey, SubBytes, ShiftRows
209  *
210  *      block = vaesdq_u8(block, vld1q_u8(keys));
211  *
212  * AES inverse MixColumns for the next round.
213  *
214  * This means that we switch the order of the inverse AddRoundKey and
215  * inverse MixColumns operations. We have to do this as AddRoundKey is
216  * done in an atomic instruction together with the inverses of SubBytes
217  * and ShiftRows.
218  *
219  * It works because MixColumns is a linear operation over GF(2^8) and
220  * AddRoundKey is an exclusive or, which is equivalent to addition over
221  * GF(2^8). (The inverse of MixColumns needs to be applied to the
222  * affected round keys separately which has been done when the
223  * decryption round keys were calculated.)
224  *
225  *      block = vaesimcq_u8(block);
226  */
227 #define AESCE_DECRYPT_ROUND                   \
228     block = vaesdq_u8(block, vld1q_u8(keys)); \
229     block = vaesimcq_u8(block);               \
230     keys += 16
231 /* Two rounds of AESCE decryption */
232 #define AESCE_DECRYPT_ROUND_X2        AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
233 
234 #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
235 static uint8x16_t aesce_decrypt_block(uint8x16_t block,
236                                       unsigned char *keys,
237                                       int rounds)
238 {
239     /* 10, 12 or 14 rounds. Unroll loop. */
240     if (rounds == 10) {
241         goto rounds_10;
242     }
243     if (rounds == 12) {
244         goto rounds_12;
245     }
246     AESCE_DECRYPT_ROUND_X2;
247 rounds_12:
248     AESCE_DECRYPT_ROUND_X2;
249 rounds_10:
250     AESCE_DECRYPT_ROUND_X2;
251     AESCE_DECRYPT_ROUND_X2;
252     AESCE_DECRYPT_ROUND_X2;
253     AESCE_DECRYPT_ROUND_X2;
254     AESCE_DECRYPT_ROUND;
255 
256     /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
257      * last full round. */
258     block = vaesdq_u8(block, vld1q_u8(keys));
259     keys += 16;
260 
261     /* Inverse AddRoundKey for inverting the initial round key addition. */
262     block = veorq_u8(block, vld1q_u8(keys));
263 
264     return block;
265 }
266 #endif
267 
268 /*
269  * AES-ECB block en(de)cryption
270  */
271 int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
272                             int mode,
273                             const unsigned char input[16],
274                             unsigned char output[16])
275 {
276     uint8x16_t block = vld1q_u8(&input[0]);
277     unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
278 
279 #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
280     if (mode == MBEDTLS_AES_DECRYPT) {
281         block = aesce_decrypt_block(block, keys, ctx->nr);
282     } else
283 #else
284     (void) mode;
285 #endif
286     {
287         block = aesce_encrypt_block(block, keys, ctx->nr);
288     }
289     vst1q_u8(&output[0], block);
290 
291     return 0;
292 }
293 
294 /*
295  * Compute decryption round keys from encryption round keys
296  */
297 #if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
298 void mbedtls_aesce_inverse_key(unsigned char *invkey,
299                                const unsigned char *fwdkey,
300                                int nr)
301 {
302     int i, j;
303     j = nr;
304     vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
305     for (i = 1, j--; j > 0; i++, j--) {
306         vst1q_u8(invkey + i * 16,
307                  vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
308     }
309     vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
310 
311 }
312 #endif
313 
314 static inline uint32_t aes_rot_word(uint32_t word)
315 {
316     return (word << (32 - 8)) | (word >> 8);
317 }
318 
319 static inline uint32_t aes_sub_word(uint32_t in)
320 {
321     uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
322     uint8x16_t zero = vdupq_n_u8(0);
323 
324     /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
325      * the correct result as ShiftRows doesn't change the first row. */
326     v = vaeseq_u8(zero, v);
327     return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
328 }
329 
330 /*
331  * Key expansion function
332  */
333 static void aesce_setkey_enc(unsigned char *rk,
334                              const unsigned char *key,
335                              const size_t key_bit_length)
336 {
337     static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
338                                     0x20, 0x40, 0x80, 0x1b, 0x36 };
339     /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
340      *   - Section 5, Nr = Nk + 6
341      *   - Section 5.2, the length of round keys is Nb*(Nr+1)
342      */
343     const size_t key_len_in_words = key_bit_length / 32;    /* Nk */
344     const size_t round_key_len_in_words = 4;                /* Nb */
345     const size_t rounds_needed = key_len_in_words + 6;      /* Nr */
346     const size_t round_keys_len_in_words =
347         round_key_len_in_words * (rounds_needed + 1);       /* Nb*(Nr+1) */
348     const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
349 
350     memcpy(rk, key, key_len_in_words * 4);
351 
352     for (uint32_t *rki = (uint32_t *) rk;
353          rki + key_len_in_words < rko_end;
354          rki += key_len_in_words) {
355 
356         size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
357         uint32_t *rko;
358         rko = rki + key_len_in_words;
359         rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
360         rko[0] ^= rcon[iteration] ^ rki[0];
361         rko[1] = rko[0] ^ rki[1];
362         rko[2] = rko[1] ^ rki[2];
363         rko[3] = rko[2] ^ rki[3];
364         if (rko + key_len_in_words > rko_end) {
365             /* Do not write overflow words.*/
366             continue;
367         }
368 #if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
369         switch (key_bit_length) {
370             case 128:
371                 break;
372             case 192:
373                 rko[4] = rko[3] ^ rki[4];
374                 rko[5] = rko[4] ^ rki[5];
375                 break;
376             case 256:
377                 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
378                 rko[5] = rko[4] ^ rki[5];
379                 rko[6] = rko[5] ^ rki[6];
380                 rko[7] = rko[6] ^ rki[7];
381                 break;
382         }
383 #endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
384     }
385 }
386 
387 /*
388  * Key expansion, wrapper
389  */
390 int mbedtls_aesce_setkey_enc(unsigned char *rk,
391                              const unsigned char *key,
392                              size_t bits)
393 {
394     switch (bits) {
395         case 128:
396         case 192:
397         case 256:
398             aesce_setkey_enc(rk, key, bits);
399             break;
400         default:
401             return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
402     }
403 
404     return 0;
405 }
406 
407 #if defined(MBEDTLS_GCM_C)
408 
409 #if defined(MBEDTLS_ARCH_IS_ARM32)
410 
411 #if defined(__clang__)
412 /* On clang for A32/T32, work around some missing intrinsics and types which are listed in
413  * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
414  * These are only required for GCM.
415  */
416 #define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
417 
418 typedef uint8x16_t poly128_t;
419 
420 static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
421 {
422     poly128_t r;
423     asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
424     return r;
425 }
426 
427 /* This is set to cause some more missing intrinsics to be defined below */
428 #define COMMON_MISSING_INTRINSICS
429 
430 static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
431 {
432     return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
433                      (poly64_t) (vget_high_u64((uint64x2_t) b)));
434 }
435 
436 #endif /* defined(__clang__) */
437 
438 static inline uint8x16_t vrbitq_u8(uint8x16_t x)
439 {
440     /* There is no vrbitq_u8 instruction in A32/T32, so provide
441      * an equivalent non-Neon implementation. Reverse bit order in each
442      * byte with 4x rbit, rev. */
443     asm ("ldm  %[p], { r2-r5 } \n\t"
444          "rbit r2, r2          \n\t"
445          "rev  r2, r2          \n\t"
446          "rbit r3, r3          \n\t"
447          "rev  r3, r3          \n\t"
448          "rbit r4, r4          \n\t"
449          "rev  r4, r4          \n\t"
450          "rbit r5, r5          \n\t"
451          "rev  r5, r5          \n\t"
452          "stm  %[p], { r2-r5 } \n\t"
453          :
454          /* Output: 16 bytes of memory pointed to by &x */
455          "+m" (*(uint8_t(*)[16]) &x)
456          :
457          [p] "r" (&x)
458          :
459          "r2", "r3", "r4", "r5"
460          );
461     return x;
462 }
463 
464 #endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
465 
466 #if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
467 /* Some intrinsics are not available for GCC 5.X. */
468 #define COMMON_MISSING_INTRINSICS
469 #endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
470 
471 
472 #if defined(COMMON_MISSING_INTRINSICS)
473 
474 /* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
475 
476 #define vreinterpretq_p64_u8(a)  ((poly64x2_t) a)
477 #define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
478 
479 static inline poly64x1_t vget_low_p64(poly64x2_t a)
480 {
481     uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
482     return (poly64x1_t) r;
483 
484 }
485 
486 #endif /* COMMON_MISSING_INTRINSICS */
487 
488 /* vmull_p64/vmull_high_p64 wrappers.
489  *
490  * Older compilers miss some intrinsic functions for `poly*_t`. We use
491  * uint8x16_t and uint8x16x3_t as input/output parameters.
492  */
493 #if defined(MBEDTLS_COMPILER_IS_GCC)
494 /* GCC reports incompatible type error without cast. GCC think poly64_t and
495  * poly64x1_t are different, that is different with MSVC and Clang. */
496 #define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
497 #else
498 /* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
499  * error with/without cast. And I think poly64_t and poly64x1_t are same, no
500  * cast for clang also. */
501 #define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
502 #endif /* MBEDTLS_COMPILER_IS_GCC */
503 
504 static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
505 {
506 
507     return vreinterpretq_u8_p128(
508         MBEDTLS_VMULL_P64(
509             (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
510             (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
511             ));
512 }
513 
514 static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
515 {
516     return vreinterpretq_u8_p128(
517         vmull_high_p64(vreinterpretq_p64_u8(a),
518                        vreinterpretq_p64_u8(b)));
519 }
520 
521 /* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
522  * `x^128 + x^7 + x^2 + x + 1`.
523  *
524  * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
525  * multiplies to generate a 128b.
526  *
527  * `poly_mult_128` executes polynomial multiplication and outputs 256b that
528  * represented by 3 128b due to code size optimization.
529  *
530  * Output layout:
531  * |            |             |             |
532  * |------------|-------------|-------------|
533  * | ret.val[0] | h3:h2:00:00 | high   128b |
534  * | ret.val[1] |   :m2:m1:00 | middle 128b |
535  * | ret.val[2] |   :  :l1:l0 | low    128b |
536  */
537 static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
538 {
539     uint8x16x3_t ret;
540     uint8x16_t h, m, l; /* retval high/middle/low */
541     uint8x16_t c, d, e;
542 
543     h = pmull_high(a, b);                       /* h3:h2:00:00 = a1*b1 */
544     l = pmull_low(a, b);                        /*   :  :l1:l0 = a0*b0 */
545     c = vextq_u8(b, b, 8);                      /*      :c1:c0 = b0:b1 */
546     d = pmull_high(a, c);                       /*   :d2:d1:00 = a1*b0 */
547     e = pmull_low(a, c);                        /*   :e2:e1:00 = a0*b1 */
548     m = veorq_u8(d, e);                         /*   :m2:m1:00 = d + e */
549 
550     ret.val[0] = h;
551     ret.val[1] = m;
552     ret.val[2] = l;
553     return ret;
554 }
555 
556 /*
557  * Modulo reduction.
558  *
559  * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
560  *
561  * Section 4.3
562  *
563  * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
564  * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
565  * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
566  * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
567  * simply multiply the higher part of the operand by r(z) and add it to l(z). If
568  * the result is still larger than 128 bits, we reduce again.
569  */
570 static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
571 {
572     uint8x16_t const ZERO = vdupq_n_u8(0);
573 
574     uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
575 #if defined(__GNUC__)
576     /* use 'asm' as an optimisation barrier to prevent loading MODULO from
577      * memory. It is for GNUC compatible compilers.
578      */
579     asm volatile ("" : "+w" (r));
580 #endif
581     uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
582     uint8x16_t h, m, l; /* input high/middle/low 128b */
583     uint8x16_t c, d, e, f, g, n, o;
584     h = input.val[0];            /* h3:h2:00:00                          */
585     m = input.val[1];            /*   :m2:m1:00                          */
586     l = input.val[2];            /*   :  :l1:l0                          */
587     c = pmull_high(h, MODULO);   /*   :c2:c1:00 = reduction of h3        */
588     d = pmull_low(h, MODULO);    /*   :  :d1:d0 = reduction of h2        */
589     e = veorq_u8(c, m);          /*   :e2:e1:00 = m2:m1:00 + c2:c1:00    */
590     f = pmull_high(e, MODULO);   /*   :  :f1:f0 = reduction of e2        */
591     g = vextq_u8(ZERO, e, 8);    /*   :  :g1:00 = e1:00                  */
592     n = veorq_u8(d, l);          /*   :  :n1:n0 = d1:d0 + l1:l0          */
593     o = veorq_u8(n, f);          /*       o1:o0 = f1:f0 + n1:n0          */
594     return veorq_u8(o, g);       /*             = o1:o0 + g1:00          */
595 }
596 
597 /*
598  * GCM multiplication: c = a times b in GF(2^128)
599  */
600 void mbedtls_aesce_gcm_mult(unsigned char c[16],
601                             const unsigned char a[16],
602                             const unsigned char b[16])
603 {
604     uint8x16_t va, vb, vc;
605     va = vrbitq_u8(vld1q_u8(&a[0]));
606     vb = vrbitq_u8(vld1q_u8(&b[0]));
607     vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
608     vst1q_u8(&c[0], vc);
609 }
610 
611 #endif /* MBEDTLS_GCM_C */
612 
613 #if defined(MBEDTLS_POP_TARGET_PRAGMA)
614 #if defined(__clang__)
615 #pragma clang attribute pop
616 #elif defined(__GNUC__)
617 #pragma GCC pop_options
618 #endif
619 #undef MBEDTLS_POP_TARGET_PRAGMA
620 #endif
621 
622 #endif /* MBEDTLS_AESCE_HAVE_CODE */
623 
624 #endif /* MBEDTLS_AESCE_C */
625

1	/*
2	* Armv8-A Cryptographic Extension support functions for Aarch64
3	*
4	* Copyright The Mbed TLS Contributors
5	* SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6	*/
7
8	#if defined(__clang__) && (__clang_major__ >= 4)
9
10	/ Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,*
11	* but that is defined by build_info.h, and we need this block to happen first. */
12	#if defined(__ARM_ARCH)
13	#if __ARM_ARCH >= 8
14	#define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
15	#endif
16	#endif
17
18	#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
19	/ The intrinsic declaration are guarded by predefined ACLE macros in clang:*
20	* these are normally only enabled by the -march option on the command line.
21	* By defining the macros ourselves we gain access to those declarations without
22	* requiring -march on the command line.
23	*
24	* `arm_neon.h` is included by common.h, so we put these defines
25	* at the top of this file, before any includes. This is necessary with
26	* Clang <=15.x. With Clang 16.0 and above, these macro definitions are
27	* no longer required, but they're harmless. See
28	* https://reviews.llvm.org/D131064
29	*/
30	#define __ARM_FEATURE_CRYPTO 1
31	/ See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions*
32	*
33	* `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
34	* for older compilers.
35	*/
36	#define __ARM_FEATURE_AES 1
37	#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
38	#endif
39
40	#endif / defined(__clang__) && (__clang_major__ >= 4) /
41
42	#include "common.h"
43
44	#if defined(MBEDTLS_AESCE_C)
45
46	#include <string.h>
47
48	#include "aesce.h"
49
50	#if defined(MBEDTLS_AESCE_HAVE_CODE)
51
52	/ Compiler version checks. /
53	#if defined(__clang__)
54	# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
55	# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
56	# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
57	# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
58	# endif
59	#elif defined(__GNUC__)
60	# if __GNUC__ < 6
61	# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
62	# endif
63	#elif defined(_MSC_VER)
64	/ TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,*
65	* please update this and document of `MBEDTLS_AESCE_C` in
66	* `mbedtls_config.h`. */
67	# if _MSC_VER < 1929
68	# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
69	# endif
70	#elif defined(__ARMCC_VERSION)
71	# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
72	/ TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.*
73	* If someone verified that, please update this and document of
74	* `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
75	# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
76	# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
77	# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
78	# endif
79	#endif
80
81	#if !(defined(__ARM_FEATURE_CRYPTO) \|\| defined(__ARM_FEATURE_AES)) \|\| \
82	defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
83	# if defined(__ARMCOMPILER_VERSION)
84	# if __ARMCOMPILER_VERSION <= 6090000
85	# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
86	# else
87	# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
88	# define MBEDTLS_POP_TARGET_PRAGMA
89	# endif
90	# elif defined(__clang__)
91	# if __clang_major__ < 7
92	# pragma clang attribute push (__attribute__((target("crypto"))), apply_to=function)
93	# else
94	# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
95	# endif
96	# define MBEDTLS_POP_TARGET_PRAGMA
97	# elif defined(__GNUC__)
98	# pragma GCC push_options
99	# pragma GCC target ("+crypto")
100	# define MBEDTLS_POP_TARGET_PRAGMA
101	# elif defined(_MSC_VER)
102	# error "Required feature(__ARM_FEATURE_AES) is not enabled."
103	# endif
104	#endif / !(__ARM_FEATURE_CRYPTO \|\| __ARM_FEATURE_AES) \|\|*
105	MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG /*
106
107	#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
108
109	#include <sys/auxv.h>
110	#if !defined(HWCAP_NEON)
111	#define HWCAP_NEON (1 << 12)
112	#endif
113	#if !defined(HWCAP2_AES)
114	#define HWCAP2_AES (1 << 0)
115	#endif
116	#if !defined(HWCAP_AES)
117	#define HWCAP_AES (1 << 3)
118	#endif
119	#if !defined(HWCAP_ASIMD)
120	#define HWCAP_ASIMD (1 << 1)
121	#endif
122
123	signed char mbedtls_aesce_has_support_result = -1;
124
125	#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
126	/*
127	* AES instruction support detection routine
128	*/
129	int mbedtls_aesce_has_support_impl(void)
130	{
131	/ To avoid many calls to getauxval, cache the result. This is*
132	* thread-safe, because we store the result in a char so cannot
133	* be vulnerable to non-atomic updates.
134	* It is possible that we could end up setting result more than
135	* once, but that is harmless.
136	*/
137	if (mbedtls_aesce_has_support_result == -1) {
138	#if defined(MBEDTLS_ARCH_IS_ARM32)
139	unsigned long auxval = getauxval(AT_HWCAP);
140	unsigned long auxval2 = getauxval(AT_HWCAP2);
141	if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
142	((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
143	mbedtls_aesce_has_support_result = 1;
144	} else {
145	mbedtls_aesce_has_support_result = 0;
146	}
147	#else
148	unsigned long auxval = getauxval(AT_HWCAP);
149	if ((auxval & (HWCAP_ASIMD \| HWCAP_AES)) ==
150	(HWCAP_ASIMD \| HWCAP_AES)) {
151	mbedtls_aesce_has_support_result = 1;
152	} else {
153	mbedtls_aesce_has_support_result = 0;
154	}
155	#endif
156	}
157	return mbedtls_aesce_has_support_result;
158	}
159	#endif
160
161	#endif / defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) /
162
163	/ Single round of AESCE encryption /
164	#define AESCE_ENCRYPT_ROUND \
165	block = vaeseq_u8(block, vld1q_u8(keys)); \
166	block = vaesmcq_u8(block); \
167	keys += 16
168	/ Two rounds of AESCE encryption /
169	#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
170
171	MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
172	static uint8x16_t aesce_encrypt_block(uint8x16_t block,
173	unsigned char *keys,
174	int rounds)
175	{
176	/ 10, 12 or 14 rounds. Unroll loop. /
177	if (rounds == 10) {
178	goto rounds_10;
179	}
180	if (rounds == 12) {
181	goto rounds_12;
182	}
183	AESCE_ENCRYPT_ROUND_X2;
184	rounds_12:
185	AESCE_ENCRYPT_ROUND_X2;
186	rounds_10:
187	AESCE_ENCRYPT_ROUND_X2;
188	AESCE_ENCRYPT_ROUND_X2;
189	AESCE_ENCRYPT_ROUND_X2;
190	AESCE_ENCRYPT_ROUND_X2;
191	AESCE_ENCRYPT_ROUND;
192
193	/ AES AddRoundKey for the previous round.*
194	* SubBytes, ShiftRows for the final round. */
195	block = vaeseq_u8(block, vld1q_u8(keys));
196	keys += 16;
197
198	/ Final round: no MixColumns /
199
200	/ Final AddRoundKey /
201	block = veorq_u8(block, vld1q_u8(keys));
202
203	return block;
204	}
205
206	/ Single round of AESCE decryption*
207	*
208	* AES AddRoundKey, SubBytes, ShiftRows
209	*
210	* block = vaesdq_u8(block, vld1q_u8(keys));
211	*
212	* AES inverse MixColumns for the next round.
213	*
214	* This means that we switch the order of the inverse AddRoundKey and
215	* inverse MixColumns operations. We have to do this as AddRoundKey is
216	* done in an atomic instruction together with the inverses of SubBytes
217	* and ShiftRows.
218	*
219	* It works because MixColumns is a linear operation over GF(2^8) and
220	* AddRoundKey is an exclusive or, which is equivalent to addition over
221	* GF(2^8). (The inverse of MixColumns needs to be applied to the
222	* affected round keys separately which has been done when the
223	* decryption round keys were calculated.)
224	*
225	* block = vaesimcq_u8(block);
226	*/
227	#define AESCE_DECRYPT_ROUND \
228	block = vaesdq_u8(block, vld1q_u8(keys)); \
229	block = vaesimcq_u8(block); \
230	keys += 16
231	/ Two rounds of AESCE decryption /
232	#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
233
234	#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
235	static uint8x16_t aesce_decrypt_block(uint8x16_t block,
236	unsigned char *keys,
237	int rounds)
238	{
239	/ 10, 12 or 14 rounds. Unroll loop. /
240	if (rounds == 10) {
241	goto rounds_10;
242	}
243	if (rounds == 12) {
244	goto rounds_12;
245	}
246	AESCE_DECRYPT_ROUND_X2;
247	rounds_12:
248	AESCE_DECRYPT_ROUND_X2;
249	rounds_10:
250	AESCE_DECRYPT_ROUND_X2;
251	AESCE_DECRYPT_ROUND_X2;
252	AESCE_DECRYPT_ROUND_X2;
253	AESCE_DECRYPT_ROUND_X2;
254	AESCE_DECRYPT_ROUND;
255
256	/ The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the*
257	* last full round. */
258	block = vaesdq_u8(block, vld1q_u8(keys));
259	keys += 16;
260
261	/ Inverse AddRoundKey for inverting the initial round key addition. /
262	block = veorq_u8(block, vld1q_u8(keys));
263
264	return block;
265	}
266	#endif
267
268	/*
269	* AES-ECB block en(de)cryption
270	*/
271	int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
272	int mode,
273	const unsigned char input[16],
274	unsigned char output[16])
275	{
276	uint8x16_t block = vld1q_u8(&input[0]);
277	unsigned char keys = (unsigned* char *) (ctx->buf + ctx->rk_offset);
278
279	#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
280	if (mode == MBEDTLS_AES_DECRYPT) {
281	block = aesce_decrypt_block(block, keys, ctx->nr);
282	} else
283	#else
284	(void) mode;
285	#endif
286	{
287	block = aesce_encrypt_block(block, keys, ctx->nr);
288	}
289	vst1q_u8(&output[0], block);
290
291	return 0;
292	}
293
294	/*
295	* Compute decryption round keys from encryption round keys
296	*/
297	#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
298	void mbedtls_aesce_inverse_key(unsigned char *invkey,
299	const unsigned char *fwdkey,
300	int nr)
301	{
302	int i, j;
303	j = nr;
304	vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
305	for (i = 1, j--; j > 0; i++, j--) {
306	vst1q_u8(invkey + i * 16,
307	vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
308	}
309	vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
310
311	}
312	#endif
313
314	static inline uint32_t aes_rot_word(uint32_t word)
315	{
316	return (word << (32 - 8)) \| (word >> 8);
317	}
318
319	static inline uint32_t aes_sub_word(uint32_t in)
320	{
321	uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
322	uint8x16_t zero = vdupq_n_u8(0);
323
324	/ vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields*
325	* the correct result as ShiftRows doesn't change the first row. */
326	v = vaeseq_u8(zero, v);
327	return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
328	}
329
330	/*
331	* Key expansion function
332	*/
333	static void aesce_setkey_enc(unsigned char *rk,
334	const unsigned char *key,
335	const size_t key_bit_length)
336	{
337	static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
338	0x20, 0x40, 0x80, 0x1b, 0x36 };
339	/ See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf*
340	* - Section 5, Nr = Nk + 6
341	* - Section 5.2, the length of round keys is Nb*(Nr+1)
342	*/
343	const size_t key_len_in_words = key_bit_length / 32; / Nk /
344	const size_t round_key_len_in_words = 4; / Nb /
345	const size_t rounds_needed = key_len_in_words + 6; / Nr /
346	const size_t round_keys_len_in_words =
347	round_key_len_in_words * (rounds_needed + 1); / Nb(Nr+1) /*
348	const uint32_t rko_end = (uint32_t ) rk + round_keys_len_in_words;
349
350	memcpy(rk, key, key_len_in_words * 4);
351
352	for (uint32_t rki = (uint32_t ) rk;
353	rki + key_len_in_words < rko_end;
354	rki += key_len_in_words) {
355
356	size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
357	uint32_t *rko;
358	rko = rki + key_len_in_words;
359	rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
360	rko[0] ^= rcon[iteration] ^ rki[0];
361	rko[1] = rko[0] ^ rki[1];
362	rko[2] = rko[1] ^ rki[2];
363	rko[3] = rko[2] ^ rki[3];
364	if (rko + key_len_in_words > rko_end) {
365	/ Do not write overflow words./
366	continue;
367	}
368	#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
369	switch (key_bit_length) {
370	case 128:
371	break;
372	case 192:
373	rko[4] = rko[3] ^ rki[4];
374	rko[5] = rko[4] ^ rki[5];
375	break;
376	case 256:
377	rko[4] = aes_sub_word(rko[3]) ^ rki[4];
378	rko[5] = rko[4] ^ rki[5];
379	rko[6] = rko[5] ^ rki[6];
380	rko[7] = rko[6] ^ rki[7];
381	break;
382	}
383	#endif / !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH /
384	}
385	}
386
387	/*
388	* Key expansion, wrapper
389	*/
390	int mbedtls_aesce_setkey_enc(unsigned char *rk,
391	const unsigned char *key,
392	size_t bits)
393	{
394	switch (bits) {
395	case 128:
396	case 192:
397	case 256:
398	aesce_setkey_enc(rk, key, bits);
399	break;
400	default:
401	return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
402	}
403
404	return 0;
405	}
406
407	#if defined(MBEDTLS_GCM_C)
408
409	#if defined(MBEDTLS_ARCH_IS_ARM32)
410
411	#if defined(__clang__)
412	/ On clang for A32/T32, work around some missing intrinsics and types which are listed in*
413	* [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
414	* These are only required for GCM.
415	*/
416	#define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
417
418	typedef uint8x16_t poly128_t;
419
420	static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
421	{
422	poly128_t r;
423	asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
424	return r;
425	}
426
427	/ This is set to cause some more missing intrinsics to be defined below /
428	#define COMMON_MISSING_INTRINSICS
429
430	static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
431	{
432	return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
433	(poly64_t) (vget_high_u64((uint64x2_t) b)));
434	}
435
436	#endif / defined(__clang__) /
437
438	static inline uint8x16_t vrbitq_u8(uint8x16_t x)
439	{
440	/ There is no vrbitq_u8 instruction in A32/T32, so provide*
441	* an equivalent non-Neon implementation. Reverse bit order in each
442	* byte with 4x rbit, rev. */
443	asm ("ldm %[p], { r2-r5 } \n\t"
444	"rbit r2, r2 \n\t"
445	"rev r2, r2 \n\t"
446	"rbit r3, r3 \n\t"
447	"rev r3, r3 \n\t"
448	"rbit r4, r4 \n\t"
449	"rev r4, r4 \n\t"
450	"rbit r5, r5 \n\t"
451	"rev r5, r5 \n\t"
452	"stm %[p], { r2-r5 } \n\t"
453	:
454	/ Output: 16 bytes of memory pointed to by &x /
455	"+m" ((uint8_t()[16]) &x)
456	:
457	[p] "r" (&x)
458	:
459	"r2", "r3", "r4", "r5"
460	);
461	return x;
462	}
463
464	#endif / defined(MBEDTLS_ARCH_IS_ARM32) /
465
466	#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
467	/ Some intrinsics are not available for GCC 5.X. /
468	#define COMMON_MISSING_INTRINSICS
469	#endif / MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 /
470
471
472	#if defined(COMMON_MISSING_INTRINSICS)
473
474	/ Missing intrinsics common to both GCC 5, and Clang on 32-bit /
475
476	#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
477	#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
478
479	static inline poly64x1_t vget_low_p64(poly64x2_t a)
480	{
481	uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
482	return (poly64x1_t) r;
483
484	}
485
486	#endif / COMMON_MISSING_INTRINSICS /
487
488	/ vmull_p64/vmull_high_p64 wrappers.*
489	*
490	* Older compilers miss some intrinsic functions for `poly*_t`. We use
491	* uint8x16_t and uint8x16x3_t as input/output parameters.
492	*/
493	#if defined(MBEDTLS_COMPILER_IS_GCC)
494	/ GCC reports incompatible type error without cast. GCC think poly64_t and*
495	* poly64x1_t are different, that is different with MSVC and Clang. */
496	#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
497	#else
498	/ MSVC reports `error C2440: 'type cast'` with cast. Clang does not report*
499	* error with/without cast. And I think poly64_t and poly64x1_t are same, no
500	* cast for clang also. */
501	#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
502	#endif / MBEDTLS_COMPILER_IS_GCC /
503
504	static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
505	{
506
507	return vreinterpretq_u8_p128(
508	MBEDTLS_VMULL_P64(
509	(poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
510	(poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
511	));
512	}
513
514	static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
515	{
516	return vreinterpretq_u8_p128(
517	vmull_high_p64(vreinterpretq_p64_u8(a),
518	vreinterpretq_p64_u8(b)));
519	}
520
521	/ GHASH does 128b polynomial multiplication on block in GF(2^128) defined by*
522	* `x^128 + x^7 + x^2 + x + 1`.
523	*
524	* Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
525	* multiplies to generate a 128b.
526	*
527	* `poly_mult_128` executes polynomial multiplication and outputs 256b that
528	* represented by 3 128b due to code size optimization.
529	*
530	* Output layout:
531	* \| \| \| \|
532	* \|------------\|-------------\|-------------\|
533	* \| ret.val[0] \| h3:h2:00:00 \| high 128b \|
534	* \| ret.val[1] \| :m2:m1:00 \| middle 128b \|
535	* \| ret.val[2] \| : :l1:l0 \| low 128b \|
536	*/
537	static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
538	{
539	uint8x16x3_t ret;
540	uint8x16_t h, m, l; / retval high/middle/low /
541	uint8x16_t c, d, e;
542
543	h = pmull_high(a, b); / h3:h2:00:00 = a1b1 /*
544	l = pmull_low(a, b); / : :l1:l0 = a0b0 /*
545	c = vextq_u8(b, b, 8); / :c1:c0 = b0:b1 /
546	d = pmull_high(a, c); / :d2:d1:00 = a1b0 /*
547	e = pmull_low(a, c); / :e2:e1:00 = a0b1 /*
548	m = veorq_u8(d, e); / :m2:m1:00 = d + e /
549
550	ret.val[0] = h;
551	ret.val[1] = m;
552	ret.val[2] = l;
553	return ret;
554	}
555
556	/*
557	* Modulo reduction.
558	*
559	* See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
560	*
561	* Section 4.3
562	*
563	* Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
564	* z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
565	* consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
566	* operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
567	* simply multiply the higher part of the operand by r(z) and add it to l(z). If
568	* the result is still larger than 128 bits, we reduce again.
569	*/
570	static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
571	{
572	uint8x16_t const ZERO = vdupq_n_u8(0);
573
574	uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
575	#if defined(__GNUC__)
576	/ use 'asm' as an optimisation barrier to prevent loading MODULO from*
577	* memory. It is for GNUC compatible compilers.
578	*/
579	asm volatile ("" : "+w" (r));
580	#endif
581	uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
582	uint8x16_t h, m, l; / input high/middle/low 128b /
583	uint8x16_t c, d, e, f, g, n, o;
584	h = input.val[0]; / h3:h2:00:00 /
585	m = input.val[1]; / :m2:m1:00 /
586	l = input.val[2]; / : :l1:l0 /
587	c = pmull_high(h, MODULO); / :c2:c1:00 = reduction of h3 /
588	d = pmull_low(h, MODULO); / : :d1:d0 = reduction of h2 /
589	e = veorq_u8(c, m); / :e2:e1:00 = m2:m1:00 + c2:c1:00 /
590	f = pmull_high(e, MODULO); / : :f1:f0 = reduction of e2 /
591	g = vextq_u8(ZERO, e, 8); / : :g1:00 = e1:00 /
592	n = veorq_u8(d, l); / : :n1:n0 = d1:d0 + l1:l0 /
593	o = veorq_u8(n, f); / o1:o0 = f1:f0 + n1:n0 /
594	return veorq_u8(o, g); / = o1:o0 + g1:00 /
595	}
596
597	/*
598	* GCM multiplication: c = a times b in GF(2^128)
599	*/
600	void mbedtls_aesce_gcm_mult(unsigned char c[16],
601	const unsigned char a[16],
602	const unsigned char b[16])
603	{
604	uint8x16_t va, vb, vc;
605	va = vrbitq_u8(vld1q_u8(&a[0]));
606	vb = vrbitq_u8(vld1q_u8(&b[0]));
607	vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
608	vst1q_u8(&c[0], vc);
609	}
610
611	#endif / MBEDTLS_GCM_C /
612
613	#if defined(MBEDTLS_POP_TARGET_PRAGMA)
614	#if defined(__clang__)
615	#pragma clang attribute pop
616	#elif defined(__GNUC__)
617	#pragma GCC pop_options
618	#endif
619	#undef MBEDTLS_POP_TARGET_PRAGMA
620	#endif
621
622	#endif / MBEDTLS_AESCE_HAVE_CODE /
623
624	#endif / MBEDTLS_AESCE_C /
625