v / thirdparty / mbedtls / library / aesce.c
624 lines · 551 sloc · 20.17 KB · 3d9911f887ecec942f9ae2a5be02d064f233b729
Raw
1/*
2 * Armv8-A Cryptographic Extension support functions for Aarch64
3 *
4 * Copyright The Mbed TLS Contributors
5 * SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
6 */
7
8#if defined(__clang__) && (__clang_major__ >= 4)
9
10/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
11 * but that is defined by build_info.h, and we need this block to happen first. */
12#if defined(__ARM_ARCH)
13#if __ARM_ARCH >= 8
14#define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
15#endif
16#endif
17
18#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
19/* The intrinsic declaration are guarded by predefined ACLE macros in clang:
20 * these are normally only enabled by the -march option on the command line.
21 * By defining the macros ourselves we gain access to those declarations without
22 * requiring -march on the command line.
23 *
24 * `arm_neon.h` is included by common.h, so we put these defines
25 * at the top of this file, before any includes. This is necessary with
26 * Clang <=15.x. With Clang 16.0 and above, these macro definitions are
27 * no longer required, but they're harmless. See
28 * https://reviews.llvm.org/D131064
29 */
30#define __ARM_FEATURE_CRYPTO 1
31/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
32 *
33 * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
34 * for older compilers.
35 */
36#define __ARM_FEATURE_AES 1
37#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
38#endif
39
40#endif /* defined(__clang__) && (__clang_major__ >= 4) */
41
42#include "common.h"
43
44#if defined(MBEDTLS_AESCE_C)
45
46#include <string.h>
47
48#include "aesce.h"
49
50#if defined(MBEDTLS_AESCE_HAVE_CODE)
51
52/* Compiler version checks. */
53#if defined(__clang__)
54# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
55# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
56# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
57# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
58# endif
59#elif defined(__GNUC__)
60# if __GNUC__ < 6
61# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
62# endif
63#elif defined(_MSC_VER)
64/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
65 * please update this and document of `MBEDTLS_AESCE_C` in
66 * `mbedtls_config.h`. */
67# if _MSC_VER < 1929
68# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
69# endif
70#elif defined(__ARMCC_VERSION)
71# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
72/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
73 * If someone verified that, please update this and document of
74 * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
75# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
76# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
77# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
78# endif
79#endif
80
81#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
82 defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
83# if defined(__ARMCOMPILER_VERSION)
84# if __ARMCOMPILER_VERSION <= 6090000
85# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
86# else
87# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
88# define MBEDTLS_POP_TARGET_PRAGMA
89# endif
90# elif defined(__clang__)
91# if __clang_major__ < 7
92# pragma clang attribute push (__attribute__((target("crypto"))), apply_to=function)
93# else
94# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
95# endif
96# define MBEDTLS_POP_TARGET_PRAGMA
97# elif defined(__GNUC__)
98# pragma GCC push_options
99# pragma GCC target ("+crypto")
100# define MBEDTLS_POP_TARGET_PRAGMA
101# elif defined(_MSC_VER)
102# error "Required feature(__ARM_FEATURE_AES) is not enabled."
103# endif
104#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
105 MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
106
107#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
108
109#include <sys/auxv.h>
110#if !defined(HWCAP_NEON)
111#define HWCAP_NEON (1 << 12)
112#endif
113#if !defined(HWCAP2_AES)
114#define HWCAP2_AES (1 << 0)
115#endif
116#if !defined(HWCAP_AES)
117#define HWCAP_AES (1 << 3)
118#endif
119#if !defined(HWCAP_ASIMD)
120#define HWCAP_ASIMD (1 << 1)
121#endif
122
123signed char mbedtls_aesce_has_support_result = -1;
124
125#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
126/*
127 * AES instruction support detection routine
128 */
129int mbedtls_aesce_has_support_impl(void)
130{
131 /* To avoid many calls to getauxval, cache the result. This is
132 * thread-safe, because we store the result in a char so cannot
133 * be vulnerable to non-atomic updates.
134 * It is possible that we could end up setting result more than
135 * once, but that is harmless.
136 */
137 if (mbedtls_aesce_has_support_result == -1) {
138#if defined(MBEDTLS_ARCH_IS_ARM32)
139 unsigned long auxval = getauxval(AT_HWCAP);
140 unsigned long auxval2 = getauxval(AT_HWCAP2);
141 if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
142 ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
143 mbedtls_aesce_has_support_result = 1;
144 } else {
145 mbedtls_aesce_has_support_result = 0;
146 }
147#else
148 unsigned long auxval = getauxval(AT_HWCAP);
149 if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
150 (HWCAP_ASIMD | HWCAP_AES)) {
151 mbedtls_aesce_has_support_result = 1;
152 } else {
153 mbedtls_aesce_has_support_result = 0;
154 }
155#endif
156 }
157 return mbedtls_aesce_has_support_result;
158}
159#endif
160
161#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
162
163/* Single round of AESCE encryption */
164#define AESCE_ENCRYPT_ROUND \
165 block = vaeseq_u8(block, vld1q_u8(keys)); \
166 block = vaesmcq_u8(block); \
167 keys += 16
168/* Two rounds of AESCE encryption */
169#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
170
171MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
172static uint8x16_t aesce_encrypt_block(uint8x16_t block,
173 unsigned char *keys,
174 int rounds)
175{
176 /* 10, 12 or 14 rounds. Unroll loop. */
177 if (rounds == 10) {
178 goto rounds_10;
179 }
180 if (rounds == 12) {
181 goto rounds_12;
182 }
183 AESCE_ENCRYPT_ROUND_X2;
184rounds_12:
185 AESCE_ENCRYPT_ROUND_X2;
186rounds_10:
187 AESCE_ENCRYPT_ROUND_X2;
188 AESCE_ENCRYPT_ROUND_X2;
189 AESCE_ENCRYPT_ROUND_X2;
190 AESCE_ENCRYPT_ROUND_X2;
191 AESCE_ENCRYPT_ROUND;
192
193 /* AES AddRoundKey for the previous round.
194 * SubBytes, ShiftRows for the final round. */
195 block = vaeseq_u8(block, vld1q_u8(keys));
196 keys += 16;
197
198 /* Final round: no MixColumns */
199
200 /* Final AddRoundKey */
201 block = veorq_u8(block, vld1q_u8(keys));
202
203 return block;
204}
205
206/* Single round of AESCE decryption
207 *
208 * AES AddRoundKey, SubBytes, ShiftRows
209 *
210 * block = vaesdq_u8(block, vld1q_u8(keys));
211 *
212 * AES inverse MixColumns for the next round.
213 *
214 * This means that we switch the order of the inverse AddRoundKey and
215 * inverse MixColumns operations. We have to do this as AddRoundKey is
216 * done in an atomic instruction together with the inverses of SubBytes
217 * and ShiftRows.
218 *
219 * It works because MixColumns is a linear operation over GF(2^8) and
220 * AddRoundKey is an exclusive or, which is equivalent to addition over
221 * GF(2^8). (The inverse of MixColumns needs to be applied to the
222 * affected round keys separately which has been done when the
223 * decryption round keys were calculated.)
224 *
225 * block = vaesimcq_u8(block);
226 */
227#define AESCE_DECRYPT_ROUND \
228 block = vaesdq_u8(block, vld1q_u8(keys)); \
229 block = vaesimcq_u8(block); \
230 keys += 16
231/* Two rounds of AESCE decryption */
232#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
233
234#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
235static uint8x16_t aesce_decrypt_block(uint8x16_t block,
236 unsigned char *keys,
237 int rounds)
238{
239 /* 10, 12 or 14 rounds. Unroll loop. */
240 if (rounds == 10) {
241 goto rounds_10;
242 }
243 if (rounds == 12) {
244 goto rounds_12;
245 }
246 AESCE_DECRYPT_ROUND_X2;
247rounds_12:
248 AESCE_DECRYPT_ROUND_X2;
249rounds_10:
250 AESCE_DECRYPT_ROUND_X2;
251 AESCE_DECRYPT_ROUND_X2;
252 AESCE_DECRYPT_ROUND_X2;
253 AESCE_DECRYPT_ROUND_X2;
254 AESCE_DECRYPT_ROUND;
255
256 /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
257 * last full round. */
258 block = vaesdq_u8(block, vld1q_u8(keys));
259 keys += 16;
260
261 /* Inverse AddRoundKey for inverting the initial round key addition. */
262 block = veorq_u8(block, vld1q_u8(keys));
263
264 return block;
265}
266#endif
267
268/*
269 * AES-ECB block en(de)cryption
270 */
271int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
272 int mode,
273 const unsigned char input[16],
274 unsigned char output[16])
275{
276 uint8x16_t block = vld1q_u8(&input[0]);
277 unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
278
279#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
280 if (mode == MBEDTLS_AES_DECRYPT) {
281 block = aesce_decrypt_block(block, keys, ctx->nr);
282 } else
283#else
284 (void) mode;
285#endif
286 {
287 block = aesce_encrypt_block(block, keys, ctx->nr);
288 }
289 vst1q_u8(&output[0], block);
290
291 return 0;
292}
293
294/*
295 * Compute decryption round keys from encryption round keys
296 */
297#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
298void mbedtls_aesce_inverse_key(unsigned char *invkey,
299 const unsigned char *fwdkey,
300 int nr)
301{
302 int i, j;
303 j = nr;
304 vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
305 for (i = 1, j--; j > 0; i++, j--) {
306 vst1q_u8(invkey + i * 16,
307 vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
308 }
309 vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
310
311}
312#endif
313
314static inline uint32_t aes_rot_word(uint32_t word)
315{
316 return (word << (32 - 8)) | (word >> 8);
317}
318
319static inline uint32_t aes_sub_word(uint32_t in)
320{
321 uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
322 uint8x16_t zero = vdupq_n_u8(0);
323
324 /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
325 * the correct result as ShiftRows doesn't change the first row. */
326 v = vaeseq_u8(zero, v);
327 return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
328}
329
330/*
331 * Key expansion function
332 */
333static void aesce_setkey_enc(unsigned char *rk,
334 const unsigned char *key,
335 const size_t key_bit_length)
336{
337 static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
338 0x20, 0x40, 0x80, 0x1b, 0x36 };
339 /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
340 * - Section 5, Nr = Nk + 6
341 * - Section 5.2, the length of round keys is Nb*(Nr+1)
342 */
343 const size_t key_len_in_words = key_bit_length / 32; /* Nk */
344 const size_t round_key_len_in_words = 4; /* Nb */
345 const size_t rounds_needed = key_len_in_words + 6; /* Nr */
346 const size_t round_keys_len_in_words =
347 round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */
348 const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
349
350 memcpy(rk, key, key_len_in_words * 4);
351
352 for (uint32_t *rki = (uint32_t *) rk;
353 rki + key_len_in_words < rko_end;
354 rki += key_len_in_words) {
355
356 size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
357 uint32_t *rko;
358 rko = rki + key_len_in_words;
359 rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
360 rko[0] ^= rcon[iteration] ^ rki[0];
361 rko[1] = rko[0] ^ rki[1];
362 rko[2] = rko[1] ^ rki[2];
363 rko[3] = rko[2] ^ rki[3];
364 if (rko + key_len_in_words > rko_end) {
365 /* Do not write overflow words.*/
366 continue;
367 }
368#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
369 switch (key_bit_length) {
370 case 128:
371 break;
372 case 192:
373 rko[4] = rko[3] ^ rki[4];
374 rko[5] = rko[4] ^ rki[5];
375 break;
376 case 256:
377 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
378 rko[5] = rko[4] ^ rki[5];
379 rko[6] = rko[5] ^ rki[6];
380 rko[7] = rko[6] ^ rki[7];
381 break;
382 }
383#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
384 }
385}
386
387/*
388 * Key expansion, wrapper
389 */
390int mbedtls_aesce_setkey_enc(unsigned char *rk,
391 const unsigned char *key,
392 size_t bits)
393{
394 switch (bits) {
395 case 128:
396 case 192:
397 case 256:
398 aesce_setkey_enc(rk, key, bits);
399 break;
400 default:
401 return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
402 }
403
404 return 0;
405}
406
407#if defined(MBEDTLS_GCM_C)
408
409#if defined(MBEDTLS_ARCH_IS_ARM32)
410
411#if defined(__clang__)
412/* On clang for A32/T32, work around some missing intrinsics and types which are listed in
413 * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
414 * These are only required for GCM.
415 */
416#define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
417
418typedef uint8x16_t poly128_t;
419
420static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
421{
422 poly128_t r;
423 asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
424 return r;
425}
426
427/* This is set to cause some more missing intrinsics to be defined below */
428#define COMMON_MISSING_INTRINSICS
429
430static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
431{
432 return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
433 (poly64_t) (vget_high_u64((uint64x2_t) b)));
434}
435
436#endif /* defined(__clang__) */
437
438static inline uint8x16_t vrbitq_u8(uint8x16_t x)
439{
440 /* There is no vrbitq_u8 instruction in A32/T32, so provide
441 * an equivalent non-Neon implementation. Reverse bit order in each
442 * byte with 4x rbit, rev. */
443 asm ("ldm %[p], { r2-r5 } \n\t"
444 "rbit r2, r2 \n\t"
445 "rev r2, r2 \n\t"
446 "rbit r3, r3 \n\t"
447 "rev r3, r3 \n\t"
448 "rbit r4, r4 \n\t"
449 "rev r4, r4 \n\t"
450 "rbit r5, r5 \n\t"
451 "rev r5, r5 \n\t"
452 "stm %[p], { r2-r5 } \n\t"
453 :
454 /* Output: 16 bytes of memory pointed to by &x */
455 "+m" (*(uint8_t(*)[16]) &x)
456 :
457 [p] "r" (&x)
458 :
459 "r2", "r3", "r4", "r5"
460 );
461 return x;
462}
463
464#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
465
466#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
467/* Some intrinsics are not available for GCC 5.X. */
468#define COMMON_MISSING_INTRINSICS
469#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
470
471
472#if defined(COMMON_MISSING_INTRINSICS)
473
474/* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
475
476#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
477#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
478
479static inline poly64x1_t vget_low_p64(poly64x2_t a)
480{
481 uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
482 return (poly64x1_t) r;
483
484}
485
486#endif /* COMMON_MISSING_INTRINSICS */
487
488/* vmull_p64/vmull_high_p64 wrappers.
489 *
490 * Older compilers miss some intrinsic functions for `poly*_t`. We use
491 * uint8x16_t and uint8x16x3_t as input/output parameters.
492 */
493#if defined(MBEDTLS_COMPILER_IS_GCC)
494/* GCC reports incompatible type error without cast. GCC think poly64_t and
495 * poly64x1_t are different, that is different with MSVC and Clang. */
496#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
497#else
498/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
499 * error with/without cast. And I think poly64_t and poly64x1_t are same, no
500 * cast for clang also. */
501#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
502#endif /* MBEDTLS_COMPILER_IS_GCC */
503
504static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
505{
506
507 return vreinterpretq_u8_p128(
508 MBEDTLS_VMULL_P64(
509 (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
510 (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
511 ));
512}
513
514static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
515{
516 return vreinterpretq_u8_p128(
517 vmull_high_p64(vreinterpretq_p64_u8(a),
518 vreinterpretq_p64_u8(b)));
519}
520
521/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
522 * `x^128 + x^7 + x^2 + x + 1`.
523 *
524 * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
525 * multiplies to generate a 128b.
526 *
527 * `poly_mult_128` executes polynomial multiplication and outputs 256b that
528 * represented by 3 128b due to code size optimization.
529 *
530 * Output layout:
531 * | | | |
532 * |------------|-------------|-------------|
533 * | ret.val[0] | h3:h2:00:00 | high 128b |
534 * | ret.val[1] | :m2:m1:00 | middle 128b |
535 * | ret.val[2] | : :l1:l0 | low 128b |
536 */
537static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
538{
539 uint8x16x3_t ret;
540 uint8x16_t h, m, l; /* retval high/middle/low */
541 uint8x16_t c, d, e;
542
543 h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */
544 l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */
545 c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */
546 d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */
547 e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */
548 m = veorq_u8(d, e); /* :m2:m1:00 = d + e */
549
550 ret.val[0] = h;
551 ret.val[1] = m;
552 ret.val[2] = l;
553 return ret;
554}
555
556/*
557 * Modulo reduction.
558 *
559 * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
560 *
561 * Section 4.3
562 *
563 * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
564 * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
565 * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
566 * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
567 * simply multiply the higher part of the operand by r(z) and add it to l(z). If
568 * the result is still larger than 128 bits, we reduce again.
569 */
570static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
571{
572 uint8x16_t const ZERO = vdupq_n_u8(0);
573
574 uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
575#if defined(__GNUC__)
576 /* use 'asm' as an optimisation barrier to prevent loading MODULO from
577 * memory. It is for GNUC compatible compilers.
578 */
579 asm volatile ("" : "+w" (r));
580#endif
581 uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
582 uint8x16_t h, m, l; /* input high/middle/low 128b */
583 uint8x16_t c, d, e, f, g, n, o;
584 h = input.val[0]; /* h3:h2:00:00 */
585 m = input.val[1]; /* :m2:m1:00 */
586 l = input.val[2]; /* : :l1:l0 */
587 c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */
588 d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */
589 e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
590 f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */
591 g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */
592 n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */
593 o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */
594 return veorq_u8(o, g); /* = o1:o0 + g1:00 */
595}
596
597/*
598 * GCM multiplication: c = a times b in GF(2^128)
599 */
600void mbedtls_aesce_gcm_mult(unsigned char c[16],
601 const unsigned char a[16],
602 const unsigned char b[16])
603{
604 uint8x16_t va, vb, vc;
605 va = vrbitq_u8(vld1q_u8(&a[0]));
606 vb = vrbitq_u8(vld1q_u8(&b[0]));
607 vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
608 vst1q_u8(&c[0], vc);
609}
610
611#endif /* MBEDTLS_GCM_C */
612
613#if defined(MBEDTLS_POP_TARGET_PRAGMA)
614#if defined(__clang__)
615#pragma clang attribute pop
616#elif defined(__GNUC__)
617#pragma GCC pop_options
618#endif
619#undef MBEDTLS_POP_TARGET_PRAGMA
620#endif
621
622#endif /* MBEDTLS_AESCE_HAVE_CODE */
623
624#endif /* MBEDTLS_AESCE_C */
625