/* $NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include RCSID("$NetBSD: aes_armv8_64.S,v 1.15 2020/09/08 23:58:09 riastradh Exp $") .arch_extension aes /* * uint32_t rcon[10] * * Table mapping n ---> x^n mod (x^8 + x^4 + x^3 + x + 1) in GF(2). * Such elements of GF(8) need only eight bits to be represented, * but we store them in 4-byte units so we can copy one into all * four 4-byte lanes of a vector register with a single LD1R. The * access pattern is fixed, so indices into this table are never * secret. */ .section .rodata .p2align 2 .type rcon,@object rcon: .long 0x01 .long 0x02 .long 0x04 .long 0x08 .long 0x10 .long 0x20 .long 0x40 .long 0x80 .long 0x1b .long 0x36 END(rcon) /* * uint128_t unshiftrows_rotword_1 * * Table for TBL instruction to undo ShiftRows, and then do * RotWord on word 1, and then copy it into all the other words. */ .section .rodata .p2align 4 .type unshiftrows_rotword_1,@object unshiftrows_rotword_1: .byte 0x01,0x0e,0x0b,0x04 .byte 0x01,0x0e,0x0b,0x04 .byte 0x01,0x0e,0x0b,0x04 .byte 0x01,0x0e,0x0b,0x04 END(unshiftrows_rotword_1) /* * uint128_t unshiftrows_3 * * Table for TBL instruction to undo ShiftRows, and then copy word * 3 into all the other words. */ .section .rodata .p2align 4 .type unshiftrows_3,@object unshiftrows_3: .byte 0x0c,0x09,0x06,0x03 .byte 0x0c,0x09,0x06,0x03 .byte 0x0c,0x09,0x06,0x03 .byte 0x0c,0x09,0x06,0x03 END(unshiftrows_3) /* * uint128_t unshiftrows_rotword_3 * * Table for TBL instruction to undo ShiftRows, and then do * RotWord on word 3, and then copy it into all the other words. */ .section .rodata .p2align 4 .type unshiftrows_rotword_3,@object unshiftrows_rotword_3: .byte 0x09,0x06,0x03,0x0c .byte 0x09,0x06,0x03,0x0c .byte 0x09,0x06,0x03,0x0c .byte 0x09,0x06,0x03,0x0c END(unshiftrows_rotword_3) /* * aesarmv8_setenckey128(struct aesenc *enckey@x0, const uint8_t key[16] @x1) * * Expand a 16-byte AES-128 key into 10 round keys. * * Standard ABI calling convention. */ ENTRY(aesarmv8_setenckey128) ld1 {v1.16b}, [x1] /* q1 := master key */ adrl x4, unshiftrows_rotword_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_3 table */ str q1, [x0], #0x10 /* store master key as first round key */ mov x2, #10 /* round count */ adrl x3, rcon /* round constant */ 1: /* * q0 = 0 * v1.4s = (prk[0], prk[1], prk[2], prk[3]) * x0 = pointer to round key to compute * x2 = round count * x3 = rcon pointer */ /* q3 := ShiftRows(SubBytes(q1)) */ mov v3.16b, v1.16b aese v3.16b, v0.16b /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ ld1r {v4.4s}, [x3], #4 tbl v3.16b, {v3.16b}, v16.16b eor v3.16b, v3.16b, v4.16b /* * v5.4s := (0,prk[0],prk[1],prk[2]) * v6.4s := (0,0,prk[0],prk[1]) * v7.4s := (0,0,0,prk[0]) */ ext v5.16b, v0.16b, v1.16b, #12 ext v6.16b, v0.16b, v1.16b, #8 ext v7.16b, v0.16b, v1.16b, #4 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */ eor v1.16b, v1.16b, v3.16b eor v1.16b, v1.16b, v5.16b eor v1.16b, v1.16b, v6.16b eor v1.16b, v1.16b, v7.16b subs x2, x2, #1 /* count down rounds */ str q1, [x0], #0x10 /* store round key */ b.ne 1b ret END(aesarmv8_setenckey128) /* * aesarmv8_setenckey192(struct aesenc *enckey@x0, const uint8_t key[24] @x1) * * Expand a 24-byte AES-192 key into 12 round keys. * * Standard ABI calling convention. */ ENTRY(aesarmv8_setenckey192) ld1 {v1.16b}, [x1], #0x10 /* q1 := master key[0:128) */ ld1 {v2.8b}, [x1] /* d2 := master key[128:192) */ adrl x4, unshiftrows_rotword_1 adrl x5, unshiftrows_rotword_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_1 */ ld1 {v17.16b}, [x5] /* q17 := unshiftrows_rotword_3 */ str q1, [x0], #0x10 /* store master key[0:128) as round key */ mov x2, #12 /* round count */ adrl x3, rcon /* round constant */ 1: /* * q0 = 0 * v1.4s = (prk[0], prk[1], prk[2], prk[3]) * v2.4s = (rklo[0], rklo[1], xxx, xxx) * x0 = pointer to three round keys to compute * x2 = round count * x3 = rcon pointer */ /* q3 := ShiftRows(SubBytes(q2)) */ mov v3.16b, v2.16b aese v3.16b, v0.16b /* v3.4s[i] := RotWords(SubBytes(rklo[1])) ^ RCON */ ld1r {v4.4s}, [x3], #4 tbl v3.16b, {v3.16b}, v16.16b eor v3.16b, v3.16b, v4.16b /* * We need to compute: * * rk[0] := rklo[0] * rk[1] := rklo[1] * rk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] * rk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] * nrk[0] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ prk[1] ^ prk[2] * nrk[1] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] * nrk[2] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] * nrk[3] := Rot(Sub(rklo[1])) ^ RCON ^ prk[0] ^ ... ^ prk[3] ^ rklo[0] * ^ rklo[1] */ /* * v5.4s := (0,prk[0],prk[1],prk[2]) * v6.4s := (0,0,prk[0],prk[1]) * v7.4s := (0,0,0,prk[0]) */ ext v5.16b, v0.16b, v1.16b, #12 ext v6.16b, v0.16b, v1.16b, #8 ext v7.16b, v0.16b, v1.16b, #4 /* v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) */ eor v5.16b, v5.16b, v1.16b eor v5.16b, v5.16b, v3.16b eor v5.16b, v5.16b, v6.16b eor v5.16b, v5.16b, v7.16b /* * At this point, rk is split across v2.4s = (rk[0],rk[1],...) * and v5.4s = (rk[2],rk[3],...); nrk is in v5.4s = * (...,nrk[0],nrk[1]); and we have yet to compute nrk[2] or * nrk[3], which requires rklo[0] and rklo[1] in v2.4s = * (rklo[0],rklo[1],...). */ /* v1.4s := (nrk[0], nrk[1], nrk[1], nrk[1]) */ dup v1.4s, v5.s[3] mov v1.s[0], v5.s[2] /* * v6.4s := (0, 0, rklo[0], rklo[1]) * v7.4s := (0, 0, 0, rklo[0]) */ ext v6.16b, v0.16b, v2.16b, #8 ext v7.16b, v0.16b, v2.16b, #4 /* v3.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */ eor v3.16b, v1.16b, v6.16b eor v3.16b, v3.16b, v7.16b /* * Recall v2.4s = (rk[0], rk[1], xxx, xxx) * and v5.4s = (rk[2], rk[3], xxx, xxx). Set * v2.4s := (rk[0], rk[1], rk[2], rk[3]) */ mov v2.d[1], v5.d[0] /* store two round keys */ stp q2, q3, [x0], #0x20 /* * Live vector registers at this point: * * q0 = zero * q2 = rk * q3 = nrk * v5.4s = (rk[2], rk[3], nrk[0], nrk[1]) * q16 = unshiftrows_rotword_1 * q17 = unshiftrows_rotword_3 * * We have to compute, in q1: * * nnrk[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] * nnrk[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] * nnrk[2] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] * nnrk[3] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] * ^ nrk[1] * * And, if there's any more afterward, in q2: * * nnnrklo[0] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] * ^ nrk[1] ^ nrk[2] * nnnrklo[1] := Rot(Sub(nrk[3])) ^ RCON' ^ rk[2] ^ rk[3] ^ nrk[0] * ^ nrk[1] ^ nrk[2] ^ nrk[3] */ /* q1 := RotWords(SubBytes(q3)) */ mov v1.16b, v3.16b aese v1.16b, v0.16b /* v1.4s[i] := RotWords(SubBytes(nrk[3])) ^ RCON' */ ld1r {v4.4s}, [x3], #4 tbl v1.16b, {v1.16b}, v17.16b eor v1.16b, v1.16b, v4.16b /* * v5.4s := (rk[2], rk[3], nrk[0], nrk[1]) [already] * v4.4s := (0, rk[2], rk[3], nrk[0]) * v6.4s := (0, 0, rk[2], rk[3]) * v7.4s := (0, 0, 0, rk[2]) */ ext v4.16b, v0.16b, v5.16b, #12 ext v6.16b, v0.16b, v5.16b, #8 ext v7.16b, v0.16b, v5.16b, #4 /* v1.4s := (nnrk[0], nnrk[1], nnrk[2], nnrk[3]) */ eor v1.16b, v1.16b, v5.16b eor v1.16b, v1.16b, v4.16b eor v1.16b, v1.16b, v6.16b eor v1.16b, v1.16b, v7.16b subs x2, x2, #3 /* count down three rounds */ str q1, [x0], #0x10 /* store third round key */ b.eq 2f /* * v4.4s := (nrk[2], nrk[3], xxx, xxx) * v5.4s := (0, nrk[2], xxx, xxx) */ ext v4.16b, v3.16b, v0.16b, #8 ext v5.16b, v0.16b, v4.16b, #12 /* v2.4s := (nnrk[3], nnrk[3], xxx, xxx) */ dup v2.4s, v1.s[3] /* * v2.4s := (nnnrklo[0] = nnrk[3] ^ nrk[2], * nnnrklo[1] = nnrk[3] ^ nrk[2] ^ nrk[3], * xxx, xxx) */ eor v2.16b, v2.16b, v4.16b eor v2.16b, v2.16b, v5.16b b 1b 2: ret END(aesarmv8_setenckey192) /* * aesarmv8_setenckey256(struct aesenc *enckey@x0, const uint8_t key[32] @x1) * * Expand a 32-byte AES-256 key into 14 round keys. * * Standard ABI calling convention. */ ENTRY(aesarmv8_setenckey256) /* q1 := key[0:128), q2 := key[128:256) */ ld1 {v1.16b-v2.16b}, [x1], #0x20 adrl x4, unshiftrows_rotword_3 adrl x5, unshiftrows_3 eor v0.16b, v0.16b, v0.16b /* q0 := 0 */ ld1 {v16.16b}, [x4] /* q16 := unshiftrows_rotword_3 */ ld1 {v17.16b}, [x5] /* q17 := unshiftrows_3 */ /* store master key as first two round keys */ stp q1, q2, [x0], #0x20 mov x2, #14 /* round count */ adrl x3, rcon /* round constant */ 1: /* * q0 = 0 * v1.4s = (pprk[0], pprk[1], pprk[2], pprk[3]) * v2.4s = (prk[0], prk[1], prk[2], prk[3]) * x2 = round count * x3 = rcon pointer */ /* q3 := ShiftRows(SubBytes(q2)) */ mov v3.16b, v2.16b aese v3.16b, v0.16b /* v3.4s[i] := RotWords(SubBytes(prk[3])) ^ RCON */ ld1r {v4.4s}, [x3], #4 tbl v3.16b, {v3.16b}, v16.16b eor v3.16b, v3.16b, v4.16b /* * v5.4s := (0,pprk[0],pprk[1],pprk[2]) * v6.4s := (0,0,pprk[0],pprk[1]) * v7.4s := (0,0,0,pprk[0]) */ ext v5.16b, v0.16b, v1.16b, #12 ext v6.16b, v0.16b, v1.16b, #8 ext v7.16b, v0.16b, v1.16b, #4 /* v1.4s := (rk[0], rk[1], rk[2], rk[3]) */ eor v1.16b, v1.16b, v3.16b eor v1.16b, v1.16b, v5.16b eor v1.16b, v1.16b, v6.16b eor v1.16b, v1.16b, v7.16b subs x2, x2, #2 /* count down two rounds */ b.eq 2f /* stop if this is the last one */ /* q3 := ShiftRows(SubBytes(q1)) */ mov v3.16b, v1.16b aese v3.16b, v0.16b /* v3.4s[i] := SubBytes(rk[3]) */ tbl v3.16b, {v3.16b}, v17.16b /* * v5.4s := (0,prk[0],prk[1],prk[2]) * v6.4s := (0,0,prk[0],prk[1]) * v7.4s := (0,0,0,prk[0]) */ ext v5.16b, v0.16b, v2.16b, #12 ext v6.16b, v0.16b, v2.16b, #8 ext v7.16b, v0.16b, v2.16b, #4 /* v2.4s := (nrk[0], nrk[1], nrk[2], nrk[3]) */ eor v2.16b, v2.16b, v3.16b eor v2.16b, v2.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v2.16b, v2.16b, v7.16b stp q1, q2, [x0], #0x20 /* store two round keys */ b 1b 2: str q1, [x0] /* store last round key */ ret END(aesarmv8_setenckey256) /* * aesarmv8_enctodec(const struct aesenc *enckey@x0, struct aesdec *deckey@x1, * uint32_t nrounds@x2) * * Convert AES encryption round keys to AES decryption round keys. * `rounds' must be between 10 and 14. * * Standard ABI calling convention. */ ENTRY(aesarmv8_enctodec) ldr q0, [x0, x2, lsl #4] /* load last round key */ b 2f _ALIGN_TEXT 1: aesimc v0.16b, v0.16b /* convert encryption to decryption */ 2: str q0, [x1], #0x10 /* store round key */ subs x2, x2, #1 /* count down round */ ldr q0, [x0, x2, lsl #4] /* load previous round key */ b.ne 1b /* repeat if there's more */ str q0, [x1] /* store first round key verbatim */ ret END(aesarmv8_enctodec) /* * aesarmv8_enc(const struct aesenc *enckey@x0, const uint8_t in[16] @x1, * uint8_t out[16] @x2, uint32_t nrounds@x3) * * Encrypt a single block. * * Standard ABI calling convention. */ ENTRY(aesarmv8_enc) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp ld1 {v0.16b}, [x1] /* q0 := ptxt */ bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ st1 {v0.16b}, [x2] /* store ctxt */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_enc) /* * aesarmv8_dec(const struct aesdec *deckey@x0, const uint8_t in[16] @x1, * uint8_t out[16] @x2, uint32_t nrounds@x3) * * Decrypt a single block. * * Standard ABI calling convention. */ ENTRY(aesarmv8_dec) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp ld1 {v0.16b}, [x1] /* q0 := ctxt */ bl aesarmv8_dec1 /* q0 := ptxt; trash x0/x3/q16 */ st1 {v0.16b}, [x2] /* store ptxt */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_dec) /* * aesarmv8_cbc_enc(const struct aesenc *enckey@x0, const uint8_t *in@x1, * uint8_t *out@x2, size_t nbytes@x3, uint8_t iv[16] @x4, * uint32_t nrounds@x5) * * Encrypt a contiguous sequence of blocks with AES-CBC. * * nbytes must be an integral multiple of 16. * * Standard ABI calling convention. */ ENTRY(aesarmv8_cbc_enc) cbz x3, 2f /* stop if nothing to do */ stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ ld1 {v0.16b}, [x4] /* q0 := chaining value */ _ALIGN_TEXT 1: ld1 {v1.16b}, [x1], #0x10 /* q1 := plaintext block */ eor v0.16b, v0.16b, v1.16b /* q0 := cv ^ ptxt */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ bl aesarmv8_enc1 /* q0 := ctxt; trash x0/x3/q16 */ subs x10, x10, #0x10 /* count down nbytes */ st1 {v0.16b}, [x2], #0x10 /* store ciphertext block */ b.ne 1b /* repeat if x10 is nonzero */ st1 {v0.16b}, [x4] /* store chaining value */ ldp fp, lr, [sp], #16 /* pop stack frame */ 2: ret END(aesarmv8_cbc_enc) /* * aesarmv8_cbc_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1, * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4, * uint32_t nrounds@x5) * * Decrypt a contiguous sequence of blocks with AES-CBC. * * nbytes must be a positive integral multiple of 16. This routine * is not vectorized; use aesarmv8_cbc_dec8 for >=8 blocks at once. * * Standard ABI calling convention. */ ENTRY(aesarmv8_cbc_dec1) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp ld1 {v24.16b}, [x4] /* q24 := iv */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ add x1, x1, x3 /* x1 := pointer past end of in */ add x2, x2, x3 /* x2 := pointer past end of out */ sub x1, x1, #0x10 ld1 {v0.16b}, [x1] /* q0 := last ciphertext block */ st1 {v0.16b}, [x4] /* update iv */ b 2f _ALIGN_TEXT 1: sub x1, x1, #0x10 ld1 {v31.16b}, [x1] /* q31 := chaining value */ sub x2, x2, #0x10 eor v0.16b, v0.16b, v31.16b /* q0 := plaintext block */ st1 {v0.16b}, [x2] /* store plaintext block */ mov v0.16b, v31.16b /* move cv = ciphertext block */ 2: mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ bl aesarmv8_dec1 /* q0 := cv ^ ptxt; trash x0/x3/q16 */ subs x10, x10, #0x10 /* count down nbytes */ b.ne 1b /* repeat if more blocks */ eor v0.16b, v0.16b, v24.16b /* q0 := first plaintext block */ sub x2, x2, #0x10 /* store first plaintext block */ st1 {v0.16b}, [x2] ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_cbc_dec1) /* * aesarmv8_cbc_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1, * uint8_t *out@x2, size_t nbytes@x3, const uint8_t iv[16] @x4, * uint32_t nrounds@x5) * * Decrypt a contiguous sequence of 8-block units with AES-CBC. * * nbytes must be a positive integral multiple of 128. * * Standard ABI calling convention. */ ENTRY(aesarmv8_cbc_dec8) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp ld1 {v24.16b}, [x4] /* q24 := iv */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ add x1, x1, x3 /* x1 := pointer past end of in */ add x2, x2, x3 /* x2 := pointer past end of out */ sub x1, x1, #0x20 ld1 {v6.16b, v7.16b}, [x1] /* q6, q7 := last ciphertext blocks */ st1 {v7.16b}, [x4] /* update iv */ b 2f _ALIGN_TEXT 1: sub x1, x1, #0x20 ld1 {v6.16b, v7.16b}, [x1] eor v0.16b, v0.16b, v7.16b /* q0 := pt0 */ sub x2, x2, #0x20 st1 {v0.16b, v1.16b}, [x2] 2: sub x1, x1, #0x20 ld1 {v4.16b-v5.16b}, [x1] sub x1, x1, #0x40 ld1 {v0.16b-v3.16b}, [x1] mov v31.16b, v6.16b /* q[24+i] := cv[i], 0=8 blocks at once. * * Standard ABI calling convention. */ ENTRY(aesarmv8_xts_enc1) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ ld1 {v31.16b}, [x4] /* q31 := tweak */ _ALIGN_TEXT 1: ld1 {v0.16b}, [x1], #0x10 /* q0 := ptxt */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ eor v0.16b, v0.16b, v31.16b /* q0 := ptxt ^ tweak */ bl aesarmv8_enc1 /* q0 := AES(...); trash x0/x3/q16 */ eor v0.16b, v0.16b, v31.16b /* q0 := AES(ptxt ^ tweak) ^ tweak */ st1 {v0.16b}, [x2], #0x10 /* store ciphertext block */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ subs x10, x10, #0x10 /* count down nbytes */ b.ne 1b /* repeat if more blocks */ st1 {v31.16b}, [x4] /* update tweak */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_xts_enc1) /* * aesarmv8_xts_enc8(const struct aesenc *enckey@x0, const uint8_t *in@x1, * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, * uint32_t nrounds@x5) * * Encrypt a contiguous sequence of blocks with AES-XTS. * * nbytes must be a positive integral multiple of 128. * * Standard ABI calling convention. */ ENTRY(aesarmv8_xts_enc8) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ ld1 {v31.16b}, [x4] /* q31 := tweak */ _ALIGN_TEXT 1: mov v24.16b, v31.16b /* q24 := tweak[0] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v25.16b, v31.16b /* q25 := tweak[1] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v26.16b, v31.16b /* q26 := tweak[2] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v27.16b, v31.16b /* q27 := tweak[3] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v28.16b, v31.16b /* q28 := tweak[4] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v29.16b, v31.16b /* q29 := tweak[5] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v30.16b, v31.16b /* q30 := tweak[6] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ /* q31 := tweak[7] */ ld1 {v0.16b-v3.16b}, [x1], #0x40 /* q[i] := ptxt[i] */ ld1 {v4.16b-v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v24.16b /* q[i] := ptxt[i] ^ tweak[i] */ eor v1.16b, v1.16b, v25.16b eor v2.16b, v2.16b, v26.16b eor v3.16b, v3.16b, v27.16b eor v4.16b, v4.16b, v28.16b eor v5.16b, v5.16b, v29.16b eor v6.16b, v6.16b, v30.16b eor v7.16b, v7.16b, v31.16b mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ bl aesarmv8_enc8 /* encrypt q0-q7; trash x0/x3/q16 */ eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ eor v1.16b, v1.16b, v25.16b eor v2.16b, v2.16b, v26.16b eor v3.16b, v3.16b, v27.16b eor v4.16b, v4.16b, v28.16b eor v5.16b, v5.16b, v29.16b eor v6.16b, v6.16b, v30.16b eor v7.16b, v7.16b, v31.16b st1 {v0.16b-v3.16b}, [x2], #0x40 /* store ciphertext blocks */ st1 {v4.16b-v7.16b}, [x2], #0x40 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ subs x10, x10, #0x80 /* count down nbytes */ b.ne 1b /* repeat if more block groups */ st1 {v31.16b}, [x4] /* update tweak */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_xts_enc8) /* * aesarmv8_xts_dec1(const struct aesdec *deckey@x0, const uint8_t *in@x1, * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, * uint32_t nrounds@x5) * * Decrypt a contiguous sequdece of blocks with AES-XTS. * * nbytes must be a positive integral multiple of 16. This routine * is not vectorized; use aesarmv8_xts_dec8 for >=8 blocks at once. * * Standard ABI calling convention. */ ENTRY(aesarmv8_xts_dec1) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp mov x9, x0 /* x9 := deckey */ mov x10, x3 /* x10 := nbytes */ ld1 {v31.16b}, [x4] /* q31 := tweak */ _ALIGN_TEXT 1: ld1 {v0.16b}, [x1], #0x10 /* q0 := ctxt */ mov x0, x9 /* x0 := deckey */ mov x3, x5 /* x3 := nrounds */ eor v0.16b, v0.16b, v31.16b /* q0 := ctxt ^ tweak */ bl aesarmv8_dec1 /* q0 := AES(...); trash x0/x3/q16 */ eor v0.16b, v0.16b, v31.16b /* q0 := AES(ctxt ^ tweak) ^ tweak */ st1 {v0.16b}, [x2], #0x10 /* store plaintext block */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ subs x10, x10, #0x10 /* count down nbytes */ b.ne 1b /* repeat if more blocks */ st1 {v31.16b}, [x4] /* update tweak */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_xts_dec1) /* * aesarmv8_xts_dec8(const struct aesdec *deckey@x0, const uint8_t *in@x1, * uint8_t *out@x2, size_t nbytes@x3, uint8_t tweak[16] @x4, * uint32_t nrounds@x5) * * Decrypt a contiguous sequdece of blocks with AES-XTS. * * nbytes must be a positive integral multiple of 128. * * Standard ABI calling convention. */ ENTRY(aesarmv8_xts_dec8) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp mov x9, x0 /* x9 := deckey */ mov x10, x3 /* x10 := nbytes */ ld1 {v31.16b}, [x4] /* q31 := tweak */ _ALIGN_TEXT 1: mov v24.16b, v31.16b /* q24 := tweak[0] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v25.16b, v31.16b /* q25 := tweak[1] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v26.16b, v31.16b /* q26 := tweak[2] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v27.16b, v31.16b /* q27 := tweak[3] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v28.16b, v31.16b /* q28 := tweak[4] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v29.16b, v31.16b /* q29 := tweak[5] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ mov v30.16b, v31.16b /* q30 := tweak[6] */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ /* q31 := tweak[7] */ ld1 {v0.16b-v3.16b}, [x1], #0x40 /* q[i] := ctxt[i] */ ld1 {v4.16b-v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v24.16b /* q[i] := ctxt[i] ^ tweak[i] */ eor v1.16b, v1.16b, v25.16b eor v2.16b, v2.16b, v26.16b eor v3.16b, v3.16b, v27.16b eor v4.16b, v4.16b, v28.16b eor v5.16b, v5.16b, v29.16b eor v6.16b, v6.16b, v30.16b eor v7.16b, v7.16b, v31.16b mov x0, x9 /* x0 := deckey */ mov x3, x5 /* x3 := nrounds */ bl aesarmv8_dec8 /* decrypt q0-q7; trash x0/x3/q16 */ eor v0.16b, v0.16b, v24.16b /* q[i] := AES(...) ^ tweak[i] */ eor v1.16b, v1.16b, v25.16b eor v2.16b, v2.16b, v26.16b eor v3.16b, v3.16b, v27.16b eor v4.16b, v4.16b, v28.16b eor v5.16b, v5.16b, v29.16b eor v6.16b, v6.16b, v30.16b eor v7.16b, v7.16b, v31.16b st1 {v0.16b-v3.16b}, [x2], #0x40 /* store plaintext blocks */ st1 {v4.16b-v7.16b}, [x2], #0x40 bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ subs x10, x10, #0x80 /* count down nbytes */ b.ne 1b /* repeat if more block groups */ st1 {v31.16b}, [x4] /* update tweak */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_xts_dec8) /* * aesarmv8_xts_mulx(tweak@q31) * * Multiply q31 by x, modulo x^128 + x^7 + x^2 + x + 1, in place. * Uses x0 and q0/q1 as temporaries. */ .text _ALIGN_TEXT .type aesarmv8_xts_mulx,@function aesarmv8_xts_mulx: /* * Simultaneously determine * (a) whether the high bit of the low half must be * shifted into the low bit of the high half, and * (b) whether the high bit of the high half must be * carried into x^128 = x^7 + x^2 + x + 1. */ adrl x0, xtscarry cmlt v1.2d, v31.2d, #0 /* v1.2d[i] := -1 if v31.2d[i] < 0, else 0 */ ld1 {v0.16b}, [x0] /* q0 := xtscarry */ ext v1.16b, v1.16b, v1.16b, #8 /* swap halves of q1 */ shl v31.2d, v31.2d, #1 /* shift */ and v0.16b, v0.16b, v1.16b /* copy xtscarry according to mask */ eor v31.16b, v31.16b, v0.16b /* incorporate (a) and (b) */ ret END(aesarmv8_xts_mulx) .section .rodata .p2align 4 .type xtscarry,@object xtscarry: .byte 0x87,0,0,0, 0,0,0,0, 1,0,0,0, 0,0,0,0 END(xtscarry) /* * aesarmv8_xts_update(const uint8_t in[16] @x0, uint8_t out[16] @x1) * * Update an AES-XTS tweak. * * Standard ABI calling convention. */ ENTRY(aesarmv8_xts_update) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp ld1 {v31.16b}, [x0] /* load tweak */ bl aesarmv8_xts_mulx /* q31 *= x; trash x0/q0/q1 */ st1 {v31.16b}, [x1] /* store tweak */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_xts_update) /* * aesarmv8_cbcmac_update1(const struct aesenc *enckey@x0, * const uint8_t *in@x1, size_t nbytes@x2, uint8_t auth[16] @x3, * uint32_t nrounds@x4) * * Update CBC-MAC. * * nbytes must be a positive integral multiple of 16. * * Standard ABI calling convention. */ ENTRY(aesarmv8_cbcmac_update1) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp ld1 {v0.16b}, [x3] /* q0 := initial authenticator */ mov x9, x0 /* x9 := enckey */ mov x5, x3 /* x5 := &auth (enc1 trashes x3) */ _ALIGN_TEXT 1: ld1 {v1.16b}, [x1], #0x10 /* q1 := plaintext block */ mov x0, x9 /* x0 := enckey */ mov x3, x4 /* x3 := nrounds */ eor v0.16b, v0.16b, v1.16b /* q0 := auth ^ ptxt */ bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */ subs x2, x2, #0x10 /* count down nbytes */ b.ne 1b /* repeat if x10 is nonzero */ st1 {v0.16b}, [x5] /* store updated authenticator */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_cbcmac_update1) /* * aesarmv8_ccm_enc1(const struct aesenc *enckey@x0, const uint8_t *in@x1, * uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4, * uint32_t nrounds@x5) * * Update CCM encryption. * * nbytes must be a positive integral multiple of 16. * * Standard ABI calling convention. */ ENTRY(aesarmv8_ccm_enc1) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp ld1 {v0.16b-v1.16b}, [x4] /* q0 := auth, q1 := ctr (be) */ adrl x11, ctr32_inc /* x11 := &ctr32_inc */ ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ rev32 v2.16b, v1.16b /* q2 := ctr (host-endian) */ _ALIGN_TEXT 1: ld1 {v3.16b}, [x1], #0x10 /* q3 := plaintext block */ add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ rev32 v1.16b, v2.16b /* q1 := ctr (big-endian) */ eor v0.16b, v0.16b, v3.16b /* q0 := auth ^ ptxt */ bl aesarmv8_enc2 /* q0 := auth', q1 := pad; * trash x0/x3/q16 */ eor v3.16b, v1.16b, v3.16b /* q3 := ciphertext block */ subs x10, x10, #0x10 /* count down bytes */ st1 {v3.16b}, [x2], #0x10 /* store ciphertext block */ b.ne 1b /* repeat if more blocks */ rev32 v1.16b, v2.16b /* q1 := ctr (big-endian) */ st1 {v0.16b-v1.16b}, [x4] /* store updated auth/ctr */ ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_ccm_enc1) /* * aesarmv8_ccm_dec1(const struct aesenc *enckey@x0, const uint8_t *in@x1, * uint8_t *out@x2, size_t nbytes@x3, uint8_t authctr[32] @x4, * uint32_t nrounds@x5) * * Update CCM decryption. * * nbytes must be a positive integral multiple of 16. * * Standard ABI calling convention. */ ENTRY(aesarmv8_ccm_dec1) stp fp, lr, [sp, #-16]! /* push stack frame */ mov fp, sp ld1 {v1.16b, v2.16b}, [x4] /* q1 := auth, q2 := ctr (be) */ adrl x11, ctr32_inc /* x11 := &ctr32_inc */ ld1 {v5.4s}, [x11] /* q5 := (0,0,0,1) (host-endian) */ mov x9, x0 /* x9 := enckey */ mov x10, x3 /* x10 := nbytes */ rev32 v2.16b, v2.16b /* q2 := ctr (host-endian) */ /* Decrypt the first block. */ add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ mov x3, x5 /* x3 := nrounds */ rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */ ld1 {v3.16b}, [x1], #0x10 /* q3 := ctxt */ bl aesarmv8_enc1 /* q0 := pad; trash x0/x3/q16 */ b 2f _ALIGN_TEXT 1: /* * Authenticate the last block and decrypt the next block * simultaneously. * * q1 = auth ^ ptxt[-1] * q2 = ctr[-1] (le) */ add v2.4s, v2.4s, v5.4s /* increment ctr (32-bit) */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ rev32 v0.16b, v2.16b /* q0 := ctr (big-endian) */ ld1 {v3.16b}, [x1], #0x10 /* q3 := ctxt */ bl aesarmv8_enc2 /* q0 := pad, q1 := auth'; * trash x0/x3/q16 */ 2: eor v3.16b, v0.16b, v3.16b /* q3 := plaintext block */ subs x10, x10, #0x10 st1 {v3.16b}, [x2], #0x10 /* store plaintext */ eor v1.16b, v1.16b, v3.16b /* q1 := auth ^ ptxt */ b.ne 1b rev32 v2.16b, v2.16b /* q2 := ctr (big-endian) */ /* Authenticate the last block. */ mov x0, x9 /* x0 := enckey */ mov x3, x5 /* x3 := nrounds */ mov v0.16b, v1.16b /* q0 := auth ^ ptxt */ bl aesarmv8_enc1 /* q0 := auth'; trash x0/x3/q16 */ mov v1.16b, v2.16b /* store updated auth/ctr */ st1 {v0.16b-v1.16b}, [x4] ldp fp, lr, [sp], #16 /* pop stack frame */ ret END(aesarmv8_ccm_dec1) .section .rodata .p2align 4 .type ctr32_inc,@object ctr32_inc: .int 0, 0, 0, 1 END(ctr32_inc) /* * aesarmv8_enc1(const struct aesenc *enckey@x0, * uint128_t block@q0, uint32_t nrounds@x3) * * Encrypt a single AES block in q0. * * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. */ .text _ALIGN_TEXT .type aesarmv8_enc1,@function aesarmv8_enc1: ldr q16, [x0], #0x10 /* load round key */ sub x3, x3, #1 _ALIGN_TEXT 1: /* q0 := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q0)))) */ aese v0.16b, v16.16b aesmc v0.16b, v0.16b ldr q16, [x0], #0x10 subs x3, x3, #1 b.ne 1b /* q0 := ShiftRows(SubBytes(AddRoundKey_q16(q0))) */ aese v0.16b, v16.16b ldr q16, [x0] /* load last round key */ /* q0 := AddRoundKey_q16(q0) */ eor v0.16b, v0.16b, v16.16b ret END(aesarmv8_enc1) /* * aesarmv8_enc2(const struct aesenc *enckey@x0, * uint128_t block@q0, uint128_t block@q1, uint32_t nrounds@x3) * * Encrypt two AES blocks in q0 and q1. * * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. */ .text _ALIGN_TEXT .type aesarmv8_enc2,@function aesarmv8_enc2: ldr q16, [x0], #0x10 /* load round key */ sub x3, x3, #1 _ALIGN_TEXT 1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */ aese v0.16b, v16.16b aesmc v0.16b, v0.16b aese v1.16b, v16.16b aesmc v1.16b, v1.16b ldr q16, [x0], #0x10 /* load next round key */ subs x3, x3, #1 b.ne 1b /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ aese v0.16b, v16.16b aese v1.16b, v16.16b ldr q16, [x0] /* load last round key */ /* q[i] := AddRoundKey_q16(q[i]) */ eor v0.16b, v0.16b, v16.16b eor v1.16b, v1.16b, v16.16b ret END(aesarmv8_enc2) /* * aesarmv8_enc8(const struct aesenc *enckey@x0, * uint128_t block0@q0, ..., uint128_t block7@q7, * uint32_t nrounds@x3) * * Encrypt eight AES blocks in q0 through q7 in parallel. * * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. */ .text _ALIGN_TEXT .type aesarmv8_enc8,@function aesarmv8_enc8: ldr q16, [x0], #0x10 /* load round key */ sub x3, x3, #1 _ALIGN_TEXT 1: /* q[i] := MixColumns(ShiftRows(SubBytes(AddRoundKey_q16(q[i])))) */ aese v0.16b, v16.16b aesmc v0.16b, v0.16b aese v1.16b, v16.16b aesmc v1.16b, v1.16b aese v2.16b, v16.16b aesmc v2.16b, v2.16b aese v3.16b, v16.16b aesmc v3.16b, v3.16b aese v4.16b, v16.16b aesmc v4.16b, v4.16b aese v5.16b, v16.16b aesmc v5.16b, v5.16b aese v6.16b, v16.16b aesmc v6.16b, v6.16b aese v7.16b, v16.16b aesmc v7.16b, v7.16b ldr q16, [x0], #0x10 /* load next round key */ subs x3, x3, #1 b.ne 1b /* q[i] := ShiftRows(SubBytes(AddRoundKey_q16(q[i]))) */ aese v0.16b, v16.16b aese v1.16b, v16.16b aese v2.16b, v16.16b aese v3.16b, v16.16b aese v4.16b, v16.16b aese v5.16b, v16.16b aese v6.16b, v16.16b aese v7.16b, v16.16b ldr q16, [x0] /* load last round key */ /* q[i] := AddRoundKey_q16(q[i]) */ eor v0.16b, v0.16b, v16.16b eor v1.16b, v1.16b, v16.16b eor v2.16b, v2.16b, v16.16b eor v3.16b, v3.16b, v16.16b eor v4.16b, v4.16b, v16.16b eor v5.16b, v5.16b, v16.16b eor v6.16b, v6.16b, v16.16b eor v7.16b, v7.16b, v16.16b ret END(aesarmv8_enc8) /* * aesarmv8_dec1(const struct aesdec *deckey@x0, * uint128_t block@q0, uint32_t nrounds@x3) * * Decrypt a single AES block in q0. * * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. */ .text _ALIGN_TEXT .type aesarmv8_dec1,@function aesarmv8_dec1: ldr q16, [x0], #0x10 /* load round key */ sub x3, x3, #1 _ALIGN_TEXT 1: /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ aesd v0.16b, v16.16b /* q0 := InMixColumns(q0) */ aesimc v0.16b, v0.16b ldr q16, [x0], #0x10 /* load next round key */ subs x3, x3, #1 b.ne 1b /* q0 := InSubBytes(InShiftRows(AddRoundKey_q16(q0))) */ aesd v0.16b, v16.16b ldr q16, [x0] /* load last round key */ /* q0 := AddRoundKey_q16(q0) */ eor v0.16b, v0.16b, v16.16b ret END(aesarmv8_dec1) /* * aesarmv8_dec8(const struct aesdec *deckey@x0, * uint128_t block0@q0, ..., uint128_t block7@q7, * uint32_t nrounds@x3) * * Decrypt eight AES blocks in q0 through q7 in parallel. * * Internal ABI. Uses q16 as temporary. Destroys x0 and x3. */ .text _ALIGN_TEXT .type aesarmv8_dec8,@function aesarmv8_dec8: ldr q16, [x0], #0x10 /* load round key */ sub x3, x3, #1 _ALIGN_TEXT 1: /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ aesd v0.16b, v16.16b /* q[i] := InMixColumns(q[i]) */ aesimc v0.16b, v0.16b aesd v1.16b, v16.16b aesimc v1.16b, v1.16b aesd v2.16b, v16.16b aesimc v2.16b, v2.16b aesd v3.16b, v16.16b aesimc v3.16b, v3.16b aesd v4.16b, v16.16b aesimc v4.16b, v4.16b aesd v5.16b, v16.16b aesimc v5.16b, v5.16b aesd v6.16b, v16.16b aesimc v6.16b, v6.16b aesd v7.16b, v16.16b aesimc v7.16b, v7.16b ldr q16, [x0], #0x10 /* load next round key */ subs x3, x3, #1 b.ne 1b /* q[i] := InSubBytes(InShiftRows(AddRoundKey_q16(q[i]))) */ aesd v0.16b, v16.16b aesd v1.16b, v16.16b aesd v2.16b, v16.16b aesd v3.16b, v16.16b aesd v4.16b, v16.16b aesd v5.16b, v16.16b aesd v6.16b, v16.16b aesd v7.16b, v16.16b ldr q16, [x0] /* load last round key */ /* q[i] := AddRoundKey_q16(q[i]) */ eor v0.16b, v0.16b, v16.16b eor v1.16b, v1.16b, v16.16b eor v2.16b, v2.16b, v16.16b eor v3.16b, v3.16b, v16.16b eor v4.16b, v4.16b, v16.16b eor v5.16b, v5.16b, v16.16b eor v6.16b, v6.16b, v16.16b eor v7.16b, v7.16b, v16.16b ret END(aesarmv8_dec8)