[PATCH 03/10] Enable AMD64 Camellia implementations on WIN64
Jussi Kivilinna
jussi.kivilinna at iki.fi
Thu May 14 13:11:13 CEST 2015
* cipher/camellia-aesni-avx-amd64.S: Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(ELF): New macro to mask lines with ELF specific commands.
* cipher/camellia-aesni-avx2-amd64.S: Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(ELF): New macro to mask lines with ELF specific commands.
* cipher/camellia-glue.c (USE_AESNI_AVX, USE_AESNI_AVX2): Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
[USE_AESNI_AVX || USE_AESNI_AVX2] (ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
(_gcry_camellia_aesni_avx_ctr_enc, _gcry_camellia_aesni_avx_cbc_dec)
(_gcry_camellia_aesni_avx_cfb_dec, _gcry_camellia_aesni_avx_keygen)
(_gcry_camellia_aesni_avx2_ctr_enc, _gcry_camellia_aesni_avx2_cbc_dec)
(_gcry_camellia_aesni_avx2_cfb_dec): Add ASM_FUNC_ABI.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/camellia-aesni-avx-amd64.S | 41 ++++++++++++++----------
cipher/camellia-aesni-avx2-amd64.S | 29 +++++++++++------
cipher/camellia-glue.c | 61 +++++++++++++++++++++++++-----------
3 files changed, 85 insertions(+), 46 deletions(-)
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 6d157a7..c047a21 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -20,7 +20,8 @@
#ifdef __x86_64
#include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
#ifdef __PIC__
@@ -29,6 +30,12 @@
# define RIP
#endif
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
#define CAMELLIA_TABLE_BYTE_LEN 272
/* struct CAMELLIA_context: */
@@ -769,7 +776,7 @@
.text
.align 8
-.type __camellia_enc_blk16, at function;
+ELF(.type __camellia_enc_blk16, at function;)
__camellia_enc_blk16:
/* input:
@@ -853,10 +860,10 @@ __camellia_enc_blk16:
%xmm15, %rax, %rcx, 24);
jmp .Lenc_done;
-.size __camellia_enc_blk16,.-__camellia_enc_blk16;
+ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
.align 8
-.type __camellia_dec_blk16, at function;
+ELF(.type __camellia_dec_blk16, at function;)
__camellia_dec_blk16:
/* input:
@@ -938,7 +945,7 @@ __camellia_dec_blk16:
((key_table + (24) * 8) + 4)(CTX));
jmp .Ldec_max24;
-.size __camellia_dec_blk16,.-__camellia_dec_blk16;
+ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
@@ -948,7 +955,7 @@ __camellia_dec_blk16:
.align 8
.globl _gcry_camellia_aesni_avx_ctr_enc
-.type _gcry_camellia_aesni_avx_ctr_enc, at function;
+ELF(.type _gcry_camellia_aesni_avx_ctr_enc, at function;)
_gcry_camellia_aesni_avx_ctr_enc:
/* input:
@@ -1062,11 +1069,11 @@ _gcry_camellia_aesni_avx_ctr_enc:
leave;
ret;
-.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;
+ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
.align 8
.globl _gcry_camellia_aesni_avx_cbc_dec
-.type _gcry_camellia_aesni_avx_cbc_dec, at function;
+ELF(.type _gcry_camellia_aesni_avx_cbc_dec, at function;)
_gcry_camellia_aesni_avx_cbc_dec:
/* input:
@@ -1130,11 +1137,11 @@ _gcry_camellia_aesni_avx_cbc_dec:
leave;
ret;
-.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
+ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
.align 8
.globl _gcry_camellia_aesni_avx_cfb_dec
-.type _gcry_camellia_aesni_avx_cfb_dec, at function;
+ELF(.type _gcry_camellia_aesni_avx_cfb_dec, at function;)
_gcry_camellia_aesni_avx_cfb_dec:
/* input:
@@ -1202,7 +1209,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
leave;
ret;
-.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;
+ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
/*
* IN:
@@ -1309,7 +1316,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
.text
.align 8
-.type __camellia_avx_setup128, at function;
+ELF(.type __camellia_avx_setup128, at function;)
__camellia_avx_setup128:
/* input:
* %rdi: ctx, CTX; subkey storage at key_table(CTX)
@@ -1650,10 +1657,10 @@ __camellia_avx_setup128:
vzeroall;
ret;
-.size __camellia_avx_setup128,.-__camellia_avx_setup128;
+ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
.align 8
-.type __camellia_avx_setup256, at function;
+ELF(.type __camellia_avx_setup256, at function;)
__camellia_avx_setup256:
/* input:
@@ -2127,11 +2134,11 @@ __camellia_avx_setup256:
vzeroall;
ret;
-.size __camellia_avx_setup256,.-__camellia_avx_setup256;
+ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
.align 8
.globl _gcry_camellia_aesni_avx_keygen
-.type _gcry_camellia_aesni_avx_keygen, at function;
+ELF(.type _gcry_camellia_aesni_avx_keygen, at function;)
_gcry_camellia_aesni_avx_keygen:
/* input:
@@ -2159,7 +2166,7 @@ _gcry_camellia_aesni_avx_keygen:
vpor %xmm2, %xmm1, %xmm1;
jmp __camellia_avx_setup256;
-.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;
+ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;)
#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
#endif /*__x86_64*/
diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S
index 25f48bc..a3fa229 100644
--- a/cipher/camellia-aesni-avx2-amd64.S
+++ b/cipher/camellia-aesni-avx2-amd64.S
@@ -20,7 +20,8 @@
#ifdef __x86_64
#include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
#ifdef __PIC__
@@ -29,6 +30,12 @@
# define RIP
#endif
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
#define CAMELLIA_TABLE_BYTE_LEN 272
/* struct CAMELLIA_context: */
@@ -748,7 +755,7 @@
.text
.align 8
-.type __camellia_enc_blk32, at function;
+ELF(.type __camellia_enc_blk32, at function;)
__camellia_enc_blk32:
/* input:
@@ -832,10 +839,10 @@ __camellia_enc_blk32:
%ymm15, %rax, %rcx, 24);
jmp .Lenc_done;
-.size __camellia_enc_blk32,.-__camellia_enc_blk32;
+ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
.align 8
-.type __camellia_dec_blk32, at function;
+ELF(.type __camellia_dec_blk32, at function;)
__camellia_dec_blk32:
/* input:
@@ -917,7 +924,7 @@ __camellia_dec_blk32:
((key_table + (24) * 8) + 4)(CTX));
jmp .Ldec_max24;
-.size __camellia_dec_blk32,.-__camellia_dec_blk32;
+ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
@@ -927,7 +934,7 @@ __camellia_dec_blk32:
.align 8
.globl _gcry_camellia_aesni_avx2_ctr_enc
-.type _gcry_camellia_aesni_avx2_ctr_enc, at function;
+ELF(.type _gcry_camellia_aesni_avx2_ctr_enc, at function;)
_gcry_camellia_aesni_avx2_ctr_enc:
/* input:
@@ -1111,11 +1118,11 @@ _gcry_camellia_aesni_avx2_ctr_enc:
leave;
ret;
-.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;
+ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;)
.align 8
.globl _gcry_camellia_aesni_avx2_cbc_dec
-.type _gcry_camellia_aesni_avx2_cbc_dec, at function;
+ELF(.type _gcry_camellia_aesni_avx2_cbc_dec, at function;)
_gcry_camellia_aesni_avx2_cbc_dec:
/* input:
@@ -1183,11 +1190,11 @@ _gcry_camellia_aesni_avx2_cbc_dec:
leave;
ret;
-.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;
+ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;)
.align 8
.globl _gcry_camellia_aesni_avx2_cfb_dec
-.type _gcry_camellia_aesni_avx2_cfb_dec, at function;
+ELF(.type _gcry_camellia_aesni_avx2_cfb_dec, at function;)
_gcry_camellia_aesni_avx2_cfb_dec:
/* input:
@@ -1257,7 +1264,7 @@ _gcry_camellia_aesni_avx2_cfb_dec:
leave;
ret;
-.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;
+ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;)
#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
#endif /*__x86_64*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index f18d135..5032321 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -75,7 +75,8 @@
/* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */
#undef USE_AESNI_AVX
#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
-# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
# define USE_AESNI_AVX 1
# endif
#endif
@@ -83,7 +84,8 @@
/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
#undef USE_AESNI_AVX2
#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
-# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
# define USE_AESNI_AVX2 1
# endif
#endif
@@ -100,6 +102,20 @@ typedef struct
#endif /*USE_AESNI_AVX2*/
} CAMELLIA_context;
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+# else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+# endif
+#endif
+
#ifdef USE_AESNI_AVX
/* Assembler implementations of Camellia using AES-NI and AVX. Process data
in 16 block same time.
@@ -107,21 +123,21 @@ typedef struct
extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
unsigned char *out,
const unsigned char *in,
- unsigned char *ctr);
+ unsigned char *ctr) ASM_FUNC_ABI;
extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
unsigned char *out,
const unsigned char *in,
- unsigned char *iv);
+ unsigned char *iv) ASM_FUNC_ABI;
extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
unsigned char *out,
const unsigned char *in,
- unsigned char *iv);
+ unsigned char *iv) ASM_FUNC_ABI;
extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
const unsigned char *key,
- unsigned int keylen);
+ unsigned int keylen) ASM_FUNC_ABI;
#endif
#ifdef USE_AESNI_AVX2
@@ -131,17 +147,17 @@ extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx,
unsigned char *out,
const unsigned char *in,
- unsigned char *ctr);
+ unsigned char *ctr) ASM_FUNC_ABI;
extern void _gcry_camellia_aesni_avx2_cbc_dec(CAMELLIA_context *ctx,
unsigned char *out,
const unsigned char *in,
- unsigned char *iv);
+ unsigned char *iv) ASM_FUNC_ABI;
extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx,
unsigned char *out,
const unsigned char *in,
- unsigned char *iv);
+ unsigned char *iv) ASM_FUNC_ABI;
#endif
static const char *selftest(void);
@@ -318,7 +334,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
if (did_use_aesni_avx2)
{
int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
- 2 * sizeof(void *);
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
@@ -347,8 +363,11 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
if (did_use_aesni_avx)
{
- if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
- burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+ int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+ if (burn_stack_depth < avx_burn_stack_depth)
+ burn_stack_depth = avx_burn_stack_depth;
}
/* Use generic code to handle smaller chunks... */
@@ -409,7 +428,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
if (did_use_aesni_avx2)
{
int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
- 2 * sizeof(void *);
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;;
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
@@ -437,8 +456,11 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
if (did_use_aesni_avx)
{
- if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
- burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+ int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+ if (burn_stack_depth < avx_burn_stack_depth)
+ burn_stack_depth = avx_burn_stack_depth;
}
/* Use generic code to handle smaller chunks... */
@@ -491,7 +513,7 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
if (did_use_aesni_avx2)
{
int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
- 2 * sizeof(void *);
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
@@ -519,8 +541,11 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
if (did_use_aesni_avx)
{
- if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
- burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+ int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+ if (burn_stack_depth < avx_burn_stack_depth)
+ burn_stack_depth = avx_burn_stack_depth;
}
/* Use generic code to handle smaller chunks... */
More information about the Gcrypt-devel
mailing list