[PATCH 03/10] Enable AMD64 Camellia implementations on WIN64

Jussi Kivilinna jussi.kivilinna at iki.fi
Thu May 14 13:11:13 CEST 2015


* cipher/camellia-aesni-avx-amd64.S: Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(ELF): New macro to mask lines with ELF specific commands.
* cipher/camellia-aesni-avx2-amd64.S: Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(ELF): New macro to mask lines with ELF specific commands.
* cipher/camellia-glue.c (USE_AESNI_AVX, USE_AESNI_AVX2): Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
[USE_AESNI_AVX || USE_AESNI_AVX2] (ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
(_gcry_camellia_aesni_avx_ctr_enc, _gcry_camellia_aesni_avx_cbc_dec)
(_gcry_camellia_aesni_avx_cfb_dec, _gcry_camellia_aesni_avx_keygen)
(_gcry_camellia_aesni_avx2_ctr_enc, _gcry_camellia_aesni_avx2_cbc_dec)
(_gcry_camellia_aesni_avx2_cfb_dec): Add ASM_FUNC_ABI.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx-amd64.S  |   41 ++++++++++++++----------
 cipher/camellia-aesni-avx2-amd64.S |   29 +++++++++++------
 cipher/camellia-glue.c             |   61 +++++++++++++++++++++++++-----------
 3 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 6d157a7..c047a21 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -20,7 +20,8 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
 
 #ifdef __PIC__
@@ -29,6 +30,12 @@
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 /* struct CAMELLIA_context: */
@@ -769,7 +776,7 @@
 .text
 
 .align 8
-.type   __camellia_enc_blk16, at function;
+ELF(.type   __camellia_enc_blk16, at function;)
 
 __camellia_enc_blk16:
 	/* input:
@@ -853,10 +860,10 @@ __camellia_enc_blk16:
 		     %xmm15, %rax, %rcx, 24);
 
 	jmp .Lenc_done;
-.size __camellia_enc_blk16,.-__camellia_enc_blk16;
+ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
 
 .align 8
-.type   __camellia_dec_blk16, at function;
+ELF(.type   __camellia_dec_blk16, at function;)
 
 __camellia_dec_blk16:
 	/* input:
@@ -938,7 +945,7 @@ __camellia_dec_blk16:
 	      ((key_table + (24) * 8) + 4)(CTX));
 
 	jmp .Ldec_max24;
-.size __camellia_dec_blk16,.-__camellia_dec_blk16;
+ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
@@ -948,7 +955,7 @@ __camellia_dec_blk16:
 
 .align 8
 .globl _gcry_camellia_aesni_avx_ctr_enc
-.type   _gcry_camellia_aesni_avx_ctr_enc, at function;
+ELF(.type   _gcry_camellia_aesni_avx_ctr_enc, at function;)
 
 _gcry_camellia_aesni_avx_ctr_enc:
 	/* input:
@@ -1062,11 +1069,11 @@ _gcry_camellia_aesni_avx_ctr_enc:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;
+ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_cbc_dec
-.type   _gcry_camellia_aesni_avx_cbc_dec, at function;
+ELF(.type   _gcry_camellia_aesni_avx_cbc_dec, at function;)
 
 _gcry_camellia_aesni_avx_cbc_dec:
 	/* input:
@@ -1130,11 +1137,11 @@ _gcry_camellia_aesni_avx_cbc_dec:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
+ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_cfb_dec
-.type   _gcry_camellia_aesni_avx_cfb_dec, at function;
+ELF(.type   _gcry_camellia_aesni_avx_cfb_dec, at function;)
 
 _gcry_camellia_aesni_avx_cfb_dec:
 	/* input:
@@ -1202,7 +1209,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;
+ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
 
 /*
  * IN:
@@ -1309,7 +1316,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 .text
 
 .align 8
-.type  __camellia_avx_setup128, at function;
+ELF(.type  __camellia_avx_setup128, at function;)
 __camellia_avx_setup128:
 	/* input:
 	 *	%rdi: ctx, CTX; subkey storage at key_table(CTX)
@@ -1650,10 +1657,10 @@ __camellia_avx_setup128:
 	vzeroall;
 
 	ret;
-.size __camellia_avx_setup128,.-__camellia_avx_setup128;
+ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
 
 .align 8
-.type  __camellia_avx_setup256, at function;
+ELF(.type  __camellia_avx_setup256, at function;)
 
 __camellia_avx_setup256:
 	/* input:
@@ -2127,11 +2134,11 @@ __camellia_avx_setup256:
 	vzeroall;
 
 	ret;
-.size __camellia_avx_setup256,.-__camellia_avx_setup256;
+ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_keygen
-.type  _gcry_camellia_aesni_avx_keygen, at function;
+ELF(.type  _gcry_camellia_aesni_avx_keygen, at function;)
 
 _gcry_camellia_aesni_avx_keygen:
 	/* input:
@@ -2159,7 +2166,7 @@ _gcry_camellia_aesni_avx_keygen:
 	vpor %xmm2, %xmm1, %xmm1;
 
 	jmp __camellia_avx_setup256;
-.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;
+ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S
index 25f48bc..a3fa229 100644
--- a/cipher/camellia-aesni-avx2-amd64.S
+++ b/cipher/camellia-aesni-avx2-amd64.S
@@ -20,7 +20,8 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
 
 #ifdef __PIC__
@@ -29,6 +30,12 @@
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 /* struct CAMELLIA_context: */
@@ -748,7 +755,7 @@
 .text
 
 .align 8
-.type   __camellia_enc_blk32, at function;
+ELF(.type   __camellia_enc_blk32, at function;)
 
 __camellia_enc_blk32:
 	/* input:
@@ -832,10 +839,10 @@ __camellia_enc_blk32:
 		     %ymm15, %rax, %rcx, 24);
 
 	jmp .Lenc_done;
-.size __camellia_enc_blk32,.-__camellia_enc_blk32;
+ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
 
 .align 8
-.type   __camellia_dec_blk32, at function;
+ELF(.type   __camellia_dec_blk32, at function;)
 
 __camellia_dec_blk32:
 	/* input:
@@ -917,7 +924,7 @@ __camellia_dec_blk32:
 	      ((key_table + (24) * 8) + 4)(CTX));
 
 	jmp .Ldec_max24;
-.size __camellia_dec_blk32,.-__camellia_dec_blk32;
+ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
 
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
@@ -927,7 +934,7 @@ __camellia_dec_blk32:
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_ctr_enc
-.type   _gcry_camellia_aesni_avx2_ctr_enc, at function;
+ELF(.type   _gcry_camellia_aesni_avx2_ctr_enc, at function;)
 
 _gcry_camellia_aesni_avx2_ctr_enc:
 	/* input:
@@ -1111,11 +1118,11 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;
+ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_cbc_dec
-.type   _gcry_camellia_aesni_avx2_cbc_dec, at function;
+ELF(.type   _gcry_camellia_aesni_avx2_cbc_dec, at function;)
 
 _gcry_camellia_aesni_avx2_cbc_dec:
 	/* input:
@@ -1183,11 +1190,11 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;
+ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_cfb_dec
-.type   _gcry_camellia_aesni_avx2_cfb_dec, at function;
+ELF(.type   _gcry_camellia_aesni_avx2_cfb_dec, at function;)
 
 _gcry_camellia_aesni_avx2_cfb_dec:
 	/* input:
@@ -1257,7 +1264,7 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;
+ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index f18d135..5032321 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -75,7 +75,8 @@
 /* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */
 #undef USE_AESNI_AVX
 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
-# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 #  define USE_AESNI_AVX 1
 # endif
 #endif
@@ -83,7 +84,8 @@
 /* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
 #undef USE_AESNI_AVX2
 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
-# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 #  define USE_AESNI_AVX2 1
 # endif
 #endif
@@ -100,6 +102,20 @@ typedef struct
 #endif /*USE_AESNI_AVX2*/
 } CAMELLIA_context;
 
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
 #ifdef USE_AESNI_AVX
 /* Assembler implementations of Camellia using AES-NI and AVX.  Process data
    in 16 block same time.
@@ -107,21 +123,21 @@ typedef struct
 extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
-					     unsigned char *ctr);
+					     unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
-					     unsigned char *iv);
+					     unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
-					     unsigned char *iv);
+					     unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
 					    const unsigned char *key,
-					    unsigned int keylen);
+					    unsigned int keylen) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_AESNI_AVX2
@@ -131,17 +147,17 @@ extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
 extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
-					      unsigned char *ctr);
+					      unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_cbc_dec(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
-					      unsigned char *iv);
+					      unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
-					      unsigned char *iv);
+					      unsigned char *iv) ASM_FUNC_ABI;
 #endif
 
 static const char *selftest(void);
@@ -318,7 +334,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
       if (did_use_aesni_avx2)
         {
           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *);
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
 
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
@@ -347,8 +363,11 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
 
       if (did_use_aesni_avx)
         {
-          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
-            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
@@ -409,7 +428,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
       if (did_use_aesni_avx2)
         {
           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *);
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;;
 
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
@@ -437,8 +456,11 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
-            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
@@ -491,7 +513,7 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
       if (did_use_aesni_avx2)
         {
           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *);
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
 
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
@@ -519,8 +541,11 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
-            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */




More information about the Gcrypt-devel mailing list