[PATCH 2/7] amd64-asm: move constant data to read-only section for hash/mac algos
Jussi Kivilinna
jussi.kivilinna at iki.fi
Tue Jan 17 20:17:36 CET 2023
* cipher/asm-common-amd64.h (SECTION_RODATA): New.
* cipher/blake2b-amd64-avx2.S: Use read-only section for constant
data; Align text section to 64 bytes.
* cipher/blake2b-amd64-avx512.S: Likewise.
* cipher/blake2s-amd64-avx.S: Likewise.
* cipher/blake2s-amd64-avx512.S: Likewise.
* cipher/poly1305-amd64-avx512.S: Likewise.
* cipher/sha1-avx-amd64.S: Likewise.
* cipher/sha1-avx-bmi2-amd64.S: Likewise.
* cipher/sha1-avx2-bmi2-amd64.S: Likewise.
* cipher/sha1-ssse3-amd64.S: Likewise.
* cipher/sha256-avx-amd64.S: Likewise.
* cipher/sha256-avx2-bmi2-amd64.S: Likewise.
* cipher/sha256-ssse3-amd64.S: Likewise.
* cipher/sha512-avx-amd64.S: Likewise.
* cipher/sha512-avx2-bmi2-amd64.S: Likewise.
* cipher/sha512-avx512-amd64.S: Likewise.
* cipher/sha512-ssse3-amd64.S: Likewise.
* cipher/sha3-avx-bmi2-amd64.S: Likewise.
* cipher/whirlpool-sse2-amd64.S: Likewise.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/asm-common-amd64.h | 6 ++++++
cipher/blake2b-amd64-avx2.S | 7 ++++---
cipher/blake2b-amd64-avx512.S | 10 ++++++----
cipher/blake2s-amd64-avx.S | 9 ++++++---
cipher/blake2s-amd64-avx512.S | 10 ++++++----
cipher/poly1305-amd64-avx512.S | 7 +++++--
cipher/sha1-avx-amd64.S | 8 ++++++--
cipher/sha1-avx-bmi2-amd64.S | 9 +++++++--
cipher/sha1-avx2-bmi2-amd64.S | 9 +++++++--
cipher/sha1-ssse3-amd64.S | 9 +++++++--
cipher/sha256-avx-amd64.S | 7 ++++++-
cipher/sha256-avx2-bmi2-amd64.S | 8 +++++++-
cipher/sha256-ssse3-amd64.S | 7 ++++++-
cipher/sha512-avx-amd64.S | 7 ++++++-
cipher/sha512-avx2-bmi2-amd64.S | 7 ++++++-
cipher/sha512-avx512-amd64.S | 4 +++-
cipher/sha512-ssse3-amd64.S | 7 ++++++-
cipher/sm3-avx-bmi2-amd64.S | 6 ++++--
cipher/whirlpool-sse2-amd64.S | 2 +-
19 files changed, 105 insertions(+), 34 deletions(-)
diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
index d9bbc01b..870fef9a 100644
--- a/cipher/asm-common-amd64.h
+++ b/cipher/asm-common-amd64.h
@@ -29,6 +29,12 @@
# define ELF(...) /*_*/
#endif
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define SECTION_RODATA .section .rdata
+#else
+# define SECTION_RODATA .section .rodata
+#endif
+
#ifdef __PIC__
# define rRIP (%rip)
#else
diff --git a/cipher/blake2b-amd64-avx2.S b/cipher/blake2b-amd64-avx2.S
index 3601b65f..43c2cce1 100644
--- a/cipher/blake2b-amd64-avx2.S
+++ b/cipher/blake2b-amd64-avx2.S
@@ -31,8 +31,6 @@
#include "asm-common-amd64.h"
-.text
-
/* register macros */
#define RSTATE %rdi
#define RINBLKS %rsi
@@ -185,8 +183,10 @@
G2(ROW1, ROW2, ROW3, ROW4, m4); \
UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
-blake2b_data:
+SECTION_RODATA
.align 32
+ELF(.type _blake2b_avx2_data, at object;)
+_blake2b_avx2_data:
.Liv:
.quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
.quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
@@ -197,6 +197,7 @@ blake2b_data:
.Lshuf_ror24:
.byte 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10
+.text
.align 64
.globl _gcry_blake2b_transform_amd64_avx2
ELF(.type _gcry_blake2b_transform_amd64_avx2, at function;)
diff --git a/cipher/blake2b-amd64-avx512.S b/cipher/blake2b-amd64-avx512.S
index 18b0c3ad..fe938730 100644
--- a/cipher/blake2b-amd64-avx512.S
+++ b/cipher/blake2b-amd64-avx512.S
@@ -31,8 +31,6 @@
#include "asm-common-amd64.h"
-.text
-
/* register macros */
#define RSTATE %rdi
#define RINBLKS %rsi
@@ -180,9 +178,11 @@
G2(ROW1, ROW2, ROW3, ROW4, m4); \
UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4)
-ELF(.type blake2b_data, at object;)
-blake2b_data:
+SECTION_RODATA
+
.align 32
+ELF(.type _blake2b_avx512_data, at object;)
+_blake2b_avx512_data:
.Liv:
.quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
.quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
@@ -209,6 +209,8 @@ blake2b_data:
.Lgmask9:
GEN_GMASK(10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0)
+.text
+
.align 64
.globl _gcry_blake2b_transform_amd64_avx512
ELF(.type _gcry_blake2b_transform_amd64_avx512, at function;)
diff --git a/cipher/blake2s-amd64-avx.S b/cipher/blake2s-amd64-avx.S
index 5094b4c1..44b82ab2 100644
--- a/cipher/blake2s-amd64-avx.S
+++ b/cipher/blake2s-amd64-avx.S
@@ -31,8 +31,6 @@
#include "asm-common-amd64.h"
-.text
-
/* register macros */
#define RSTATE %rdi
#define RINBLKS %rsi
@@ -171,8 +169,11 @@
G2(ROW1, ROW2, ROW3, ROW4, m4); \
UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
-blake2s_data:
+SECTION_RODATA
+
.align 16
+ELF(.type _blake2s_avx_data, at object;)
+_blake2s_avx_data:
.Liv:
.long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
.long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
@@ -181,6 +182,8 @@ blake2s_data:
.Lshuf_ror8:
.byte 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12
+.text
+
.align 64
.globl _gcry_blake2s_transform_amd64_avx
ELF(.type _gcry_blake2s_transform_amd64_avx, at function;)
diff --git a/cipher/blake2s-amd64-avx512.S b/cipher/blake2s-amd64-avx512.S
index ddcdfd67..e2da2a18 100644
--- a/cipher/blake2s-amd64-avx512.S
+++ b/cipher/blake2s-amd64-avx512.S
@@ -31,8 +31,6 @@
#include "asm-common-amd64.h"
-.text
-
/* register macros */
#define RSTATE %rdi
#define RINBLKS %rsi
@@ -164,13 +162,17 @@
G2(ROW1, ROW2, ROW3, ROW4, m4); \
UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
-ELF(.type blake2s_data, at object;)
-blake2s_data:
+SECTION_RODATA
+
+ELF(.type _blake2s_avx512_data, at object;)
.align 16
+_blake2s_avx512_data:
.Liv:
.long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
.long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+.text
+
.align 64
.globl _gcry_blake2s_transform_amd64_avx512
ELF(.type _gcry_blake2s_transform_amd64_avx512, at function;)
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
index 9beed8ad..cf176129 100644
--- a/cipher/poly1305-amd64-avx512.S
+++ b/cipher/poly1305-amd64-avx512.S
@@ -44,7 +44,7 @@
.intel_syntax noprefix
-.text
+SECTION_RODATA
ELF(.type _gcry_poly1305_avx512_consts, at object)
_gcry_poly1305_avx512_consts:
@@ -1575,7 +1575,10 @@ ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts)
;; arg3 - Input/output hash
;; arg4 - Poly1305 key
*/
-.align 32
+
+.text
+
+.align 64
.globl _gcry_poly1305_amd64_avx512_blocks
ELF(.type _gcry_poly1305_amd64_avx512_blocks, at function;)
_gcry_poly1305_amd64_avx512_blocks:
diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S
index acada960..5b9e0500 100644
--- a/cipher/sha1-avx-amd64.S
+++ b/cipher/sha1-avx-amd64.S
@@ -47,7 +47,10 @@
/* Constants */
-.text
+SECTION_RODATA
+
+ELF(.type _sha1_avx_consts, at object)
+_sha1_avx_consts:
#define K1 0x5A827999
#define K2 0x6ED9EBA1
#define K3 0x8F1BBCDC
@@ -195,6 +198,7 @@
vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \
vmovdqa tmp0, WK((i)&~3);
+.text
/*
* Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
@@ -205,7 +209,7 @@
*/
.globl _gcry_sha1_transform_amd64_avx
ELF(.type _gcry_sha1_transform_amd64_avx, at function)
-.align 16
+.align 64
_gcry_sha1_transform_amd64_avx:
/* input:
* %rdi: ctx, CTX
diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S
index 5f4b9e69..9df147c2 100644
--- a/cipher/sha1-avx-bmi2-amd64.S
+++ b/cipher/sha1-avx-bmi2-amd64.S
@@ -48,7 +48,11 @@
/* Constants */
-.text
+SECTION_RODATA
+
+ELF(.type _sha1_avx_bmi2_consts, at object)
+_sha1_avx_bmi2_consts:
+
.align 16
.Lbswap_shufb_ctl:
.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
@@ -194,6 +198,7 @@
vpaddd K, W, tmp0; \
vmovdqa tmp0, WK((i)&~3);
+.text
/*
* Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
@@ -204,7 +209,7 @@
*/
.globl _gcry_sha1_transform_amd64_avx_bmi2
ELF(.type _gcry_sha1_transform_amd64_avx_bmi2, at function)
-.align 16
+.align 64
_gcry_sha1_transform_amd64_avx_bmi2:
/* input:
* %rdi: ctx, CTX
diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S
index ed52761b..0db1d9b9 100644
--- a/cipher/sha1-avx2-bmi2-amd64.S
+++ b/cipher/sha1-avx2-bmi2-amd64.S
@@ -48,9 +48,13 @@
/* Constants */
+SECTION_RODATA
+
#define WK_STACK_WORDS (80 * 2)
-.text
+ELF(.type _sha1_avx2_bmi2_consts, at object)
+_sha1_avx2_bmi2_consts:
+
.align 16
.Lbswap_shufb_ctl:
.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
@@ -200,6 +204,7 @@
vpaddd K, W, tmp0; \
vmovdqa tmp0, PRE_WK((i)&~3);
+.text
/*
* Transform 2*nblks*64 bytes (2*nblks*16 32-bit words) at DATA.
@@ -210,7 +215,7 @@
*/
.globl _gcry_sha1_transform_amd64_avx2_bmi2
ELF(.type _gcry_sha1_transform_amd64_avx2_bmi2, at function)
-.align 16
+.align 64
_gcry_sha1_transform_amd64_avx2_bmi2:
/* input:
* %rdi: ctx, CTX
diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S
index f09b1de1..afea6501 100644
--- a/cipher/sha1-ssse3-amd64.S
+++ b/cipher/sha1-ssse3-amd64.S
@@ -47,7 +47,11 @@
/* Constants */
-.text
+SECTION_RODATA
+
+ELF(.type _sha1_ssse3_consts, at object)
+_sha1_ssse3_consts:
+
#define K1 0x5A827999
#define K2 0x6ED9EBA1
#define K3 0x8F1BBCDC
@@ -207,6 +211,7 @@
#define CLEAR_REG(reg) pxor reg, reg;
+.text
/*
* Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
@@ -217,7 +222,7 @@
*/
.globl _gcry_sha1_transform_amd64_ssse3
ELF(.type _gcry_sha1_transform_amd64_ssse3, at function)
-.align 16
+.align 64
_gcry_sha1_transform_amd64_ssse3:
/* input:
* %rdi: ctx, CTX
diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S
index be8a799d..8b2cbfe8 100644
--- a/cipher/sha256-avx-amd64.S
+++ b/cipher/sha256-avx-amd64.S
@@ -342,7 +342,7 @@
.text
.globl _gcry_sha256_transform_amd64_avx
ELF(.type _gcry_sha256_transform_amd64_avx, at function;)
-.align 16
+.align 64
_gcry_sha256_transform_amd64_avx:
CFI_STARTPROC()
vzeroupper
@@ -475,6 +475,11 @@ _gcry_sha256_transform_amd64_avx:
CFI_ENDPROC()
+SECTION_RODATA
+
+ELF(.type _sha256_avx_consts, at object)
+_sha256_avx_consts:
+
.align 16
.LK256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S
index 60ad442c..93919ead 100644
--- a/cipher/sha256-avx2-bmi2-amd64.S
+++ b/cipher/sha256-avx2-bmi2-amd64.S
@@ -247,7 +247,7 @@
.text
.globl _gcry_sha256_transform_amd64_avx2
ELF(.type _gcry_sha256_transform_amd64_avx2, at function)
-.align 32
+.align 64
_gcry_sha256_transform_amd64_avx2:
CFI_STARTPROC()
xor eax, eax
@@ -477,6 +477,12 @@ _gcry_sha256_transform_amd64_avx2:
ret_spec_stop
CFI_ENDPROC()
+
+SECTION_RODATA
+
+ELF(.type _sha256_avx2_consts, at object)
+_sha256_avx2_consts:
+
.align 64
.LK256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index 401ff6f4..41c15420 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -349,7 +349,7 @@
.text
.globl _gcry_sha256_transform_amd64_ssse3
ELF(.type _gcry_sha256_transform_amd64_ssse3, at function;)
-.align 16
+.align 64
_gcry_sha256_transform_amd64_ssse3:
CFI_STARTPROC()
push rbx
@@ -497,6 +497,11 @@ _gcry_sha256_transform_amd64_ssse3:
CFI_ENDPROC()
+SECTION_RODATA
+
+ELF(.type _sha256_ssse3_consts, at object)
+_sha256_ssse3_consts:
+
.align 16
.LK256:
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S
index bfc4435d..e8663756 100644
--- a/cipher/sha512-avx-amd64.S
+++ b/cipher/sha512-avx-amd64.S
@@ -246,7 +246,7 @@
*/
.globl _gcry_sha512_transform_amd64_avx
ELF(.type _gcry_sha512_transform_amd64_avx, at function;)
-.align 16
+.align 64
_gcry_sha512_transform_amd64_avx:
CFI_STARTPROC()
xor eax, eax
@@ -408,6 +408,11 @@ _gcry_sha512_transform_amd64_avx:
;;; Binary Data
*/
+SECTION_RODATA
+
+ELF(.type _sha512_avx_consts, at object)
+_sha512_avx_consts:
+
.align 16
/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index a431e196..6e6e1e43 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -274,7 +274,7 @@
*/
.globl _gcry_sha512_transform_amd64_avx2
ELF(.type _gcry_sha512_transform_amd64_avx2, at function;)
-.align 16
+.align 64
_gcry_sha512_transform_amd64_avx2:
CFI_STARTPROC()
xor eax, eax
@@ -445,6 +445,11 @@ _gcry_sha512_transform_amd64_avx2:
/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
/*;; Binary Data */
+SECTION_RODATA
+
+ELF(.type _sha512_avx2_consts, at object)
+_sha512_avx2_consts:
+
.align 64
/* K[t] used in SHA512 hashing */
.LK512:
diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S
index 431fb3e9..f113824c 100644
--- a/cipher/sha512-avx512-amd64.S
+++ b/cipher/sha512-avx512-amd64.S
@@ -256,7 +256,7 @@
*/
.globl _gcry_sha512_transform_amd64_avx512
ELF(.type _gcry_sha512_transform_amd64_avx512, at function;)
-.align 16
+.align 64
_gcry_sha512_transform_amd64_avx512:
CFI_STARTPROC()
xor eax, eax
@@ -404,6 +404,8 @@ ELF(.size _gcry_sha512_transform_amd64_avx512,.-_gcry_sha512_transform_amd64_avx
/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
/*;; Binary Data */
+SECTION_RODATA
+
ELF(.type _gcry_sha512_avx512_consts, at object)
_gcry_sha512_avx512_consts:
.align 64
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S
index 9cc30892..0a26f215 100644
--- a/cipher/sha512-ssse3-amd64.S
+++ b/cipher/sha512-ssse3-amd64.S
@@ -249,7 +249,7 @@
*/
.globl _gcry_sha512_transform_amd64_ssse3
ELF(.type _gcry_sha512_transform_amd64_ssse3, at function;)
-.align 16
+.align 64
_gcry_sha512_transform_amd64_ssse3:
CFI_STARTPROC()
xor eax, eax
@@ -414,6 +414,11 @@ _gcry_sha512_transform_amd64_ssse3:
;;; Binary Data
*/
+SECTION_RODATA
+
+ELF(.type _sha512_ssse3_consts, at object)
+_sha512_ssse3_consts:
+
.align 16
/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */
diff --git a/cipher/sm3-avx-bmi2-amd64.S b/cipher/sm3-avx-bmi2-amd64.S
index d9b6206a..9066be33 100644
--- a/cipher/sm3-avx-bmi2-amd64.S
+++ b/cipher/sm3-avx-bmi2-amd64.S
@@ -41,7 +41,7 @@
/* Constants */
-.text
+SECTION_RODATA
.align 16
ELF(.type _gcry_sm3_avx2_consts, at object)
_gcry_sm3_avx2_consts:
@@ -334,6 +334,8 @@ ELF(.size _gcry_sm3_avx2_consts,.-_gcry_sm3_avx2_consts)
vpxor w0, XTMP4, XTMP1; \
vmovdqa XTMP1, XW_W1W2_ADDR((round), 0);
+.text
+
/*
* Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
*
@@ -343,7 +345,7 @@ ELF(.size _gcry_sm3_avx2_consts,.-_gcry_sm3_avx2_consts)
*/
.globl _gcry_sm3_transform_amd64_avx_bmi2
ELF(.type _gcry_sm3_transform_amd64_avx_bmi2, at function)
-.align 16
+.align 64
_gcry_sm3_transform_amd64_avx_bmi2:
/* input:
* %rdi: ctx, CTX
diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S
index 37648faa..b26dfed2 100644
--- a/cipher/whirlpool-sse2-amd64.S
+++ b/cipher/whirlpool-sse2-amd64.S
@@ -152,7 +152,7 @@
#define RB_ADD6 RB6, RB7, RB0, RB1, RB2, RB3, RB4, RB5
#define RB_ADD7 RB7, RB0, RB1, RB2, RB3, RB4, RB5, RB6
-.align 8
+.align 64
.globl _gcry_whirlpool_transform_amd64
ELF(.type _gcry_whirlpool_transform_amd64, at function;)
--
2.37.2
More information about the Gcrypt-devel
mailing list