[PATCH] mpi/amd64: use SSE2 for shifting instead of MMX
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Nov 4 20:50:18 CET 2019
* mpi/amd64/mpih-lshift.S: Convert to SSE2.
* mpi/amd64/mpih-rshift.S: Ditto.
--
On current Intel processors, MMX instructions is slower than SSE2. Switch
lshift and rshift functions to use SSE2 registers instead of MMX.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/mpi/amd64/mpih-lshift.S b/mpi/amd64/mpih-lshift.S
index 9e8979b10..009bece64 100644
--- a/mpi/amd64/mpih-lshift.S
+++ b/mpi/amd64/mpih-lshift.S
@@ -43,37 +43,37 @@
.globl C_SYMBOL_NAME(_gcry_mpih_lshift)
C_SYMBOL_NAME(_gcry_mpih_lshift:)
FUNC_ENTRY()
- movq -8(%rsi,%rdx,8), %mm7
- movd %ecx, %mm1
+ /* Note: %xmm6 and %xmm7 not used for WIN64 ABI compatibility. */
+ movq -8(%rsi,%rdx,8), %xmm4
+ movd %ecx, %xmm1
movl $64, %eax
subl %ecx, %eax
- movd %eax, %mm0
- movq %mm7, %mm3
- psrlq %mm0, %mm7
- movd %mm7, %rax
+ movd %eax, %xmm0
+ movq %xmm4, %xmm3
+ psrlq %xmm0, %xmm4
+ movd %xmm4, %rax
subq $2, %rdx
jl .Lendo
ALIGN(4) /* minimal alignment for claimed speed */
-.Loop: movq (%rsi,%rdx,8), %mm6
- movq %mm6, %mm2
- psrlq %mm0, %mm6
- psllq %mm1, %mm3
- por %mm6, %mm3
- movq %mm3, 8(%rdi,%rdx,8)
+.Loop: movq (%rsi,%rdx,8), %xmm5
+ movq %xmm5, %xmm2
+ psrlq %xmm0, %xmm5
+ psllq %xmm1, %xmm3
+ por %xmm5, %xmm3
+ movq %xmm3, 8(%rdi,%rdx,8)
je .Lende
- movq -8(%rsi,%rdx,8), %mm7
- movq %mm7, %mm3
- psrlq %mm0, %mm7
- psllq %mm1, %mm2
- por %mm7, %mm2
- movq %mm2, (%rdi,%rdx,8)
+ movq -8(%rsi,%rdx,8), %xmm4
+ movq %xmm4, %xmm3
+ psrlq %xmm0, %xmm4
+ psllq %xmm1, %xmm2
+ por %xmm4, %xmm2
+ movq %xmm2, (%rdi,%rdx,8)
subq $2, %rdx
jge .Loop
-.Lendo: movq %mm3, %mm2
-.Lende: psllq %mm1, %mm2
- movq %mm2, (%rdi)
- emms
+.Lendo: movq %xmm3, %xmm2
+.Lende: psllq %xmm1, %xmm2
+ movq %xmm2, (%rdi)
FUNC_EXIT()
ret
diff --git a/mpi/amd64/mpih-rshift.S b/mpi/amd64/mpih-rshift.S
index 7bd594216..db31060de 100644
--- a/mpi/amd64/mpih-rshift.S
+++ b/mpi/amd64/mpih-rshift.S
@@ -43,14 +43,15 @@
.globl C_SYMBOL_NAME(_gcry_mpih_rshift)
C_SYMBOL_NAME(_gcry_mpih_rshift:)
FUNC_ENTRY()
- movq (%rsi), %mm7
- movd %ecx, %mm1
+ /* Note: %xmm6 and %xmm7 not used for WIN64 ABI compatibility. */
+ movq (%rsi), %xmm4
+ movd %ecx, %xmm1
movl $64, %eax
subl %ecx, %eax
- movd %eax, %mm0
- movq %mm7, %mm3
- psllq %mm0, %mm7
- movd %mm7, %rax
+ movd %eax, %xmm0
+ movq %xmm4, %xmm3
+ psllq %xmm0, %xmm4
+ movd %xmm4, %rax
leaq (%rsi,%rdx,8), %rsi
leaq (%rdi,%rdx,8), %rdi
negq %rdx
@@ -58,25 +59,24 @@ C_SYMBOL_NAME(_gcry_mpih_rshift:)
jg .Lendo
ALIGN(4) /* minimal alignment for claimed speed */
-.Loop: movq -8(%rsi,%rdx,8), %mm6
- movq %mm6, %mm2
- psllq %mm0, %mm6
- psrlq %mm1, %mm3
- por %mm6, %mm3
- movq %mm3, -16(%rdi,%rdx,8)
+.Loop: movq -8(%rsi,%rdx,8), %xmm5
+ movq %xmm5, %xmm2
+ psllq %xmm0, %xmm5
+ psrlq %xmm1, %xmm3
+ por %xmm5, %xmm3
+ movq %xmm3, -16(%rdi,%rdx,8)
je .Lende
- movq (%rsi,%rdx,8), %mm7
- movq %mm7, %mm3
- psllq %mm0, %mm7
- psrlq %mm1, %mm2
- por %mm7, %mm2
- movq %mm2, -8(%rdi,%rdx,8)
+ movq (%rsi,%rdx,8), %xmm4
+ movq %xmm4, %xmm3
+ psllq %xmm0, %xmm4
+ psrlq %xmm1, %xmm2
+ por %xmm4, %xmm2
+ movq %xmm2, -8(%rdi,%rdx,8)
addq $2, %rdx
jle .Loop
-.Lendo: movq %mm3, %mm2
-.Lende: psrlq %mm1, %mm2
- movq %mm2, -8(%rdi)
- emms
+.Lendo: movq %xmm3, %xmm2
+.Lende: psrlq %xmm1, %xmm2
+ movq %xmm2, -8(%rdi)
FUNC_EXIT()
ret
More information about the Gcrypt-devel
mailing list