[PATCH 1/5] dilithium: Added optimized dilithium NTT support for ppc64le.
Danny Tsen
dtsen at us.ibm.com
Tue Feb 24 01:27:49 CET 2026
Optimized dilithium (ML-DSA) NTT algorithm for ppc64le (Power 8 and
above).
Signed-off-by: Danny Tsen <dtsen at us.ibm.com>
---
cipher/dilithium_ntt_p8le.S | 859 ++++++++++++++++++++++++++++++++++++
1 file changed, 859 insertions(+)
create mode 100644 cipher/dilithium_ntt_p8le.S
diff --git a/cipher/dilithium_ntt_p8le.S b/cipher/dilithium_ntt_p8le.S
new file mode 100644
index 00000000..8932d8e8
--- /dev/null
+++ b/cipher/dilithium_ntt_p8le.S
@@ -0,0 +1,859 @@
+/*
+ * This file was modified for use by Libgcrypt.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <https://www.gnu.org/licenses/>.
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ * You can also use this file under the same licence of original code.
+ * SPDX-License-Identifier: CC0 OR Apache-2.0
+ *
+ */
+/*
+ *
+ * Copyright IBM Corp. 2025, 2026
+ *
+ * ===================================================================================
+ * Written by Danny Tsen <dtsen at us.ibm.com>
+ */
+
+#define QINV_OFFSET 16
+#define ZETA_NTT_OFFSET 32
+
+#define MLDSA_Q 8380417
+#define MLDSA_QINV 58728449
+
+#define QINV 0
+#define V_Q 1
+#define V_ZETA 2
+#define V_Z0 2
+#define V_Z1 3
+#define V_Z2 4
+#define V_Z3 5
+
+.machine "any"
+.text
+
+.macro SAVE_REGS
+ stdu 1, -352(1)
+ mflr 0
+ std 14, 56(1)
+ std 15, 64(1)
+ std 16, 72(1)
+ std 17, 80(1)
+ std 18, 88(1)
+ std 19, 96(1)
+ std 20, 104(1)
+ std 21, 112(1)
+ li 10, 128
+ li 11, 144
+ li 12, 160
+ li 14, 176
+ li 15, 192
+ li 16, 208
+ stxvx 32+20, 10, 1
+ stxvx 32+21, 11, 1
+ stxvx 32+22, 12, 1
+ stxvx 32+23, 14, 1
+ stxvx 32+24, 15, 1
+ stxvx 32+25, 16, 1
+ li 10, 224
+ li 11, 240
+ li 12, 256
+ li 14, 272
+ stxvx 32+26, 10, 1
+ stxvx 32+27, 11, 1
+ stxvx 32+28, 12, 1
+ stxvx 32+29, 14, 1
+.endm
+
+.macro RESTORE_REGS
+ li 10, 128
+ li 11, 144
+ li 12, 160
+ li 14, 176
+ li 15, 192
+ li 16, 208
+ lxvx 32+20, 10, 1
+ lxvx 32+21, 11, 1
+ lxvx 32+22, 12, 1
+ lxvx 32+23, 14, 1
+ lxvx 32+24, 15, 1
+ lxvx 32+25, 16, 1
+ li 10, 224
+ li 11, 240
+ li 12, 256
+ li 14, 272
+ lxvx 32+26, 10, 1
+ lxvx 32+27, 11, 1
+ lxvx 32+28, 12, 1
+ lxvx 32+29, 14, 1
+ ld 14, 56(1)
+ ld 15, 64(1)
+ ld 16, 72(1)
+ ld 17, 80(1)
+ ld 18, 88(1)
+ ld 19, 96(1)
+ ld 20, 104(1)
+ ld 21, 112(1)
+
+ mtlr 0
+ addi 1, 1, 352
+.endm
+
+/*
+ * Init_Coeffs_offset: initial offset setup for the coefficient array.
+ *
+ * start: beginning of the offset to the coefficient array.
+ * next: Next offset.
+ * len: Index difference between coefficients.
+ *
+ * r7: len * 2, each coefficient component is 32 bits.
+ *
+ * registers used for offset to coefficients, r[j] and r[j+len]
+ * R9: offset to r0 = j
+ * R16: offset to r1 = r0 + next
+ * R18: offset to r2 = r1 + next
+ * R20: offset to r3 = r2 + next
+ *
+ * R10: offset to r'0 = r0 + len*2
+ * R17: offset to r'1 = r'0 + next
+ * R19: offset to r'2 = r'1 + next
+ * R21: offset to r'3 = r'2 + next
+ *
+ */
+.macro Init_Coeffs_offset start next
+ li 9, \start /* first offset to j */
+ add 10, 7, 9 /* J + len*2 */
+ addi 16, 9, \next
+ addi 17, 10, \next
+ addi 18, 16, \next
+ addi 19, 17, \next
+ addi 20, 18, \next
+ addi 21, 19, \next
+.endm
+
+/*
+ * For Len=1, load 1-1-1-1 layout
+ *
+ * Load Coefficients and setup vectors
+ * rj0, rjlen1, rj2, rjlen3
+ * rj4, rjlen5, rj6, rjlen7
+ *
+ * Each vmrgew and vmrgow will transpose vectors as,
+ *
+ * rj vector = (rj0, rj4, rj2, rj6)
+ * rjlen vector = (rjlen1, rjlen5, rjlen3, rjlen7)
+ *
+ * r' =r[j+len]: V6, V7, V8, V9
+ * r = r[j]: V26, V27, V28, V29
+ *
+ * In order to do the coefficients computation, zeta vector will arrange
+ * in the proper order to match the multiplication.
+ */
+.macro Load_41Coeffs
+ lxvd2x 32+10, 0, 5
+ lxvd2x 32+11, 10, 5
+ vmrgew 6, 10, 11
+ vmrgow 26, 10, 11
+ lxvd2x 32+12, 11, 5
+ lxvd2x 32+13, 12, 5
+ vmrgew 7, 12, 13
+ vmrgow 27, 12, 13
+ lxvd2x 32+10, 15, 5
+ lxvd2x 32+11, 16, 5
+ vmrgew 8, 10, 11
+ vmrgow 28, 10, 11
+ lxvd2x 32+12, 17, 5
+ lxvd2x 32+13, 18, 5
+ vmrgew 9, 12, 13
+ vmrgow 29, 12, 13
+.endm
+
+/*
+ * For Len=2, Load 2 - 2 - 2 - 2 layout
+ *
+ * Load Coefficients and setup vectors for 8 coefficients in the
+ * following order,
+ * rj0, rj1, rjlen2, rjlen3,
+ * rj4, rj5, rjlen6, arlen7
+ * Each xxpermdi will transpose vectors as,
+ * r[j]= rj0, rj1, rj4, rj5
+ * r[j+len]= rjlen2, rjlen3, rjlen6, arlen7
+ *
+ * r' = r[j+len]: V6, V7, V8, V9
+ * r = r[j]: V26, V27, V28, V29
+ *
+ * In order to do the coefficients computation, zeta vector will arrange
+ * in the proper order to match the multiplication.
+ */
+.macro Load_42Coeffs
+ lxvd2x 1, 0, 5
+ lxvd2x 2, 10, 5
+ xxpermdi 32+6, 1, 2, 3
+ xxpermdi 32+26, 1, 2, 0
+ lxvd2x 3, 11, 5
+ lxvd2x 4, 12, 5
+ xxpermdi 32+7, 3, 4, 3
+ xxpermdi 32+27, 3, 4, 0
+ lxvd2x 1, 15, 5
+ lxvd2x 2, 16, 5
+ xxpermdi 32+8, 1, 2, 3
+ xxpermdi 32+28, 1, 2, 0
+ lxvd2x 3, 17, 5
+ lxvd2x 4, 18, 5
+ xxpermdi 32+9, 3, 4, 3
+ xxpermdi 32+29, 3, 4, 0
+.endm
+
+/*
+ * For Len=8,
+ * Load coefficient with 2 legs with 64 bytes apart in
+ * r[j+len] (r') vectors from offset, R10, R17, R19 and R21
+ * r[j+len]: V6, V7, V8, V9
+ */
+.macro Load_22Coeffs start next
+ li 9, \start
+ add 10, 7, 9
+ addi 16, 9, \next
+ addi 17, 10, \next
+ li 18, \start+64
+ add 19, 7, 18
+ addi 20, 18, \next
+ addi 21, 19, \next
+ lxvd2x 32+6, 3, 10
+ lxvd2x 32+7, 3, 17
+ lxvd2x 32+8, 3, 19
+ lxvd2x 32+9, 3, 21
+.endm
+
+/*
+ * Load coefficient with 2 legs with len*2 bytes apart in
+ * r[j+len] (r') vectors from offset, R10, R17, R19 and R21
+ * r[j+len]: V6, V7, V8, V9
+ */
+.macro Load_4Coeffs start next
+ Init_Coeffs_offset \start, \next
+
+ lxvd2x 32+6, 3, 10
+ lxvd2x 32+7, 3, 17
+ lxvd2x 32+8, 3, 19
+ lxvd2x 32+9, 3, 21
+.endm
+
+/*
+ * Load 4 r[j] (r) coefficient vectors:
+ * Load coefficient in vectors from offset, R9, R16, R18 and R20
+ * r[j]: V26, V27, V28, V29
+ */
+.macro Load_4Rj
+ lxvd2x 32+26, 3, 9
+ lxvd2x 32+27, 3, 16
+ lxvd2x 32+28, 3, 18
+ lxvd2x 32+29, 3, 20
+.endm
+
+/*
+ * Compute final final r[j] and r[j+len]
+ * final r[j+len]: V18, V19, V20, V21
+ * final r[j]: V22, V23, V24, V25
+ */
+.macro Compute_4Coeff
+ vsubuwm 18, 26, 10
+ vadduwm 22, 26, 10
+
+ vsubuwm 19, 27, 11
+ vadduwm 23, 27, 11
+
+ vsubuwm 20, 28, 12
+ vadduwm 24, 28, 12
+
+ vsubuwm 21, 29, 13
+ vadduwm 25, 29, 13
+.endm
+
+.macro Write_One
+ stxvd2x 32+22, 3, 9
+ stxvd2x 32+18, 3, 10
+ stxvd2x 32+23, 3, 16
+ stxvd2x 32+19, 3, 17
+ stxvd2x 32+24, 3, 18
+ stxvd2x 32+20, 3, 19
+ stxvd2x 32+25, 3, 20
+ stxvd2x 32+21, 3, 21
+.endm
+
+/*
+ * Transpose the final coefficients of 2-2-2-2 layout to the original
+ * coefficient array order.
+ */
+.macro PermWrite42
+ xxpermdi 32+10, 32+22, 32+18, 0
+ xxpermdi 32+14, 32+22, 32+18, 3
+ xxpermdi 32+11, 32+23, 32+19, 0
+ xxpermdi 32+15, 32+23, 32+19, 3
+ xxpermdi 32+12, 32+24, 32+20, 0
+ xxpermdi 32+16, 32+24, 32+20, 3
+ xxpermdi 32+13, 32+25, 32+21, 0
+ xxpermdi 32+17, 32+25, 32+21, 3
+ stxvd2x 32+10, 0, 5
+ stxvd2x 32+14, 10, 5
+ stxvd2x 32+11, 11, 5
+ stxvd2x 32+15, 12, 5
+ stxvd2x 32+12, 15, 5
+ stxvd2x 32+16, 16, 5
+ stxvd2x 32+13, 17, 5
+ stxvd2x 32+17, 18, 5
+.endm
+
+/*
+ * Transpose the final coefficients of 1-1-1-1 layout to the original
+ * coefficient array order.
+ */
+.macro PermWrite41
+ vmrgew 10, 18, 22
+ vmrgow 11, 18, 22
+ vmrgew 12, 19, 23
+ vmrgow 13, 19, 23
+ vmrgew 14, 20, 24
+ vmrgow 15, 20, 24
+ vmrgew 16, 21, 25
+ vmrgow 17, 21, 25
+ stxvd2x 32+10, 0, 5
+ stxvd2x 32+11, 10, 5
+ stxvd2x 32+12, 11, 5
+ stxvd2x 32+13, 12, 5
+ stxvd2x 32+14, 15, 5
+ stxvd2x 32+15, 16, 5
+ stxvd2x 32+16, 17, 5
+ stxvd2x 32+17, 18, 5
+.endm
+
+.macro Load_next_4zetas
+ li 10, 16
+ li 11, 32
+ li 12, 48
+ lxvd2x 32+V_Z0, 0, 14
+ lxvd2x 32+V_Z1, 10, 14
+ lxvd2x 32+V_Z2, 11, 14
+ lxvd2x 32+V_Z3, 12, 14
+ addi 14, 14, 64
+.endm
+
+/*
+ * montgomery_reduce
+ * a = zeta * a[j+len]
+ * t = (int64_t)(int32_t)a*QINV;
+ * t = (a - (int64_t)t*Q) >> 32;
+ *
+ * -----------------------------------
+ * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3)
+ */
+.macro MREDUCE_4x _vz0 _vz1 _vz2 _vz3
+ /* Coefficients computation results in abosulte value of 2^64 in
+ even and odd pairs */
+ vmulesw 10, 6, \_vz0
+ vmulosw 11, 6, \_vz0
+ vmulesw 12, 7, \_vz1
+ vmulosw 13, 7, \_vz1
+ vmulesw 14, 8, \_vz2
+ vmulosw 15, 8, \_vz2
+ vmulesw 16, 9, \_vz3
+ vmulosw 17, 9, \_vz3
+
+ /* Compute a*q^(-1) mod 2^32 and results in the upper 32 bits of
+ even pair */
+ vmulosw 18, 10, QINV
+ vmulosw 19, 11, QINV
+ vmulosw 20, 12, QINV
+ vmulosw 21, 13, QINV
+ vmulosw 22, 14, QINV
+ vmulosw 23, 15, QINV
+ vmulosw 24, 16, QINV
+ vmulosw 25, 17, QINV
+
+ vmulosw 18, 18, V_Q
+ vmulosw 19, 19, V_Q
+ vmulosw 20, 20, V_Q
+ vmulosw 21, 21, V_Q
+ vmulosw 22, 22, V_Q
+ vmulosw 23, 23, V_Q
+ vmulosw 24, 24, V_Q
+ vmulosw 25, 25, V_Q
+
+ vsubudm 18, 10, 18
+ vsubudm 19, 11, 19
+ vsubudm 20, 12, 20
+ vsubudm 21, 13, 21
+ vsubudm 22, 14, 22
+ vsubudm 23, 15, 23
+ vsubudm 24, 16, 24
+ vsubudm 25, 17, 25
+
+ vmrgew 10, 18, 19
+ vmrgew 11, 20, 21
+ vmrgew 12, 22, 23
+ vmrgew 13, 24, 25
+.endm
+
+/*
+ * For Len=1, layer with 1-1-1-1 layout.
+ */
+.macro NTT_MREDUCE_41x
+ Load_next_4zetas
+ Load_41Coeffs
+ MREDUCE_4x V_Z0, V_Z1, V_Z2, V_Z3
+ Compute_4Coeff
+ PermWrite41
+ addi 5, 5, 128
+.endm
+
+/*
+ * For Len=2, layer with 2-2-2-2 layout.
+ */
+.macro NTT_MREDUCE_42x
+ Load_next_4zetas
+ Load_42Coeffs
+ MREDUCE_4x V_Z0, V_Z1, V_Z2, V_Z3
+ Compute_4Coeff
+ PermWrite42
+ addi 5, 5, 128
+.endm
+
+/*
+ * For Len=8
+ */
+.macro NTT_MREDUCE_22x start next _vz0 _vz1 _vz2 _vz3
+ Load_22Coeffs \start, \next
+ MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3
+ Load_4Rj
+ Compute_4Coeff
+ Write_One
+.endm
+
+/*
+ * For Len=128, 64, 32, 16 and 4.
+ */
+.macro NTT_MREDUCE_4x start next _vz0 _vz1 _vz2 _vz3
+ Load_4Coeffs \start, \next
+ MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3
+ Load_4Rj
+ Compute_4Coeff
+ Write_One
+.endm
+
+/*
+ * mldsa_ntt_ppc(int32_t *r)
+ * Compute forward NTT based on the following 8 layers -
+ * len = 128, 64, 32, 16, 8, 4, 2, 1.
+ *
+ * Each layer compute the coefficients on 2 legs, start and start + len*2 offsets.
+ *
+ * leg 1 leg 2
+ * ----- -----
+ * start start+len*2
+ * start+next start+len*2+next
+ * start+next+next start+len*2+next+next
+ * start+next+next+next start+len*2+next+next+next
+ *
+ * Each computation loads 8 vectors, 4 for each leg.
+ * The final coefficient (t) from each vector of leg1 and leg2 then do the
+ * add/sub operations to obtain the final results.
+ *
+ * -> leg1 = leg1 + t, leg2 = leg1 - t
+ *
+ * The resulting coefficients then store back to each leg's offset.
+ *
+ * Each vector has the same corresponding zeta except len=2.
+ *
+ * len=2 has 2-2-2-2 layout which means every 2 32-bit coefficients has the same zeta.
+ * e.g.
+ * coeff vector a1 a2 a3 a4 a5 a6 a7 a8
+ * zeta vector z1 z1 z2 z2 z3 z3 z4 z4
+ *
+ * For len=2, each vector will get permuted to leg1 and leg2. Zeta is
+ * pre-arranged for the leg1 and leg2. After the computation, each vector needs
+ * to transpose back to its original 2-2-2-2 layout.
+ *
+ */
+.global mldsa_ntt_ppc
+.align 4
+mldsa_ntt_ppc:
+
+ SAVE_REGS
+
+ /* load Q and Q_NEG_INV */
+ addis 8,2,mldsa_consts at toc@ha
+ addi 8,8,mldsa_consts at toc@l
+ lvx V_Q, 0, 8
+ li 10, QINV_OFFSET
+ lvx QINV, 10, 8
+
+ /* set zetas array */
+ addi 14, 8, ZETA_NTT_OFFSET
+
+ /*
+ * 1. len = 128, start = 0
+ *
+ * Compute coefficients of the NTT based on 2 legs,
+ * 0 - 128
+ * 16 - 144
+ * 32 - 160
+ * ...
+ * 112 - 240
+ * These are indexes to the 32 bits array
+ *
+ * r7 is len * 4
+ */
+ li 7, 512
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+
+ NTT_MREDUCE_4x 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 448, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+.align 4
+ /*
+ * 2. len = 64, start = 0, 128
+ *
+ * Compute coefficients of the NTT based on 2 legs,
+ * 0 - 64
+ * 16 - 80
+ * 32 - 96
+ * ...
+ * 128 - 192
+ * 144 - 208
+ * 160 - 224
+ * 176 - 240
+ * These are indexes to the 32 bits array
+ */
+ li 7, 256
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 512, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 576, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 640, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 704, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+.align 4
+ /*
+ * 3. len = 32, start = 0, 64, 128, 192
+ *
+ * Compute coefficients of the NTT based on 2 legs,
+ * 0 - 32
+ * ...
+ * 64 - 96
+ * ...
+ * 128 - 160
+ * ...
+ * 192 - 224
+ * ...
+ *
+ * These are indexes to the 32 bits array
+ */
+ li 7, 128
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 512, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 576, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 768, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+ NTT_MREDUCE_4x 832, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+.align 4
+ /*
+ * 4. len = 16, start = 0, 32, 64, 96, 128, 160, 192, 224
+ *
+ * Compute coefficients of the NTT based on 2 legs,
+ * 0 - 16
+ * 32 - 48
+ * 64 - 80
+ * ...
+ * 192 - 208
+ * 224 - 240
+ *
+ * These are indexes to the 32 bits array
+ */
+ li 7, 64
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 512, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 640, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 768, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+ lvx V_ZETA, 0, 14
+ addi 14, 14, 16
+ NTT_MREDUCE_4x 896, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA
+
+.align 4
+ /*
+ * 5. len = 8, start = 0, 32, 64, 96, 128, 160, 192, 224
+ *
+ * Compute coefficients of the NTT based on 2 legs,
+ * 0 - 8
+ * 32 - 40
+ * 64 - 72
+ * ...
+ * 192 - 200
+ * 224 - 232
+ *
+ * These are indexes to the 32 bits array
+ */
+
+ li 7, 32
+ Load_next_4zetas
+ NTT_MREDUCE_22x 0, 16, V_Z0, V_Z0, V_Z1, V_Z1
+ NTT_MREDUCE_22x 128, 16, V_Z2, V_Z2, V_Z3, V_Z3
+
+ Load_next_4zetas
+ NTT_MREDUCE_22x 256, 16, V_Z0, V_Z0, V_Z1, V_Z1
+ NTT_MREDUCE_22x 384, 16, V_Z2, V_Z2, V_Z3, V_Z3
+
+ Load_next_4zetas
+ NTT_MREDUCE_22x 512, 16, V_Z0, V_Z0, V_Z1, V_Z1
+ NTT_MREDUCE_22x 640, 16, V_Z2, V_Z2, V_Z3, V_Z3
+
+ Load_next_4zetas
+ NTT_MREDUCE_22x 768, 16, V_Z0, V_Z0, V_Z1, V_Z1
+ NTT_MREDUCE_22x 896, 16, V_Z2, V_Z2, V_Z3, V_Z3
+
+.align 4
+ /*
+ * 6. len = 4, start = 0, 32, 64, 96, 128, 160, 192, 224
+ *
+ * Compute coefficients of the NTT based on 2 legs,
+ * 0 - 4
+ * 32 - 36
+ * 64 - 68
+ * ...
+ * 192 - 196
+ * 224 - 228
+ *
+ * These are indexes to the 32 bits array
+ */
+
+ li 7, 16
+
+ Load_next_4zetas
+ NTT_MREDUCE_4x 0, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+ Load_next_4zetas
+ NTT_MREDUCE_4x 128, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+ Load_next_4zetas
+ NTT_MREDUCE_4x 256, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+ Load_next_4zetas
+ NTT_MREDUCE_4x 384, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+ Load_next_4zetas
+ NTT_MREDUCE_4x 512, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+ Load_next_4zetas
+ NTT_MREDUCE_4x 640, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+ Load_next_4zetas
+ NTT_MREDUCE_4x 768, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+ Load_next_4zetas
+ NTT_MREDUCE_4x 896, 32, V_Z0, V_Z1, V_Z2, V_Z3
+
+.align 4
+ /*
+ * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252
+ *
+ * Compute coefficients of the NTT based on 2 legs,
+ * 0 - 4
+ * 8 - 12
+ * 16 - 20
+ * ...
+ * 240 - 244
+ * 248 - 252
+ *
+ * These are indexes to the 32 bits array
+ */
+ mr 5, 3
+ li 7, 8
+
+ li 10, 16
+ li 11, 32
+ li 12, 48
+ li 15, 64
+ li 16, 80
+ li 17, 96
+ li 18, 112
+
+ NTT_MREDUCE_42x
+ NTT_MREDUCE_42x
+ NTT_MREDUCE_42x
+ NTT_MREDUCE_42x
+ NTT_MREDUCE_42x
+ NTT_MREDUCE_42x
+ NTT_MREDUCE_42x
+ NTT_MREDUCE_42x
+
+.align 4
+ /*
+ * 8. len = 1, start = 0, 2, 4, 6, 8, 10, 12,...254
+ *
+ * Compute coefficients of the NTT based on the following sequences,
+ * 0, 1, 2, 3
+ * 4, 5, 6, 7
+ * 8, 9, 10, 11
+ * 12, 13, 14, 15
+ * ...
+ * 240, 241, 242, 243
+ * 244, 245, 246, 247
+ * 248, 249, 250, 251
+ * 252, 253, 254, 255
+ *
+ * These are indexes to the 32 bits array. Each loads 4 vectors.
+ */
+ mr 5, 3
+ li 7, 4
+
+ NTT_MREDUCE_41x
+ NTT_MREDUCE_41x
+ NTT_MREDUCE_41x
+ NTT_MREDUCE_41x
+ NTT_MREDUCE_41x
+ NTT_MREDUCE_41x
+ NTT_MREDUCE_41x
+ NTT_MREDUCE_41x
+
+ RESTORE_REGS
+ blr
+.size mldsa_ntt_ppc,.-mldsa_ntt_ppc
+
+.rodata
+.align 4
+mldsa_consts:
+.long MLDSA_Q, MLDSA_Q, MLDSA_Q, MLDSA_Q
+.long MLDSA_QINV, MLDSA_QINV, MLDSA_QINV, MLDSA_QINV
+
+/* zetas */
+mldsa_zetas:
+.long 25847, 25847, 25847, 25847, -2608894, -2608894, -2608894, -2608894
+.long -518909, -518909, -518909, -518909, 237124, 237124, 237124, 237124
+.long -777960, -777960, -777960, -777960, -876248, -876248, -876248, -876248
+.long 466468, 466468, 466468, 466468, 1826347, 1826347, 1826347, 1826347
+.long 2353451, 2353451, 2353451, 2353451, -359251, -359251, -359251, -359251
+.long -2091905, -2091905, -2091905, -2091905, 3119733, 3119733, 3119733, 3119733
+.long -2884855, -2884855, -2884855, -2884855, 3111497, 3111497, 3111497, 3111497
+.long 2680103, 2680103, 2680103, 2680103, 2725464, 2725464, 2725464, 2725464
+.long 1024112, 1024112, 1024112, 1024112, -1079900, -1079900, -1079900, -1079900
+.long 3585928, 3585928, 3585928, 3585928, -549488, -549488, -549488, -549488
+.long -1119584, -1119584, -1119584, -1119584, 2619752, 2619752, 2619752, 2619752
+.long -2108549, -2108549, -2108549, -2108549, -2118186, -2118186, -2118186, -2118186
+.long -3859737, -3859737, -3859737, -3859737, -1399561, -1399561, -1399561, -1399561
+.long -3277672, -3277672, -3277672, -3277672, 1757237, 1757237, 1757237, 1757237
+.long -19422, -19422, -19422, -19422, 4010497, 4010497, 4010497, 4010497
+.long 280005, 280005, 280005, 280005
+/*For Len=4 */
+.long 2706023, 2706023, 2706023, 2706023, 95776, 95776, 95776, 95776
+.long 3077325, 3077325, 3077325, 3077325, 3530437, 3530437, 3530437, 3530437
+.long -1661693, -1661693, -1661693, -1661693, -3592148, -3592148, -3592148, -3592148
+.long -2537516, -2537516, -2537516, -2537516, 3915439, 3915439, 3915439, 3915439
+.long -3861115, -3861115, -3861115, -3861115, -3043716, -3043716, -3043716, -3043716
+.long 3574422, 3574422, 3574422, 3574422, -2867647, -2867647, -2867647, -2867647
+.long 3539968, 3539968, 3539968, 3539968, -300467, -300467, -300467, -300467
+.long 2348700, 2348700, 2348700, 2348700, -539299, -539299, -539299, -539299
+.long -1699267, -1699267, -1699267, -1699267, -1643818, -1643818, -1643818, -1643818
+.long 3505694, 3505694, 3505694, 3505694, -3821735, -3821735, -3821735, -3821735
+.long 3507263, 3507263, 3507263, 3507263, -2140649, -2140649, -2140649, -2140649
+.long -1600420, -1600420, -1600420, -1600420, 3699596, 3699596, 3699596, 3699596
+.long 811944, 811944, 811944, 811944, 531354, 531354, 531354, 531354
+.long 954230, 954230, 954230, 954230, 3881043, 3881043, 3881043, 3881043
+.long 3900724, 3900724, 3900724, 3900724, -2556880, -2556880, -2556880, -2556880
+.long 2071892, 2071892, 2071892, 2071892, -2797779, -2797779, -2797779, -2797779
+/* For Len=2 */
+.long -3930395, -3930395, -1528703, -1528703, -3677745, -3677745, -3041255, -3041255
+.long -1452451, -1452451, 3475950, 3475950, 2176455, 2176455, -1585221, -1585221
+.long -1257611, -1257611, 1939314, 1939314, -4083598, -4083598, -1000202, -1000202
+.long -3190144, -3190144, -3157330, -3157330, -3632928, -3632928, 126922, 126922
+.long 3412210, 3412210, -983419, -983419, 2147896, 2147896, 2715295, 2715295
+.long -2967645, -2967645, -3693493, -3693493, -411027, -411027, -2477047, -2477047
+.long -671102, -671102, -1228525, -1228525, -22981, -22981, -1308169, -1308169
+.long -381987, -381987, 1349076, 1349076, 1852771, 1852771, -1430430, -1430430
+.long -3343383, -3343383, 264944, 264944, 508951, 508951, 3097992, 3097992
+.long 44288, 44288, -1100098, -1100098, 904516, 904516, 3958618, 3958618
+.long -3724342, -3724342, -8578, -8578, 1653064, 1653064, -3249728, -3249728
+.long 2389356, 2389356, -210977, -210977, 759969, 759969, -1316856, -1316856
+.long 189548, 189548, -3553272, -3553272, 3159746, 3159746, -1851402, -1851402
+.long -2409325, -2409325, -177440, -177440, 1315589, 1315589, 1341330, 1341330
+.long 1285669, 1285669, -1584928, -1584928, -812732, -812732, -1439742, -1439742
+.long -3019102, -3019102, -3881060, -3881060, -3628969, -3628969, 3839961, 3839961
+/* Setup zetas for Len=1 as (3, 2, 1, 4) order */
+.long 2316500, 2091667, 3817976, 3407706, -2446433, -3342478, -3562462, 2244091
+.long -1235728, 266997, 3513181, 2434439, -1197226, -3520352, -3193378, -3759364
+.long 909542, 900702, 819034, 1859098, -43260, 495491, -522500, -1613174
+.long 2031748, -655327, 3207046, -3122442, -768622, -3556995, -3595838, -525098
+.long -2437823, 342297, 4108315, 286988, 1735879, 3437287, 203044, -3342277
+.long -2590150, 2842341, 1265009, 2691481, 2486353, 4055324, 1595974, 1247620
+.long 2635921, -3767016, -3548272, 1250494, 1903435, -2994039, -1050970, 1869119
+.long -3318210, -1333058, -1430225, 1237275, 3306115, -451100, -1962642, 1312455
+.long -2546312, -1279661, -1374803, 1917081, 2235880, 1500165, 3406031, 777191
+.long -1671176, -542412, -1846953, -2831860, 594136, -2584293, -3776993, -3724270
+.long 2454455, -2013608, -164721, 2432395, 185531, 1957272, -1207385, 3369112
+.long 1616392, -3183426, 3014001, 162844, -3694233, 810149, -1799107, 1652634
+.long 3866901, -3038916, 269760, 3523897, 1717735, 2213111, 472078, -975884
+.long -1803090, -426683, 1910376, 1723600, -260646, -1667432, -3833893, -1104333
+.long -420899, -2939036, -2286327, -2235985, 1612842, 183443, -3545687, -976891
+.long -48306, -554416, -1362209, 3919660, -846154, 3937738, 1976782, 1400424
--
2.47.3
More information about the Gcrypt-devel
mailing list