2 *****************************************************************************
3 Assembly code snippets for F[p] finite field arithmetic, used by fp.tcc .
4 Specific to x86-64, and used only if USE_ASM is defined.
5 On other architectures or without USE_ASM, fp.tcc uses a portable
6 C++ implementation instead.
7 *****************************************************************************
8 * @author This file is part of libff, developed by SCIPR Lab
9 * and contributors (see AUTHORS).
10 * @copyright MIT license (see LICENSE file)
11 *****************************************************************************/
21 #define STR_HELPER(x) #x
22 #define STR(x) STR_HELPER(x)
24 /* addq is faster than adcq, even if preceded by clc */
25 #define ADD_FIRSTADD() \
26 "movq (%[B]), %%rax \n\t" \
27 "addq %%rax, (%[A]) \n\t"
29 #define ADD_NEXTADD(ofs) \
30 "movq " STR(ofs) "(%[B]), %%rax \n\t" \
31 "adcq %%rax, " STR(ofs) "(%[A]) \n\t"
33 #define ADD_CMP(ofs) \
34 "movq " STR(ofs) "(%[mod]), %%rax \n\t" \
35 "cmpq %%rax, " STR(ofs) "(%[A]) \n\t" \
39 #define ADD_FIRSTSUB() \
40 "movq (%[mod]), %%rax \n\t" \
41 "subq %%rax, (%[A]) \n\t"
43 #define ADD_FIRSTSUB() \
44 "movq (%[mod]), %%rax \n\t" \
45 "subq %%rax, (%[A]) \n\t"
47 #define ADD_NEXTSUB(ofs) \
48 "movq " STR(ofs) "(%[mod]), %%rax \n\t" \
49 "sbbq %%rax, " STR(ofs) "(%[A]) \n\t"
51 #define SUB_FIRSTSUB() \
52 "movq (%[B]), %%rax\n\t" \
53 "subq %%rax, (%[A])\n\t"
55 #define SUB_NEXTSUB(ofs) \
56 "movq " STR(ofs) "(%[B]), %%rax\n\t" \
57 "sbbq %%rax, " STR(ofs) "(%[A])\n\t"
59 #define SUB_FIRSTADD() \
60 "movq (%[mod]), %%rax\n\t" \
61 "addq %%rax, (%[A])\n\t"
63 #define SUB_NEXTADD(ofs) \
64 "movq " STR(ofs) "(%[mod]), %%rax\n\t" \
65 "adcq %%rax, " STR(ofs) "(%[A])\n\t"
67 #define MONT_CMP(ofs) \
68 "movq " STR(ofs) "(%[M]), %%rax \n\t" \
69 "cmpq %%rax, " STR(ofs) "(%[tmp]) \n\t" \
73 #define MONT_FIRSTSUB() \
74 "movq (%[M]), %%rax \n\t" \
75 "subq %%rax, (%[tmp]) \n\t"
77 #define MONT_NEXTSUB(ofs) \
78 "movq " STR(ofs) "(%[M]), %%rax \n\t" \
79 "sbbq %%rax, " STR(ofs) "(%[tmp]) \n\t"
82 The x86-64 Montgomery multiplication here is similar to Algorithm 2 (CIOS
83 method) in http://eprint.iacr.org/2012/140.pdf and the PowerPC pseudocode of
84 gmp-ecm library (c) Paul Zimmermann and Alexander Kruppa (see comments on top
85 of powerpc64/mulredc.m4).
88 #define MONT_PRECOMPUTE() \
89 "xorq %[cy], %[cy] \n\t" \
90 "movq 0(%[A]), %%rax \n\t" \
92 "movq %%rax, %[T0] \n\t" \
93 "movq %%rdx, %[T1] # T1:T0 <- A[0] * B[0] \n\t" \
95 "movq %%rax, %[u] # u <- T0 * inv \n\t" \
97 "addq %[T0], %%rax \n\t" \
98 "adcq %%rdx, %[T1] \n\t" \
99 "adcq $0, %[cy] # cy:T1 <- (M[0]*u + T1 * b + T0) / " \
102 #define MONT_FIRSTITER(j) \
103 "xorq %[T0], %[T0] \n\t" \
104 "movq 0(%[A]), %%rax \n\t" \
105 "mulq " STR((j * 8)) "(%[B]) \n\t" \
106 "addq %[T1], %%rax \n\t" \
107 "movq %%rax, " STR(((j - 1) * 8)) "(%[tmp]) \n\t" \
108 "adcq $0, %%rdx \n\t" \
109 "movq %%rdx, %[T1] # now T1:tmp[j-1] " \
110 "<-- X[0] * Y[j] + T1\n\t" \
111 "movq " STR((j * 8)) "(%[M]), %%rax \n\t" \
113 "addq %%rax, " STR(((j - 1) * 8)) "(%[tmp]) \n\t" \
114 "adcq %[cy], %%rdx \n\t" \
115 "adcq $0, %[T0] \n\t" \
116 "xorq %[cy], %[cy] \n\t" \
117 "addq %%rdx, %[T1] \n\t" \
118 "adcq %[T0], %[cy] # " \
119 "cy:T1:tmp[j-1] <---- (X[0] * Y[j] + " \
120 "T1) + (M[j] * u + cy * b) \n\t"
122 #define MONT_ITERFIRST(i) \
123 "xorq %[cy], %[cy] \n\t" \
124 "movq " STR((i * 8)) "(%[A]), %%rax \n\t" \
125 "mulq 0(%[B]) \n\t" \
126 "addq 0(%[tmp]), %%rax \n\t" \
127 "adcq 8(%[tmp]), %%rdx \n\t" \
128 "adcq $0, %[cy] \n\t" \
129 "movq %%rax, %[T0] \n\t" \
130 "movq %%rdx, %[T1] # cy:T1:T0 <- A[i] * B[0] " \
131 "+ tmp[1] * b + tmp[0]\n\t" \
133 "movq %%rax, %[u] # u <- T0 * inv\n\t" \
134 "mulq 0(%[M]) \n\t" \
135 "addq %[T0], %%rax \n\t" \
136 "adcq %%rdx, %[T1] \n\t" \
137 "adcq $0, %[cy] # cy:T1 <- (M[0]*u + cy * " \
138 "b * b + T1 * b + T0) / b\n\t"
140 #define MONT_ITERITER(i, j) \
141 "xorq %[T0], %[T0] \n\t" \
142 "movq " STR((i * 8)) "(%[A]), %%rax \n\t" \
143 "mulq " STR((j * 8)) "(%[B]) \n\t" \
144 "addq %[T1], %%rax \n\t" \
145 "movq %%rax, " STR(((j - 1) * 8)) "(%[tmp]) \n\t" \
146 "adcq $0, %%rdx \n\t" \
147 "movq %%rdx, %[T1] # now " \
148 "T1:tmp[j-1] <-- X[i] * Y[j] + T1 \n\t" \
149 "movq " STR((j * 8)) "(%[M]), %%rax \n\t" \
151 "addq %%rax, " STR(((j - 1) * 8)) "(%[tmp]) \n\t" \
152 "adcq %[cy], %%rdx \n\t" \
153 "adcq $0, %[T0] \n\t" \
154 "xorq %[cy], %[cy] \n\t" \
155 "addq %%rdx, %[T1] \n\t" \
156 "adcq %[T0], %[cy] # cy:T1:tmp[j-1] <-- " \
157 "(X[i] * Y[j] + T1) + M[j] * u + cy * b \n\t" \
158 "addq " STR(((j + 1) * 8)) "(%[tmp]), %[T1] \n\t" \
159 "adcq $0, %[cy] # cy:T1:tmp[j-1] <-- " \
160 "(X[i] * Y[j] + T1) + M[j] * u + (tmp[j+1] + cy) * b \n\t"
162 #define MONT_FINALIZE(j) \
163 "movq %[T1], " STR((j * 8)) "(%[tmp]) \n\t" \
164 "movq %[cy], " STR(((j + 1) * 8)) "(%[tmp]) \n\t"
167 Comba multiplication and squaring routines are based on the
168 public-domain tomsfastmath library by Tom St Denis
169 <http://www.libtom.org/>
170 <https://github.com/libtom/tomsfastmath/blob/master/src/sqr/fp_sqr_comba.c
171 <https://github.com/libtom/tomsfastmath/blob/master/src/mul/fp_mul_comba.c>
173 Compared to the above, we save 5-20% of cycles by using careful register
174 renaming to implement Comba forward operation.
177 #define COMBA_3_BY_3_MUL(c0_, c1_, c2_, res_, A_, B_) \
179 "movq 0(%[A]), %%rax \n\t" \
180 "mulq 0(%[B]) \n\t" \
181 "movq %%rax, 0(%[res]) \n\t" \
182 "movq %%rdx, %[c0] \n\t" \
184 "xorq %[c1], %[c1] \n\t" \
185 "movq 0(%[A]), %%rax \n\t" \
186 "mulq 8(%[B]) \n\t" \
187 "addq %%rax, %[c0] \n\t" \
188 "adcq %%rdx, %[c1] \n\t" \
190 "xorq %[c2], %[c2] \n\t" \
191 "movq 8(%[A]), %%rax \n\t" \
192 "mulq 0(%[B]) \n\t" \
193 "addq %%rax, %[c0] \n\t" \
194 "movq %[c0], 8(%[res]) \n\t" \
195 "adcq %%rdx, %[c1] \n\t" \
196 "adcq $0, %[c2] \n\t" \
198 "// register renaming (c1, c2, c0)\n\t" \
199 "xorq %[c0], %[c0] \n\t" \
200 "movq 0(%[A]), %%rax \n\t" \
201 "mulq 16(%[B]) \n\t" \
202 "addq %%rax, %[c1] \n\t" \
203 "adcq %%rdx, %[c2] \n\t" \
204 "adcq $0, %[c0] \n\t" \
206 "movq 8(%[A]), %%rax \n\t" \
207 "mulq 8(%[B]) \n\t" \
208 "addq %%rax, %[c1] \n\t" \
209 "adcq %%rdx, %[c2] \n\t" \
210 "adcq $0, %[c0] \n\t" \
212 "movq 16(%[A]), %%rax \n\t" \
213 "mulq 0(%[B]) \n\t" \
214 "addq %%rax, %[c1] \n\t" \
215 "movq %[c1], 16(%[res]) \n\t" \
216 "adcq %%rdx, %[c2] \n\t" \
217 "adcq $0, %[c0] \n\t" \
219 "// register renaming (c2, c0, c1)\n\t" \
220 "xorq %[c1], %[c1] \n\t" \
221 "movq 8(%[A]), %%rax \n\t" \
222 "mulq 16(%[B]) \n\t" \
223 "addq %%rax, %[c2] \n\t" \
224 "adcq %%rdx, %[c0] \n\t" \
225 "adcq $0, %[c1] \n\t" \
227 "movq 16(%[A]), %%rax \n\t" \
228 "mulq 8(%[B]) \n\t" \
229 "addq %%rax, %[c2] \n\t" \
230 "movq %[c2], 24(%[res]) \n\t" \
231 "adcq %%rdx, %[c0] \n\t" \
232 "adcq $0, %[c1] \n\t" \
234 "// register renaming (c0, c1, c2)\n\t" \
235 "xorq %[c2], %[c2] \n\t" \
236 "movq 16(%[A]), %%rax \n\t" \
237 "mulq 16(%[B]) \n\t" \
238 "addq %%rax, %[c0] \n\t" \
239 "movq %[c0], 32(%[res]) \n\t" \
240 "adcq %%rdx, %[c1] \n\t" \
241 "movq %[c1], 40(%[res]) \n\t" \
242 : [c0] "=&r"(c0_), [c1] "=&r"(c1_), [c2] "=&r"(c2_) \
243 : [res] "r"(res_), [A] "r"(A_), [B] "r"(B_) \
244 : "%rax", "%rdx", "cc", "memory")
246 #define COMBA_3_BY_3_SQR(c0_, c1_, c2_, res_, A_) \
248 "xorq %[c1], %[c1] \n\t" \
249 "xorq %[c2], %[c2] \n\t" \
250 "movq 0(%[A]), %%rax \n\t" \
252 "movq %%rax, 0(%[res]) \n\t" \
253 "movq %%rdx, %[c0] \n\t" \
255 "movq 0(%[A]), %%rax \n\t" \
256 "mulq 8(%[A]) \n\t" \
257 "addq %%rax, %[c0] \n\t" \
258 "adcq %%rdx, %[c1] \n\t" \
259 "addq %%rax, %[c0] \n\t" \
260 "movq %[c0], 8(%[res]) \n\t" \
261 "adcq %%rdx, %[c1] \n\t" \
262 "adcq $0, %[c2] \n\t" \
264 "// register renaming (c1, c2, c0)\n\t" \
265 "movq 0(%[A]), %%rax \n\t" \
266 "xorq %[c0], %[c0] \n\t" \
267 "mulq 16(%[A]) \n\t" \
268 "addq %%rax, %[c1] \n\t" \
269 "adcq %%rdx, %[c2] \n\t" \
270 "adcq $0, %[c0] \n\t" \
271 "addq %%rax, %[c1] \n\t" \
272 "adcq %%rdx, %[c2] \n\t" \
273 "adcq $0, %[c0] \n\t" \
275 "movq 8(%[A]), %%rax \n\t" \
277 "addq %%rax, %[c1] \n\t" \
278 "movq %[c1], 16(%[res]) \n\t" \
279 "adcq %%rdx, %[c2] \n\t" \
280 "adcq $0, %[c0] \n\t" \
282 "// register renaming (c2, c0, c1)\n\t" \
283 "movq 8(%[A]), %%rax \n\t" \
284 "xorq %[c1], %[c1] \n\t" \
285 "mulq 16(%[A]) \n\t" \
286 "addq %%rax, %[c2] \n\t" \
287 "adcq %%rdx, %[c0] \n\t" \
288 "adcq $0, %[c1] \n\t" \
289 "addq %%rax, %[c2] \n\t" \
290 "movq %[c2], 24(%[res]) \n\t" \
291 "adcq %%rdx, %[c0] \n\t" \
292 "adcq $0, %[c1] \n\t" \
294 "// register renaming (c0, c1, c2)\n\t" \
295 "movq 16(%[A]), %%rax \n\t" \
297 "addq %%rax, %[c0] \n\t" \
298 "movq %[c0], 32(%[res]) \n\t" \
299 "adcq %%rdx, %[c1] \n\t" \
300 "movq %[c1], 40(%[res]) \n\t" \
302 : [c0] "=&r"(c0_), [c1] "=&r"(c1_), [c2] "=&r"(c2_) \
303 : [res] "r"(res_), [A] "r"(A_) \
304 : "%rax", "%rdx", "cc", "memory")
307 The Montgomery reduction here is based on Algorithm 14.32 in
308 Handbook of Applied Cryptography
309 <http://cacr.uwaterloo.ca/hac/about/chap14.pdf>.
311 #define REDUCE_6_LIMB_PRODUCT(k_, tmp1_, tmp2_, tmp3_, inv_, res_, mod_) \
313 "///////////////////////////////////\n\t" \
314 "movq 0(%[res]), %%rax \n\t" \
315 "mulq %[modprime] \n\t" \
316 "movq %%rax, %[k] \n\t" \
318 "movq (%[mod]), %%rax \n\t" \
320 "movq %%rax, %[tmp1] \n\t" \
321 "movq %%rdx, %[tmp2] \n\t" \
323 "xorq %[tmp3], %[tmp3] \n\t" \
324 "movq 8(%[mod]), %%rax \n\t" \
326 "addq %[tmp1], 0(%[res]) \n\t" \
327 "adcq %%rax, %[tmp2] \n\t" \
328 "adcq %%rdx, %[tmp3] \n\t" \
330 "xorq %[tmp1], %[tmp1] \n\t" \
331 "movq 16(%[mod]), %%rax \n\t" \
333 "addq %[tmp2], 8(%[res]) \n\t" \
334 "adcq %%rax, %[tmp3] \n\t" \
335 "adcq %%rdx, %[tmp1] \n\t" \
337 "addq %[tmp3], 16(%[res]) \n\t" \
338 "adcq %[tmp1], 24(%[res]) \n\t" \
339 "adcq $0, 32(%[res]) \n\t" \
340 "adcq $0, 40(%[res]) \n\t" \
342 "///////////////////////////////////\n\t" \
343 "movq 8(%[res]), %%rax \n\t" \
344 "mulq %[modprime] \n\t" \
345 "movq %%rax, %[k] \n\t" \
347 "movq (%[mod]), %%rax \n\t" \
349 "movq %%rax, %[tmp1] \n\t" \
350 "movq %%rdx, %[tmp2] \n\t" \
352 "xorq %[tmp3], %[tmp3] \n\t" \
353 "movq 8(%[mod]), %%rax \n\t" \
355 "addq %[tmp1], 8(%[res]) \n\t" \
356 "adcq %%rax, %[tmp2] \n\t" \
357 "adcq %%rdx, %[tmp3] \n\t" \
359 "xorq %[tmp1], %[tmp1] \n\t" \
360 "movq 16(%[mod]), %%rax \n\t" \
362 "addq %[tmp2], 16(%[res]) \n\t" \
363 "adcq %%rax, %[tmp3] \n\t" \
364 "adcq %%rdx, %[tmp1] \n\t" \
366 "addq %[tmp3], 24(%[res]) \n\t" \
367 "adcq %[tmp1], 32(%[res]) \n\t" \
368 "adcq $0, 40(%[res]) \n\t" \
370 "///////////////////////////////////\n\t" \
371 "movq 16(%[res]), %%rax \n\t" \
372 "mulq %[modprime] \n\t" \
373 "movq %%rax, %[k] \n\t" \
375 "movq (%[mod]), %%rax \n\t" \
377 "movq %%rax, %[tmp1] \n\t" \
378 "movq %%rdx, %[tmp2] \n\t" \
380 "xorq %[tmp3], %[tmp3] \n\t" \
381 "movq 8(%[mod]), %%rax \n\t" \
383 "addq %[tmp1], 16(%[res]) \n\t" \
384 "adcq %%rax, %[tmp2] \n\t" \
385 "adcq %%rdx, %[tmp3] \n\t" \
387 "xorq %[tmp1], %[tmp1] \n\t" \
388 "movq 16(%[mod]), %%rax \n\t" \
390 "addq %[tmp2], 24(%[res]) \n\t" \
391 "adcq %%rax, %[tmp3] \n\t" \
392 "adcq %%rdx, %[tmp1] \n\t" \
394 "addq %[tmp3], 32(%[res]) \n\t" \
395 "adcq %[tmp1], 40(%[res]) \n\t" \
397 [tmp1] "=&r"(tmp1_), \
398 [tmp2] "=&r"(tmp2_), \
399 [tmp3] "=&r"(tmp3_) \
400 : [modprime] "r"(inv_), [res] "r"(res_), [mod] "r"(mod_) \
401 : "%rax", "%rdx", "cc", "memory")
407 #endif // FP_AUX_TCC_