Clearmatics Libff  0.1
C++ library for Finite Fields and Elliptic Curves
fp_aux.tcc
Go to the documentation of this file.
1 /** @file
2  *****************************************************************************
3  Assembly code snippets for F[p] finite field arithmetic, used by fp.tcc .
4  Specific to x86-64, and used only if USE_ASM is defined.
5  On other architectures or without USE_ASM, fp.tcc uses a portable
6  C++ implementation instead.
7  *****************************************************************************
8  * @author This file is part of libff, developed by SCIPR Lab
9  * and contributors (see AUTHORS).
10  * @copyright MIT license (see LICENSE file)
11  *****************************************************************************/
12 
13 #ifndef FP_AUX_TCC_
14 #define FP_AUX_TCC_
15 
16 namespace libff
17 {
18 
19 // clang-format off
20 
21 #define STR_HELPER(x) #x
22 #define STR(x) STR_HELPER(x)
23 
24 /* addq is faster than adcq, even if preceded by clc */
25 #define ADD_FIRSTADD() \
26  "movq (%[B]), %%rax \n\t" \
27  "addq %%rax, (%[A]) \n\t"
28 
29 #define ADD_NEXTADD(ofs) \
30  "movq " STR(ofs) "(%[B]), %%rax \n\t" \
31  "adcq %%rax, " STR(ofs) "(%[A]) \n\t"
32 
33 #define ADD_CMP(ofs) \
34  "movq " STR(ofs) "(%[mod]), %%rax \n\t" \
35  "cmpq %%rax, " STR(ofs) "(%[A]) \n\t" \
36  "jb done%= \n\t" \
37  "ja subtract%= \n\t"
38 
39 #define ADD_FIRSTSUB() \
40  "movq (%[mod]), %%rax \n\t" \
41  "subq %%rax, (%[A]) \n\t"
42 
43 #define ADD_FIRSTSUB() \
44  "movq (%[mod]), %%rax \n\t" \
45  "subq %%rax, (%[A]) \n\t"
46 
47 #define ADD_NEXTSUB(ofs) \
48  "movq " STR(ofs) "(%[mod]), %%rax \n\t" \
49  "sbbq %%rax, " STR(ofs) "(%[A]) \n\t"
50 
51 #define SUB_FIRSTSUB() \
52  "movq (%[B]), %%rax\n\t" \
53  "subq %%rax, (%[A])\n\t"
54 
55 #define SUB_NEXTSUB(ofs) \
56  "movq " STR(ofs) "(%[B]), %%rax\n\t" \
57  "sbbq %%rax, " STR(ofs) "(%[A])\n\t"
58 
59 #define SUB_FIRSTADD() \
60  "movq (%[mod]), %%rax\n\t" \
61  "addq %%rax, (%[A])\n\t"
62 
63 #define SUB_NEXTADD(ofs) \
64  "movq " STR(ofs) "(%[mod]), %%rax\n\t" \
65  "adcq %%rax, " STR(ofs) "(%[A])\n\t"
66 
67 #define MONT_CMP(ofs) \
68  "movq " STR(ofs) "(%[M]), %%rax \n\t" \
69  "cmpq %%rax, " STR(ofs) "(%[tmp]) \n\t" \
70  "jb done%= \n\t" \
71  "ja subtract%= \n\t"
72 
73 #define MONT_FIRSTSUB() \
74  "movq (%[M]), %%rax \n\t" \
75  "subq %%rax, (%[tmp]) \n\t"
76 
77 #define MONT_NEXTSUB(ofs) \
78  "movq " STR(ofs) "(%[M]), %%rax \n\t" \
79  "sbbq %%rax, " STR(ofs) "(%[tmp]) \n\t"
80 
81 /*
82  The x86-64 Montgomery multiplication here is similar to Algorithm 2 (CIOS
83  method) in http://eprint.iacr.org/2012/140.pdf and the PowerPC pseudocode of
84  gmp-ecm library (c) Paul Zimmermann and Alexander Kruppa (see comments on top
85  of powerpc64/mulredc.m4).
86 */
87 
88 #define MONT_PRECOMPUTE() \
89  "xorq %[cy], %[cy] \n\t" \
90  "movq 0(%[A]), %%rax \n\t" \
91  "mulq 0(%[B]) \n\t" \
92  "movq %%rax, %[T0] \n\t" \
93  "movq %%rdx, %[T1] # T1:T0 <- A[0] * B[0] \n\t" \
94  "mulq %[inv] \n\t" \
95  "movq %%rax, %[u] # u <- T0 * inv \n\t" \
96  "mulq 0(%[M]) \n\t" \
97  "addq %[T0], %%rax \n\t" \
98  "adcq %%rdx, %[T1] \n\t" \
99  "adcq $0, %[cy] # cy:T1 <- (M[0]*u + T1 * b + T0) / " \
100  "b\n\t"
101 
102 #define MONT_FIRSTITER(j) \
103  "xorq %[T0], %[T0] \n\t" \
104  "movq 0(%[A]), %%rax \n\t" \
105  "mulq " STR((j * 8)) "(%[B]) \n\t" \
106  "addq %[T1], %%rax \n\t" \
107  "movq %%rax, " STR(((j - 1) * 8)) "(%[tmp]) \n\t" \
108  "adcq $0, %%rdx \n\t" \
109  "movq %%rdx, %[T1] # now T1:tmp[j-1] " \
110  "<-- X[0] * Y[j] + T1\n\t" \
111  "movq " STR((j * 8)) "(%[M]), %%rax \n\t" \
112  "mulq %[u] \n\t" \
113  "addq %%rax, " STR(((j - 1) * 8)) "(%[tmp]) \n\t" \
114  "adcq %[cy], %%rdx \n\t" \
115  "adcq $0, %[T0] \n\t" \
116  "xorq %[cy], %[cy] \n\t" \
117  "addq %%rdx, %[T1] \n\t" \
118  "adcq %[T0], %[cy] # " \
119  "cy:T1:tmp[j-1] <---- (X[0] * Y[j] + " \
120  "T1) + (M[j] * u + cy * b) \n\t"
121 
122 #define MONT_ITERFIRST(i) \
123  "xorq %[cy], %[cy] \n\t" \
124  "movq " STR((i * 8)) "(%[A]), %%rax \n\t" \
125  "mulq 0(%[B]) \n\t" \
126  "addq 0(%[tmp]), %%rax \n\t" \
127  "adcq 8(%[tmp]), %%rdx \n\t" \
128  "adcq $0, %[cy] \n\t" \
129  "movq %%rax, %[T0] \n\t" \
130  "movq %%rdx, %[T1] # cy:T1:T0 <- A[i] * B[0] " \
131  "+ tmp[1] * b + tmp[0]\n\t" \
132  "mulq %[inv] \n\t" \
133  "movq %%rax, %[u] # u <- T0 * inv\n\t" \
134  "mulq 0(%[M]) \n\t" \
135  "addq %[T0], %%rax \n\t" \
136  "adcq %%rdx, %[T1] \n\t" \
137  "adcq $0, %[cy] # cy:T1 <- (M[0]*u + cy * " \
138  "b * b + T1 * b + T0) / b\n\t"
139 
140 #define MONT_ITERITER(i, j) \
141  "xorq %[T0], %[T0] \n\t" \
142  "movq " STR((i * 8)) "(%[A]), %%rax \n\t" \
143  "mulq " STR((j * 8)) "(%[B]) \n\t" \
144  "addq %[T1], %%rax \n\t" \
145  "movq %%rax, " STR(((j - 1) * 8)) "(%[tmp]) \n\t" \
146  "adcq $0, %%rdx \n\t" \
147  "movq %%rdx, %[T1] # now " \
148  "T1:tmp[j-1] <-- X[i] * Y[j] + T1 \n\t" \
149  "movq " STR((j * 8)) "(%[M]), %%rax \n\t" \
150  "mulq %[u] \n\t" \
151  "addq %%rax, " STR(((j - 1) * 8)) "(%[tmp]) \n\t" \
152  "adcq %[cy], %%rdx \n\t" \
153  "adcq $0, %[T0] \n\t" \
154  "xorq %[cy], %[cy] \n\t" \
155  "addq %%rdx, %[T1] \n\t" \
156  "adcq %[T0], %[cy] # cy:T1:tmp[j-1] <-- " \
157  "(X[i] * Y[j] + T1) + M[j] * u + cy * b \n\t" \
158  "addq " STR(((j + 1) * 8)) "(%[tmp]), %[T1] \n\t" \
159  "adcq $0, %[cy] # cy:T1:tmp[j-1] <-- " \
160  "(X[i] * Y[j] + T1) + M[j] * u + (tmp[j+1] + cy) * b \n\t"
161 
162 #define MONT_FINALIZE(j) \
163  "movq %[T1], " STR((j * 8)) "(%[tmp]) \n\t" \
164  "movq %[cy], " STR(((j + 1) * 8)) "(%[tmp]) \n\t"
165 
166 /*
167  Comba multiplication and squaring routines are based on the
168  public-domain tomsfastmath library by Tom St Denis
169  <http://www.libtom.org/>
170  <https://github.com/libtom/tomsfastmath/blob/master/src/sqr/fp_sqr_comba.c
171  <https://github.com/libtom/tomsfastmath/blob/master/src/mul/fp_mul_comba.c>
172 
173  Compared to the above, we save 5-20% of cycles by using careful register
174  renaming to implement Comba forward operation.
175  */
176 
177 #define COMBA_3_BY_3_MUL(c0_, c1_, c2_, res_, A_, B_) \
178  asm volatile( \
179  "movq 0(%[A]), %%rax \n\t" \
180  "mulq 0(%[B]) \n\t" \
181  "movq %%rax, 0(%[res]) \n\t" \
182  "movq %%rdx, %[c0] \n\t" \
183  \
184  "xorq %[c1], %[c1] \n\t" \
185  "movq 0(%[A]), %%rax \n\t" \
186  "mulq 8(%[B]) \n\t" \
187  "addq %%rax, %[c0] \n\t" \
188  "adcq %%rdx, %[c1] \n\t" \
189  \
190  "xorq %[c2], %[c2] \n\t" \
191  "movq 8(%[A]), %%rax \n\t" \
192  "mulq 0(%[B]) \n\t" \
193  "addq %%rax, %[c0] \n\t" \
194  "movq %[c0], 8(%[res]) \n\t" \
195  "adcq %%rdx, %[c1] \n\t" \
196  "adcq $0, %[c2] \n\t" \
197  \
198  "// register renaming (c1, c2, c0)\n\t" \
199  "xorq %[c0], %[c0] \n\t" \
200  "movq 0(%[A]), %%rax \n\t" \
201  "mulq 16(%[B]) \n\t" \
202  "addq %%rax, %[c1] \n\t" \
203  "adcq %%rdx, %[c2] \n\t" \
204  "adcq $0, %[c0] \n\t" \
205  \
206  "movq 8(%[A]), %%rax \n\t" \
207  "mulq 8(%[B]) \n\t" \
208  "addq %%rax, %[c1] \n\t" \
209  "adcq %%rdx, %[c2] \n\t" \
210  "adcq $0, %[c0] \n\t" \
211  \
212  "movq 16(%[A]), %%rax \n\t" \
213  "mulq 0(%[B]) \n\t" \
214  "addq %%rax, %[c1] \n\t" \
215  "movq %[c1], 16(%[res]) \n\t" \
216  "adcq %%rdx, %[c2] \n\t" \
217  "adcq $0, %[c0] \n\t" \
218  \
219  "// register renaming (c2, c0, c1)\n\t" \
220  "xorq %[c1], %[c1] \n\t" \
221  "movq 8(%[A]), %%rax \n\t" \
222  "mulq 16(%[B]) \n\t" \
223  "addq %%rax, %[c2] \n\t" \
224  "adcq %%rdx, %[c0] \n\t" \
225  "adcq $0, %[c1] \n\t" \
226  \
227  "movq 16(%[A]), %%rax \n\t" \
228  "mulq 8(%[B]) \n\t" \
229  "addq %%rax, %[c2] \n\t" \
230  "movq %[c2], 24(%[res]) \n\t" \
231  "adcq %%rdx, %[c0] \n\t" \
232  "adcq $0, %[c1] \n\t" \
233  \
234  "// register renaming (c0, c1, c2)\n\t" \
235  "xorq %[c2], %[c2] \n\t" \
236  "movq 16(%[A]), %%rax \n\t" \
237  "mulq 16(%[B]) \n\t" \
238  "addq %%rax, %[c0] \n\t" \
239  "movq %[c0], 32(%[res]) \n\t" \
240  "adcq %%rdx, %[c1] \n\t" \
241  "movq %[c1], 40(%[res]) \n\t" \
242  : [c0] "=&r"(c0_), [c1] "=&r"(c1_), [c2] "=&r"(c2_) \
243  : [res] "r"(res_), [A] "r"(A_), [B] "r"(B_) \
244  : "%rax", "%rdx", "cc", "memory")
245 
246 #define COMBA_3_BY_3_SQR(c0_, c1_, c2_, res_, A_) \
247  asm volatile( \
248  "xorq %[c1], %[c1] \n\t" \
249  "xorq %[c2], %[c2] \n\t" \
250  "movq 0(%[A]), %%rax \n\t" \
251  "mulq %%rax \n\t" \
252  "movq %%rax, 0(%[res]) \n\t" \
253  "movq %%rdx, %[c0] \n\t" \
254  \
255  "movq 0(%[A]), %%rax \n\t" \
256  "mulq 8(%[A]) \n\t" \
257  "addq %%rax, %[c0] \n\t" \
258  "adcq %%rdx, %[c1] \n\t" \
259  "addq %%rax, %[c0] \n\t" \
260  "movq %[c0], 8(%[res]) \n\t" \
261  "adcq %%rdx, %[c1] \n\t" \
262  "adcq $0, %[c2] \n\t" \
263  \
264  "// register renaming (c1, c2, c0)\n\t" \
265  "movq 0(%[A]), %%rax \n\t" \
266  "xorq %[c0], %[c0] \n\t" \
267  "mulq 16(%[A]) \n\t" \
268  "addq %%rax, %[c1] \n\t" \
269  "adcq %%rdx, %[c2] \n\t" \
270  "adcq $0, %[c0] \n\t" \
271  "addq %%rax, %[c1] \n\t" \
272  "adcq %%rdx, %[c2] \n\t" \
273  "adcq $0, %[c0] \n\t" \
274  \
275  "movq 8(%[A]), %%rax \n\t" \
276  "mulq %%rax \n\t" \
277  "addq %%rax, %[c1] \n\t" \
278  "movq %[c1], 16(%[res]) \n\t" \
279  "adcq %%rdx, %[c2] \n\t" \
280  "adcq $0, %[c0] \n\t" \
281  \
282  "// register renaming (c2, c0, c1)\n\t" \
283  "movq 8(%[A]), %%rax \n\t" \
284  "xorq %[c1], %[c1] \n\t" \
285  "mulq 16(%[A]) \n\t" \
286  "addq %%rax, %[c2] \n\t" \
287  "adcq %%rdx, %[c0] \n\t" \
288  "adcq $0, %[c1] \n\t" \
289  "addq %%rax, %[c2] \n\t" \
290  "movq %[c2], 24(%[res]) \n\t" \
291  "adcq %%rdx, %[c0] \n\t" \
292  "adcq $0, %[c1] \n\t" \
293  \
294  "// register renaming (c0, c1, c2)\n\t" \
295  "movq 16(%[A]), %%rax \n\t" \
296  "mulq %%rax \n\t" \
297  "addq %%rax, %[c0] \n\t" \
298  "movq %[c0], 32(%[res]) \n\t" \
299  "adcq %%rdx, %[c1] \n\t" \
300  "movq %[c1], 40(%[res]) \n\t" \
301  \
302  : [c0] "=&r"(c0_), [c1] "=&r"(c1_), [c2] "=&r"(c2_) \
303  : [res] "r"(res_), [A] "r"(A_) \
304  : "%rax", "%rdx", "cc", "memory")
305 
306 /*
307  The Montgomery reduction here is based on Algorithm 14.32 in
308  Handbook of Applied Cryptography
309  <http://cacr.uwaterloo.ca/hac/about/chap14.pdf>.
310  */
311 #define REDUCE_6_LIMB_PRODUCT(k_, tmp1_, tmp2_, tmp3_, inv_, res_, mod_) \
312  __asm__ volatile( \
313  "///////////////////////////////////\n\t" \
314  "movq 0(%[res]), %%rax \n\t" \
315  "mulq %[modprime] \n\t" \
316  "movq %%rax, %[k] \n\t" \
317  \
318  "movq (%[mod]), %%rax \n\t" \
319  "mulq %[k] \n\t" \
320  "movq %%rax, %[tmp1] \n\t" \
321  "movq %%rdx, %[tmp2] \n\t" \
322  \
323  "xorq %[tmp3], %[tmp3] \n\t" \
324  "movq 8(%[mod]), %%rax \n\t" \
325  "mulq %[k] \n\t" \
326  "addq %[tmp1], 0(%[res]) \n\t" \
327  "adcq %%rax, %[tmp2] \n\t" \
328  "adcq %%rdx, %[tmp3] \n\t" \
329  \
330  "xorq %[tmp1], %[tmp1] \n\t" \
331  "movq 16(%[mod]), %%rax \n\t" \
332  "mulq %[k] \n\t" \
333  "addq %[tmp2], 8(%[res]) \n\t" \
334  "adcq %%rax, %[tmp3] \n\t" \
335  "adcq %%rdx, %[tmp1] \n\t" \
336  \
337  "addq %[tmp3], 16(%[res]) \n\t" \
338  "adcq %[tmp1], 24(%[res]) \n\t" \
339  "adcq $0, 32(%[res]) \n\t" \
340  "adcq $0, 40(%[res]) \n\t" \
341  \
342  "///////////////////////////////////\n\t" \
343  "movq 8(%[res]), %%rax \n\t" \
344  "mulq %[modprime] \n\t" \
345  "movq %%rax, %[k] \n\t" \
346  \
347  "movq (%[mod]), %%rax \n\t" \
348  "mulq %[k] \n\t" \
349  "movq %%rax, %[tmp1] \n\t" \
350  "movq %%rdx, %[tmp2] \n\t" \
351  \
352  "xorq %[tmp3], %[tmp3] \n\t" \
353  "movq 8(%[mod]), %%rax \n\t" \
354  "mulq %[k] \n\t" \
355  "addq %[tmp1], 8(%[res]) \n\t" \
356  "adcq %%rax, %[tmp2] \n\t" \
357  "adcq %%rdx, %[tmp3] \n\t" \
358  \
359  "xorq %[tmp1], %[tmp1] \n\t" \
360  "movq 16(%[mod]), %%rax \n\t" \
361  "mulq %[k] \n\t" \
362  "addq %[tmp2], 16(%[res]) \n\t" \
363  "adcq %%rax, %[tmp3] \n\t" \
364  "adcq %%rdx, %[tmp1] \n\t" \
365  \
366  "addq %[tmp3], 24(%[res]) \n\t" \
367  "adcq %[tmp1], 32(%[res]) \n\t" \
368  "adcq $0, 40(%[res]) \n\t" \
369  \
370  "///////////////////////////////////\n\t" \
371  "movq 16(%[res]), %%rax \n\t" \
372  "mulq %[modprime] \n\t" \
373  "movq %%rax, %[k] \n\t" \
374  \
375  "movq (%[mod]), %%rax \n\t" \
376  "mulq %[k] \n\t" \
377  "movq %%rax, %[tmp1] \n\t" \
378  "movq %%rdx, %[tmp2] \n\t" \
379  \
380  "xorq %[tmp3], %[tmp3] \n\t" \
381  "movq 8(%[mod]), %%rax \n\t" \
382  "mulq %[k] \n\t" \
383  "addq %[tmp1], 16(%[res]) \n\t" \
384  "adcq %%rax, %[tmp2] \n\t" \
385  "adcq %%rdx, %[tmp3] \n\t" \
386  \
387  "xorq %[tmp1], %[tmp1] \n\t" \
388  "movq 16(%[mod]), %%rax \n\t" \
389  "mulq %[k] \n\t" \
390  "addq %[tmp2], 24(%[res]) \n\t" \
391  "adcq %%rax, %[tmp3] \n\t" \
392  "adcq %%rdx, %[tmp1] \n\t" \
393  \
394  "addq %[tmp3], 32(%[res]) \n\t" \
395  "adcq %[tmp1], 40(%[res]) \n\t" \
396  : [k] "=&r"(k_), \
397  [tmp1] "=&r"(tmp1_), \
398  [tmp2] "=&r"(tmp2_), \
399  [tmp3] "=&r"(tmp3_) \
400  : [modprime] "r"(inv_), [res] "r"(res_), [mod] "r"(mod_) \
401  : "%rax", "%rdx", "cc", "memory")
402 
403 // clang-format on
404 
405 } // namespace libff
406 
407 #endif // FP_AUX_TCC_