[openssl-dev] [openssl.org #3741] [PATCH] Better side-channel security for ECC table lookups

Wed Mar 11 07:17:37 UTC 2015

This patch relates to RT 3667 and the paper here (to appear at COSADE 2015):

https://eprint.iacr.org/2015/036

Instead of a direct table lookup for precomputed points in the generic
ECC multi-scalar multiplication routine, it computes the point by
traversing the entire table. Motivation is better side-channel
security.

Benchmarking below (evidently code path is not executed for nistp256
anymore). Costs are 6-10% for ECDSA sign, 2-6% for ECDSA verify
(defense unnecessary, but AFAICT there is no way to authoritatively
distinguish at this depth whether to protect scalars or not), and 1-5%
for ECDH.

BBB

openssl speed ecdsa before:

                              sign    verify    sign/s verify/s
 160 bit ecdsa (secp160r1)  0.0000s   0.0002s  20818.1   5670.3
 192 bit ecdsa (nistp192)   0.0001s   0.0002s  17844.1   4570.7
 224 bit ecdsa (nistp224)   0.0001s   0.0003s  13452.7   3413.4
 256 bit ecdsa (nistp256)   0.0000s   0.0001s  26485.2  10577.4
 384 bit ecdsa (nistp384)   0.0002s   0.0007s   5961.0   1438.0
 521 bit ecdsa (nistp521)   0.0004s   0.0015s   2837.1    684.5

openssl speed ecdsa after:

                              sign    verify    sign/s verify/s
 160 bit ecdsa (secp160r1)  0.0001s   0.0002s  19056.1   5369.1
 192 bit ecdsa (nistp192)   0.0001s   0.0002s  16270.0   4490.4
 224 bit ecdsa (nistp224)   0.0001s   0.0003s  12150.9   3354.7
 256 bit ecdsa (nistp256)   0.0000s   0.0001s  26489.9  10554.9
 384 bit ecdsa (nistp384)   0.0002s   0.0007s   5470.9   1355.5
 521 bit ecdsa (nistp521)   0.0004s   0.0015s   2673.3    667.4

openssl speed ecdh before:

                              op      op/s
 160 bit ecdh (secp160r1)  0.0001s   7025.0
 192 bit ecdh (nistp192)   0.0002s   5786.0
 224 bit ecdh (nistp224)   0.0002s   4158.1
 256 bit ecdh (nistp256)   0.0001s  14899.7
 384 bit ecdh (nistp384)   0.0006s   1732.1
 521 bit ecdh (nistp521)   0.0012s    820.4

openssl speed ecdh after:

                              op      op/s
 160 bit ecdh (secp160r1)  0.0001s   6750.8
 192 bit ecdh (nistp192)   0.0002s   5477.6
 224 bit ecdh (nistp224)   0.0002s   4060.2
 256 bit ecdh (nistp256)   0.0001s  14886.2
 384 bit ecdh (nistp384)   0.0006s   1709.7
 521 bit ecdh (nistp521)   0.0012s    802.1

(I do not have root on the benchmarking machine to peg the clock. Sorry.)

$ cat /proc/cpuinfo
processor    : 0
vendor_id    : GenuineIntel
cpu family    : 6
model        : 60
model name    : Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz
stepping    : 3
microcode    : 26
cpu MHz        : 800.000
cache size    : 6144 KB
physical id    : 0
siblings    : 4
core id        : 0
cpu cores    : 4
apicid        : 0
initial apicid    : 0
fpu        : yes
fpu_exception    : yes
cpuid level    : 13
wp        : yes
flags        : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge
mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe
syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts
rep_good xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor
ds_cpl vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1 sse4_2
x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
lahf_lm abm ida arat epb xsaveopt pln pts dts tpr_shadow vnmi
flexpriority ept vpid fsgsbase bmi1 hle avx2 smep bmi2 erms invpcid
rtm
bogomips    : 6385.38
clflush size    : 64
cache_alignment    : 64
address sizes    : 39 bits physical, 48 bits virtual
power management: SNIP

-------------- next part --------------
diff -rupN openssl-1.0.2_orig/crypto/ec/ec_mult.c openssl-1.0.2/crypto/ec/ec_mult.c

--- openssl-1.0.2_orig/crypto/ec/ec_mult.c	2015-01-22 16:58:32.000000000 +0200
+++ openssl-1.0.2/crypto/ec/ec_mult.c	2015-03-10 13:06:50.661644241 +0200
@@ -306,6 +306,66 @@ static signed char *compute_wNAF(const B
     return r;
 }
 
+/**
+ * Traverses t[i] for 0 <= i < 2**(w-1) and places t[digit >> 1] in r 
+ * with bitwise operations as a side-channel defense.
+ * Assumes r has enough BIGNUM space so tops of BIGNUMs in t do not overflow.
+ * See https://eprint.iacr.org/2015/036
+ */
+void EC_POINT_select(EC_POINT *r, EC_POINT **t, int digit, int w, int top) {
+
+    w = (1 << w);
+
+    int d, j;
+    BN_ULONG mask;
+
+    for(j=0; j<top; j++) {
+        r->X.d[j] = 0;
+        r->Y.d[j] = 0;
+        r->Z.d[j] = 0;
+    }
+    r->X.top    = 0;
+    r->X.dmax   = 0;
+    r->X.neg    = 0;
+    r->X.flags  = 0;
+    r->Y.top    = 0;
+    r->Y.dmax   = 0;
+    r->Y.neg    = 0;
+    r->Y.flags  = 0;
+    r->Z.top    = 0;
+    r->Z.dmax   = 0;
+    r->Z.neg    = 0;
+    r->Z.flags  = 0;
+    r->Z_is_one = 0;
+
+    for(d=1; d<w; d+=2) {
+        mask = (BN_ULONG)(((digit ^ d) - 1) >> 15);
+        for(j=0; j<t[d >> 1]->X.top; j++) {
+            r->X.d[j] |= mask & t[d >> 1]->X.d[j];
+        }
+        for(j=0; j<t[d >> 1]->Y.top; j++) {
+            r->Y.d[j] |= mask & t[d >> 1]->Y.d[j];
+        }
+        for(j=0; j<t[d >> 1]->Z.top; j++) {
+            r->Z.d[j] |= mask & t[d >> 1]->Z.d[j];
+        }
+        r->X.top    |= mask & (t[d >> 1]->X.top   );
+        r->X.dmax   |= mask & (t[d >> 1]->X.dmax  );
+        r->X.neg    |= mask & (t[d >> 1]->X.neg   );
+        r->X.flags  |= mask & (t[d >> 1]->X.flags );
+        r->Y.top    |= mask & (t[d >> 1]->Y.top   );
+        r->Y.dmax   |= mask & (t[d >> 1]->Y.dmax  );
+        r->Y.neg    |= mask & (t[d >> 1]->Y.neg   );
+        r->Y.flags  |= mask & (t[d >> 1]->Y.flags );
+        r->Z.top    |= mask & (t[d >> 1]->Z.top   );
+        r->Z.dmax   |= mask & (t[d >> 1]->Z.dmax  );
+        r->Z.neg    |= mask & (t[d >> 1]->Z.neg   );
+        r->Z.flags  |= mask & (t[d >> 1]->Z.flags );
+        r->Z_is_one |= mask & (t[d >> 1]->Z_is_one);
+    }
+
+}
+
 /*
  * TODO: table should be optimised for the wNAF-based implementation,
  * sometimes smaller windows will give better performance (thus the
@@ -532,6 +592,7 @@ int ec_wNAF_mul(const EC_GROUP *group, E
                 tmp_points = pre_comp->points;
 
                 for (i = num; i < totalnum; i++) {
+                    wsize[i] = wsize[num];
                     if (i < totalnum - 1) {
                         wNAF_len[i] = blocksize;
                         if (tmp_len < blocksize) {
@@ -635,6 +696,11 @@ int ec_wNAF_mul(const EC_GROUP *group, E
         goto err;
 #endif
 
+    /* use tmp for selecting point addition arg */
+    bn_wexpand(&tmp->X, group->field.top);
+    bn_wexpand(&tmp->Y, group->field.top);
+    bn_wexpand(&tmp->Z, group->field.top);
+
     r_is_at_infinity = 1;
 
     for (k = max_len - 1; k >= 0; k--) {
@@ -665,12 +731,16 @@ int ec_wNAF_mul(const EC_GROUP *group, E
                     /* digit > 0 */
 
                     if (r_is_at_infinity) {
-                        if (!EC_POINT_copy(r, val_sub[i][digit >> 1]))
-                            goto err;
+                        bn_wexpand(&r->X, group->field.top);
+                        bn_wexpand(&r->Y, group->field.top);
+                        bn_wexpand(&r->Z, group->field.top);
+                        EC_POINT_select(r, val_sub[i], digit, wsize[i], group->field.top);
                         r_is_at_infinity = 0;
                     } else {
+                        /* prepare point addition argument */
+                        EC_POINT_select(tmp, val_sub[i], digit, wsize[i], group->field.top);
                         if (!EC_POINT_add
-                            (group, r, r, val_sub[i][digit >> 1], ctx))
+                            (group, r, r, tmp, ctx))
                             goto err;
                     }
                 }