mirror of
https://git.torproject.org/tor.git
synced 2025-10-05 23:42:52 +02:00
Add code from BearSSL's ghash implementation.
Polyval (which we need for CGO) is very similar to ghash, and most of this code should be reusable with suitable adaptation.
This commit is contained in:
27
LICENSE
27
LICENSE
@@ -382,6 +382,33 @@ src/ext/mulodi4.c is distributed under this license:
|
||||
names of the LLVM Team or the University of Illinois to endorse
|
||||
or promote products derived from this Software.
|
||||
|
||||
===============================================================================
|
||||
Parts of src/ext/polyval are based on Thomas Pornin's GHASH implementation in
|
||||
BearSSL, and distributed under the following license:
|
||||
|
||||
Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
|
||||
===============================================================================
|
||||
If you got Tor as a static binary with OpenSSL included, then you should know:
|
||||
"This product includes software developed by the OpenSSL Project
|
||||
|
28
src/ext/polyval/README.tor
Normal file
28
src/ext/polyval/README.tor
Normal file
@@ -0,0 +1,28 @@
|
||||
This code is based on the constant-time GHASH implementations
|
||||
from BearSSL, written by Thomas Pornin.
|
||||
|
||||
Up-to-date with BearSSL as of 3c040368f6791553610e362401db1efff4b4c5b8.
|
||||
|
||||
The license on that code is:
|
||||
|
||||
> Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
|
||||
>
|
||||
> Permission is hereby granted, free of charge, to any person obtaining
|
||||
> a copy of this software and associated documentation files (the
|
||||
> "Software"), to deal in the Software without restriction, including
|
||||
> without limitation the rights to use, copy, modify, merge, publish,
|
||||
> distribute, sublicense, and/or sell copies of the Software, and to
|
||||
> permit persons to whom the Software is furnished to do so, subject to
|
||||
> the following conditions:
|
||||
>
|
||||
> The above copyright notice and this permission notice shall be
|
||||
> included in all copies or substantial portions of the Software.
|
||||
>
|
||||
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
> NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
> BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
> ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
> CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
> SOFTWARE.
|
345
src/ext/polyval/ghash_ctmul.c
Normal file
345
src/ext/polyval/ghash_ctmul.c
Normal file
@@ -0,0 +1,345 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "inner.h"
|
||||
|
||||
/*
|
||||
* We compute "carryless multiplications" through normal integer
|
||||
* multiplications, masking out enough bits to create "holes" in which
|
||||
* carries may expand without altering our bits; we really use 8 data
|
||||
* bits per 32-bit word, spaced every fourth bit. Accumulated carries
|
||||
* may not exceed 8 in total, which fits in 4 bits.
|
||||
*
|
||||
* It would be possible to use a 3-bit spacing, allowing two operands,
|
||||
* one with 7 non-zero data bits, the other one with 10 or 11 non-zero
|
||||
* data bits; this asymmetric splitting makes the overall code more
|
||||
* complex with thresholds and exceptions, and does not appear to be
|
||||
* worth the effort.
|
||||
*/
|
||||
|
||||
/*
|
||||
* We cannot really autodetect whether multiplications are "slow" or
|
||||
* not. A typical example is the ARM Cortex M0+, which exists in two
|
||||
* versions: one with a 1-cycle multiplication opcode, the other with
|
||||
* a 32-cycle multiplication opcode. They both use exactly the same
|
||||
* architecture and ABI, and cannot be distinguished from each other
|
||||
* at compile-time.
|
||||
*
|
||||
* Since most modern CPU (even embedded CPU) still have fast
|
||||
* multiplications, we use the "fast mul" code by default.
|
||||
*/
|
||||
|
||||
#if BR_SLOW_MUL
|
||||
|
||||
/*
|
||||
* This implementation uses Karatsuba-like reduction to make fewer
|
||||
* integer multiplications (9 instead of 16), at the expense of extra
|
||||
* logical operations (XOR, shifts...). On modern x86 CPU that offer
|
||||
* fast, pipelined multiplications, this code is about twice slower than
|
||||
* the simpler code with 16 multiplications. This tendency may be
|
||||
* reversed on low-end platforms with expensive multiplications.
|
||||
*/
|
||||
|
||||
#define MUL32(h, l, x, y) do { \
|
||||
uint64_t mul32tmp = MUL(x, y); \
|
||||
(h) = (uint32_t)(mul32tmp >> 32); \
|
||||
(l) = (uint32_t)mul32tmp; \
|
||||
} while (0)
|
||||
|
||||
static inline void
|
||||
bmul(uint32_t *hi, uint32_t *lo, uint32_t x, uint32_t y)
|
||||
{
|
||||
uint32_t x0, x1, x2, x3;
|
||||
uint32_t y0, y1, y2, y3;
|
||||
uint32_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
|
||||
uint32_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
|
||||
|
||||
x0 = x & (uint32_t)0x11111111;
|
||||
x1 = x & (uint32_t)0x22222222;
|
||||
x2 = x & (uint32_t)0x44444444;
|
||||
x3 = x & (uint32_t)0x88888888;
|
||||
y0 = y & (uint32_t)0x11111111;
|
||||
y1 = y & (uint32_t)0x22222222;
|
||||
y2 = y & (uint32_t)0x44444444;
|
||||
y3 = y & (uint32_t)0x88888888;
|
||||
|
||||
/*
|
||||
* (x0+W*x1)*(y0+W*y1) -> a0:b0
|
||||
* (x2+W*x3)*(y2+W*y3) -> a3:b3
|
||||
* ((x0+x2)+W*(x1+x3))*((y0+y2)+W*(y1+y3)) -> a6:b6
|
||||
*/
|
||||
a0 = x0;
|
||||
b0 = y0;
|
||||
a1 = x1 >> 1;
|
||||
b1 = y1 >> 1;
|
||||
a2 = a0 ^ a1;
|
||||
b2 = b0 ^ b1;
|
||||
a3 = x2 >> 2;
|
||||
b3 = y2 >> 2;
|
||||
a4 = x3 >> 3;
|
||||
b4 = y3 >> 3;
|
||||
a5 = a3 ^ a4;
|
||||
b5 = b3 ^ b4;
|
||||
a6 = a0 ^ a3;
|
||||
b6 = b0 ^ b3;
|
||||
a7 = a1 ^ a4;
|
||||
b7 = b1 ^ b4;
|
||||
a8 = a6 ^ a7;
|
||||
b8 = b6 ^ b7;
|
||||
|
||||
MUL32(b0, a0, b0, a0);
|
||||
MUL32(b1, a1, b1, a1);
|
||||
MUL32(b2, a2, b2, a2);
|
||||
MUL32(b3, a3, b3, a3);
|
||||
MUL32(b4, a4, b4, a4);
|
||||
MUL32(b5, a5, b5, a5);
|
||||
MUL32(b6, a6, b6, a6);
|
||||
MUL32(b7, a7, b7, a7);
|
||||
MUL32(b8, a8, b8, a8);
|
||||
|
||||
a0 &= (uint32_t)0x11111111;
|
||||
a1 &= (uint32_t)0x11111111;
|
||||
a2 &= (uint32_t)0x11111111;
|
||||
a3 &= (uint32_t)0x11111111;
|
||||
a4 &= (uint32_t)0x11111111;
|
||||
a5 &= (uint32_t)0x11111111;
|
||||
a6 &= (uint32_t)0x11111111;
|
||||
a7 &= (uint32_t)0x11111111;
|
||||
a8 &= (uint32_t)0x11111111;
|
||||
b0 &= (uint32_t)0x11111111;
|
||||
b1 &= (uint32_t)0x11111111;
|
||||
b2 &= (uint32_t)0x11111111;
|
||||
b3 &= (uint32_t)0x11111111;
|
||||
b4 &= (uint32_t)0x11111111;
|
||||
b5 &= (uint32_t)0x11111111;
|
||||
b6 &= (uint32_t)0x11111111;
|
||||
b7 &= (uint32_t)0x11111111;
|
||||
b8 &= (uint32_t)0x11111111;
|
||||
|
||||
a2 ^= a0 ^ a1;
|
||||
b2 ^= b0 ^ b1;
|
||||
a0 ^= (a2 << 1) ^ (a1 << 2);
|
||||
b0 ^= (b2 << 1) ^ (b1 << 2);
|
||||
a5 ^= a3 ^ a4;
|
||||
b5 ^= b3 ^ b4;
|
||||
a3 ^= (a5 << 1) ^ (a4 << 2);
|
||||
b3 ^= (b5 << 1) ^ (b4 << 2);
|
||||
a8 ^= a6 ^ a7;
|
||||
b8 ^= b6 ^ b7;
|
||||
a6 ^= (a8 << 1) ^ (a7 << 2);
|
||||
b6 ^= (b8 << 1) ^ (b7 << 2);
|
||||
a6 ^= a0 ^ a3;
|
||||
b6 ^= b0 ^ b3;
|
||||
*lo = a0 ^ (a6 << 2) ^ (a3 << 4);
|
||||
*hi = b0 ^ (b6 << 2) ^ (b3 << 4) ^ (a6 >> 30) ^ (a3 >> 28);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
* Simple multiplication in GF(2)[X], using 16 integer multiplications.
|
||||
*/
|
||||
|
||||
static inline void
|
||||
bmul(uint32_t *hi, uint32_t *lo, uint32_t x, uint32_t y)
|
||||
{
|
||||
uint32_t x0, x1, x2, x3;
|
||||
uint32_t y0, y1, y2, y3;
|
||||
uint64_t z0, z1, z2, z3;
|
||||
uint64_t z;
|
||||
|
||||
x0 = x & (uint32_t)0x11111111;
|
||||
x1 = x & (uint32_t)0x22222222;
|
||||
x2 = x & (uint32_t)0x44444444;
|
||||
x3 = x & (uint32_t)0x88888888;
|
||||
y0 = y & (uint32_t)0x11111111;
|
||||
y1 = y & (uint32_t)0x22222222;
|
||||
y2 = y & (uint32_t)0x44444444;
|
||||
y3 = y & (uint32_t)0x88888888;
|
||||
z0 = MUL(x0, y0) ^ MUL(x1, y3) ^ MUL(x2, y2) ^ MUL(x3, y1);
|
||||
z1 = MUL(x0, y1) ^ MUL(x1, y0) ^ MUL(x2, y3) ^ MUL(x3, y2);
|
||||
z2 = MUL(x0, y2) ^ MUL(x1, y1) ^ MUL(x2, y0) ^ MUL(x3, y3);
|
||||
z3 = MUL(x0, y3) ^ MUL(x1, y2) ^ MUL(x2, y1) ^ MUL(x3, y0);
|
||||
z0 &= (uint64_t)0x1111111111111111;
|
||||
z1 &= (uint64_t)0x2222222222222222;
|
||||
z2 &= (uint64_t)0x4444444444444444;
|
||||
z3 &= (uint64_t)0x8888888888888888;
|
||||
z = z0 | z1 | z2 | z3;
|
||||
*lo = (uint32_t)z;
|
||||
*hi = (uint32_t)(z >> 32);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* see bearssl_hash.h */
|
||||
void
|
||||
br_ghash_ctmul(void *y, const void *h, const void *data, size_t len)
|
||||
{
|
||||
const unsigned char *buf, *hb;
|
||||
unsigned char *yb;
|
||||
uint32_t yw[4];
|
||||
uint32_t hw[4];
|
||||
|
||||
/*
|
||||
* Throughout the loop we handle the y and h values as arrays
|
||||
* of 32-bit words.
|
||||
*/
|
||||
buf = data;
|
||||
yb = y;
|
||||
hb = h;
|
||||
yw[3] = br_dec32be(yb);
|
||||
yw[2] = br_dec32be(yb + 4);
|
||||
yw[1] = br_dec32be(yb + 8);
|
||||
yw[0] = br_dec32be(yb + 12);
|
||||
hw[3] = br_dec32be(hb);
|
||||
hw[2] = br_dec32be(hb + 4);
|
||||
hw[1] = br_dec32be(hb + 8);
|
||||
hw[0] = br_dec32be(hb + 12);
|
||||
while (len > 0) {
|
||||
const unsigned char *src;
|
||||
unsigned char tmp[16];
|
||||
int i;
|
||||
uint32_t a[9], b[9], zw[8];
|
||||
uint32_t c0, c1, c2, c3, d0, d1, d2, d3, e0, e1, e2, e3;
|
||||
|
||||
/*
|
||||
* Get the next 16-byte block (using zero-padding if
|
||||
* necessary).
|
||||
*/
|
||||
if (len >= 16) {
|
||||
src = buf;
|
||||
buf += 16;
|
||||
len -= 16;
|
||||
} else {
|
||||
memcpy(tmp, buf, len);
|
||||
memset(tmp + len, 0, (sizeof tmp) - len);
|
||||
src = tmp;
|
||||
len = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Decode the block. The GHASH standard mandates
|
||||
* big-endian encoding.
|
||||
*/
|
||||
yw[3] ^= br_dec32be(src);
|
||||
yw[2] ^= br_dec32be(src + 4);
|
||||
yw[1] ^= br_dec32be(src + 8);
|
||||
yw[0] ^= br_dec32be(src + 12);
|
||||
|
||||
/*
|
||||
* We multiply two 128-bit field elements. We use
|
||||
* Karatsuba to turn that into three 64-bit
|
||||
* multiplications, which are themselves done with a
|
||||
* total of nine 32-bit multiplications.
|
||||
*/
|
||||
|
||||
/*
|
||||
* y[0,1]*h[0,1] -> 0..2
|
||||
* y[2,3]*h[2,3] -> 3..5
|
||||
* (y[0,1]+y[2,3])*(h[0,1]+h[2,3]) -> 6..8
|
||||
*/
|
||||
a[0] = yw[0];
|
||||
b[0] = hw[0];
|
||||
a[1] = yw[1];
|
||||
b[1] = hw[1];
|
||||
a[2] = a[0] ^ a[1];
|
||||
b[2] = b[0] ^ b[1];
|
||||
|
||||
a[3] = yw[2];
|
||||
b[3] = hw[2];
|
||||
a[4] = yw[3];
|
||||
b[4] = hw[3];
|
||||
a[5] = a[3] ^ a[4];
|
||||
b[5] = b[3] ^ b[4];
|
||||
|
||||
a[6] = a[0] ^ a[3];
|
||||
b[6] = b[0] ^ b[3];
|
||||
a[7] = a[1] ^ a[4];
|
||||
b[7] = b[1] ^ b[4];
|
||||
a[8] = a[6] ^ a[7];
|
||||
b[8] = b[6] ^ b[7];
|
||||
|
||||
for (i = 0; i < 9; i ++) {
|
||||
bmul(&b[i], &a[i], b[i], a[i]);
|
||||
}
|
||||
|
||||
c0 = a[0];
|
||||
c1 = b[0] ^ a[2] ^ a[0] ^ a[1];
|
||||
c2 = a[1] ^ b[2] ^ b[0] ^ b[1];
|
||||
c3 = b[1];
|
||||
d0 = a[3];
|
||||
d1 = b[3] ^ a[5] ^ a[3] ^ a[4];
|
||||
d2 = a[4] ^ b[5] ^ b[3] ^ b[4];
|
||||
d3 = b[4];
|
||||
e0 = a[6];
|
||||
e1 = b[6] ^ a[8] ^ a[6] ^ a[7];
|
||||
e2 = a[7] ^ b[8] ^ b[6] ^ b[7];
|
||||
e3 = b[7];
|
||||
|
||||
e0 ^= c0 ^ d0;
|
||||
e1 ^= c1 ^ d1;
|
||||
e2 ^= c2 ^ d2;
|
||||
e3 ^= c3 ^ d3;
|
||||
c2 ^= e0;
|
||||
c3 ^= e1;
|
||||
d0 ^= e2;
|
||||
d1 ^= e3;
|
||||
|
||||
/*
|
||||
* GHASH specification has the bits "reversed" (most
|
||||
* significant is in fact least significant), which does
|
||||
* not matter for a carryless multiplication, except that
|
||||
* the 255-bit result must be shifted by 1 bit.
|
||||
*/
|
||||
zw[0] = c0 << 1;
|
||||
zw[1] = (c1 << 1) | (c0 >> 31);
|
||||
zw[2] = (c2 << 1) | (c1 >> 31);
|
||||
zw[3] = (c3 << 1) | (c2 >> 31);
|
||||
zw[4] = (d0 << 1) | (c3 >> 31);
|
||||
zw[5] = (d1 << 1) | (d0 >> 31);
|
||||
zw[6] = (d2 << 1) | (d1 >> 31);
|
||||
zw[7] = (d3 << 1) | (d2 >> 31);
|
||||
|
||||
/*
|
||||
* We now do the reduction modulo the field polynomial
|
||||
* to get back to 128 bits.
|
||||
*/
|
||||
for (i = 0; i < 4; i ++) {
|
||||
uint32_t lw;
|
||||
|
||||
lw = zw[i];
|
||||
zw[i + 4] ^= lw ^ (lw >> 1) ^ (lw >> 2) ^ (lw >> 7);
|
||||
zw[i + 3] ^= (lw << 31) ^ (lw << 30) ^ (lw << 25);
|
||||
}
|
||||
memcpy(yw, zw + 4, sizeof yw);
|
||||
}
|
||||
|
||||
/*
|
||||
* Encode back the result.
|
||||
*/
|
||||
br_enc32be(yb, yw[3]);
|
||||
br_enc32be(yb + 4, yw[2]);
|
||||
br_enc32be(yb + 8, yw[1]);
|
||||
br_enc32be(yb + 12, yw[0]);
|
||||
}
|
154
src/ext/polyval/ghash_ctmul64.c
Normal file
154
src/ext/polyval/ghash_ctmul64.c
Normal file
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "inner.h"
|
||||
|
||||
/*
|
||||
* This is the 64-bit variant of br_ghash_ctmul32(), with 64-bit operands
|
||||
* and bit reversal of 64-bit words.
|
||||
*/
|
||||
|
||||
static inline uint64_t
|
||||
bmul64(uint64_t x, uint64_t y)
|
||||
{
|
||||
uint64_t x0, x1, x2, x3;
|
||||
uint64_t y0, y1, y2, y3;
|
||||
uint64_t z0, z1, z2, z3;
|
||||
|
||||
x0 = x & (uint64_t)0x1111111111111111;
|
||||
x1 = x & (uint64_t)0x2222222222222222;
|
||||
x2 = x & (uint64_t)0x4444444444444444;
|
||||
x3 = x & (uint64_t)0x8888888888888888;
|
||||
y0 = y & (uint64_t)0x1111111111111111;
|
||||
y1 = y & (uint64_t)0x2222222222222222;
|
||||
y2 = y & (uint64_t)0x4444444444444444;
|
||||
y3 = y & (uint64_t)0x8888888888888888;
|
||||
z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1);
|
||||
z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2);
|
||||
z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3);
|
||||
z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0);
|
||||
z0 &= (uint64_t)0x1111111111111111;
|
||||
z1 &= (uint64_t)0x2222222222222222;
|
||||
z2 &= (uint64_t)0x4444444444444444;
|
||||
z3 &= (uint64_t)0x8888888888888888;
|
||||
return z0 | z1 | z2 | z3;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
rev64(uint64_t x)
|
||||
{
|
||||
#define RMS(m, s) do { \
|
||||
x = ((x & (uint64_t)(m)) << (s)) \
|
||||
| ((x >> (s)) & (uint64_t)(m)); \
|
||||
} while (0)
|
||||
|
||||
RMS(0x5555555555555555, 1);
|
||||
RMS(0x3333333333333333, 2);
|
||||
RMS(0x0F0F0F0F0F0F0F0F, 4);
|
||||
RMS(0x00FF00FF00FF00FF, 8);
|
||||
RMS(0x0000FFFF0000FFFF, 16);
|
||||
return (x << 32) | (x >> 32);
|
||||
|
||||
#undef RMS
|
||||
}
|
||||
|
||||
/* see bearssl_ghash.h */
|
||||
void
|
||||
br_ghash_ctmul64(void *y, const void *h, const void *data, size_t len)
|
||||
{
|
||||
const unsigned char *buf, *hb;
|
||||
unsigned char *yb;
|
||||
uint64_t y0, y1;
|
||||
uint64_t h0, h1, h2, h0r, h1r, h2r;
|
||||
|
||||
buf = data;
|
||||
yb = y;
|
||||
hb = h;
|
||||
y1 = br_dec64be(yb);
|
||||
y0 = br_dec64be(yb + 8);
|
||||
h1 = br_dec64be(hb);
|
||||
h0 = br_dec64be(hb + 8);
|
||||
h0r = rev64(h0);
|
||||
h1r = rev64(h1);
|
||||
h2 = h0 ^ h1;
|
||||
h2r = h0r ^ h1r;
|
||||
while (len > 0) {
|
||||
const unsigned char *src;
|
||||
unsigned char tmp[16];
|
||||
uint64_t y0r, y1r, y2, y2r;
|
||||
uint64_t z0, z1, z2, z0h, z1h, z2h;
|
||||
uint64_t v0, v1, v2, v3;
|
||||
|
||||
if (len >= 16) {
|
||||
src = buf;
|
||||
buf += 16;
|
||||
len -= 16;
|
||||
} else {
|
||||
memcpy(tmp, buf, len);
|
||||
memset(tmp + len, 0, (sizeof tmp) - len);
|
||||
src = tmp;
|
||||
len = 0;
|
||||
}
|
||||
y1 ^= br_dec64be(src);
|
||||
y0 ^= br_dec64be(src + 8);
|
||||
|
||||
y0r = rev64(y0);
|
||||
y1r = rev64(y1);
|
||||
y2 = y0 ^ y1;
|
||||
y2r = y0r ^ y1r;
|
||||
|
||||
z0 = bmul64(y0, h0);
|
||||
z1 = bmul64(y1, h1);
|
||||
z2 = bmul64(y2, h2);
|
||||
z0h = bmul64(y0r, h0r);
|
||||
z1h = bmul64(y1r, h1r);
|
||||
z2h = bmul64(y2r, h2r);
|
||||
z2 ^= z0 ^ z1;
|
||||
z2h ^= z0h ^ z1h;
|
||||
z0h = rev64(z0h) >> 1;
|
||||
z1h = rev64(z1h) >> 1;
|
||||
z2h = rev64(z2h) >> 1;
|
||||
|
||||
v0 = z0;
|
||||
v1 = z0h ^ z2;
|
||||
v2 = z1 ^ z2h;
|
||||
v3 = z1h;
|
||||
|
||||
v3 = (v3 << 1) | (v2 >> 63);
|
||||
v2 = (v2 << 1) | (v1 >> 63);
|
||||
v1 = (v1 << 1) | (v0 >> 63);
|
||||
v0 = (v0 << 1);
|
||||
|
||||
v2 ^= v0 ^ (v0 >> 1) ^ (v0 >> 2) ^ (v0 >> 7);
|
||||
v1 ^= (v0 << 63) ^ (v0 << 62) ^ (v0 << 57);
|
||||
v3 ^= v1 ^ (v1 >> 1) ^ (v1 >> 2) ^ (v1 >> 7);
|
||||
v2 ^= (v1 << 63) ^ (v1 << 62) ^ (v1 << 57);
|
||||
|
||||
y0 = v2;
|
||||
y1 = v3;
|
||||
}
|
||||
|
||||
br_enc64be(yb, y1);
|
||||
br_enc64be(yb + 8, y0);
|
||||
}
|
389
src/ext/polyval/ghash_pclmul.c
Normal file
389
src/ext/polyval/ghash_pclmul.c
Normal file
@@ -0,0 +1,389 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#define BR_ENABLE_INTRINSICS 1
|
||||
#include "inner.h"
|
||||
|
||||
/*
|
||||
* This is the GHASH implementation that leverages the pclmulqdq opcode
|
||||
* (from the AES-NI instructions).
|
||||
*/
|
||||
|
||||
#if BR_AES_X86NI
|
||||
|
||||
/*
|
||||
* Test CPU support for PCLMULQDQ.
|
||||
*/
|
||||
static inline int
|
||||
pclmul_supported(void)
|
||||
{
|
||||
/*
|
||||
* Bit mask for features in ECX:
|
||||
* 1 PCLMULQDQ support
|
||||
*/
|
||||
return br_cpuid(0, 0, 0x00000002, 0);
|
||||
}
|
||||
|
||||
/* see bearssl_hash.h */
|
||||
br_ghash
|
||||
br_ghash_pclmul_get(void)
|
||||
{
|
||||
return pclmul_supported() ? &br_ghash_pclmul : 0;
|
||||
}
|
||||
|
||||
BR_TARGETS_X86_UP
|
||||
|
||||
/*
|
||||
* GHASH is defined over elements of GF(2^128) with "full little-endian"
|
||||
* representation: leftmost byte is least significant, and, within each
|
||||
* byte, leftmost _bit_ is least significant. The natural ordering in
|
||||
* x86 is "mixed little-endian": bytes are ordered from least to most
|
||||
* significant, but bits within a byte are in most-to-least significant
|
||||
* order. Going to full little-endian representation would require
|
||||
* reversing bits within each byte, which is doable but expensive.
|
||||
*
|
||||
* Instead, we go to full big-endian representation, by swapping bytes
|
||||
* around, which is done with a single _mm_shuffle_epi8() opcode (it
|
||||
* comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
|
||||
* can use a full big-endian representation because in a carryless
|
||||
* multiplication, we have a nice bit reversal property:
|
||||
*
|
||||
* rev_128(x) * rev_128(y) = rev_255(x * y)
|
||||
*
|
||||
* So by using full big-endian, we still get the right result, except
|
||||
* that it is right-shifted by 1 bit. The left-shift is relatively
|
||||
* inexpensive, and it can be mutualised.
|
||||
*
|
||||
*
|
||||
* Since SSE2 opcodes do not have facilities for shitfting full 128-bit
|
||||
* values with bit precision, we have to break down values into 64-bit
|
||||
* chunks. We number chunks from 0 to 3 in left to right order.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Byte-swap a complete 128-bit value. This normally uses
|
||||
* _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
|
||||
* However, this crashes old Clang versions, so, for Clang before 3.8,
|
||||
* we use an alternate (and less efficient) version.
|
||||
*/
|
||||
#if BR_CLANG && !BR_CLANG_3_8
|
||||
#define BYTESWAP_DECL
|
||||
#define BYTESWAP_PREP (void)0
|
||||
#define BYTESWAP(x) do { \
|
||||
__m128i byteswap1, byteswap2; \
|
||||
byteswap1 = (x); \
|
||||
byteswap2 = _mm_srli_epi16(byteswap1, 8); \
|
||||
byteswap1 = _mm_slli_epi16(byteswap1, 8); \
|
||||
byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
|
||||
byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
|
||||
byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
|
||||
(x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
|
||||
} while (0)
|
||||
#else
|
||||
#define BYTESWAP_DECL __m128i byteswap_index;
|
||||
#define BYTESWAP_PREP do { \
|
||||
byteswap_index = _mm_set_epi8( \
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
|
||||
} while (0)
|
||||
#define BYTESWAP(x) do { \
|
||||
(x) = _mm_shuffle_epi8((x), byteswap_index); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
|
||||
* for that compiler, we use inline assembly. Inline assembly is
|
||||
* potentially a bit slower because the compiler does not understand
|
||||
* what the opcode does, and thus cannot optimize instruction
|
||||
* scheduling.
|
||||
*
|
||||
* We use a target of "sse2" only, so that Clang may still handle the
|
||||
* '__m128i' type and allocate SSE2 registers.
|
||||
*/
|
||||
#if BR_CLANG
|
||||
BR_TARGET("sse2")
|
||||
static inline __m128i
|
||||
pclmulqdq00(__m128i x, __m128i y)
|
||||
{
|
||||
__asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
|
||||
return x;
|
||||
}
|
||||
BR_TARGET("sse2")
|
||||
static inline __m128i
|
||||
pclmulqdq11(__m128i x, __m128i y)
|
||||
{
|
||||
__asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
|
||||
return x;
|
||||
}
|
||||
#else
|
||||
#define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00)
|
||||
#define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* From a 128-bit value kw, compute kx as the XOR of the two 64-bit
|
||||
* halves of kw (into the right half of kx; left half is unspecified).
|
||||
*/
|
||||
#define BK(kw, kx) do { \
|
||||
kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
|
||||
* the XOR of the two values (kx).
|
||||
*/
|
||||
#define PBK(k0, k1, kw, kx) do { \
|
||||
kw = _mm_unpacklo_epi64(k1, k0); \
|
||||
kx = _mm_xor_si128(k0, k1); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Left-shift by 1 bit a 256-bit value (in four 64-bit words).
|
||||
*/
|
||||
#define SL_256(x0, x1, x2, x3) do { \
|
||||
x0 = _mm_or_si128( \
|
||||
_mm_slli_epi64(x0, 1), \
|
||||
_mm_srli_epi64(x1, 63)); \
|
||||
x1 = _mm_or_si128( \
|
||||
_mm_slli_epi64(x1, 1), \
|
||||
_mm_srli_epi64(x2, 63)); \
|
||||
x2 = _mm_or_si128( \
|
||||
_mm_slli_epi64(x2, 1), \
|
||||
_mm_srli_epi64(x3, 63)); \
|
||||
x3 = _mm_slli_epi64(x3, 1); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
|
||||
* result is written in x0..x1.
|
||||
*/
|
||||
#define REDUCE_F128(x0, x1, x2, x3) do { \
|
||||
x1 = _mm_xor_si128( \
|
||||
x1, \
|
||||
_mm_xor_si128( \
|
||||
_mm_xor_si128( \
|
||||
x3, \
|
||||
_mm_srli_epi64(x3, 1)), \
|
||||
_mm_xor_si128( \
|
||||
_mm_srli_epi64(x3, 2), \
|
||||
_mm_srli_epi64(x3, 7)))); \
|
||||
x2 = _mm_xor_si128( \
|
||||
_mm_xor_si128( \
|
||||
x2, \
|
||||
_mm_slli_epi64(x3, 63)), \
|
||||
_mm_xor_si128( \
|
||||
_mm_slli_epi64(x3, 62), \
|
||||
_mm_slli_epi64(x3, 57))); \
|
||||
x0 = _mm_xor_si128( \
|
||||
x0, \
|
||||
_mm_xor_si128( \
|
||||
_mm_xor_si128( \
|
||||
x2, \
|
||||
_mm_srli_epi64(x2, 1)), \
|
||||
_mm_xor_si128( \
|
||||
_mm_srli_epi64(x2, 2), \
|
||||
_mm_srli_epi64(x2, 7)))); \
|
||||
x1 = _mm_xor_si128( \
|
||||
_mm_xor_si128( \
|
||||
x1, \
|
||||
_mm_slli_epi64(x2, 63)), \
|
||||
_mm_xor_si128( \
|
||||
_mm_slli_epi64(x2, 62), \
|
||||
_mm_slli_epi64(x2, 57))); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Square value kw into (dw,dx).
|
||||
*/
|
||||
#define SQUARE_F128(kw, dw, dx) do { \
|
||||
__m128i z0, z1, z2, z3; \
|
||||
z1 = pclmulqdq11(kw, kw); \
|
||||
z3 = pclmulqdq00(kw, kw); \
|
||||
z0 = _mm_shuffle_epi32(z1, 0x0E); \
|
||||
z2 = _mm_shuffle_epi32(z3, 0x0E); \
|
||||
SL_256(z0, z1, z2, z3); \
|
||||
REDUCE_F128(z0, z1, z2, z3); \
|
||||
PBK(z0, z1, dw, dx); \
|
||||
} while (0)
|
||||
|
||||
/* see bearssl_hash.h */
|
||||
BR_TARGET("ssse3,pclmul")
|
||||
void
|
||||
br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
|
||||
{
|
||||
const unsigned char *buf1, *buf2;
|
||||
unsigned char tmp[64];
|
||||
size_t num4, num1;
|
||||
__m128i yw, h1w, h1x;
|
||||
BYTESWAP_DECL
|
||||
|
||||
/*
|
||||
* We split data into two chunks. First chunk starts at buf1
|
||||
* and contains num4 blocks of 64-byte values. Second chunk
|
||||
* starts at buf2 and contains num1 blocks of 16-byte values.
|
||||
* We want the first chunk to be as large as possible.
|
||||
*/
|
||||
buf1 = data;
|
||||
num4 = len >> 6;
|
||||
len &= 63;
|
||||
buf2 = buf1 + (num4 << 6);
|
||||
num1 = (len + 15) >> 4;
|
||||
if ((len & 15) != 0) {
|
||||
memcpy(tmp, buf2, len);
|
||||
memset(tmp + len, 0, (num1 << 4) - len);
|
||||
buf2 = tmp;
|
||||
}
|
||||
|
||||
/*
|
||||
* Preparatory step for endian conversions.
|
||||
*/
|
||||
BYTESWAP_PREP;
|
||||
|
||||
/*
|
||||
* Load y and h.
|
||||
*/
|
||||
yw = _mm_loadu_si128(y);
|
||||
h1w = _mm_loadu_si128(h);
|
||||
BYTESWAP(yw);
|
||||
BYTESWAP(h1w);
|
||||
BK(h1w, h1x);
|
||||
|
||||
if (num4 > 0) {
|
||||
__m128i h2w, h2x, h3w, h3x, h4w, h4x;
|
||||
__m128i t0, t1, t2, t3;
|
||||
|
||||
/*
|
||||
* Compute h2 = h^2.
|
||||
*/
|
||||
SQUARE_F128(h1w, h2w, h2x);
|
||||
|
||||
/*
|
||||
* Compute h3 = h^3 = h*(h^2).
|
||||
*/
|
||||
t1 = pclmulqdq11(h1w, h2w);
|
||||
t3 = pclmulqdq00(h1w, h2w);
|
||||
t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
|
||||
_mm_xor_si128(t1, t3));
|
||||
t0 = _mm_shuffle_epi32(t1, 0x0E);
|
||||
t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
|
||||
t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
|
||||
SL_256(t0, t1, t2, t3);
|
||||
REDUCE_F128(t0, t1, t2, t3);
|
||||
PBK(t0, t1, h3w, h3x);
|
||||
|
||||
/*
|
||||
* Compute h4 = h^4 = (h^2)^2.
|
||||
*/
|
||||
SQUARE_F128(h2w, h4w, h4x);
|
||||
|
||||
while (num4 -- > 0) {
|
||||
__m128i aw0, aw1, aw2, aw3;
|
||||
__m128i ax0, ax1, ax2, ax3;
|
||||
|
||||
aw0 = _mm_loadu_si128((void *)(buf1 + 0));
|
||||
aw1 = _mm_loadu_si128((void *)(buf1 + 16));
|
||||
aw2 = _mm_loadu_si128((void *)(buf1 + 32));
|
||||
aw3 = _mm_loadu_si128((void *)(buf1 + 48));
|
||||
BYTESWAP(aw0);
|
||||
BYTESWAP(aw1);
|
||||
BYTESWAP(aw2);
|
||||
BYTESWAP(aw3);
|
||||
buf1 += 64;
|
||||
|
||||
aw0 = _mm_xor_si128(aw0, yw);
|
||||
BK(aw1, ax1);
|
||||
BK(aw2, ax2);
|
||||
BK(aw3, ax3);
|
||||
BK(aw0, ax0);
|
||||
|
||||
t1 = _mm_xor_si128(
|
||||
_mm_xor_si128(
|
||||
pclmulqdq11(aw0, h4w),
|
||||
pclmulqdq11(aw1, h3w)),
|
||||
_mm_xor_si128(
|
||||
pclmulqdq11(aw2, h2w),
|
||||
pclmulqdq11(aw3, h1w)));
|
||||
t3 = _mm_xor_si128(
|
||||
_mm_xor_si128(
|
||||
pclmulqdq00(aw0, h4w),
|
||||
pclmulqdq00(aw1, h3w)),
|
||||
_mm_xor_si128(
|
||||
pclmulqdq00(aw2, h2w),
|
||||
pclmulqdq00(aw3, h1w)));
|
||||
t2 = _mm_xor_si128(
|
||||
_mm_xor_si128(
|
||||
pclmulqdq00(ax0, h4x),
|
||||
pclmulqdq00(ax1, h3x)),
|
||||
_mm_xor_si128(
|
||||
pclmulqdq00(ax2, h2x),
|
||||
pclmulqdq00(ax3, h1x)));
|
||||
t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
|
||||
t0 = _mm_shuffle_epi32(t1, 0x0E);
|
||||
t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
|
||||
t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
|
||||
SL_256(t0, t1, t2, t3);
|
||||
REDUCE_F128(t0, t1, t2, t3);
|
||||
yw = _mm_unpacklo_epi64(t1, t0);
|
||||
}
|
||||
}
|
||||
|
||||
while (num1 -- > 0) {
|
||||
__m128i aw, ax;
|
||||
__m128i t0, t1, t2, t3;
|
||||
|
||||
aw = _mm_loadu_si128((void *)buf2);
|
||||
BYTESWAP(aw);
|
||||
buf2 += 16;
|
||||
|
||||
aw = _mm_xor_si128(aw, yw);
|
||||
BK(aw, ax);
|
||||
|
||||
t1 = pclmulqdq11(aw, h1w);
|
||||
t3 = pclmulqdq00(aw, h1w);
|
||||
t2 = pclmulqdq00(ax, h1x);
|
||||
t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
|
||||
t0 = _mm_shuffle_epi32(t1, 0x0E);
|
||||
t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
|
||||
t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
|
||||
SL_256(t0, t1, t2, t3);
|
||||
REDUCE_F128(t0, t1, t2, t3);
|
||||
yw = _mm_unpacklo_epi64(t1, t0);
|
||||
}
|
||||
|
||||
BYTESWAP(yw);
|
||||
_mm_storeu_si128(y, yw);
|
||||
}
|
||||
|
||||
BR_TARGETS_X86_DOWN
|
||||
|
||||
#else
|
||||
|
||||
/* see bearssl_hash.h */
|
||||
br_ghash
|
||||
br_ghash_pclmul_get(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
Reference in New Issue
Block a user