crypto: x86/poly1305 - fix overflow during partial reduction
The x86_64 implementation of Poly1305 produces the wrong result on some inputs because poly1305_4block_avx2() incorrectly assumes that when partially reducing the accumulator, the bits carried from limb 'd4' to limb 'h0' fit in a 32-bit integer. This is true for poly1305-generic which processes only one block at a time. However, it's not true for the AVX2 implementation, which processes 4 blocks at a time and therefore can produce intermediate limbs about 4x larger. Fix it by making the relevant calculations use 64-bit arithmetic rather than 32-bit. Note that most of the carries already used 64-bit arithmetic, but the d4 -> h0 carry was different for some reason. To be safe I also made the same change to the corresponding SSE2 code, though that only operates on 1 or 2 blocks at a time. I don't think it's really needed for poly1305_block_sse2(), but it doesn't hurt because it's already x86_64 code. It *might* be needed for poly1305_2block_sse2(), but overflows aren't easy to reproduce there. This bug was originally detected by my patches that improve testmgr to fuzz algorithms against their generic implementation. But also add a test vector which reproduces it directly (in the AVX2 case). Fixes:b1ccc8f4b6
("crypto: poly1305 - Add a four block AVX2 variant for x86_64") Fixes:c70f4abef0
("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64") Cc: <stable@vger.kernel.org> # v4.3+ Cc: Martin Willi <martin@strongswan.org> Cc: Jason A. Donenfeld <Jason@zx2c4.com> Signed-off-by: Eric Biggers <ebiggers@google.com> Reviewed-by: Martin Willi <martin@strongswan.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
b4e9e931e9
commit
678cce4019
|
@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2)
|
|||
vpaddq t2,t1,t1
|
||||
vmovq t1x,d4
|
||||
|
||||
# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
|
||||
# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
|
||||
# amount. Careful: we must not assume the carry bits 'd0 >> 26',
|
||||
# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
|
||||
# integers. It's true in a single-block implementation, but not here.
|
||||
|
||||
# d1 += d0 >> 26
|
||||
mov d0,%rax
|
||||
shr $26,%rax
|
||||
|
@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2)
|
|||
# h0 += (d4 >> 26) * 5
|
||||
mov d4,%rax
|
||||
shr $26,%rax
|
||||
lea (%eax,%eax,4),%eax
|
||||
add %eax,%ebx
|
||||
lea (%rax,%rax,4),%rax
|
||||
add %rax,%rbx
|
||||
# h4 = d4 & 0x3ffffff
|
||||
mov d4,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h4
|
||||
|
||||
# h1 += h0 >> 26
|
||||
mov %ebx,%eax
|
||||
shr $26,%eax
|
||||
mov %rbx,%rax
|
||||
shr $26,%rax
|
||||
add %eax,h1
|
||||
# h0 = h0 & 0x3ffffff
|
||||
andl $0x3ffffff,%ebx
|
||||
|
|
|
@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2)
|
|||
# h0 += (d4 >> 26) * 5
|
||||
mov d4,%rax
|
||||
shr $26,%rax
|
||||
lea (%eax,%eax,4),%eax
|
||||
add %eax,%ebx
|
||||
lea (%rax,%rax,4),%rax
|
||||
add %rax,%rbx
|
||||
# h4 = d4 & 0x3ffffff
|
||||
mov d4,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h4
|
||||
|
||||
# h1 += h0 >> 26
|
||||
mov %ebx,%eax
|
||||
shr $26,%eax
|
||||
mov %rbx,%rax
|
||||
shr $26,%rax
|
||||
add %eax,h1
|
||||
# h0 = h0 & 0x3ffffff
|
||||
andl $0x3ffffff,%ebx
|
||||
|
@ -524,6 +524,12 @@ ENTRY(poly1305_2block_sse2)
|
|||
paddq t2,t1
|
||||
movq t1,d4
|
||||
|
||||
# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
|
||||
# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
|
||||
# amount. Careful: we must not assume the carry bits 'd0 >> 26',
|
||||
# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
|
||||
# integers. It's true in a single-block implementation, but not here.
|
||||
|
||||
# d1 += d0 >> 26
|
||||
mov d0,%rax
|
||||
shr $26,%rax
|
||||
|
@ -562,16 +568,16 @@ ENTRY(poly1305_2block_sse2)
|
|||
# h0 += (d4 >> 26) * 5
|
||||
mov d4,%rax
|
||||
shr $26,%rax
|
||||
lea (%eax,%eax,4),%eax
|
||||
add %eax,%ebx
|
||||
lea (%rax,%rax,4),%rax
|
||||
add %rax,%rbx
|
||||
# h4 = d4 & 0x3ffffff
|
||||
mov d4,%rax
|
||||
and $0x3ffffff,%eax
|
||||
mov %eax,h4
|
||||
|
||||
# h1 += h0 >> 26
|
||||
mov %ebx,%eax
|
||||
shr $26,%eax
|
||||
mov %rbx,%rax
|
||||
shr $26,%rax
|
||||
add %eax,h1
|
||||
# h0 = h0 & 0x3ffffff
|
||||
andl $0x3ffffff,%ebx
|
||||
|
|
|
@ -5634,7 +5634,49 @@ static const struct hash_testvec poly1305_tv_template[] = {
|
|||
.psize = 80,
|
||||
.digest = "\x13\x00\x00\x00\x00\x00\x00\x00"
|
||||
"\x00\x00\x00\x00\x00\x00\x00\x00",
|
||||
},
|
||||
}, { /* Regression test for overflow in AVX2 implementation */
|
||||
.plaintext = "\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
"\xff\xff\xff\xff",
|
||||
.psize = 300,
|
||||
.digest = "\xfb\x5e\x96\xd8\x61\xd5\xc7\xc8"
|
||||
"\x78\xe5\x87\xcc\x2d\x5a\x22\xe1",
|
||||
}
|
||||
};
|
||||
|
||||
/* NHPoly1305 test vectors from https://github.com/google/adiantum */
|
||||
|
|
Loading…
Reference in New Issue