diff options
author | Martin Willi <martin@strongswan.org> | 2015-07-16 19:14:07 +0200 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2015-07-17 15:20:28 +0200 |
commit | da35b22df301128ba2f2238befd6d75067e80315 (patch) | |
tree | 488418a78080098444586a312ffbb4002e9b6583 /arch/x86/crypto/poly1305_glue.c | |
parent | crypto: poly1305 - Add a SSE2 SIMD variant for x86_64 (diff) | |
download | linux-da35b22df301128ba2f2238befd6d75067e80315.tar.xz linux-da35b22df301128ba2f2238befd6d75067e80315.zip |
crypto: poly1305 - Add a two block SSE2 variant for x86_64
Extends the x86_64 SSE2 Poly1305 authenticator by a function processing two
consecutive Poly1305 blocks in parallel using a derived key r^2. Loop
unrolling can be more effectively mapped to SSE instructions, further
increasing throughput.
For large messages, throughput increases by ~45-65% compared to single
block SSE2:
testing speed of poly1305 (poly1305-simd)
test 0 ( 96 byte blocks, 16 bytes per update, 6 updates): 3790063 opers/sec, 363846076 bytes/sec
test 1 ( 96 byte blocks, 32 bytes per update, 3 updates): 5913378 opers/sec, 567684355 bytes/sec
test 2 ( 96 byte blocks, 96 bytes per update, 1 updates): 9352574 opers/sec, 897847104 bytes/sec
test 3 ( 288 byte blocks, 16 bytes per update, 18 updates): 1362145 opers/sec, 392297990 bytes/sec
test 4 ( 288 byte blocks, 32 bytes per update, 9 updates): 2007075 opers/sec, 578037628 bytes/sec
test 5 ( 288 byte blocks, 288 bytes per update, 1 updates): 3709811 opers/sec, 1068425798 bytes/sec
test 6 ( 1056 byte blocks, 32 bytes per update, 33 updates): 566272 opers/sec, 597984182 bytes/sec
test 7 ( 1056 byte blocks, 1056 bytes per update, 1 updates): 1111657 opers/sec, 1173910108 bytes/sec
test 8 ( 2080 byte blocks, 32 bytes per update, 65 updates): 288857 opers/sec, 600823808 bytes/sec
test 9 ( 2080 byte blocks, 2080 bytes per update, 1 updates): 590746 opers/sec, 1228751888 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update, 1 updates): 301825 opers/sec, 1245936902 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update, 1 updates): 153075 opers/sec, 1258896201 bytes/sec
testing speed of poly1305 (poly1305-simd)
test 0 ( 96 byte blocks, 16 bytes per update, 6 updates): 3809514 opers/sec, 365713411 bytes/sec
test 1 ( 96 byte blocks, 32 bytes per update, 3 updates): 5973423 opers/sec, 573448627 bytes/sec
test 2 ( 96 byte blocks, 96 bytes per update, 1 updates): 9446779 opers/sec, 906890803 bytes/sec
test 3 ( 288 byte blocks, 16 bytes per update, 18 updates): 1364814 opers/sec, 393066691 bytes/sec
test 4 ( 288 byte blocks, 32 bytes per update, 9 updates): 2045780 opers/sec, 589184697 bytes/sec
test 5 ( 288 byte blocks, 288 bytes per update, 1 updates): 3711946 opers/sec, 1069040592 bytes/sec
test 6 ( 1056 byte blocks, 32 bytes per update, 33 updates): 573686 opers/sec, 605812732 bytes/sec
test 7 ( 1056 byte blocks, 1056 bytes per update, 1 updates): 1647802 opers/sec, 1740079440 bytes/sec
test 8 ( 2080 byte blocks, 32 bytes per update, 65 updates): 292970 opers/sec, 609378224 bytes/sec
test 9 ( 2080 byte blocks, 2080 bytes per update, 1 updates): 943229 opers/sec, 1961916528 bytes/sec
test 10 ( 4128 byte blocks, 4128 bytes per update, 1 updates): 494623 opers/sec, 2041804569 bytes/sec
test 11 ( 8224 byte blocks, 8224 bytes per update, 1 updates): 254045 opers/sec, 2089271014 bytes/sec
Benchmark results from a Core i5-4670T.
Signed-off-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/poly1305_glue.c')
-rw-r--r-- | arch/x86/crypto/poly1305_glue.c | 54 |
1 files changed, 49 insertions, 5 deletions
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 1e59274a0af3..b7c33d0d9ef2 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -18,24 +18,68 @@ #include <asm/fpu/api.h> #include <asm/simd.h> +struct poly1305_simd_desc_ctx { + struct poly1305_desc_ctx base; + /* derived key u set? */ + bool uset; + /* derived Poly1305 key r^2 */ + u32 u[5]; +}; + asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks); +asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, + unsigned int blocks, const u32 *u); + +static int poly1305_simd_init(struct shash_desc *desc) +{ + struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc); + + sctx->uset = false; + + return crypto_poly1305_init(desc); +} + +static void poly1305_simd_mult(u32 *a, const u32 *b) +{ + u8 m[POLY1305_BLOCK_SIZE]; + + memset(m, 0, sizeof(m)); + /* The poly1305 block function adds a hi-bit to the accumulator which + * we don't need for key multiplication; compensate for it. */ + a[4] -= 1 << 24; + poly1305_block_sse2(a, m, b, 1); +} static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { + struct poly1305_simd_desc_ctx *sctx; unsigned int blocks, datalen; + BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base)); + sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base); + if (unlikely(!dctx->sset)) { datalen = crypto_poly1305_setdesckey(dctx, src, srclen); src += srclen - datalen; srclen = datalen; } + if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { + if (unlikely(!sctx->uset)) { + memcpy(sctx->u, dctx->r, sizeof(sctx->u)); + poly1305_simd_mult(sctx->u, dctx->r); + sctx->uset = true; + } + blocks = srclen / (POLY1305_BLOCK_SIZE * 2); + poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u); + src += POLY1305_BLOCK_SIZE * 2 * blocks; + srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; + } if (srclen >= POLY1305_BLOCK_SIZE) { - blocks = srclen / POLY1305_BLOCK_SIZE; - poly1305_block_sse2(dctx->h, src, dctx->r, blocks); - srclen -= POLY1305_BLOCK_SIZE * blocks; + poly1305_block_sse2(dctx->h, src, dctx->r, 1); + srclen -= POLY1305_BLOCK_SIZE; } return srclen; } @@ -84,11 +128,11 @@ static int poly1305_simd_update(struct shash_desc *desc, static struct shash_alg alg = { .digestsize = POLY1305_DIGEST_SIZE, - .init = crypto_poly1305_init, + .init = poly1305_simd_init, .update = poly1305_simd_update, .final = crypto_poly1305_final, .setkey = crypto_poly1305_setkey, - .descsize = sizeof(struct poly1305_desc_ctx), + .descsize = sizeof(struct poly1305_simd_desc_ctx), .base = { .cra_name = "poly1305", .cra_driver_name = "poly1305-simd", |