From 9fb410a89e8fa92f8ebc7aa95563442a14da21eb Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 6 Jul 2018 15:13:23 -0700 Subject: nfp: bpf: migrate to advanced reciprocal divide in reciprocal_div.h As we are doing JIT, we would want to use the advanced version of the reciprocal divide (reciprocal_value_adv) to trade performance with host. We could reduce the required ALU instructions from 4 to 2 or 1. Signed-off-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 64 +++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 7c9ee3d85907..1d9e36835404 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -1497,8 +1497,8 @@ wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm) { swreg dst_both = reg_both(dst), dst_a = reg_a(dst), dst_b = reg_a(dst); - struct reciprocal_value rvalue; - swreg tmp_b = imm_b(nfp_prog); + struct reciprocal_value_adv rvalue; + u8 pre_shift, exp; swreg magic; if (imm > U32_MAX) { @@ -1506,16 +1506,58 @@ static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm) return 0; } - rvalue = reciprocal_value(imm); + /* NOTE: because we are using "reciprocal_value_adv" which doesn't + * support "divisor > (1u << 31)", we need to JIT separate NFP sequence + * to handle such case which actually equals to the result of unsigned + * comparison "dst >= imm" which could be calculated using the following + * NFP sequence: + * + * alu[--, dst, -, imm] + * immed[imm, 0] + * alu[dst, imm, +carry, 0] + * + */ + if (imm > 1U << 31) { + swreg tmp_b = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog)); + + emit_alu(nfp_prog, reg_none(), dst_a, ALU_OP_SUB, tmp_b); + wrp_immed(nfp_prog, imm_a(nfp_prog), 0); + emit_alu(nfp_prog, dst_both, imm_a(nfp_prog), ALU_OP_ADD_C, + reg_imm(0)); + return 0; + } + + rvalue = reciprocal_value_adv(imm, 32); + exp = rvalue.exp; + if (rvalue.is_wide_m && !(imm & 1)) { + pre_shift = fls(imm & -imm) - 1; + rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift); + } else { + pre_shift = 0; + } magic = ur_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog)); - wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a, magic, - true); - emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB, tmp_b); - emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b, - SHF_SC_R_SHF, rvalue.sh1); - emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD, tmp_b); - emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b, - SHF_SC_R_SHF, rvalue.sh2); + if (imm == 1U << exp) { + emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b, + SHF_SC_R_SHF, exp); + } else if (rvalue.is_wide_m) { + wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a, + magic, true); + emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB, + imm_b(nfp_prog)); + emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b, + SHF_SC_R_SHF, 1); + emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD, + imm_b(nfp_prog)); + emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b, + SHF_SC_R_SHF, rvalue.sh - 1); + } else { + if (pre_shift) + emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, + dst_b, SHF_SC_R_SHF, pre_shift); + wrp_mul_u32(nfp_prog, dst_both, reg_none(), dst_a, magic, true); + emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, + dst_b, SHF_SC_R_SHF, rvalue.sh); + } return 0; } -- cgit v1.2.3