diff --git a/cranelift/codegen/src/opts/icmp.isle b/cranelift/codegen/src/opts/icmp.isle index a33eb52d0933..4b9494a621e2 100644 --- a/cranelift/codegen/src/opts/icmp.isle +++ b/cranelift/codegen/src/opts/icmp.isle @@ -414,3 +414,39 @@ (iconst_u _ k2))) (if-let false (u64_eq k1 k2)) (ne select_ty inner_cond (iconst_u inner_ty 0))) + +;;;;; Boolean-context simplifications for `ctz` and `clz` ;;;;;;;;;;;;;;;;;;;;;; +;; +;; When a count-trailing/leading-zeros instruction's result is fed into a +;; comparison against zero (the consumer cares whether the count is zero, +;; not its numeric value), rewrite to test the corresponding bit of X +;; directly: +;; +;; ctz(X) == 0 iff LSB of X is set iff (X & 1) != 0 +;; clz(X) == 0 iff MSB of X is set iff X is signed-negative +;; +;; LZCNT/TZCNT/BSF/BSR each cost ~3 cycles on Intel and write a GPR (creating +;; a false dependency); the rewritten forms emit a single-cycle `test` whose +;; result lives only in flags. JIT-less interpreters benefit even more — their +;; bit-counting paths are typically loops. +;; +;; The matching wasm-side fold is in WebAssembly/binaryen#8562 (LSB→ctz under +;; `-Os`). With these mid-end rules in place, that fold becomes cycle-neutral +;; on cranelift JITs even when produced unconditionally. + +;; ctz(X) == 0 iff the LSB of X is 1, i.e. (X & 1) != 0. +(rule (simplify (eq result_ty (ctz x_ty X) (iconst_u _ 0))) + (ne result_ty (band x_ty X (iconst_u x_ty 1)) (iconst_u x_ty 0))) + +;; ctz(X) != 0 iff the LSB of X is 0, i.e. (X & 1) == 0. +(rule (simplify (ne result_ty (ctz x_ty X) (iconst_u _ 0))) + (eq result_ty (band x_ty X (iconst_u x_ty 1)) (iconst_u x_ty 0))) + +;; clz(X) == 0 iff the MSB of X is 1, i.e. X is signed-negative. +;; Lowers to `test X, X; js` on x86_64 — single-instruction sign-bit test. +(rule (simplify (eq result_ty (clz x_ty X) (iconst_u _ 0))) + (slt result_ty X (iconst_u x_ty 0))) + +;; clz(X) != 0 iff the MSB of X is 0, i.e. X is signed-non-negative. +(rule (simplify (ne result_ty (clz x_ty X) (iconst_u _ 0))) + (sge result_ty X (iconst_u x_ty 0))) diff --git a/cranelift/filetests/filetests/egraph/cnt-bool-context.clif b/cranelift/filetests/filetests/egraph/cnt-bool-context.clif new file mode 100644 index 000000000000..1e8c53a2ee5b --- /dev/null +++ b/cranelift/filetests/filetests/egraph/cnt-bool-context.clif @@ -0,0 +1,98 @@ +test optimize precise-output +set opt_level=speed +target x86_64 + +;; Boolean-context simplifications for ctz / clz: the result of the +;; bit-counting instruction is only used to test "is it zero?", which +;; reduces to a direct bit test on X. +;; +;; ctz(X) == 0 iff LSB of X set iff (X & 1) != 0 +;; clz(X) == 0 iff MSB of X set iff X is signed-negative + +;; ctz(X) == 0 → (X & 1) != 0 +function %ctz_eq_zero_i32(i32) -> i8 { +block0(v0: i32): + v1 = ctz v0 + v2 = iconst.i32 0 + v3 = icmp eq v1, v2 + return v3 +} + +; function %ctz_eq_zero_i32(i32) -> i8 fast { +; block0(v0: i32): +; v4 = iconst.i32 1 +; v5 = band v0, v4 ; v4 = 1 +; v2 = iconst.i32 0 +; v6 = icmp ne v5, v2 ; v2 = 0 +; return v6 +; } + +;; ctz(X) != 0 → (X & 1) == 0 +function %ctz_ne_zero_i64(i64) -> i8 { +block0(v0: i64): + v1 = ctz v0 + v2 = iconst.i64 0 + v3 = icmp ne v1, v2 + return v3 +} + +; function %ctz_ne_zero_i64(i64) -> i8 fast { +; block0(v0: i64): +; v4 = iconst.i64 1 +; v5 = band v0, v4 ; v4 = 1 +; v2 = iconst.i64 0 +; v6 = icmp eq v5, v2 ; v2 = 0 +; return v6 +; } + +;; clz(X) == 0 → X i8 { +block0(v0: i32): + v1 = clz v0 + v2 = iconst.i32 0 + v3 = icmp eq v1, v2 + return v3 +} + +; function %clz_eq_zero_i32(i32) -> i8 fast { +; block0(v0: i32): +; v2 = iconst.i32 0 +; v4 = icmp slt v0, v2 ; v2 = 0 +; return v4 +; } + +;; clz(X) != 0 → X >=signed 0 (i64 case) +function %clz_ne_zero_i64(i64) -> i8 { +block0(v0: i64): + v1 = clz v0 + v2 = iconst.i64 0 + v3 = icmp ne v1, v2 + return v3 +} + +; function %clz_ne_zero_i64(i64) -> i8 fast { +; block0(v0: i64): +; v2 = iconst.i64 0 +; v4 = icmp sge v0, v2 ; v2 = 0 +; return v4 +; } + +;; Negative test: only the comparison-against-zero pattern fires. +;; `ctz(X) == 4` is a numeric-value test on the count, not a boolean, +;; and must be left alone. +function %ctz_eq_nonzero_i32(i32) -> i8 { +block0(v0: i32): + v1 = ctz v0 + v2 = iconst.i32 4 + v3 = icmp eq v1, v2 + return v3 +} + +; function %ctz_eq_nonzero_i32(i32) -> i8 fast { +; block0(v0: i32): +; v1 = ctz v0 +; v2 = iconst.i32 4 +; v3 = icmp eq v1, v2 ; v2 = 4 +; return v3 +; } + diff --git a/tests/disas/ctz-clz-bool-condition.wat b/tests/disas/ctz-clz-bool-condition.wat new file mode 100644 index 000000000000..9e06d200f77f --- /dev/null +++ b/tests/disas/ctz-clz-bool-condition.wat @@ -0,0 +1,280 @@ +;;! target = 'x86_64' +;;! test = 'compile' + +;; End-to-end check that boolean-context comparisons of `ctz`/`clz` against +;; zero collapse to the corresponding bit test (LSB / sign), per the egraph +;; rewrites in `cranelift/codegen/src/opts/icmp.isle`. +;; +;; Layout per operator/width: three consumers (`if`, `select`, `eqz`) over +;; the explicit `(ctz/clz x) == 0` and `(ctz/clz x) != 0` icmp shapes, plus +;; the wasm-natural `if (ctz/clz x)` form (no icmp interposed) which is what +;; non-Rust frontends like Motoko's `moc` emit. + +(module + ;; ----- ctz, i32 ------------------------------------------------------- + + (func $if_ctz_eq0_i32 (param i32) (result i32) + (i32.eq (i32.ctz (local.get 0)) (i32.const 0)) + if (result i32) i32.const 100 else i32.const 200 end) + (func $if_ctz_ne0_i32 (param i32) (result i32) + (i32.ne (i32.ctz (local.get 0)) (i32.const 0)) + if (result i32) i32.const 100 else i32.const 200 end) + (func $if_ctz_bare_i32 (param i32) (result i32) + (i32.ctz (local.get 0)) + if (result i32) i32.const 100 else i32.const 200 end) + (func $select_ctz_eq0_i32 (param i32 i32 i32) (result i32) + local.get 1 local.get 2 + (i32.eq (i32.ctz (local.get 0)) (i32.const 0)) + select) + (func $eqz_ctz_eq0_i32 (param i32) (result i32) + (i32.eq (i32.ctz (local.get 0)) (i32.const 0)) + i32.eqz) + + ;; ----- ctz, i64 ------------------------------------------------------- + + (func $if_ctz_eq0_i64 (param i64) (result i32) + (i64.eq (i64.ctz (local.get 0)) (i64.const 0)) + if (result i32) i32.const 100 else i32.const 200 end) + (func $if_ctz_ne0_i64 (param i64) (result i32) + (i64.ne (i64.ctz (local.get 0)) (i64.const 0)) + if (result i32) i32.const 100 else i32.const 200 end) + ;; Wasm-natural shape: `i64.ctz` produces i64, narrowed via `i32.wrap_i64` + ;; before `if`. This is exactly what moc emits for the EOP compactness + ;; discriminator. + (func $if_ctz_bare_i64 (param i64) (result i32) + (i64.ctz (local.get 0)) i32.wrap_i64 + if (result i32) i32.const 100 else i32.const 200 end) + (func $select_ctz_eq0_i64 (param i64 i32 i32) (result i32) + local.get 1 local.get 2 + (i64.eq (i64.ctz (local.get 0)) (i64.const 0)) + select) + + ;; ----- clz, i32 (sign-bit tests) -------------------------------------- + + (func $if_clz_eq0_i32 (param i32) (result i32) + (i32.eq (i32.clz (local.get 0)) (i32.const 0)) + if (result i32) i32.const 100 else i32.const 200 end) + (func $if_clz_ne0_i32 (param i32) (result i32) + (i32.ne (i32.clz (local.get 0)) (i32.const 0)) + if (result i32) i32.const 100 else i32.const 200 end) + (func $if_clz_bare_i32 (param i32) (result i32) + (i32.clz (local.get 0)) + if (result i32) i32.const 100 else i32.const 200 end) + (func $select_clz_eq0_i32 (param i32 i32 i32) (result i32) + local.get 1 local.get 2 + (i32.eq (i32.clz (local.get 0)) (i32.const 0)) + select) + + ;; ----- clz, i64 ------------------------------------------------------- + + (func $if_clz_eq0_i64 (param i64) (result i32) + (i64.eq (i64.clz (local.get 0)) (i64.const 0)) + if (result i32) i32.const 100 else i32.const 200 end) + (func $if_clz_ne0_i64 (param i64) (result i32) + (i64.ne (i64.clz (local.get 0)) (i64.const 0)) + if (result i32) i32.const 100 else i32.const 200 end) + + ;; ----- negative test: numeric comparison must NOT collapse ------------ + ;; `ctz(x) == 4` is an arithmetic test on the count, not a boolean + ;; context, so the egraph should leave it alone. + (func $if_ctz_eq4_i32 (param i32) (result i32) + (i32.eq (i32.ctz (local.get 0)) (i32.const 4)) + if (result i32) i32.const 100 else i32.const 200 end) +) +;; wasm[0]::function[0]::if_ctz_eq0_i32: +;; pushq %rbp +;; movq %rsp, %rbp +;; testl $1, %edx +;; jne 0x1a +;; 10: movl $0xc8, %eax +;; jmp 0x1f +;; 1a: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[1]::if_ctz_ne0_i32: +;; pushq %rbp +;; movq %rsp, %rbp +;; testl $1, %edx +;; je 0x5a +;; 50: movl $0xc8, %eax +;; jmp 0x5f +;; 5a: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[2]::if_ctz_bare_i32: +;; pushq %rbp +;; movq %rsp, %rbp +;; movl $0x20, %esi +;; bsfl %edx, %r9d +;; cmovel %esi, %r9d +;; testl %r9d, %r9d +;; jne 0xa4 +;; 9a: movl $0xc8, %eax +;; jmp 0xa9 +;; a4: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[3]::select_ctz_eq0_i32: +;; pushq %rbp +;; movq %rsp, %rbp +;; testl $1, %edx +;; movq %r8, %rax +;; cmovnel %ecx, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[4]::eqz_ctz_eq0_i32: +;; pushq %rbp +;; movq %rsp, %rbp +;; testl $1, %edx +;; sete %sil +;; movzbl %sil, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[5]::if_ctz_eq0_i64: +;; pushq %rbp +;; movq %rsp, %rbp +;; testq $1, %rdx +;; jne 0x11b +;; 111: movl $0xc8, %eax +;; jmp 0x120 +;; 11b: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[6]::if_ctz_ne0_i64: +;; pushq %rbp +;; movq %rsp, %rbp +;; testq $1, %rdx +;; je 0x15b +;; 151: movl $0xc8, %eax +;; jmp 0x160 +;; 15b: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[7]::if_ctz_bare_i64: +;; pushq %rbp +;; movq %rsp, %rbp +;; movl $0x40, %esi +;; bsfq %rdx, %r9 +;; cmoveq %rsi, %r9 +;; testl %r9d, %r9d +;; jne 0x1a4 +;; 19a: movl $0xc8, %eax +;; jmp 0x1a9 +;; 1a4: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[8]::select_ctz_eq0_i64: +;; pushq %rbp +;; movq %rsp, %rbp +;; testq $1, %rdx +;; movq %r8, %rax +;; cmovnel %ecx, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[9]::if_clz_eq0_i32: +;; pushq %rbp +;; movq %rsp, %rbp +;; testl %edx, %edx +;; jl 0x1f6 +;; 1ec: movl $0xc8, %eax +;; jmp 0x1fb +;; 1f6: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[10]::if_clz_ne0_i32: +;; pushq %rbp +;; movq %rsp, %rbp +;; testl %edx, %edx +;; jge 0x216 +;; 20c: movl $0xc8, %eax +;; jmp 0x21b +;; 216: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[11]::if_clz_bare_i32: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq $18446744073709551615, %rsi +;; bsrl %edx, %r9d +;; cmovel %esi, %r9d +;; movl $0x1f, %eax +;; subl %r9d, %eax +;; testl %eax, %eax +;; jne 0x24d +;; 243: movl $0xc8, %eax +;; jmp 0x252 +;; 24d: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[12]::select_clz_eq0_i32: +;; pushq %rbp +;; movq %rsp, %rbp +;; testl %edx, %edx +;; movq %r8, %rax +;; cmovll %ecx, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[13]::if_clz_eq0_i64: +;; pushq %rbp +;; movq %rsp, %rbp +;; testq %rdx, %rdx +;; jl 0x297 +;; 28d: movl $0xc8, %eax +;; jmp 0x29c +;; 297: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[14]::if_clz_ne0_i64: +;; pushq %rbp +;; movq %rsp, %rbp +;; testq %rdx, %rdx +;; jge 0x2d7 +;; 2cd: movl $0xc8, %eax +;; jmp 0x2dc +;; 2d7: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq +;; +;; wasm[0]::function[15]::if_ctz_eq4_i32: +;; pushq %rbp +;; movq %rsp, %rbp +;; movl $0x20, %esi +;; bsfl %edx, %r9d +;; cmovel %esi, %r9d +;; cmpl $4, %r9d +;; je 0x325 +;; 31b: movl $0xc8, %eax +;; jmp 0x32a +;; 325: movl $0x64, %eax +;; movq %rbp, %rsp +;; popq %rbp +;; retq