diff --git a/.github/scripts/compare-jmh.py b/.github/scripts/compare-jmh.py
new file mode 100644
index 000000000..39c1a5d0f
--- /dev/null
+++ b/.github/scripts/compare-jmh.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python3
+"""Compare two sets of JMH JSON results and emit a markdown summary.
+
+Used by `.github/workflows/benchmarks.yml` to diff the latest scheduled
+`main` benchmark run against the run that just finished for a PR.
+
+For each (benchmark, params) pair common to both runs we report two
+metrics:
+
+* `Time` — `primaryMetric.score`. In `SampleTime` mode this is the
+  mean sampled latency per op; it's our best available proxy for CPU
+  work since no dedicated CPU profiler is configured in
+  `BenchmarkRunner`.
+* `Alloc/op` — `secondaryMetrics["·gc.alloc.rate.norm"]`, populated by
+  JMH's `GCProfiler`. This is bytes allocated per benchmark op and is
+  the standard, low-noise JMH memory metric.
+
+Both metrics are "lower is better", so a positive delta indicates the
+PR is worse than the baseline. A run is considered failed when **any**
+benchmark's worst metric delta exceeds `--threshold-pct` in the worse
+direction. The script writes a `regressions=...`/`improvements=...`
+summary file the workflow uses to set step outputs and decide whether
+to fail the job.
+"""
+
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+import sys
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+Key = Tuple[str, str]
+
+
+# ---------------------------------------------------------------------------
+# Metric model
+# ---------------------------------------------------------------------------
+
+# JMH's `GCProfiler` reports allocation rate normalised per op under this
+# secondary metric key (the leading char is U+00B7 MIDDLE DOT, not a regular
+# dot — that's JMH's convention for profiler-emitted metrics).
+ALLOC_NORM_KEY = "\u00b7gc.alloc.rate.norm"
+
+
+@dataclass(frozen=True)
+class Metric:
+    id: str
+    label: str
+    # Pull the `{score, scoreError, scoreUnit}`-shaped dict from a JMH record.
+    extract: Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]
+    # True when a higher score is worse (regression). Both of our metrics
+    # are lower-is-better so this is always True today, but the model
+    # supports e.g. Throughput mode trivially.
+    higher_is_worse: bool = True
+
+
+def _primary(record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    pm = record.get("primaryMetric")
+    return pm if isinstance(pm, dict) else None
+
+
+def _secondary(record: Dict[str, Any], key: str) -> Optional[Dict[str, Any]]:
+    sm = record.get("secondaryMetrics") or {}
+    val = sm.get(key)
+    return val if isinstance(val, dict) else None
+
+
+METRICS: List[Metric] = [
+    Metric(id="time", label="Time", extract=_primary),
+    Metric(
+        id="alloc",
+        label="Alloc/op",
+        extract=lambda r: _secondary(r, ALLOC_NORM_KEY),
+    ),
+]
+
+
+# ---------------------------------------------------------------------------
+# Loading & helpers
+# ---------------------------------------------------------------------------
+
+
+def load_results(directory: str) -> Dict[Key, Dict[str, Any]]:
+    by_key: Dict[Key, Dict[str, Any]] = {}
+    paths = sorted(
+        glob.glob(os.path.join(directory, "**", "jmh-results-*.json"), recursive=True)
+    )
+    for path in paths:
+        try:
+            with open(path, "r", encoding="utf-8") as fh:
+                data = json.load(fh)
+        except (OSError, json.JSONDecodeError) as exc:
+            print(f"warn: could not load {path}: {exc}", file=sys.stderr)
+            continue
+        if not isinstance(data, list):
+            continue
+        for record in data:
+            bench = record.get("benchmark")
+            if not bench:
+                continue
+            params = record.get("params") or {}
+            param_str = ", ".join(f"{k}={params[k]}" for k in sorted(params))
+            by_key[(bench, param_str)] = record
+    return by_key
+
+
+def _float(d: Optional[Dict[str, Any]], key: str) -> Optional[float]:
+    if not d:
+        return None
+    val = d.get(key)
+    try:
+        return float(val) if val is not None else None
+    except (TypeError, ValueError):
+        return None
+
+
+def short_bench(name: str) -> str:
+    parts = name.split(".")
+    return ".".join(parts[-2:]) if len(parts) >= 2 else name
+
+
+def fmt_score(v: Optional[float], err: Optional[float], unit: str) -> str:
+    if v is None:
+        return "—"
+    body = f"{v:.3g} ± {err:.2g}" if err is not None else f"{v:.3g}"
+    return f"{body} {unit}".rstrip()
+
+
+def fmt_delta(d: Optional[float]) -> str:
+    if d is None:
+        return "—"
+    sign = "+" if d >= 0 else ""
+    return f"{sign}{d:.2f}%"
+
+
+# ---------------------------------------------------------------------------
+# Comparison
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MetricDelta:
+    metric: Metric
+    baseline: Optional[float]
+    current: Optional[float]
+    baseline_err: Optional[float]
+    current_err: Optional[float]
+    unit: str
+    delta_pct: Optional[float]
+
+    def regression(self, threshold: float) -> bool:
+        if self.delta_pct is None:
+            return False
+        signed = self.delta_pct if self.metric.higher_is_worse else -self.delta_pct
+        return signed > threshold
+
+    def improvement(self, threshold: float) -> bool:
+        if self.delta_pct is None:
+            return False
+        signed = self.delta_pct if self.metric.higher_is_worse else -self.delta_pct
+        return signed < -threshold
+
+    def cell(self) -> str:
+        b = fmt_score(self.baseline, self.baseline_err, self.unit)
+        c = fmt_score(self.current, self.current_err, self.unit)
+        return f"{b} → {c} ({fmt_delta(self.delta_pct)})"
+
+
+def metric_delta(metric: Metric, baseline_rec: Dict[str, Any], current_rec: Dict[str, Any]) -> MetricDelta:
+    b = metric.extract(baseline_rec)
+    c = metric.extract(current_rec)
+    bs = _float(b, "score")
+    cs = _float(c, "score")
+    be = _float(b, "scoreError")
+    ce = _float(c, "scoreError")
+    unit = (c or {}).get("scoreUnit") or (b or {}).get("scoreUnit") or ""
+    if bs is None or cs is None or bs == 0:
+        delta_pct: Optional[float] = None
+    else:
+        delta_pct = (cs - bs) / bs * 100.0
+    return MetricDelta(
+        metric=metric,
+        baseline=bs,
+        current=cs,
+        baseline_err=be,
+        current_err=ce,
+        unit=unit,
+        delta_pct=delta_pct,
+    )
+
+
+@dataclass
+class Row:
+    key: Key
+    deltas: List[MetricDelta]
+
+    def worst_signed_pct(self) -> float:
+        worst = 0.0
+        for d in self.deltas:
+            if d.delta_pct is None:
+                continue
+            signed = d.delta_pct if d.metric.higher_is_worse else -d.delta_pct
+            if signed > worst:
+                worst = signed
+        return worst
+
+    def sort_key(self) -> float:
+        best = 0.0
+        for d in self.deltas:
+            if d.delta_pct is None:
+                continue
+            if abs(d.delta_pct) > best:
+                best = abs(d.delta_pct)
+        return best
+
+    def status(self, threshold: float) -> str:
+        regressed = [d for d in self.deltas if d.regression(threshold)]
+        improved = [d for d in self.deltas if d.improvement(threshold)]
+        if regressed:
+            labels = ", ".join(d.metric.label for d in regressed)
+            return f"REGRESSION ({labels})"
+        if improved:
+            labels = ", ".join(d.metric.label for d in improved)
+            return f"improvement ({labels})"
+        return ""
+
+
+def build_rows(
+    baseline: Dict[Key, Dict[str, Any]],
+    current: Dict[Key, Dict[str, Any]],
+) -> List[Row]:
+    rows: List[Row] = []
+    for key in sorted(set(current) & set(baseline)):
+        b = baseline[key]
+        c = current[key]
+        deltas = [metric_delta(m, b, c) for m in METRICS]
+        rows.append(Row(key=key, deltas=deltas))
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# Rendering
+# ---------------------------------------------------------------------------
+
+
+def build_markdown(
+    rows: List[Row],
+    only_current: List[Key],
+    only_baseline: List[Key],
+    current: Dict[Key, Dict[str, Any]],
+    *,
+    threshold: float,
+    repo: str,
+    server_url: str,
+    baseline_run_id: str,
+    current_run_id: str,
+) -> Tuple[str, int, int]:
+    rows = sorted(rows, key=lambda r: r.sort_key(), reverse=True)
+    regressions = sum(1 for r in rows if any(d.regression(threshold) for d in r.deltas))
+    improvements = sum(
+        1 for r in rows
+        if not any(d.regression(threshold) for d in r.deltas)
+        and any(d.improvement(threshold) for d in r.deltas)
+    )
+
+    out: List[str] = ["<!-- jmh-benchmark-comparison -->"]
+    if regressions:
+        out.append(f"## ❌ JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%")
+    elif improvements:
+        out.append(f"## ✅ JMH benchmark comparison — {improvements} improvement(s) over {threshold:g}%")
+    else:
+        out.append(f"## JMH benchmark comparison — no changes over {threshold:g}%")
+    out.append("")
+
+    if repo and baseline_run_id and current_run_id:
+        base_url = f"{server_url}/{repo}/actions/runs/{baseline_run_id}"
+        curr_url = f"{server_url}/{repo}/actions/runs/{current_run_id}"
+        out.append(
+            f"Baseline: [`main` run #{baseline_run_id}]({base_url}) — "
+            f"PR: [run #{current_run_id}]({curr_url})"
+        )
+        out.append("")
+
+    out.append(
+        f"Threshold: **±{threshold:g}%**. "
+        f"Metrics: **Time** (`primaryMetric.score`, `SampleTime` — proxy for CPU work) and "
+        f"**Alloc/op** (`{ALLOC_NORM_KEY}`, GC allocations per op — memory pressure). "
+        "Both are lower-is-better, so a positive Δ% means the PR is worse than baseline."
+    )
+    out.append("")
+
+    if rows:
+        header = "| Benchmark | Params | " + " | ".join(m.label for m in METRICS) + " | Status |"
+        sep = "|---|---|" + "|".join(["---"] * len(METRICS)) + "|---|"
+        out.append(header)
+        out.append(sep)
+        for r in rows:
+            bench, params = r.key
+            cells = " | ".join(d.cell() for d in r.deltas)
+            out.append(
+                f"| `{short_bench(bench)}` | {params or '—'} | {cells} | {r.status(threshold)} |"
+            )
+        out.append("")
+    else:
+        out.append("_No benchmarks matched between baseline and PR._")
+        out.append("")
+
+    if only_current:
+        out.append("<details><summary>Benchmarks only in PR run</summary>")
+        out.append("")
+        for k in only_current:
+            bench, params = k
+            rec = current[k]
+            time_d = _primary(rec) or {}
+            alloc_d = _secondary(rec, ALLOC_NORM_KEY) or {}
+            out.append(
+                f"- `{short_bench(bench)}` ({params or '—'}): "
+                f"time={fmt_score(_float(time_d, 'score'), _float(time_d, 'scoreError'), time_d.get('scoreUnit', ''))}, "
+                f"alloc={fmt_score(_float(alloc_d, 'score'), _float(alloc_d, 'scoreError'), alloc_d.get('scoreUnit', ''))}"
+            )
+        out.append("")
+        out.append("</details>")
+        out.append("")
+
+    if only_baseline:
+        out.append("<details><summary>Benchmarks only in baseline run</summary>")
+        out.append("")
+        for k in only_baseline:
+            bench, params = k
+            out.append(f"- `{short_bench(bench)}` ({params or '—'})")
+        out.append("")
+        out.append("</details>")
+        out.append("")
+
+    return "\n".join(out), regressions, improvements
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--baseline", required=True, help="Directory containing baseline JMH JSON files")
+    parser.add_argument("--current", required=True, help="Directory containing current JMH JSON files")
+    parser.add_argument("--baseline-run-id", default="", help="Baseline workflow run id (for links)")
+    parser.add_argument("--current-run-id", default="", help="Current workflow run id (for links)")
+    parser.add_argument("--repo", default="", help="owner/name for run links")
+    parser.add_argument("--server-url", default="https://github.com")
+    parser.add_argument("--output", required=True, help="Output markdown file")
+    parser.add_argument(
+        "--threshold-pct",
+        type=float,
+        default=10.0,
+        help="Δ%% beyond which a metric is flagged as a regression / improvement (default: 10)",
+    )
+    parser.add_argument(
+        "--summary-output",
+        default="",
+        help="Optional path to write a key=value summary the workflow can source",
+    )
+    args = parser.parse_args()
+
+    if args.threshold_pct < 0:
+        print("error: --threshold-pct must be non-negative", file=sys.stderr)
+        return 2
+
+    baseline = load_results(args.baseline)
+    current = load_results(args.current)
+
+    if not current:
+        print("error: no current JMH result files found", file=sys.stderr)
+        return 2
+
+    rows = build_rows(baseline, current)
+    only_current = sorted(set(current) - set(baseline))
+    only_baseline = sorted(set(baseline) - set(current))
+
+    md, regressions, improvements = build_markdown(
+        rows,
+        only_current,
+        only_baseline,
+        current,
+        threshold=args.threshold_pct,
+        repo=args.repo,
+        server_url=args.server_url,
+        baseline_run_id=args.baseline_run_id,
+        current_run_id=args.current_run_id,
+    )
+
+    with open(args.output, "w", encoding="utf-8") as fh:
+        fh.write(md)
+
+    if args.summary_output:
+        with open(args.summary_output, "w", encoding="utf-8") as fh:
+            fh.write(f"regressions={regressions}\n")
+            fh.write(f"improvements={improvements}\n")
+            fh.write(f"matched={len(rows)}\n")
+            fh.write(f"threshold_pct={args.threshold_pct}\n")
+
+    print(
+        f"wrote {args.output}: {len(rows)} matched, "
+        f"{regressions} regression(s) > {args.threshold_pct:g}%, "
+        f"{improvements} improvement(s), "
+        f"{len(only_current)} only-PR, {len(only_baseline)} only-baseline"
+    )
+    # We always exit 0; the workflow uses the summary file to decide
+    # whether to fail the job so that the comparison comment is still
+    # posted on regressions.
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/.github/workflows/benchmarks-pr-comment.yml b/.github/workflows/benchmarks-pr-comment.yml
new file mode 100644
index 000000000..fe3a01996
--- /dev/null
+++ b/.github/workflows/benchmarks-pr-comment.yml
@@ -0,0 +1,41 @@
+name: Benchmarks PR Comment
+
+# Posts a one-time instruction comment on newly opened PRs so contributors /
+# reviewers know how to launch a JMH benchmark run for the PR. The actual
+# benchmark workflow lives in `benchmarks.yml` and is triggered by a
+# `/benchmark` slash command (collaborators only).
+
+on:
+  pull_request_target:
+    types: [opened]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  comment:
+    if: startsWith(github.repository, 'ClickHouse/clickhouse-java')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Post benchmark instructions
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh api -X POST \
+            "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
+            -f body="$(cat <<'EOF'
+          Repository collaborators can run the JMH benchmark suite against this PR by commenting:
+
+          ```
+          /benchmark
+          ```
+
+          Optional regression threshold override (Δ% on Time or Alloc/op; defaults to 10%):
+
+          ```
+          /benchmark threshold=15
+          ```
+
+          Only one benchmark run per PR is active at a time — issuing a new `/benchmark` comment cancels the previous run. After the run finishes a separate comment will be posted comparing it against the latest scheduled run on `main`; the PR check fails if any benchmark regresses by more than the threshold.
+          EOF
+          )"
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 87fc22433..64f513d97 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -9,6 +9,10 @@ on:
       pr:
         description: "Pull request#"
         required: false
+      threshold:
+        description: "Regression threshold (Δ% on Time or Alloc/op)"
+        required: false
+        default: "10"
   issue_comment:
     types: [created]
 
@@ -16,57 +20,60 @@ env:
   CHC_BRANCH: "main"
   CH_VERSION: "25.3"
   JAVA_VERSION: 17
+  # Default Δ% above which a regression / improvement is flagged and the
+  # PR check is failed. Overridable per workflow_dispatch input or per
+  # `/benchmark threshold=N` comment.
+  DEFAULT_THRESHOLD_PCT: "10"
 
-# One run per PR (cancel any in-progress run when a newer /benchmark arrives
-# for the same PR). Scheduled runs are grouped by SHA so they don't fight
-# with PR runs.
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.issue.number || github.event.inputs.pr || github.sha }}
-  cancel-in-progress: true
+# NOTE: there is intentionally no workflow-level `concurrency:` block.
+# `issue_comment` events fire for *every* comment on every PR / issue,
+# including those from bots (e.g. sonarqubecloud). A workflow-level
+# `cancel-in-progress` group keyed on the PR number would cancel an
+# in-flight legitimate `/benchmark` run as soon as any unrelated bot
+# commented on the same PR. The per-PR concurrency rule is enforced on
+# the `jmh` job instead, so unrelated comment events leave the job
+# skipped without claiming the concurrency slot.
 
 jobs:
-  # Gate: only run for `issue_comment` events when the comment is on a PR,
-  # starts with `/benchmark`, is not from a bot, and the commenter is a
-  # repo OWNER/MEMBER/COLLABORATOR. For schedule and workflow_dispatch this
-  # job is skipped and the benchmark job runs unconditionally.
-  trigger-check:
-    if: github.event_name == 'issue_comment'
-    name: "Check /benchmark trigger"
-    runs-on: ubuntu-latest
+  jmh:
+    name: "Mininal JMH Benchmarks"
+    runs-on: "ubuntu-latest"
+    timeout-minutes: 30
     permissions:
+      contents: read
       pull-requests: write
       issues: write
-    outputs:
-      pr_number: ${{ steps.resolve.outputs.pr_number }}
-    steps:
-      - name: Validate comment
-        id: validate
-        if: |
+      actions: read
+    # Single fan-in filter, modelled on `.github/workflows/claude.yml`:
+    # the job runs for the daily schedule, manual `workflow_dispatch`,
+    # or a `/benchmark` slash-command from a non-bot repo collaborator
+    # on a pull request. Bot comments and chat comments leave the job
+    # skipped — no failed run, no notification, no concurrency
+    # collision.
+    if: |
+      startsWith(github.repository, 'ClickHouse/') &&
+      (
+        github.event_name == 'schedule' ||
+        github.event_name == 'workflow_dispatch' ||
+        (
+          github.event_name == 'issue_comment' &&
           github.event.issue.pull_request != null &&
           github.event.sender.type != 'Bot' &&
+          github.event.comment.user.type != 'Bot' &&
           startsWith(github.event.comment.body, '/benchmark') &&
           contains(fromJSON('["OWNER","MEMBER","COLLABORATOR"]'), github.event.comment.author_association)
-        run: echo "ok=true" >> $GITHUB_OUTPUT
-        # Note: we deliberately use `startsWith` (not `contains`) so the
-        # instruction comment posted by the PR-open bot, which mentions
-        # `/benchmark` mid-sentence, does not re-trigger this workflow.
-
-      - name: Reject unauthorized trigger
-        if: steps.validate.outputs.ok != 'true'
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          # If it looks like a /benchmark attempt by someone without
-          # permission, leave a -1 reaction so they get feedback.
-          if [[ "${{ github.event.issue.pull_request != null }}" == "true" ]] \
-             && [[ "${{ startsWith(github.event.comment.body, '/benchmark') }}" == "true" ]]; then
-            gh api -X POST \
-              "repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \
-              -f content='-1' || true
-          fi
-          exit 1
-
-      - name: Acknowledge trigger
+        )
+      )
+    # One running benchmark per PR (and per-SHA for the daily
+    # schedule). Concurrency lives on this job, not on the workflow,
+    # so unrelated comment events (which the job-level `if` filters
+    # out) never claim the slot or cancel an in-flight run.
+    concurrency:
+      group: ${{ github.workflow }}-jmh-${{ github.event.issue.number || github.event.inputs.pr || github.sha }}
+      cancel-in-progress: true
+    steps:
+      - name: Acknowledge /benchmark trigger
+        if: github.event_name == 'issue_comment'
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
@@ -74,28 +81,32 @@ jobs:
             "repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \
             -f content='rocket' || true
 
-      - name: Resolve PR number
-        id: resolve
-        run: echo "pr_number=${{ github.event.issue.number }}" >> $GITHUB_OUTPUT
-
-  jmh:
-    needs: [trigger-check]
-    if: |
-      always() &&
-      startsWith(github.repository, 'ClickHouse/') &&
-      (needs.trigger-check.result == 'success' || needs.trigger-check.result == 'skipped')
-    name: "Mininal JMH Benchmarks"
-    runs-on: "ubuntu-latest"
-    timeout-minutes: 30
-    steps:
-      - name: Resolve PR number
+      - name: Resolve PR number and threshold
         id: pr
+        env:
+          COMMENT_BODY: ${{ github.event.comment.body }}
+          DISPATCH_PR: ${{ github.event.inputs.pr }}
+          DISPATCH_THRESHOLD: ${{ github.event.inputs.threshold }}
+          DEFAULT_THRESHOLD: ${{ env.DEFAULT_THRESHOLD_PCT }}
         run: |
           case "${{ github.event_name }}" in
-            issue_comment)     echo "number=${{ needs.trigger-check.outputs.pr_number }}" >> $GITHUB_OUTPUT ;;
-            workflow_dispatch) echo "number=${{ github.event.inputs.pr }}" >> $GITHUB_OUTPUT ;;
-            *)                 echo "number=" >> $GITHUB_OUTPUT ;;
+            issue_comment)
+              # Accept `/benchmark threshold=15` or `/benchmark threshold=7.5`.
+              T=$(printf '%s' "$COMMENT_BODY" | grep -oE 'threshold=[0-9]+(\.[0-9]+)?' | head -1 | cut -d= -f2 || true)
+              [ -z "$T" ] && T="$DEFAULT_THRESHOLD"
+              echo "number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT"
+              echo "threshold=$T" >> "$GITHUB_OUTPUT"
+              ;;
+            workflow_dispatch)
+              echo "number=$DISPATCH_PR" >> "$GITHUB_OUTPUT"
+              echo "threshold=${DISPATCH_THRESHOLD:-$DEFAULT_THRESHOLD}" >> "$GITHUB_OUTPUT"
+              ;;
+            *)
+              echo "number=" >> "$GITHUB_OUTPUT"
+              echo "threshold=$DEFAULT_THRESHOLD" >> "$GITHUB_OUTPUT"
+              ;;
           esac
+
       - name: Post "started" comment
         if: github.event_name == 'issue_comment' && steps.pr.outputs.number != ''
         env:
@@ -104,33 +115,40 @@ jobs:
           gh api -X POST \
             "repos/${{ github.repository }}/issues/${{ steps.pr.outputs.number }}/comments" \
             -f body="JMH benchmark run started: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true
+
       - name: Check out Git repository
         uses: actions/checkout@v4
         with:
           ref: ${{ env.CHC_BRANCH }}
+
       - name: Check out PR
         if: steps.pr.outputs.number != ''
         run: |
           git fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 \
             origin pull/${{ steps.pr.outputs.number }}/merge:merged-pr && git checkout merged-pr
+
       - name: Install JDK and Maven
         uses: actions/setup-java@v4
         with:
           distribution: "temurin"
           java-version: ${{ env.JAVA_VERSION }}
           cache: "maven"
+
       - name: Build
         run: mvn --batch-mode --no-transfer-progress -Dj8 -DskipTests=true clean install
+
       - name: Prepare Dataset
         run: |
           cd ./performance &&
           mvn --batch-mode --no-transfer-progress clean compile exec:exec -Dexec.executable=java \
           -Dexec.args="-classpath %classpath com.clickhouse.benchmark.data.DataSetGenerator -input sample_dataset.sql -name default -rows 100000"
+
       - name: Run Benchmarks 
         run: |
           cd ./performance &&
           mvn --batch-mode --no-transfer-progress clean compile exec:exec -Dexec.executable=java -Dexec.args="-classpath %classpath com.clickhouse.benchmark.BenchmarkRunner \
           -l 100000,10000 -m 3 -t 15 -b q,i -d file://default.csv"
+
       - name: Upload test results
         uses: actions/upload-artifact@v4
         if: success()
@@ -138,3 +156,93 @@ jobs:
           name: result ${{ github.job }}
           path: |
             performance/jmh-results*
+
+      # Compare against the latest scheduled run on `main` and post a
+      # markdown comment. Only relevant when this run is tied to a PR;
+      # scheduled / non-PR runs skip these steps. We never fail the
+      # workflow if comparison fails — it's reporting, not gating.
+      - name: Fetch baseline results (latest successful main schedule)
+        id: baseline
+        if: steps.pr.outputs.number != ''
+        continue-on-error: true
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          mkdir -p baseline-results
+          RUN_ID=$(gh run list \
+            --workflow benchmarks.yml \
+            --branch main \
+            --status success \
+            --limit 20 \
+            --repo "${{ github.repository }}" \
+            --json databaseId,event \
+            -q 'map(select(.event=="schedule"))[0].databaseId // empty')
+          if [ -z "$RUN_ID" ]; then
+            echo "No scheduled baseline run found on main"
+            echo "found=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          echo "Baseline run: $RUN_ID"
+          if gh run download "$RUN_ID" --dir baseline-results --repo "${{ github.repository }}"; then
+            echo "found=true" >> "$GITHUB_OUTPUT"
+            echo "run_id=$RUN_ID" >> "$GITHUB_OUTPUT"
+          else
+            echo "Failed to download baseline artifacts"
+            echo "found=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Compare benchmark results
+        id: compare
+        if: steps.pr.outputs.number != '' && steps.baseline.outputs.found == 'true'
+        continue-on-error: true
+        run: |
+          python3 .github/scripts/compare-jmh.py \
+            --baseline baseline-results \
+            --current performance \
+            --baseline-run-id "${{ steps.baseline.outputs.run_id }}" \
+            --current-run-id "${{ github.run_id }}" \
+            --repo "${{ github.repository }}" \
+            --server-url "${{ github.server_url }}" \
+            --threshold-pct "${{ steps.pr.outputs.threshold }}" \
+            --output comparison.md \
+            --summary-output compare-summary.env
+          # Surface the script's summary file as step outputs so the
+          # follow-up "enforce threshold" step can decide whether to
+          # fail the job — without skipping the comment post.
+          cat compare-summary.env >> "$GITHUB_OUTPUT"
+          echo "ok=true" >> "$GITHUB_OUTPUT"
+
+      - name: Post baseline-not-found comment
+        if: |
+          steps.pr.outputs.number != '' &&
+          steps.baseline.outputs.found != 'true'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh pr comment "${{ steps.pr.outputs.number }}" \
+            --repo "${{ github.repository }}" \
+            --body "JMH benchmark comparison skipped: no successful scheduled run on \`main\` was found to use as a baseline." || true
+
+      - name: Post comparison comment
+        if: steps.compare.outputs.ok == 'true'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh pr comment "${{ steps.pr.outputs.number }}" \
+            --repo "${{ github.repository }}" \
+            --body-file comparison.md
+
+      # Fail the job — and therefore the PR check — when the comparison
+      # script flagged at least one regression beyond the threshold.
+      # This runs *after* the comment has been posted so reviewers still
+      # see the full table on the PR.
+      - name: Enforce regression threshold
+        if: steps.compare.outputs.ok == 'true'
+        run: |
+          REGRESSIONS="${{ steps.compare.outputs.regressions }}"
+          THRESHOLD="${{ steps.pr.outputs.threshold }}"
+          if [ -n "$REGRESSIONS" ] && [ "$REGRESSIONS" -gt 0 ]; then
+            echo "::error::$REGRESSIONS benchmark(s) regressed by more than ${THRESHOLD}% vs baseline."
+            exit 1
+          fi
+          echo "No regressions over ${THRESHOLD}%."