diff --git a/.github/scripts/compare-jmh.py b/.github/scripts/compare-jmh.py
new file mode 100644
index 000000000..39c1a5d0f
--- /dev/null
+++ b/.github/scripts/compare-jmh.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python3
+"""Compare two sets of JMH JSON results and emit a markdown summary.
+
+Used by `.github/workflows/benchmarks.yml` to diff the latest scheduled
+`main` benchmark run against the run that just finished for a PR.
+
+For each (benchmark, params) pair common to both runs we report two
+metrics:
+
+* `Time` — `primaryMetric.score`. In `SampleTime` mode this is the
+ mean sampled latency per op; it's our best available proxy for CPU
+ work since no dedicated CPU profiler is configured in
+ `BenchmarkRunner`.
+* `Alloc/op` — `secondaryMetrics["·gc.alloc.rate.norm"]`, populated by
+ JMH's `GCProfiler`. This is bytes allocated per benchmark op and is
+ the standard, low-noise JMH memory metric.
+
+Both metrics are "lower is better", so a positive delta indicates the
+PR is worse than the baseline. A run is considered failed when **any**
+benchmark's worst metric delta exceeds `--threshold-pct` in the worse
+direction. The script writes a `regressions=...`/`improvements=...`
+summary file the workflow uses to set step outputs and decide whether
+to fail the job.
+"""
+
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import os
+import sys
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+Key = Tuple[str, str]
+
+
+# ---------------------------------------------------------------------------
+# Metric model
+# ---------------------------------------------------------------------------
+
+# JMH's `GCProfiler` reports allocation rate normalised per op under this
+# secondary metric key (the leading char is U+00B7 MIDDLE DOT, not a regular
+# dot — that's JMH's convention for profiler-emitted metrics).
+ALLOC_NORM_KEY = "\u00b7gc.alloc.rate.norm"
+
+
+@dataclass(frozen=True)
+class Metric:
+ id: str
+ label: str
+ # Pull the `{score, scoreError, scoreUnit}`-shaped dict from a JMH record.
+ extract: Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]
+ # True when a higher score is worse (regression). Both of our metrics
+ # are lower-is-better so this is always True today, but the model
+ # supports e.g. Throughput mode trivially.
+ higher_is_worse: bool = True
+
+
+def _primary(record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+ pm = record.get("primaryMetric")
+ return pm if isinstance(pm, dict) else None
+
+
+def _secondary(record: Dict[str, Any], key: str) -> Optional[Dict[str, Any]]:
+ sm = record.get("secondaryMetrics") or {}
+ val = sm.get(key)
+ return val if isinstance(val, dict) else None
+
+
+METRICS: List[Metric] = [
+ Metric(id="time", label="Time", extract=_primary),
+ Metric(
+ id="alloc",
+ label="Alloc/op",
+ extract=lambda r: _secondary(r, ALLOC_NORM_KEY),
+ ),
+]
+
+
+# ---------------------------------------------------------------------------
+# Loading & helpers
+# ---------------------------------------------------------------------------
+
+
+def load_results(directory: str) -> Dict[Key, Dict[str, Any]]:
+ by_key: Dict[Key, Dict[str, Any]] = {}
+ paths = sorted(
+ glob.glob(os.path.join(directory, "**", "jmh-results-*.json"), recursive=True)
+ )
+ for path in paths:
+ try:
+ with open(path, "r", encoding="utf-8") as fh:
+ data = json.load(fh)
+ except (OSError, json.JSONDecodeError) as exc:
+ print(f"warn: could not load {path}: {exc}", file=sys.stderr)
+ continue
+ if not isinstance(data, list):
+ continue
+ for record in data:
+ bench = record.get("benchmark")
+ if not bench:
+ continue
+ params = record.get("params") or {}
+ param_str = ", ".join(f"{k}={params[k]}" for k in sorted(params))
+ by_key[(bench, param_str)] = record
+ return by_key
+
+
+def _float(d: Optional[Dict[str, Any]], key: str) -> Optional[float]:
+ if not d:
+ return None
+ val = d.get(key)
+ try:
+ return float(val) if val is not None else None
+ except (TypeError, ValueError):
+ return None
+
+
+def short_bench(name: str) -> str:
+ parts = name.split(".")
+ return ".".join(parts[-2:]) if len(parts) >= 2 else name
+
+
+def fmt_score(v: Optional[float], err: Optional[float], unit: str) -> str:
+ if v is None:
+ return "—"
+ body = f"{v:.3g} ± {err:.2g}" if err is not None else f"{v:.3g}"
+ return f"{body} {unit}".rstrip()
+
+
+def fmt_delta(d: Optional[float]) -> str:
+ if d is None:
+ return "—"
+ sign = "+" if d >= 0 else ""
+ return f"{sign}{d:.2f}%"
+
+
+# ---------------------------------------------------------------------------
+# Comparison
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MetricDelta:
+ metric: Metric
+ baseline: Optional[float]
+ current: Optional[float]
+ baseline_err: Optional[float]
+ current_err: Optional[float]
+ unit: str
+ delta_pct: Optional[float]
+
+ def regression(self, threshold: float) -> bool:
+ if self.delta_pct is None:
+ return False
+ signed = self.delta_pct if self.metric.higher_is_worse else -self.delta_pct
+ return signed > threshold
+
+ def improvement(self, threshold: float) -> bool:
+ if self.delta_pct is None:
+ return False
+ signed = self.delta_pct if self.metric.higher_is_worse else -self.delta_pct
+ return signed < -threshold
+
+ def cell(self) -> str:
+ b = fmt_score(self.baseline, self.baseline_err, self.unit)
+ c = fmt_score(self.current, self.current_err, self.unit)
+ return f"{b} → {c} ({fmt_delta(self.delta_pct)})"
+
+
+def metric_delta(metric: Metric, baseline_rec: Dict[str, Any], current_rec: Dict[str, Any]) -> MetricDelta:
+ b = metric.extract(baseline_rec)
+ c = metric.extract(current_rec)
+ bs = _float(b, "score")
+ cs = _float(c, "score")
+ be = _float(b, "scoreError")
+ ce = _float(c, "scoreError")
+ unit = (c or {}).get("scoreUnit") or (b or {}).get("scoreUnit") or ""
+ if bs is None or cs is None or bs == 0:
+ delta_pct: Optional[float] = None
+ else:
+ delta_pct = (cs - bs) / bs * 100.0
+ return MetricDelta(
+ metric=metric,
+ baseline=bs,
+ current=cs,
+ baseline_err=be,
+ current_err=ce,
+ unit=unit,
+ delta_pct=delta_pct,
+ )
+
+
+@dataclass
+class Row:
+ key: Key
+ deltas: List[MetricDelta]
+
+ def worst_signed_pct(self) -> float:
+ worst = 0.0
+ for d in self.deltas:
+ if d.delta_pct is None:
+ continue
+ signed = d.delta_pct if d.metric.higher_is_worse else -d.delta_pct
+ if signed > worst:
+ worst = signed
+ return worst
+
+ def sort_key(self) -> float:
+ best = 0.0
+ for d in self.deltas:
+ if d.delta_pct is None:
+ continue
+ if abs(d.delta_pct) > best:
+ best = abs(d.delta_pct)
+ return best
+
+ def status(self, threshold: float) -> str:
+ regressed = [d for d in self.deltas if d.regression(threshold)]
+ improved = [d for d in self.deltas if d.improvement(threshold)]
+ if regressed:
+ labels = ", ".join(d.metric.label for d in regressed)
+ return f"REGRESSION ({labels})"
+ if improved:
+ labels = ", ".join(d.metric.label for d in improved)
+ return f"improvement ({labels})"
+ return ""
+
+
+def build_rows(
+ baseline: Dict[Key, Dict[str, Any]],
+ current: Dict[Key, Dict[str, Any]],
+) -> List[Row]:
+ rows: List[Row] = []
+ for key in sorted(set(current) & set(baseline)):
+ b = baseline[key]
+ c = current[key]
+ deltas = [metric_delta(m, b, c) for m in METRICS]
+ rows.append(Row(key=key, deltas=deltas))
+ return rows
+
+
+# ---------------------------------------------------------------------------
+# Rendering
+# ---------------------------------------------------------------------------
+
+
+def build_markdown(
+ rows: List[Row],
+ only_current: List[Key],
+ only_baseline: List[Key],
+ current: Dict[Key, Dict[str, Any]],
+ *,
+ threshold: float,
+ repo: str,
+ server_url: str,
+ baseline_run_id: str,
+ current_run_id: str,
+) -> Tuple[str, int, int]:
+ rows = sorted(rows, key=lambda r: r.sort_key(), reverse=True)
+ regressions = sum(1 for r in rows if any(d.regression(threshold) for d in r.deltas))
+ improvements = sum(
+ 1 for r in rows
+ if not any(d.regression(threshold) for d in r.deltas)
+ and any(d.improvement(threshold) for d in r.deltas)
+ )
+
+ out: List[str] = [""]
+ if regressions:
+ out.append(f"## ❌ JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%")
+ elif improvements:
+ out.append(f"## ✅ JMH benchmark comparison — {improvements} improvement(s) over {threshold:g}%")
+ else:
+ out.append(f"## JMH benchmark comparison — no changes over {threshold:g}%")
+ out.append("")
+
+ if repo and baseline_run_id and current_run_id:
+ base_url = f"{server_url}/{repo}/actions/runs/{baseline_run_id}"
+ curr_url = f"{server_url}/{repo}/actions/runs/{current_run_id}"
+ out.append(
+ f"Baseline: [`main` run #{baseline_run_id}]({base_url}) — "
+ f"PR: [run #{current_run_id}]({curr_url})"
+ )
+ out.append("")
+
+ out.append(
+ f"Threshold: **±{threshold:g}%**. "
+ f"Metrics: **Time** (`primaryMetric.score`, `SampleTime` — proxy for CPU work) and "
+ f"**Alloc/op** (`{ALLOC_NORM_KEY}`, GC allocations per op — memory pressure). "
+ "Both are lower-is-better, so a positive Δ% means the PR is worse than baseline."
+ )
+ out.append("")
+
+ if rows:
+ header = "| Benchmark | Params | " + " | ".join(m.label for m in METRICS) + " | Status |"
+ sep = "|---|---|" + "|".join(["---"] * len(METRICS)) + "|---|"
+ out.append(header)
+ out.append(sep)
+ for r in rows:
+ bench, params = r.key
+ cells = " | ".join(d.cell() for d in r.deltas)
+ out.append(
+ f"| `{short_bench(bench)}` | {params or '—'} | {cells} | {r.status(threshold)} |"
+ )
+ out.append("")
+ else:
+ out.append("_No benchmarks matched between baseline and PR._")
+ out.append("")
+
+ if only_current:
+ out.append("Benchmarks only in PR run
")
+ out.append("")
+ for k in only_current:
+ bench, params = k
+ rec = current[k]
+ time_d = _primary(rec) or {}
+ alloc_d = _secondary(rec, ALLOC_NORM_KEY) or {}
+ out.append(
+ f"- `{short_bench(bench)}` ({params or '—'}): "
+ f"time={fmt_score(_float(time_d, 'score'), _float(time_d, 'scoreError'), time_d.get('scoreUnit', ''))}, "
+ f"alloc={fmt_score(_float(alloc_d, 'score'), _float(alloc_d, 'scoreError'), alloc_d.get('scoreUnit', ''))}"
+ )
+ out.append("")
+ out.append(" ")
+ out.append("")
+
+ if only_baseline:
+ out.append("Benchmarks only in baseline run
")
+ out.append("")
+ for k in only_baseline:
+ bench, params = k
+ out.append(f"- `{short_bench(bench)}` ({params or '—'})")
+ out.append("")
+ out.append(" ")
+ out.append("")
+
+ return "\n".join(out), regressions, improvements
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--baseline", required=True, help="Directory containing baseline JMH JSON files")
+ parser.add_argument("--current", required=True, help="Directory containing current JMH JSON files")
+ parser.add_argument("--baseline-run-id", default="", help="Baseline workflow run id (for links)")
+ parser.add_argument("--current-run-id", default="", help="Current workflow run id (for links)")
+ parser.add_argument("--repo", default="", help="owner/name for run links")
+ parser.add_argument("--server-url", default="https://github.com")
+ parser.add_argument("--output", required=True, help="Output markdown file")
+ parser.add_argument(
+ "--threshold-pct",
+ type=float,
+ default=10.0,
+ help="Δ%% beyond which a metric is flagged as a regression / improvement (default: 10)",
+ )
+ parser.add_argument(
+ "--summary-output",
+ default="",
+ help="Optional path to write a key=value summary the workflow can source",
+ )
+ args = parser.parse_args()
+
+ if args.threshold_pct < 0:
+ print("error: --threshold-pct must be non-negative", file=sys.stderr)
+ return 2
+
+ baseline = load_results(args.baseline)
+ current = load_results(args.current)
+
+ if not current:
+ print("error: no current JMH result files found", file=sys.stderr)
+ return 2
+
+ rows = build_rows(baseline, current)
+ only_current = sorted(set(current) - set(baseline))
+ only_baseline = sorted(set(baseline) - set(current))
+
+ md, regressions, improvements = build_markdown(
+ rows,
+ only_current,
+ only_baseline,
+ current,
+ threshold=args.threshold_pct,
+ repo=args.repo,
+ server_url=args.server_url,
+ baseline_run_id=args.baseline_run_id,
+ current_run_id=args.current_run_id,
+ )
+
+ with open(args.output, "w", encoding="utf-8") as fh:
+ fh.write(md)
+
+ if args.summary_output:
+ with open(args.summary_output, "w", encoding="utf-8") as fh:
+ fh.write(f"regressions={regressions}\n")
+ fh.write(f"improvements={improvements}\n")
+ fh.write(f"matched={len(rows)}\n")
+ fh.write(f"threshold_pct={args.threshold_pct}\n")
+
+ print(
+ f"wrote {args.output}: {len(rows)} matched, "
+ f"{regressions} regression(s) > {args.threshold_pct:g}%, "
+ f"{improvements} improvement(s), "
+ f"{len(only_current)} only-PR, {len(only_baseline)} only-baseline"
+ )
+ # We always exit 0; the workflow uses the summary file to decide
+ # whether to fail the job so that the comparison comment is still
+ # posted on regressions.
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/.github/workflows/benchmarks-pr-comment.yml b/.github/workflows/benchmarks-pr-comment.yml
new file mode 100644
index 000000000..fe3a01996
--- /dev/null
+++ b/.github/workflows/benchmarks-pr-comment.yml
@@ -0,0 +1,41 @@
+name: Benchmarks PR Comment
+
+# Posts a one-time instruction comment on newly opened PRs so contributors /
+# reviewers know how to launch a JMH benchmark run for the PR. The actual
+# benchmark workflow lives in `benchmarks.yml` and is triggered by a
+# `/benchmark` slash command (collaborators only).
+
+on:
+ pull_request_target:
+ types: [opened]
+
+permissions:
+ pull-requests: write
+
+jobs:
+ comment:
+ if: startsWith(github.repository, 'ClickHouse/clickhouse-java')
+ runs-on: ubuntu-latest
+ steps:
+ - name: Post benchmark instructions
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ gh api -X POST \
+ "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \
+ -f body="$(cat <<'EOF'
+ Repository collaborators can run the JMH benchmark suite against this PR by commenting:
+
+ ```
+ /benchmark
+ ```
+
+ Optional regression threshold override (Δ% on Time or Alloc/op; defaults to 10%):
+
+ ```
+ /benchmark threshold=15
+ ```
+
+ Only one benchmark run per PR is active at a time — issuing a new `/benchmark` comment cancels the previous run. After the run finishes a separate comment will be posted comparing it against the latest scheduled run on `main`; the PR check fails if any benchmark regresses by more than the threshold.
+ EOF
+ )"
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 87fc22433..64f513d97 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -9,6 +9,10 @@ on:
pr:
description: "Pull request#"
required: false
+ threshold:
+ description: "Regression threshold (Δ% on Time or Alloc/op)"
+ required: false
+ default: "10"
issue_comment:
types: [created]
@@ -16,57 +20,60 @@ env:
CHC_BRANCH: "main"
CH_VERSION: "25.3"
JAVA_VERSION: 17
+ # Default Δ% above which a regression / improvement is flagged and the
+ # PR check is failed. Overridable per workflow_dispatch input or per
+ # `/benchmark threshold=N` comment.
+ DEFAULT_THRESHOLD_PCT: "10"
-# One run per PR (cancel any in-progress run when a newer /benchmark arrives
-# for the same PR). Scheduled runs are grouped by SHA so they don't fight
-# with PR runs.
-concurrency:
- group: ${{ github.workflow }}-${{ github.event.issue.number || github.event.inputs.pr || github.sha }}
- cancel-in-progress: true
+# NOTE: there is intentionally no workflow-level `concurrency:` block.
+# `issue_comment` events fire for *every* comment on every PR / issue,
+# including those from bots (e.g. sonarqubecloud). A workflow-level
+# `cancel-in-progress` group keyed on the PR number would cancel an
+# in-flight legitimate `/benchmark` run as soon as any unrelated bot
+# commented on the same PR. The per-PR concurrency rule is enforced on
+# the `jmh` job instead, so unrelated comment events leave the job
+# skipped without claiming the concurrency slot.
jobs:
- # Gate: only run for `issue_comment` events when the comment is on a PR,
- # starts with `/benchmark`, is not from a bot, and the commenter is a
- # repo OWNER/MEMBER/COLLABORATOR. For schedule and workflow_dispatch this
- # job is skipped and the benchmark job runs unconditionally.
- trigger-check:
- if: github.event_name == 'issue_comment'
- name: "Check /benchmark trigger"
- runs-on: ubuntu-latest
+ jmh:
+ name: "Mininal JMH Benchmarks"
+ runs-on: "ubuntu-latest"
+ timeout-minutes: 30
permissions:
+ contents: read
pull-requests: write
issues: write
- outputs:
- pr_number: ${{ steps.resolve.outputs.pr_number }}
- steps:
- - name: Validate comment
- id: validate
- if: |
+ actions: read
+ # Single fan-in filter, modelled on `.github/workflows/claude.yml`:
+ # the job runs for the daily schedule, manual `workflow_dispatch`,
+ # or a `/benchmark` slash-command from a non-bot repo collaborator
+ # on a pull request. Bot comments and chat comments leave the job
+ # skipped — no failed run, no notification, no concurrency
+ # collision.
+ if: |
+ startsWith(github.repository, 'ClickHouse/') &&
+ (
+ github.event_name == 'schedule' ||
+ github.event_name == 'workflow_dispatch' ||
+ (
+ github.event_name == 'issue_comment' &&
github.event.issue.pull_request != null &&
github.event.sender.type != 'Bot' &&
+ github.event.comment.user.type != 'Bot' &&
startsWith(github.event.comment.body, '/benchmark') &&
contains(fromJSON('["OWNER","MEMBER","COLLABORATOR"]'), github.event.comment.author_association)
- run: echo "ok=true" >> $GITHUB_OUTPUT
- # Note: we deliberately use `startsWith` (not `contains`) so the
- # instruction comment posted by the PR-open bot, which mentions
- # `/benchmark` mid-sentence, does not re-trigger this workflow.
-
- - name: Reject unauthorized trigger
- if: steps.validate.outputs.ok != 'true'
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- # If it looks like a /benchmark attempt by someone without
- # permission, leave a -1 reaction so they get feedback.
- if [[ "${{ github.event.issue.pull_request != null }}" == "true" ]] \
- && [[ "${{ startsWith(github.event.comment.body, '/benchmark') }}" == "true" ]]; then
- gh api -X POST \
- "repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \
- -f content='-1' || true
- fi
- exit 1
-
- - name: Acknowledge trigger
+ )
+ )
+ # One running benchmark per PR (and per-SHA for the daily
+ # schedule). Concurrency lives on this job, not on the workflow,
+ # so unrelated comment events (which the job-level `if` filters
+ # out) never claim the slot or cancel an in-flight run.
+ concurrency:
+ group: ${{ github.workflow }}-jmh-${{ github.event.issue.number || github.event.inputs.pr || github.sha }}
+ cancel-in-progress: true
+ steps:
+ - name: Acknowledge /benchmark trigger
+ if: github.event_name == 'issue_comment'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
@@ -74,28 +81,32 @@ jobs:
"repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \
-f content='rocket' || true
- - name: Resolve PR number
- id: resolve
- run: echo "pr_number=${{ github.event.issue.number }}" >> $GITHUB_OUTPUT
-
- jmh:
- needs: [trigger-check]
- if: |
- always() &&
- startsWith(github.repository, 'ClickHouse/') &&
- (needs.trigger-check.result == 'success' || needs.trigger-check.result == 'skipped')
- name: "Mininal JMH Benchmarks"
- runs-on: "ubuntu-latest"
- timeout-minutes: 30
- steps:
- - name: Resolve PR number
+ - name: Resolve PR number and threshold
id: pr
+ env:
+ COMMENT_BODY: ${{ github.event.comment.body }}
+ DISPATCH_PR: ${{ github.event.inputs.pr }}
+ DISPATCH_THRESHOLD: ${{ github.event.inputs.threshold }}
+ DEFAULT_THRESHOLD: ${{ env.DEFAULT_THRESHOLD_PCT }}
run: |
case "${{ github.event_name }}" in
- issue_comment) echo "number=${{ needs.trigger-check.outputs.pr_number }}" >> $GITHUB_OUTPUT ;;
- workflow_dispatch) echo "number=${{ github.event.inputs.pr }}" >> $GITHUB_OUTPUT ;;
- *) echo "number=" >> $GITHUB_OUTPUT ;;
+ issue_comment)
+ # Accept `/benchmark threshold=15` or `/benchmark threshold=7.5`.
+ T=$(printf '%s' "$COMMENT_BODY" | grep -oE 'threshold=[0-9]+(\.[0-9]+)?' | head -1 | cut -d= -f2 || true)
+ [ -z "$T" ] && T="$DEFAULT_THRESHOLD"
+ echo "number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT"
+ echo "threshold=$T" >> "$GITHUB_OUTPUT"
+ ;;
+ workflow_dispatch)
+ echo "number=$DISPATCH_PR" >> "$GITHUB_OUTPUT"
+ echo "threshold=${DISPATCH_THRESHOLD:-$DEFAULT_THRESHOLD}" >> "$GITHUB_OUTPUT"
+ ;;
+ *)
+ echo "number=" >> "$GITHUB_OUTPUT"
+ echo "threshold=$DEFAULT_THRESHOLD" >> "$GITHUB_OUTPUT"
+ ;;
esac
+
- name: Post "started" comment
if: github.event_name == 'issue_comment' && steps.pr.outputs.number != ''
env:
@@ -104,33 +115,40 @@ jobs:
gh api -X POST \
"repos/${{ github.repository }}/issues/${{ steps.pr.outputs.number }}/comments" \
-f body="JMH benchmark run started: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true
+
- name: Check out Git repository
uses: actions/checkout@v4
with:
ref: ${{ env.CHC_BRANCH }}
+
- name: Check out PR
if: steps.pr.outputs.number != ''
run: |
git fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 \
origin pull/${{ steps.pr.outputs.number }}/merge:merged-pr && git checkout merged-pr
+
- name: Install JDK and Maven
uses: actions/setup-java@v4
with:
distribution: "temurin"
java-version: ${{ env.JAVA_VERSION }}
cache: "maven"
+
- name: Build
run: mvn --batch-mode --no-transfer-progress -Dj8 -DskipTests=true clean install
+
- name: Prepare Dataset
run: |
cd ./performance &&
mvn --batch-mode --no-transfer-progress clean compile exec:exec -Dexec.executable=java \
-Dexec.args="-classpath %classpath com.clickhouse.benchmark.data.DataSetGenerator -input sample_dataset.sql -name default -rows 100000"
+
- name: Run Benchmarks
run: |
cd ./performance &&
mvn --batch-mode --no-transfer-progress clean compile exec:exec -Dexec.executable=java -Dexec.args="-classpath %classpath com.clickhouse.benchmark.BenchmarkRunner \
-l 100000,10000 -m 3 -t 15 -b q,i -d file://default.csv"
+
- name: Upload test results
uses: actions/upload-artifact@v4
if: success()
@@ -138,3 +156,93 @@ jobs:
name: result ${{ github.job }}
path: |
performance/jmh-results*
+
+ # Compare against the latest scheduled run on `main` and post a
+ # markdown comment. Only relevant when this run is tied to a PR;
+ # scheduled / non-PR runs skip these steps. We never fail the
+ # workflow if comparison fails — it's reporting, not gating.
+ - name: Fetch baseline results (latest successful main schedule)
+ id: baseline
+ if: steps.pr.outputs.number != ''
+ continue-on-error: true
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ mkdir -p baseline-results
+ RUN_ID=$(gh run list \
+ --workflow benchmarks.yml \
+ --branch main \
+ --status success \
+ --limit 20 \
+ --repo "${{ github.repository }}" \
+ --json databaseId,event \
+ -q 'map(select(.event=="schedule"))[0].databaseId // empty')
+ if [ -z "$RUN_ID" ]; then
+ echo "No scheduled baseline run found on main"
+ echo "found=false" >> "$GITHUB_OUTPUT"
+ exit 0
+ fi
+ echo "Baseline run: $RUN_ID"
+ if gh run download "$RUN_ID" --dir baseline-results --repo "${{ github.repository }}"; then
+ echo "found=true" >> "$GITHUB_OUTPUT"
+ echo "run_id=$RUN_ID" >> "$GITHUB_OUTPUT"
+ else
+ echo "Failed to download baseline artifacts"
+ echo "found=false" >> "$GITHUB_OUTPUT"
+ fi
+
+ - name: Compare benchmark results
+ id: compare
+ if: steps.pr.outputs.number != '' && steps.baseline.outputs.found == 'true'
+ continue-on-error: true
+ run: |
+ python3 .github/scripts/compare-jmh.py \
+ --baseline baseline-results \
+ --current performance \
+ --baseline-run-id "${{ steps.baseline.outputs.run_id }}" \
+ --current-run-id "${{ github.run_id }}" \
+ --repo "${{ github.repository }}" \
+ --server-url "${{ github.server_url }}" \
+ --threshold-pct "${{ steps.pr.outputs.threshold }}" \
+ --output comparison.md \
+ --summary-output compare-summary.env
+ # Surface the script's summary file as step outputs so the
+ # follow-up "enforce threshold" step can decide whether to
+ # fail the job — without skipping the comment post.
+ cat compare-summary.env >> "$GITHUB_OUTPUT"
+ echo "ok=true" >> "$GITHUB_OUTPUT"
+
+ - name: Post baseline-not-found comment
+ if: |
+ steps.pr.outputs.number != '' &&
+ steps.baseline.outputs.found != 'true'
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ gh pr comment "${{ steps.pr.outputs.number }}" \
+ --repo "${{ github.repository }}" \
+ --body "JMH benchmark comparison skipped: no successful scheduled run on \`main\` was found to use as a baseline." || true
+
+ - name: Post comparison comment
+ if: steps.compare.outputs.ok == 'true'
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ gh pr comment "${{ steps.pr.outputs.number }}" \
+ --repo "${{ github.repository }}" \
+ --body-file comparison.md
+
+ # Fail the job — and therefore the PR check — when the comparison
+ # script flagged at least one regression beyond the threshold.
+ # This runs *after* the comment has been posted so reviewers still
+ # see the full table on the PR.
+ - name: Enforce regression threshold
+ if: steps.compare.outputs.ok == 'true'
+ run: |
+ REGRESSIONS="${{ steps.compare.outputs.regressions }}"
+ THRESHOLD="${{ steps.pr.outputs.threshold }}"
+ if [ -n "$REGRESSIONS" ] && [ "$REGRESSIONS" -gt 0 ]; then
+ echo "::error::$REGRESSIONS benchmark(s) regressed by more than ${THRESHOLD}% vs baseline."
+ exit 1
+ fi
+ echo "No regressions over ${THRESHOLD}%."