diff --git a/.github/scripts/compare-jmh.py b/.github/scripts/compare-jmh.py new file mode 100644 index 000000000..39c1a5d0f --- /dev/null +++ b/.github/scripts/compare-jmh.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +"""Compare two sets of JMH JSON results and emit a markdown summary. + +Used by `.github/workflows/benchmarks.yml` to diff the latest scheduled +`main` benchmark run against the run that just finished for a PR. + +For each (benchmark, params) pair common to both runs we report two +metrics: + +* `Time` — `primaryMetric.score`. In `SampleTime` mode this is the + mean sampled latency per op; it's our best available proxy for CPU + work since no dedicated CPU profiler is configured in + `BenchmarkRunner`. +* `Alloc/op` — `secondaryMetrics["·gc.alloc.rate.norm"]`, populated by + JMH's `GCProfiler`. This is bytes allocated per benchmark op and is + the standard, low-noise JMH memory metric. + +Both metrics are "lower is better", so a positive delta indicates the +PR is worse than the baseline. A run is considered failed when **any** +benchmark's worst metric delta exceeds `--threshold-pct` in the worse +direction. The script writes a `regressions=...`/`improvements=...` +summary file the workflow uses to set step outputs and decide whether +to fail the job. +""" + +from __future__ import annotations + +import argparse +import glob +import json +import os +import sys +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Tuple + +Key = Tuple[str, str] + + +# --------------------------------------------------------------------------- +# Metric model +# --------------------------------------------------------------------------- + +# JMH's `GCProfiler` reports allocation rate normalised per op under this +# secondary metric key (the leading char is U+00B7 MIDDLE DOT, not a regular +# dot — that's JMH's convention for profiler-emitted metrics). +ALLOC_NORM_KEY = "\u00b7gc.alloc.rate.norm" + + +@dataclass(frozen=True) +class Metric: + id: str + label: str + # Pull the `{score, scoreError, scoreUnit}`-shaped dict from a JMH record. + extract: Callable[[Dict[str, Any]], Optional[Dict[str, Any]]] + # True when a higher score is worse (regression). Both of our metrics + # are lower-is-better so this is always True today, but the model + # supports e.g. Throughput mode trivially. + higher_is_worse: bool = True + + +def _primary(record: Dict[str, Any]) -> Optional[Dict[str, Any]]: + pm = record.get("primaryMetric") + return pm if isinstance(pm, dict) else None + + +def _secondary(record: Dict[str, Any], key: str) -> Optional[Dict[str, Any]]: + sm = record.get("secondaryMetrics") or {} + val = sm.get(key) + return val if isinstance(val, dict) else None + + +METRICS: List[Metric] = [ + Metric(id="time", label="Time", extract=_primary), + Metric( + id="alloc", + label="Alloc/op", + extract=lambda r: _secondary(r, ALLOC_NORM_KEY), + ), +] + + +# --------------------------------------------------------------------------- +# Loading & helpers +# --------------------------------------------------------------------------- + + +def load_results(directory: str) -> Dict[Key, Dict[str, Any]]: + by_key: Dict[Key, Dict[str, Any]] = {} + paths = sorted( + glob.glob(os.path.join(directory, "**", "jmh-results-*.json"), recursive=True) + ) + for path in paths: + try: + with open(path, "r", encoding="utf-8") as fh: + data = json.load(fh) + except (OSError, json.JSONDecodeError) as exc: + print(f"warn: could not load {path}: {exc}", file=sys.stderr) + continue + if not isinstance(data, list): + continue + for record in data: + bench = record.get("benchmark") + if not bench: + continue + params = record.get("params") or {} + param_str = ", ".join(f"{k}={params[k]}" for k in sorted(params)) + by_key[(bench, param_str)] = record + return by_key + + +def _float(d: Optional[Dict[str, Any]], key: str) -> Optional[float]: + if not d: + return None + val = d.get(key) + try: + return float(val) if val is not None else None + except (TypeError, ValueError): + return None + + +def short_bench(name: str) -> str: + parts = name.split(".") + return ".".join(parts[-2:]) if len(parts) >= 2 else name + + +def fmt_score(v: Optional[float], err: Optional[float], unit: str) -> str: + if v is None: + return "—" + body = f"{v:.3g} ± {err:.2g}" if err is not None else f"{v:.3g}" + return f"{body} {unit}".rstrip() + + +def fmt_delta(d: Optional[float]) -> str: + if d is None: + return "—" + sign = "+" if d >= 0 else "" + return f"{sign}{d:.2f}%" + + +# --------------------------------------------------------------------------- +# Comparison +# --------------------------------------------------------------------------- + + +@dataclass +class MetricDelta: + metric: Metric + baseline: Optional[float] + current: Optional[float] + baseline_err: Optional[float] + current_err: Optional[float] + unit: str + delta_pct: Optional[float] + + def regression(self, threshold: float) -> bool: + if self.delta_pct is None: + return False + signed = self.delta_pct if self.metric.higher_is_worse else -self.delta_pct + return signed > threshold + + def improvement(self, threshold: float) -> bool: + if self.delta_pct is None: + return False + signed = self.delta_pct if self.metric.higher_is_worse else -self.delta_pct + return signed < -threshold + + def cell(self) -> str: + b = fmt_score(self.baseline, self.baseline_err, self.unit) + c = fmt_score(self.current, self.current_err, self.unit) + return f"{b} → {c} ({fmt_delta(self.delta_pct)})" + + +def metric_delta(metric: Metric, baseline_rec: Dict[str, Any], current_rec: Dict[str, Any]) -> MetricDelta: + b = metric.extract(baseline_rec) + c = metric.extract(current_rec) + bs = _float(b, "score") + cs = _float(c, "score") + be = _float(b, "scoreError") + ce = _float(c, "scoreError") + unit = (c or {}).get("scoreUnit") or (b or {}).get("scoreUnit") or "" + if bs is None or cs is None or bs == 0: + delta_pct: Optional[float] = None + else: + delta_pct = (cs - bs) / bs * 100.0 + return MetricDelta( + metric=metric, + baseline=bs, + current=cs, + baseline_err=be, + current_err=ce, + unit=unit, + delta_pct=delta_pct, + ) + + +@dataclass +class Row: + key: Key + deltas: List[MetricDelta] + + def worst_signed_pct(self) -> float: + worst = 0.0 + for d in self.deltas: + if d.delta_pct is None: + continue + signed = d.delta_pct if d.metric.higher_is_worse else -d.delta_pct + if signed > worst: + worst = signed + return worst + + def sort_key(self) -> float: + best = 0.0 + for d in self.deltas: + if d.delta_pct is None: + continue + if abs(d.delta_pct) > best: + best = abs(d.delta_pct) + return best + + def status(self, threshold: float) -> str: + regressed = [d for d in self.deltas if d.regression(threshold)] + improved = [d for d in self.deltas if d.improvement(threshold)] + if regressed: + labels = ", ".join(d.metric.label for d in regressed) + return f"REGRESSION ({labels})" + if improved: + labels = ", ".join(d.metric.label for d in improved) + return f"improvement ({labels})" + return "" + + +def build_rows( + baseline: Dict[Key, Dict[str, Any]], + current: Dict[Key, Dict[str, Any]], +) -> List[Row]: + rows: List[Row] = [] + for key in sorted(set(current) & set(baseline)): + b = baseline[key] + c = current[key] + deltas = [metric_delta(m, b, c) for m in METRICS] + rows.append(Row(key=key, deltas=deltas)) + return rows + + +# --------------------------------------------------------------------------- +# Rendering +# --------------------------------------------------------------------------- + + +def build_markdown( + rows: List[Row], + only_current: List[Key], + only_baseline: List[Key], + current: Dict[Key, Dict[str, Any]], + *, + threshold: float, + repo: str, + server_url: str, + baseline_run_id: str, + current_run_id: str, +) -> Tuple[str, int, int]: + rows = sorted(rows, key=lambda r: r.sort_key(), reverse=True) + regressions = sum(1 for r in rows if any(d.regression(threshold) for d in r.deltas)) + improvements = sum( + 1 for r in rows + if not any(d.regression(threshold) for d in r.deltas) + and any(d.improvement(threshold) for d in r.deltas) + ) + + out: List[str] = [""] + if regressions: + out.append(f"## ❌ JMH benchmark comparison — {regressions} regression(s) over {threshold:g}%") + elif improvements: + out.append(f"## ✅ JMH benchmark comparison — {improvements} improvement(s) over {threshold:g}%") + else: + out.append(f"## JMH benchmark comparison — no changes over {threshold:g}%") + out.append("") + + if repo and baseline_run_id and current_run_id: + base_url = f"{server_url}/{repo}/actions/runs/{baseline_run_id}" + curr_url = f"{server_url}/{repo}/actions/runs/{current_run_id}" + out.append( + f"Baseline: [`main` run #{baseline_run_id}]({base_url}) — " + f"PR: [run #{current_run_id}]({curr_url})" + ) + out.append("") + + out.append( + f"Threshold: **±{threshold:g}%**. " + f"Metrics: **Time** (`primaryMetric.score`, `SampleTime` — proxy for CPU work) and " + f"**Alloc/op** (`{ALLOC_NORM_KEY}`, GC allocations per op — memory pressure). " + "Both are lower-is-better, so a positive Δ% means the PR is worse than baseline." + ) + out.append("") + + if rows: + header = "| Benchmark | Params | " + " | ".join(m.label for m in METRICS) + " | Status |" + sep = "|---|---|" + "|".join(["---"] * len(METRICS)) + "|---|" + out.append(header) + out.append(sep) + for r in rows: + bench, params = r.key + cells = " | ".join(d.cell() for d in r.deltas) + out.append( + f"| `{short_bench(bench)}` | {params or '—'} | {cells} | {r.status(threshold)} |" + ) + out.append("") + else: + out.append("_No benchmarks matched between baseline and PR._") + out.append("") + + if only_current: + out.append("
Benchmarks only in PR run") + out.append("") + for k in only_current: + bench, params = k + rec = current[k] + time_d = _primary(rec) or {} + alloc_d = _secondary(rec, ALLOC_NORM_KEY) or {} + out.append( + f"- `{short_bench(bench)}` ({params or '—'}): " + f"time={fmt_score(_float(time_d, 'score'), _float(time_d, 'scoreError'), time_d.get('scoreUnit', ''))}, " + f"alloc={fmt_score(_float(alloc_d, 'score'), _float(alloc_d, 'scoreError'), alloc_d.get('scoreUnit', ''))}" + ) + out.append("") + out.append("
") + out.append("") + + if only_baseline: + out.append("
Benchmarks only in baseline run") + out.append("") + for k in only_baseline: + bench, params = k + out.append(f"- `{short_bench(bench)}` ({params or '—'})") + out.append("") + out.append("
") + out.append("") + + return "\n".join(out), regressions, improvements + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--baseline", required=True, help="Directory containing baseline JMH JSON files") + parser.add_argument("--current", required=True, help="Directory containing current JMH JSON files") + parser.add_argument("--baseline-run-id", default="", help="Baseline workflow run id (for links)") + parser.add_argument("--current-run-id", default="", help="Current workflow run id (for links)") + parser.add_argument("--repo", default="", help="owner/name for run links") + parser.add_argument("--server-url", default="https://github.com") + parser.add_argument("--output", required=True, help="Output markdown file") + parser.add_argument( + "--threshold-pct", + type=float, + default=10.0, + help="Δ%% beyond which a metric is flagged as a regression / improvement (default: 10)", + ) + parser.add_argument( + "--summary-output", + default="", + help="Optional path to write a key=value summary the workflow can source", + ) + args = parser.parse_args() + + if args.threshold_pct < 0: + print("error: --threshold-pct must be non-negative", file=sys.stderr) + return 2 + + baseline = load_results(args.baseline) + current = load_results(args.current) + + if not current: + print("error: no current JMH result files found", file=sys.stderr) + return 2 + + rows = build_rows(baseline, current) + only_current = sorted(set(current) - set(baseline)) + only_baseline = sorted(set(baseline) - set(current)) + + md, regressions, improvements = build_markdown( + rows, + only_current, + only_baseline, + current, + threshold=args.threshold_pct, + repo=args.repo, + server_url=args.server_url, + baseline_run_id=args.baseline_run_id, + current_run_id=args.current_run_id, + ) + + with open(args.output, "w", encoding="utf-8") as fh: + fh.write(md) + + if args.summary_output: + with open(args.summary_output, "w", encoding="utf-8") as fh: + fh.write(f"regressions={regressions}\n") + fh.write(f"improvements={improvements}\n") + fh.write(f"matched={len(rows)}\n") + fh.write(f"threshold_pct={args.threshold_pct}\n") + + print( + f"wrote {args.output}: {len(rows)} matched, " + f"{regressions} regression(s) > {args.threshold_pct:g}%, " + f"{improvements} improvement(s), " + f"{len(only_current)} only-PR, {len(only_baseline)} only-baseline" + ) + # We always exit 0; the workflow uses the summary file to decide + # whether to fail the job so that the comparison comment is still + # posted on regressions. + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/workflows/benchmarks-pr-comment.yml b/.github/workflows/benchmarks-pr-comment.yml new file mode 100644 index 000000000..fe3a01996 --- /dev/null +++ b/.github/workflows/benchmarks-pr-comment.yml @@ -0,0 +1,41 @@ +name: Benchmarks PR Comment + +# Posts a one-time instruction comment on newly opened PRs so contributors / +# reviewers know how to launch a JMH benchmark run for the PR. The actual +# benchmark workflow lives in `benchmarks.yml` and is triggered by a +# `/benchmark` slash command (collaborators only). + +on: + pull_request_target: + types: [opened] + +permissions: + pull-requests: write + +jobs: + comment: + if: startsWith(github.repository, 'ClickHouse/clickhouse-java') + runs-on: ubuntu-latest + steps: + - name: Post benchmark instructions + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api -X POST \ + "repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" \ + -f body="$(cat <<'EOF' + Repository collaborators can run the JMH benchmark suite against this PR by commenting: + + ``` + /benchmark + ``` + + Optional regression threshold override (Δ% on Time or Alloc/op; defaults to 10%): + + ``` + /benchmark threshold=15 + ``` + + Only one benchmark run per PR is active at a time — issuing a new `/benchmark` comment cancels the previous run. After the run finishes a separate comment will be posted comparing it against the latest scheduled run on `main`; the PR check fails if any benchmark regresses by more than the threshold. + EOF + )" diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 87fc22433..64f513d97 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -9,6 +9,10 @@ on: pr: description: "Pull request#" required: false + threshold: + description: "Regression threshold (Δ% on Time or Alloc/op)" + required: false + default: "10" issue_comment: types: [created] @@ -16,57 +20,60 @@ env: CHC_BRANCH: "main" CH_VERSION: "25.3" JAVA_VERSION: 17 + # Default Δ% above which a regression / improvement is flagged and the + # PR check is failed. Overridable per workflow_dispatch input or per + # `/benchmark threshold=N` comment. + DEFAULT_THRESHOLD_PCT: "10" -# One run per PR (cancel any in-progress run when a newer /benchmark arrives -# for the same PR). Scheduled runs are grouped by SHA so they don't fight -# with PR runs. -concurrency: - group: ${{ github.workflow }}-${{ github.event.issue.number || github.event.inputs.pr || github.sha }} - cancel-in-progress: true +# NOTE: there is intentionally no workflow-level `concurrency:` block. +# `issue_comment` events fire for *every* comment on every PR / issue, +# including those from bots (e.g. sonarqubecloud). A workflow-level +# `cancel-in-progress` group keyed on the PR number would cancel an +# in-flight legitimate `/benchmark` run as soon as any unrelated bot +# commented on the same PR. The per-PR concurrency rule is enforced on +# the `jmh` job instead, so unrelated comment events leave the job +# skipped without claiming the concurrency slot. jobs: - # Gate: only run for `issue_comment` events when the comment is on a PR, - # starts with `/benchmark`, is not from a bot, and the commenter is a - # repo OWNER/MEMBER/COLLABORATOR. For schedule and workflow_dispatch this - # job is skipped and the benchmark job runs unconditionally. - trigger-check: - if: github.event_name == 'issue_comment' - name: "Check /benchmark trigger" - runs-on: ubuntu-latest + jmh: + name: "Mininal JMH Benchmarks" + runs-on: "ubuntu-latest" + timeout-minutes: 30 permissions: + contents: read pull-requests: write issues: write - outputs: - pr_number: ${{ steps.resolve.outputs.pr_number }} - steps: - - name: Validate comment - id: validate - if: | + actions: read + # Single fan-in filter, modelled on `.github/workflows/claude.yml`: + # the job runs for the daily schedule, manual `workflow_dispatch`, + # or a `/benchmark` slash-command from a non-bot repo collaborator + # on a pull request. Bot comments and chat comments leave the job + # skipped — no failed run, no notification, no concurrency + # collision. + if: | + startsWith(github.repository, 'ClickHouse/') && + ( + github.event_name == 'schedule' || + github.event_name == 'workflow_dispatch' || + ( + github.event_name == 'issue_comment' && github.event.issue.pull_request != null && github.event.sender.type != 'Bot' && + github.event.comment.user.type != 'Bot' && startsWith(github.event.comment.body, '/benchmark') && contains(fromJSON('["OWNER","MEMBER","COLLABORATOR"]'), github.event.comment.author_association) - run: echo "ok=true" >> $GITHUB_OUTPUT - # Note: we deliberately use `startsWith` (not `contains`) so the - # instruction comment posted by the PR-open bot, which mentions - # `/benchmark` mid-sentence, does not re-trigger this workflow. - - - name: Reject unauthorized trigger - if: steps.validate.outputs.ok != 'true' - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # If it looks like a /benchmark attempt by someone without - # permission, leave a -1 reaction so they get feedback. - if [[ "${{ github.event.issue.pull_request != null }}" == "true" ]] \ - && [[ "${{ startsWith(github.event.comment.body, '/benchmark') }}" == "true" ]]; then - gh api -X POST \ - "repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \ - -f content='-1' || true - fi - exit 1 - - - name: Acknowledge trigger + ) + ) + # One running benchmark per PR (and per-SHA for the daily + # schedule). Concurrency lives on this job, not on the workflow, + # so unrelated comment events (which the job-level `if` filters + # out) never claim the slot or cancel an in-flight run. + concurrency: + group: ${{ github.workflow }}-jmh-${{ github.event.issue.number || github.event.inputs.pr || github.sha }} + cancel-in-progress: true + steps: + - name: Acknowledge /benchmark trigger + if: github.event_name == 'issue_comment' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | @@ -74,28 +81,32 @@ jobs: "repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \ -f content='rocket' || true - - name: Resolve PR number - id: resolve - run: echo "pr_number=${{ github.event.issue.number }}" >> $GITHUB_OUTPUT - - jmh: - needs: [trigger-check] - if: | - always() && - startsWith(github.repository, 'ClickHouse/') && - (needs.trigger-check.result == 'success' || needs.trigger-check.result == 'skipped') - name: "Mininal JMH Benchmarks" - runs-on: "ubuntu-latest" - timeout-minutes: 30 - steps: - - name: Resolve PR number + - name: Resolve PR number and threshold id: pr + env: + COMMENT_BODY: ${{ github.event.comment.body }} + DISPATCH_PR: ${{ github.event.inputs.pr }} + DISPATCH_THRESHOLD: ${{ github.event.inputs.threshold }} + DEFAULT_THRESHOLD: ${{ env.DEFAULT_THRESHOLD_PCT }} run: | case "${{ github.event_name }}" in - issue_comment) echo "number=${{ needs.trigger-check.outputs.pr_number }}" >> $GITHUB_OUTPUT ;; - workflow_dispatch) echo "number=${{ github.event.inputs.pr }}" >> $GITHUB_OUTPUT ;; - *) echo "number=" >> $GITHUB_OUTPUT ;; + issue_comment) + # Accept `/benchmark threshold=15` or `/benchmark threshold=7.5`. + T=$(printf '%s' "$COMMENT_BODY" | grep -oE 'threshold=[0-9]+(\.[0-9]+)?' | head -1 | cut -d= -f2 || true) + [ -z "$T" ] && T="$DEFAULT_THRESHOLD" + echo "number=${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT" + echo "threshold=$T" >> "$GITHUB_OUTPUT" + ;; + workflow_dispatch) + echo "number=$DISPATCH_PR" >> "$GITHUB_OUTPUT" + echo "threshold=${DISPATCH_THRESHOLD:-$DEFAULT_THRESHOLD}" >> "$GITHUB_OUTPUT" + ;; + *) + echo "number=" >> "$GITHUB_OUTPUT" + echo "threshold=$DEFAULT_THRESHOLD" >> "$GITHUB_OUTPUT" + ;; esac + - name: Post "started" comment if: github.event_name == 'issue_comment' && steps.pr.outputs.number != '' env: @@ -104,33 +115,40 @@ jobs: gh api -X POST \ "repos/${{ github.repository }}/issues/${{ steps.pr.outputs.number }}/comments" \ -f body="JMH benchmark run started: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true + - name: Check out Git repository uses: actions/checkout@v4 with: ref: ${{ env.CHC_BRANCH }} + - name: Check out PR if: steps.pr.outputs.number != '' run: | git fetch --no-tags --prune --progress --no-recurse-submodules --depth=1 \ origin pull/${{ steps.pr.outputs.number }}/merge:merged-pr && git checkout merged-pr + - name: Install JDK and Maven uses: actions/setup-java@v4 with: distribution: "temurin" java-version: ${{ env.JAVA_VERSION }} cache: "maven" + - name: Build run: mvn --batch-mode --no-transfer-progress -Dj8 -DskipTests=true clean install + - name: Prepare Dataset run: | cd ./performance && mvn --batch-mode --no-transfer-progress clean compile exec:exec -Dexec.executable=java \ -Dexec.args="-classpath %classpath com.clickhouse.benchmark.data.DataSetGenerator -input sample_dataset.sql -name default -rows 100000" + - name: Run Benchmarks run: | cd ./performance && mvn --batch-mode --no-transfer-progress clean compile exec:exec -Dexec.executable=java -Dexec.args="-classpath %classpath com.clickhouse.benchmark.BenchmarkRunner \ -l 100000,10000 -m 3 -t 15 -b q,i -d file://default.csv" + - name: Upload test results uses: actions/upload-artifact@v4 if: success() @@ -138,3 +156,93 @@ jobs: name: result ${{ github.job }} path: | performance/jmh-results* + + # Compare against the latest scheduled run on `main` and post a + # markdown comment. Only relevant when this run is tied to a PR; + # scheduled / non-PR runs skip these steps. We never fail the + # workflow if comparison fails — it's reporting, not gating. + - name: Fetch baseline results (latest successful main schedule) + id: baseline + if: steps.pr.outputs.number != '' + continue-on-error: true + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + mkdir -p baseline-results + RUN_ID=$(gh run list \ + --workflow benchmarks.yml \ + --branch main \ + --status success \ + --limit 20 \ + --repo "${{ github.repository }}" \ + --json databaseId,event \ + -q 'map(select(.event=="schedule"))[0].databaseId // empty') + if [ -z "$RUN_ID" ]; then + echo "No scheduled baseline run found on main" + echo "found=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + echo "Baseline run: $RUN_ID" + if gh run download "$RUN_ID" --dir baseline-results --repo "${{ github.repository }}"; then + echo "found=true" >> "$GITHUB_OUTPUT" + echo "run_id=$RUN_ID" >> "$GITHUB_OUTPUT" + else + echo "Failed to download baseline artifacts" + echo "found=false" >> "$GITHUB_OUTPUT" + fi + + - name: Compare benchmark results + id: compare + if: steps.pr.outputs.number != '' && steps.baseline.outputs.found == 'true' + continue-on-error: true + run: | + python3 .github/scripts/compare-jmh.py \ + --baseline baseline-results \ + --current performance \ + --baseline-run-id "${{ steps.baseline.outputs.run_id }}" \ + --current-run-id "${{ github.run_id }}" \ + --repo "${{ github.repository }}" \ + --server-url "${{ github.server_url }}" \ + --threshold-pct "${{ steps.pr.outputs.threshold }}" \ + --output comparison.md \ + --summary-output compare-summary.env + # Surface the script's summary file as step outputs so the + # follow-up "enforce threshold" step can decide whether to + # fail the job — without skipping the comment post. + cat compare-summary.env >> "$GITHUB_OUTPUT" + echo "ok=true" >> "$GITHUB_OUTPUT" + + - name: Post baseline-not-found comment + if: | + steps.pr.outputs.number != '' && + steps.baseline.outputs.found != 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr comment "${{ steps.pr.outputs.number }}" \ + --repo "${{ github.repository }}" \ + --body "JMH benchmark comparison skipped: no successful scheduled run on \`main\` was found to use as a baseline." || true + + - name: Post comparison comment + if: steps.compare.outputs.ok == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh pr comment "${{ steps.pr.outputs.number }}" \ + --repo "${{ github.repository }}" \ + --body-file comparison.md + + # Fail the job — and therefore the PR check — when the comparison + # script flagged at least one regression beyond the threshold. + # This runs *after* the comment has been posted so reviewers still + # see the full table on the PR. + - name: Enforce regression threshold + if: steps.compare.outputs.ok == 'true' + run: | + REGRESSIONS="${{ steps.compare.outputs.regressions }}" + THRESHOLD="${{ steps.pr.outputs.threshold }}" + if [ -n "$REGRESSIONS" ] && [ "$REGRESSIONS" -gt 0 ]; then + echo "::error::$REGRESSIONS benchmark(s) regressed by more than ${THRESHOLD}% vs baseline." + exit 1 + fi + echo "No regressions over ${THRESHOLD}%."