Skip to content

Automate corpora testing in CI #23

Automate corpora testing in CI

Automate corpora testing in CI #23

name: Corpora Test
on:
workflow_dispatch:
pull_request:
paths:
- 'pkg/detectors/**'
- 'pkg/engine/defaults/defaults.go'
- '.github/workflows/detector-corpora-test.yml'
- 'scripts/test/detector_corpora_test.sh'
- 'scripts/test/diff_corpora_results.py'
- 'scripts/test/detect_changed_detectors.sh'
env:
DATASETS: |
s3://trufflehog-corpora-datasets/contents.2025-11-04.jsonl.zstd
jobs:
corpora-test:
if: ${{ github.repository == 'trufflesecurity/trufflehog' && !github.event.pull_request.head.repo.fork }}
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install Go
uses: actions/setup-go@v5
with:
go-version: "1.25"
- name: Install dependencies
run: sudo apt-get install -y zstd jq
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Resolve merge-base
id: merge_base
shell: bash
run: |
set -o pipefail
git fetch --no-tags --prune origin main
MERGE_BASE=$(git merge-base origin/main HEAD)
echo "Merge base: $MERGE_BASE"
echo "sha=$MERGE_BASE" >> "$GITHUB_OUTPUT"
# Determine which detectors changed in this PR. The PR build scopes its
# scan to the full set; the main build excludes detectors that don't
# exist there yet (new detectors). If the set is empty, the workflow
# short-circuits with a skip comment — scoping is the entire point of
# Phase 2, falling back to scan-all defeats it.
- name: Detect changed detectors
id: detect
shell: bash
env:
BASE_REF: ${{ steps.merge_base.outputs.sha }}
run: |
set -o pipefail
chmod +x scripts/test/detect_changed_detectors.sh
PR_CSV=$(./scripts/test/detect_changed_detectors.sh --pr-csv || true)
MAIN_CSV=$(./scripts/test/detect_changed_detectors.sh --main-csv || true)
NEW_LIST=$(./scripts/test/detect_changed_detectors.sh --new-only || true)
NEW_CSV=$(echo "$NEW_LIST" | paste -sd, -)
echo "PR detectors: $PR_CSV"
echo "Main detectors: $MAIN_CSV"
echo "New detectors: $NEW_CSV"
echo "pr_csv=$PR_CSV" >> "$GITHUB_OUTPUT"
echo "main_csv=$MAIN_CSV" >> "$GITHUB_OUTPUT"
echo "new_csv=$NEW_CSV" >> "$GITHUB_OUTPUT"
if [[ -n "$PR_CSV" ]]; then
echo "any_changed=true" >> "$GITHUB_OUTPUT"
else
echo "any_changed=false" >> "$GITHUB_OUTPUT"
fi
# Sticky comment: find any prior detector-bench comment on the PR by
# the marker substring and update it in place. The marker — kept in
# sync with STICKY_COMMENT_MARKER in scripts/test/diff_corpora_results.py —
# has to appear in BOTH the skip body and the diff body so the same
# comment flips between them as iterative pushes change which path
# fires. Skip body is only posted on pull_request events; workflow_dispatch
# runs with no changed detectors silently finish without posting.
- name: Find existing skip comment
if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request'
id: find_skip_comment
uses: peter-evans/find-comment@v3
with:
issue-number: ${{ github.event.pull_request.number }}
comment-author: 'github-actions[bot]'
body-includes: '<!-- detector-bench -->'
- name: Post or update skip comment
if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request'
uses: peter-evans/create-or-update-comment@v4
with:
comment-id: ${{ steps.find_skip_comment.outputs.comment-id }}
issue-number: ${{ github.event.pull_request.number }}
edit-mode: replace
body: |
<!-- detector-bench -->
## Corpora Test Results
No detector source files changed in this PR. Bench skipped.
# Two independent builds run in parallel:
# A) prepare main worktree → build main binary (git I/O then CPU)
# B) build PR binary (CPU, no dependencies)
- name: Build binaries
if: steps.detect.outputs.any_changed == 'true'
shell: bash
env:
MERGE_BASE: ${{ steps.merge_base.outputs.sha }}
run: |
set -o pipefail
# Chain A: prepare worktree, then build main binary.
(
git worktree add /tmp/trufflehog-main-src "$MERGE_BASE"
cd /tmp/trufflehog-main-src
CGO_ENABLED=0 go build -o /tmp/trufflehog-main .
) &
PID_MAIN_BUILD=$!
# Chain B: build PR binary (no dependencies).
CGO_ENABLED=0 go build -o /tmp/trufflehog-pr . &
PID_PR_BUILD=$!
wait $PID_MAIN_BUILD || { echo "Main binary build failed" >&2; exit 1; }
wait $PID_PR_BUILD || { echo "PR binary build failed" >&2; exit 1; }
# PR and main scans run in parallel. Each streams the corpus files
# independently from S3 — no shared state, different output files,
# different binaries. The main scan is skipped when main_csv is empty
# (PR adds only new detectors). CORPUS_BYTES_FILE is only written by
# the PR scan (blast-radius needs one consistent byte count).
- name: Run corpora tests
if: steps.detect.outputs.any_changed == 'true'
shell: bash
env:
PR_CSV: ${{ steps.detect.outputs.pr_csv }}
MAIN_CSV: ${{ steps.detect.outputs.main_csv }}
run: |
set -o pipefail
files=()
while IFS= read -r dataset; do
[[ -z "$dataset" ]] && continue
files+=("$dataset")
done <<< "$DATASETS"
# PR scan.
(
export TRUFFLEHOG_BIN=/tmp/trufflehog-pr
export OUTPUT_JSONL=/tmp/results-pr.jsonl
export STDERR_FILE=/tmp/corpora-stderr-pr.txt
export INCLUDE_DETECTORS="$PR_CSV"
export CORPUS_BYTES_FILE=/tmp/corpus-bytes.txt
./scripts/test/detector_corpora_test.sh "${files[@]}"
) &
PID_PR=$!
# Main scan (skipped when no detectors overlap with main).
if [[ -n "$MAIN_CSV" ]]; then
(
export TRUFFLEHOG_BIN=/tmp/trufflehog-main
export OUTPUT_JSONL=/tmp/results-main.jsonl
export STDERR_FILE=/tmp/corpora-stderr-main.txt
export INCLUDE_DETECTORS="$MAIN_CSV"
./scripts/test/detector_corpora_test.sh "${files[@]}"
) &
PID_MAIN=$!
else
echo "No overlapping detectors in main; skipping main scan."
: > /tmp/results-main.jsonl
fi
wait $PID_PR || { echo "PR scan failed" >&2; exit 1; }
[[ -n "${PID_MAIN:-}" ]] && { wait $PID_MAIN || { echo "Main scan failed" >&2; exit 1; }; }
- name: Diff results
if: steps.detect.outputs.any_changed == 'true'
shell: bash
env:
CHANGED: ${{ steps.detect.outputs.pr_csv }}
NEW_DETECTORS: ${{ steps.detect.outputs.new_csv }}
run: |
set -o pipefail
CORPUS_BYTES=0
if [[ -s /tmp/corpus-bytes.txt ]]; then
CORPUS_BYTES=$(cat /tmp/corpus-bytes.txt)
fi
python3 scripts/test/diff_corpora_results.py \
/tmp/results-main.jsonl /tmp/results-pr.jsonl \
--changed-detectors="$CHANGED" \
--new-detectors="$NEW_DETECTORS" \
--corpus-bytes="$CORPUS_BYTES" \
> /tmp/diff-report.md
cat /tmp/diff-report.md
# workflow_dispatch runs don't carry an issue context, so resolve the
# PR number by branch lookup. pull_request events fall through to the
# event's issue number. Output feeds the find/update pair below.
- name: Resolve PR number
if: steps.detect.outputs.any_changed == 'true'
id: resolve_pr
uses: actions/github-script@v7
with:
script: |
let issue_number;
if (context.eventName === 'workflow_dispatch') {
const pulls = await github.rest.pulls.list({
owner: context.repo.owner,
repo: context.repo.repo,
head: `${context.repo.owner}:${context.ref.replace('refs/heads/', '')}`,
state: 'open',
});
if (pulls.data.length === 0) {
core.setFailed(`No open PR found for branch ${context.ref}`);
return;
}
issue_number = pulls.data[0].number;
} else {
issue_number = context.issue.number;
}
core.setOutput('issue_number', issue_number);
- name: Find existing diff comment
if: steps.detect.outputs.any_changed == 'true'
id: find_diff_comment
uses: peter-evans/find-comment@v3
with:
issue-number: ${{ steps.resolve_pr.outputs.issue_number }}
comment-author: 'github-actions[bot]'
body-includes: '<!-- detector-bench -->'
- name: Post or update diff comment
if: steps.detect.outputs.any_changed == 'true'
uses: peter-evans/create-or-update-comment@v4
with:
comment-id: ${{ steps.find_diff_comment.outputs.comment-id }}
issue-number: ${{ steps.resolve_pr.outputs.issue_number }}
edit-mode: replace
body-path: /tmp/diff-report.md