Automate corpora testing in CI #17

Workflow file for this run

.github/workflows/detector-corpora-test.yml at bbaa4af

	name: Corpora Test

	on:
	workflow_dispatch:
	pull_request:
	# types: [opened, reopened] TODO: Decide if we should run this on every push
	paths:
	- 'pkg/detectors/**'
	- 'pkg/engine/defaults/defaults.go'
	- '.github/workflows/detector-corpora-test.yml'
	- 'scripts/test/detector_corpora_test.sh'
	- 'scripts/test/diff_corpora_results.py'
	- 'scripts/test/detect_changed_detectors.sh'
	- 'scripts/test/build_keyword_corpus.py'
	- 'hack/extract-keywords/**'

	env:
	DATASETS: \|
	s3://trufflehog-corpora-datasets/contents.2025-11-04.jsonl.zstd

	jobs:
	corpora-test:
	if: ${{ github.repository == 'trufflesecurity/trufflehog' && !github.event.pull_request.head.repo.fork }}
	runs-on: ubuntu-latest
	permissions:
	contents: read
	pull-requests: write
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Install Go
	uses: actions/setup-go@v5
	with:
	go-version: "1.25"

	- name: Install dependencies
	run: sudo apt-get install -y zstd jq

	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }}
	aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	aws-region: us-east-1

	- name: Resolve merge-base
	id: merge_base
	shell: bash
	run: \|
	set -o pipefail
	git fetch --no-tags --prune origin main
	MERGE_BASE=$(git merge-base origin/main HEAD)
	echo "Merge base: $MERGE_BASE"
	echo "sha=$MERGE_BASE" >> "$GITHUB_OUTPUT"

	# Determine which detectors changed in this PR. The PR build scopes its
	# scan to the full set; the main build excludes detectors that don't
	# exist there yet (new detectors). If the set is empty, the workflow
	# short-circuits with a skip comment — scoping is the entire point of
	# Phase 2, falling back to scan-all defeats it.
	- name: Detect changed detectors
	id: detect
	shell: bash
	env:
	BASE_REF: ${{ steps.merge_base.outputs.sha }}
	run: \|
	set -o pipefail
	chmod +x scripts/test/detect_changed_detectors.sh
	PR_CSV=$(./scripts/test/detect_changed_detectors.sh --pr-csv \|\| true)
	MAIN_CSV=$(./scripts/test/detect_changed_detectors.sh --main-csv \|\| true)
	NEW_LIST=$(./scripts/test/detect_changed_detectors.sh --new-only \|\| true)
	NEW_CSV=$(echo "$NEW_LIST" \| paste -sd, -)
	echo "PR detectors: $PR_CSV"
	echo "Main detectors: $MAIN_CSV"
	echo "New detectors: $NEW_CSV"
	echo "pr_csv=$PR_CSV" >> "$GITHUB_OUTPUT"
	echo "main_csv=$MAIN_CSV" >> "$GITHUB_OUTPUT"
	echo "new_csv=$NEW_CSV" >> "$GITHUB_OUTPUT"
	if [[ -n "$PR_CSV" ]]; then
	echo "any_changed=true" >> "$GITHUB_OUTPUT"
	else
	echo "any_changed=false" >> "$GITHUB_OUTPUT"
	fi

	# Sticky comment: find any prior detector-bench comment on the PR by
	# the marker substring and update it in place. The marker — kept in
	# sync with STICKY_COMMENT_MARKER in scripts/test/diff_corpora_results.py —
	# has to appear in BOTH the skip body and the diff body so the same
	# comment flips between them as iterative pushes change which path
	# fires. Skip body is only posted on pull_request events; the original
	# workflow_dispatch early-return is preserved by the event-name guard.
	- name: Find existing skip comment
	if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request'
	id: find_skip_comment
	uses: peter-evans/find-comment@v3
	with:
	issue-number: ${{ github.event.pull_request.number }}
	comment-author: 'github-actions[bot]'
	body-includes: '<!-- detector-bench -->'

	- name: Post or update skip comment
	if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request'
	uses: peter-evans/create-or-update-comment@v4
	with:
	comment-id: ${{ steps.find_skip_comment.outputs.comment-id }}
	issue-number: ${{ github.event.pull_request.number }}
	edit-mode: replace
	body: \|
	<!-- detector-bench -->
	## Corpora Test Results

	No detector source files changed in this PR. Bench skipped.

	# Layer 1 keyword corpus — fetch real-world snippets from GitHub Code
	# Search for each changed detector's pre-filter keywords. Output is a
	# zstd-compressed JSONL whose shape matches the S3 corpus, so the
	# corpora script picks it up unchanged via the DATASETS append below.
	# The same corpus file is fed to both PR and main builds; thin-L1
	# detectors and per-detector counts are written to a sidecar JSON the
	# diff step renders.
	- name: Build extract-keywords helper
	if: steps.detect.outputs.any_changed == 'true'
	shell: bash
	run: \|
	set -o pipefail
	CGO_ENABLED=0 go build -o /tmp/extract-keywords ./hack/extract-keywords

	- name: Build keyword corpus (Layer 1)
	if: steps.detect.outputs.any_changed == 'true'
	shell: bash
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	DETECTORS: ${{ steps.detect.outputs.pr_csv }}
	run: \|
	set -o pipefail
	python3 scripts/test/build_keyword_corpus.py \
	--detectors="$DETECTORS" \
	--extract-keywords-bin=/tmp/extract-keywords \
	--output-corpus=/tmp/keyword-corpus.jsonl.zstd \
	--output-meta=/tmp/keyword-corpus-meta.json \
	--max-results-per-detector="${KEYWORD_CORPUS_CAP:-100}"
	# Append to DATASETS for downstream scan steps. The python script
	# always writes a (possibly empty) corpus, so the path is safe to
	# append unconditionally — empty zstd frames decompress to 0
	# bytes and pass through the existing scan pipeline cleanly.
	echo "DATASETS<<EOF" >> "$GITHUB_ENV"
	echo "$DATASETS" >> "$GITHUB_ENV"
	echo "/tmp/keyword-corpus.jsonl.zstd" >> "$GITHUB_ENV"
	echo "EOF" >> "$GITHUB_ENV"

	- name: Prepare main worktree
	if: steps.detect.outputs.any_changed == 'true'
	shell: bash
	env:
	MERGE_BASE: ${{ steps.merge_base.outputs.sha }}
	run: \|
	set -o pipefail
	git worktree add /tmp/trufflehog-main-src "$MERGE_BASE"

	- name: Build trufflehog (PR HEAD)
	if: steps.detect.outputs.any_changed == 'true'
	shell: bash
	run: \|
	set -o pipefail
	CGO_ENABLED=0 go build -o /tmp/trufflehog-pr .

	- name: Build trufflehog (main merge-base)
	if: steps.detect.outputs.any_changed == 'true'
	shell: bash
	working-directory: /tmp/trufflehog-main-src
	run: \|
	set -o pipefail
	CGO_ENABLED=0 go build -o /tmp/trufflehog-main .

	# The PR scan always runs (any_changed=true means at least one detector
	# is in pr_csv). It also captures the corpus byte total for the diff
	# script's blast-radius column — same content streams to both binaries,
	# so measuring once is enough.
	- name: Run corpora test (PR build)
	if: steps.detect.outputs.any_changed == 'true'
	shell: bash
	env:
	TRUFFLEHOG_BIN: /tmp/trufflehog-pr
	OUTPUT_JSONL: /tmp/results-pr.jsonl
	STDERR_FILE: /tmp/corpora-stderr-pr.txt
	INCLUDE_DETECTORS: ${{ steps.detect.outputs.pr_csv }}
	CORPUS_BYTES_FILE: /tmp/corpus-bytes.txt
	run: \|
	set -o pipefail
	files=()
	while IFS= read -r dataset; do
	[[ -z "$dataset" ]] && continue
	files+=("$dataset")
	done <<< "$DATASETS"
	./scripts/test/detector_corpora_test.sh "${files[@]}"

	# Main scan is skipped when main_csv is empty (PR adds only new
	# detectors — nothing to compare against on main). The diff step is
	# safe with an empty main JSONL: every PR finding is treated as NEW,
	# which is correct semantics for new detectors.
	- name: Run corpora test (main build)
	if: steps.detect.outputs.any_changed == 'true'
	shell: bash
	env:
	TRUFFLEHOG_BIN: /tmp/trufflehog-main
	OUTPUT_JSONL: /tmp/results-main.jsonl
	STDERR_FILE: /tmp/corpora-stderr-main.txt
	INCLUDE_DETECTORS: ${{ steps.detect.outputs.main_csv }}
	run: \|
	set -o pipefail
	if [[ -z "$INCLUDE_DETECTORS" ]]; then
	echo "No overlapping detectors in main; skipping main scan."
	: > "$OUTPUT_JSONL"
	exit 0
	fi
	files=()
	while IFS= read -r dataset; do
	[[ -z "$dataset" ]] && continue
	files+=("$dataset")
	done <<< "$DATASETS"
	./scripts/test/detector_corpora_test.sh "${files[@]}"

	- name: Diff results
	if: steps.detect.outputs.any_changed == 'true'
	shell: bash
	env:
	CHANGED: ${{ steps.detect.outputs.pr_csv }}
	NEW_DETECTORS: ${{ steps.detect.outputs.new_csv }}
	run: \|
	set -o pipefail
	CORPUS_BYTES=0
	if [[ -s /tmp/corpus-bytes.txt ]]; then
	CORPUS_BYTES=$(cat /tmp/corpus-bytes.txt)
	fi
	META_ARG=()
	if [[ -s /tmp/keyword-corpus-meta.json ]]; then
	META_ARG=(--keyword-corpus-meta=/tmp/keyword-corpus-meta.json)
	fi
	python3 scripts/test/diff_corpora_results.py \
	/tmp/results-main.jsonl /tmp/results-pr.jsonl \
	--changed-detectors="$CHANGED" \
	--new-detectors="$NEW_DETECTORS" \
	--corpus-bytes="$CORPUS_BYTES" \
	"${META_ARG[@]}" \
	> /tmp/diff-report.md
	cat /tmp/diff-report.md

	# workflow_dispatch runs don't carry an issue context, so resolve the
	# PR number by branch lookup. pull_request events fall through to the
	# event's issue number. Output feeds the find/update pair below.
	- name: Resolve PR number
	if: steps.detect.outputs.any_changed == 'true'
	id: resolve_pr
	uses: actions/github-script@v7
	with:
	script: \|
	let issue_number;
	if (context.eventName === 'workflow_dispatch') {
	const pulls = await github.rest.pulls.list({
	owner: context.repo.owner,
	repo: context.repo.repo,
	head: `${context.repo.owner}:${context.ref.replace('refs/heads/', '')}`,
	state: 'open',
	});
	if (pulls.data.length === 0) {
	core.setFailed(`No open PR found for branch ${context.ref}`);
	return;
	}
	issue_number = pulls.data[0].number;
	} else {
	issue_number = context.issue.number;
	}
	core.setOutput('issue_number', issue_number);

	- name: Find existing diff comment
	if: steps.detect.outputs.any_changed == 'true'
	id: find_diff_comment
	uses: peter-evans/find-comment@v3
	with:
	issue-number: ${{ steps.resolve_pr.outputs.issue_number }}
	comment-author: 'github-actions[bot]'
	body-includes: '<!-- detector-bench -->'

	- name: Post or update diff comment
	if: steps.detect.outputs.any_changed == 'true'
	uses: peter-evans/create-or-update-comment@v4
	with:
	comment-id: ${{ steps.find_diff_comment.outputs.comment-id }}
	issue-number: ${{ steps.resolve_pr.outputs.issue_number }}
	edit-mode: replace
	body-path: /tmp/diff-report.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Automate corpora testing in CI #17

Workflow file

Automate corpora testing in CI #17

Uh oh!

Workflow file for this run