Skip to content

Automate corpora testing in CI #17

Automate corpora testing in CI

Automate corpora testing in CI #17

name: Corpora Test
on:
workflow_dispatch:
pull_request:
# types: [opened, reopened] TODO: Decide if we should run this on every push
paths:
- 'pkg/detectors/**'
- 'pkg/engine/defaults/defaults.go'
- '.github/workflows/detector-corpora-test.yml'
- 'scripts/test/detector_corpora_test.sh'
- 'scripts/test/diff_corpora_results.py'
- 'scripts/test/detect_changed_detectors.sh'
- 'scripts/test/build_keyword_corpus.py'
- 'hack/extract-keywords/**'
env:
DATASETS: |
s3://trufflehog-corpora-datasets/contents.2025-11-04.jsonl.zstd
jobs:
corpora-test:
if: ${{ github.repository == 'trufflesecurity/trufflehog' && !github.event.pull_request.head.repo.fork }}
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Install Go
uses: actions/setup-go@v5
with:
go-version: "1.25"
- name: Install dependencies
run: sudo apt-get install -y zstd jq
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Resolve merge-base
id: merge_base
shell: bash
run: |
set -o pipefail
git fetch --no-tags --prune origin main
MERGE_BASE=$(git merge-base origin/main HEAD)
echo "Merge base: $MERGE_BASE"
echo "sha=$MERGE_BASE" >> "$GITHUB_OUTPUT"
# Determine which detectors changed in this PR. The PR build scopes its
# scan to the full set; the main build excludes detectors that don't
# exist there yet (new detectors). If the set is empty, the workflow
# short-circuits with a skip comment — scoping is the entire point of
# Phase 2, falling back to scan-all defeats it.
- name: Detect changed detectors
id: detect
shell: bash
env:
BASE_REF: ${{ steps.merge_base.outputs.sha }}
run: |
set -o pipefail
chmod +x scripts/test/detect_changed_detectors.sh
PR_CSV=$(./scripts/test/detect_changed_detectors.sh --pr-csv || true)
MAIN_CSV=$(./scripts/test/detect_changed_detectors.sh --main-csv || true)
NEW_LIST=$(./scripts/test/detect_changed_detectors.sh --new-only || true)
NEW_CSV=$(echo "$NEW_LIST" | paste -sd, -)
echo "PR detectors: $PR_CSV"
echo "Main detectors: $MAIN_CSV"
echo "New detectors: $NEW_CSV"
echo "pr_csv=$PR_CSV" >> "$GITHUB_OUTPUT"
echo "main_csv=$MAIN_CSV" >> "$GITHUB_OUTPUT"
echo "new_csv=$NEW_CSV" >> "$GITHUB_OUTPUT"
if [[ -n "$PR_CSV" ]]; then
echo "any_changed=true" >> "$GITHUB_OUTPUT"
else
echo "any_changed=false" >> "$GITHUB_OUTPUT"
fi
# Sticky comment: find any prior detector-bench comment on the PR by
# the marker substring and update it in place. The marker — kept in
# sync with STICKY_COMMENT_MARKER in scripts/test/diff_corpora_results.py —
# has to appear in BOTH the skip body and the diff body so the same
# comment flips between them as iterative pushes change which path
# fires. Skip body is only posted on pull_request events; the original
# workflow_dispatch early-return is preserved by the event-name guard.
- name: Find existing skip comment
if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request'
id: find_skip_comment
uses: peter-evans/find-comment@v3
with:
issue-number: ${{ github.event.pull_request.number }}
comment-author: 'github-actions[bot]'
body-includes: '<!-- detector-bench -->'
- name: Post or update skip comment
if: steps.detect.outputs.any_changed != 'true' && github.event_name == 'pull_request'
uses: peter-evans/create-or-update-comment@v4
with:
comment-id: ${{ steps.find_skip_comment.outputs.comment-id }}
issue-number: ${{ github.event.pull_request.number }}
edit-mode: replace
body: |
<!-- detector-bench -->
## Corpora Test Results
No detector source files changed in this PR. Bench skipped.
# Layer 1 keyword corpus — fetch real-world snippets from GitHub Code
# Search for each changed detector's pre-filter keywords. Output is a
# zstd-compressed JSONL whose shape matches the S3 corpus, so the
# corpora script picks it up unchanged via the DATASETS append below.
# The same corpus file is fed to both PR and main builds; thin-L1
# detectors and per-detector counts are written to a sidecar JSON the
# diff step renders.
- name: Build extract-keywords helper
if: steps.detect.outputs.any_changed == 'true'
shell: bash
run: |
set -o pipefail
CGO_ENABLED=0 go build -o /tmp/extract-keywords ./hack/extract-keywords
- name: Build keyword corpus (Layer 1)
if: steps.detect.outputs.any_changed == 'true'
shell: bash
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
DETECTORS: ${{ steps.detect.outputs.pr_csv }}
run: |
set -o pipefail
python3 scripts/test/build_keyword_corpus.py \
--detectors="$DETECTORS" \
--extract-keywords-bin=/tmp/extract-keywords \
--output-corpus=/tmp/keyword-corpus.jsonl.zstd \
--output-meta=/tmp/keyword-corpus-meta.json \
--max-results-per-detector="${KEYWORD_CORPUS_CAP:-100}"
# Append to DATASETS for downstream scan steps. The python script
# always writes a (possibly empty) corpus, so the path is safe to
# append unconditionally — empty zstd frames decompress to 0
# bytes and pass through the existing scan pipeline cleanly.
echo "DATASETS<<EOF" >> "$GITHUB_ENV"
echo "$DATASETS" >> "$GITHUB_ENV"
echo "/tmp/keyword-corpus.jsonl.zstd" >> "$GITHUB_ENV"
echo "EOF" >> "$GITHUB_ENV"
- name: Prepare main worktree
if: steps.detect.outputs.any_changed == 'true'
shell: bash
env:
MERGE_BASE: ${{ steps.merge_base.outputs.sha }}
run: |
set -o pipefail
git worktree add /tmp/trufflehog-main-src "$MERGE_BASE"
- name: Build trufflehog (PR HEAD)
if: steps.detect.outputs.any_changed == 'true'
shell: bash
run: |
set -o pipefail
CGO_ENABLED=0 go build -o /tmp/trufflehog-pr .
- name: Build trufflehog (main merge-base)
if: steps.detect.outputs.any_changed == 'true'
shell: bash
working-directory: /tmp/trufflehog-main-src
run: |
set -o pipefail
CGO_ENABLED=0 go build -o /tmp/trufflehog-main .
# The PR scan always runs (any_changed=true means at least one detector
# is in pr_csv). It also captures the corpus byte total for the diff
# script's blast-radius column — same content streams to both binaries,
# so measuring once is enough.
- name: Run corpora test (PR build)
if: steps.detect.outputs.any_changed == 'true'
shell: bash
env:
TRUFFLEHOG_BIN: /tmp/trufflehog-pr
OUTPUT_JSONL: /tmp/results-pr.jsonl
STDERR_FILE: /tmp/corpora-stderr-pr.txt
INCLUDE_DETECTORS: ${{ steps.detect.outputs.pr_csv }}
CORPUS_BYTES_FILE: /tmp/corpus-bytes.txt
run: |
set -o pipefail
files=()
while IFS= read -r dataset; do
[[ -z "$dataset" ]] && continue
files+=("$dataset")
done <<< "$DATASETS"
./scripts/test/detector_corpora_test.sh "${files[@]}"
# Main scan is skipped when main_csv is empty (PR adds only new
# detectors — nothing to compare against on main). The diff step is
# safe with an empty main JSONL: every PR finding is treated as NEW,
# which is correct semantics for new detectors.
- name: Run corpora test (main build)
if: steps.detect.outputs.any_changed == 'true'
shell: bash
env:
TRUFFLEHOG_BIN: /tmp/trufflehog-main
OUTPUT_JSONL: /tmp/results-main.jsonl
STDERR_FILE: /tmp/corpora-stderr-main.txt
INCLUDE_DETECTORS: ${{ steps.detect.outputs.main_csv }}
run: |
set -o pipefail
if [[ -z "$INCLUDE_DETECTORS" ]]; then
echo "No overlapping detectors in main; skipping main scan."
: > "$OUTPUT_JSONL"
exit 0
fi
files=()
while IFS= read -r dataset; do
[[ -z "$dataset" ]] && continue
files+=("$dataset")
done <<< "$DATASETS"
./scripts/test/detector_corpora_test.sh "${files[@]}"
- name: Diff results
if: steps.detect.outputs.any_changed == 'true'
shell: bash
env:
CHANGED: ${{ steps.detect.outputs.pr_csv }}
NEW_DETECTORS: ${{ steps.detect.outputs.new_csv }}
run: |
set -o pipefail
CORPUS_BYTES=0
if [[ -s /tmp/corpus-bytes.txt ]]; then
CORPUS_BYTES=$(cat /tmp/corpus-bytes.txt)
fi
META_ARG=()
if [[ -s /tmp/keyword-corpus-meta.json ]]; then
META_ARG=(--keyword-corpus-meta=/tmp/keyword-corpus-meta.json)
fi
python3 scripts/test/diff_corpora_results.py \
/tmp/results-main.jsonl /tmp/results-pr.jsonl \
--changed-detectors="$CHANGED" \
--new-detectors="$NEW_DETECTORS" \
--corpus-bytes="$CORPUS_BYTES" \
"${META_ARG[@]}" \
> /tmp/diff-report.md
cat /tmp/diff-report.md
# workflow_dispatch runs don't carry an issue context, so resolve the
# PR number by branch lookup. pull_request events fall through to the
# event's issue number. Output feeds the find/update pair below.
- name: Resolve PR number
if: steps.detect.outputs.any_changed == 'true'
id: resolve_pr
uses: actions/github-script@v7
with:
script: |
let issue_number;
if (context.eventName === 'workflow_dispatch') {
const pulls = await github.rest.pulls.list({
owner: context.repo.owner,
repo: context.repo.repo,
head: `${context.repo.owner}:${context.ref.replace('refs/heads/', '')}`,
state: 'open',
});
if (pulls.data.length === 0) {
core.setFailed(`No open PR found for branch ${context.ref}`);
return;
}
issue_number = pulls.data[0].number;
} else {
issue_number = context.issue.number;
}
core.setOutput('issue_number', issue_number);
- name: Find existing diff comment
if: steps.detect.outputs.any_changed == 'true'
id: find_diff_comment
uses: peter-evans/find-comment@v3
with:
issue-number: ${{ steps.resolve_pr.outputs.issue_number }}
comment-author: 'github-actions[bot]'
body-includes: '<!-- detector-bench -->'
- name: Post or update diff comment
if: steps.detect.outputs.any_changed == 'true'
uses: peter-evans/create-or-update-comment@v4
with:
comment-id: ${{ steps.find_diff_comment.outputs.comment-id }}
issue-number: ${{ steps.resolve_pr.outputs.issue_number }}
edit-mode: replace
body-path: /tmp/diff-report.md