Frontier Judge QA Filter

Download Code

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "data-designer>=0.5.6",
# ]
# ///
"""Long-Document Understanding Frontier Model QA Judge Recipe

Use a frontier VLM as an LLM-as-a-judge to evaluate the quality
of (question, answer) pairs generated by the upstream visual QA recipes. The
judge scores each example across five rubrics:

  1. **Answer Correctness** – factual accuracy against the visible document
  2. **Question Quality** – reasoning depth, ambiguity, specificity
  3. **Visual Grounding** – reliance on visual elements vs. plain text
  4. **Format Compliance** – answer format matches the question type
  5. **Training Signal Strength** – overall value as VLM training data

A weighted composite score (0–1) is computed from the five rubric scores.

Prerequisites:
    - A seed parquet file containing output from an upstream QA recipe
      (e.g. 05-visual-qa-sdg.py, 06-single-page-qa-sdg.py, or
      08-whole-document-qa-sdg.py) with at least:
        * `png_images_base64` – JSON array of base64-encoded PNG(s) of
          document pages.
        * `question_type`  – classification of the question.
        * `question`       – the generated question.
        * `answer`         – the generated answer.
    - Access to a frontier model endpoint that exposes an OpenAI-compatible
      API. Provide the model ID, endpoint URL, and the name of the
      environment variable holding the API key via the CLI flags
      ``--model-id``, ``--endpoint``, and ``--api-key-env``.

Run:
    # Basic usage (judges 5 records by default)
    uv run 09-frontier-judge-sdg.py --seed-path my_qa_output.parquet \
        --model-id <model-id> --endpoint <endpoint-url> --api-key-env <ENV_VAR>

    # Custom record count
    uv run 09-frontier-judge-sdg.py --seed-path my_qa_output.parquet \
        --model-id <model-id> --endpoint <endpoint-url> --api-key-env <ENV_VAR> \
        --num-records 100

    # For help message and available options
    uv run 09-frontier-judge-sdg.py --help
"""

from pathlib import Path

import data_designer.config as dd
from data_designer.interface import DataDesigner, DatasetCreationResults

PROVIDER_NAME = "frontier"

# =============================================================================
# Score weights for the weighted composite
# =============================================================================

FINAL_SCORE_WEIGHTS = {
    "Answer Correctness": 0.35,
    "Training Signal Strength": 0.30,
    "Question Quality": 0.15,
    "Visual Grounding": 0.10,
    "Format Compliance": 0.10,
}

# =============================================================================
# Custom column: weighted composite score
# =============================================================================


@dd.custom_column_generator(required_columns=["qa_quality_judge"])
def compute_weighted_score(row: dict) -> dict:
    """Weighted composite score normalized to 0-1."""
    judge = row["qa_quality_judge"]
    raw = sum(float(judge[k]["score"]) * w for k, w in FINAL_SCORE_WEIGHTS.items())
    row["weighted_quality_score"] = round(raw / 5.0, 2)
    return row


# =============================================================================
# Judge prompt
# =============================================================================

PROMPT_JUDGE = """\
You are an expert evaluator of visual document question-answering (VQA) training data
for the MMLongBench-Doc benchmark.

Your task is to assess the quality of a (question, answer) pair generated from a PDF
document image. The goal is to determine how strong of a training signal this example
would provide for improving VLM performance.

You will be given:
- One or more images of document pages (with tables, charts, diagrams, etc.)
- A question type classification
- A question about the document
- An answer to the question

<question-type>
{{ question_type }}
</question-type>

<question>
{{ question }}
</question>

<answer>
{{ answer }}
</answer>

Evaluate the example across the following rubrics. For each rubric, provide a brief
reasoning and a score. Be objective and critical -- do not inflate scores.

{
  "Answer Correctness": {
    "reasoning": "Your brief analysis here",
    "score": "X"
  },
  "Question Quality": {
    "reasoning": "Your brief analysis here",
    "score": "X"
  },
  "Visual Grounding": {
    "reasoning": "Your brief analysis here",
    "score": "X"
  },
  "Format Compliance": {
    "reasoning": "Your brief analysis here",
    "score": "X"
  },
  "Training Signal Strength": {
    "reasoning": "Your brief analysis here",
    "score": "X"
  }
}

Provide your evaluation in the exact JSON format above with ALL 5 rubrics.
Keep your reasoning for each rubric short and to the point.
"""

# =============================================================================
# Score rubric definitions
# =============================================================================

answer_correctness_score = dd.Score(
    name="Answer Correctness",
    description=(
        "Is the answer factually correct given the visible document content? "
        "Verify by examining the image yourself. For calculations, redo the math. "
        "For counts, recount. For lists, check completeness."
    ),
    options={
        "5": "Exactly correct: answer matches the visible content precisely, calculations are accurate, lists are complete",
        "4": "Substantially correct: answer is right with minor imprecision (e.g., rounding differences within +/-5%, equivalent formats like '25%' vs '0.25')",
        "3": "Partially correct: core answer is right but has notable issues (missing list items, slightly off calculation, incomplete but not wrong)",
        "2": "Mostly incorrect: answer has the right idea but wrong values, wrong entity, or significant calculation errors",
        "1": "Incorrect: answer contradicts the visible content, uses wrong data, or is completely off",
        "0": "Not answerable or refused: answer is a refusal, 'Not answerable', or nonsensical when a real answer exists",
    },
)

question_quality_score = dd.Score(
    name="Question Quality",
    description=(
        "Is the question well-formed, unambiguous, and appropriately challenging? "
        "Does it require genuine reasoning (comparison, calculation, counting) rather than trivial lookup? "
        "Is it specific to the visual content and not generic?"
    ),
    options={
        "5": "Excellent: requires clear reasoning (comparison, calculation, or cross-element synthesis), unambiguous, has exactly one correct answer, well-matched to the visual element type",
        "4": "Good: requires some reasoning, mostly unambiguous, well-grounded in the visual content with minor issues",
        "3": "Adequate: reasonable question but either too easy (direct lookup), slightly ambiguous, or not well-matched to the visual element type",
        "2": "Poor: trivial lookup, ambiguous wording, or asks about content not well-suited to the visual element type",
        "1": "Very poor: unanswerable from the image, contains the answer, or is about irrelevant content",
        "0": "Invalid: nonsensical, empty, or completely unrelated to the document",
    },
)

visual_grounding_score = dd.Score(
    name="Visual Grounding",
    description=(
        "Does the question target the actual visual elements (tables, charts, diagrams) in the image? "
        "Does answering require examining the visual structure, not just reading plain text? "
        "Is the question grounded in specific, identifiable elements?"
    ),
    options={
        "5": "Excellent: question directly targets specific visual elements (chart data, table cells, diagram nodes), answering requires visual perception and spatial understanding",
        "4": "Good: question is grounded in visual content with clear references to identifiable elements, requires examining the visual structure",
        "3": "Adequate: question relates to visual content but could partially be answered from text alone, or uses vague references ('the table' without specificity)",
        "2": "Poor: question mostly targets plain text content, minimal visual grounding, could be answered without seeing the visual elements",
        "1": "Very poor: question has no meaningful connection to the visual elements, purely text-based",
        "0": "No grounding: question is about content not present in the image at all",
    },
)

format_compliance_score = dd.Score(
    name="Format Compliance",
    description=(
        "Does the answer match the expected format for its question type? "
        "Check: multiple choice uses 'A. option' format, yes/no is exactly 'Yes'/'No', "
        "percentages include '%', integers are digits only, lists are JSON arrays, "
        "and the answer contains no reasoning traces or meta-commentary."
    ),
    options={
        "5": "Perfect compliance: answer format exactly matches the question type requirements, no extraneous content",
        "4": "Good compliance: correct format with trivial deviations (e.g., extra whitespace, minor punctuation)",
        "3": "Adequate: answer is usable but has format issues (e.g., missing units, prose instead of JSON array, includes 'Based on the image...')",
        "2": "Poor: significant format violations (e.g., includes reasoning steps, wrong answer structure, contains <think> tags)",
        "1": "Very poor: answer format is fundamentally wrong for the question type",
        "0": "No compliance: answer is empty, garbled, or completely ignores format requirements",
    },
)

training_signal_score = dd.Score(
    name="Training Signal Strength",
    description=(
        "Overall, how valuable is this (question, answer) pair as training data for improving "
        "VLM performance on document understanding? Consider: does it exercise visual perception, "
        "require non-trivial reasoning, demand multi-page evidence gathering, and provide a clear learning signal?"
    ),
    options={
        "5": "Excellent: requires combining evidence from multiple pages, exercises visual perception + reasoning, non-trivial, clear correct answer. Would meaningfully improve a VLM on document QA benchmarks",
        "4": "Strong: good training example with cross-page reasoning or strong single-page visual grounding and reasoning, minor issues don't significantly reduce value",
        "3": "Moderate: decent training signal but answerable from a single page, or doesn't fully exercise multi-page or visual understanding",
        "2": "Weak: limited training value -- trivial question, wrong answer, single-page lookup, or doesn't require visual reasoning",
        "1": "Very weak: almost no training value -- incorrect, ambiguous, or completely text-based with no multi-page dependency",
        "0": "No value: harmful to training -- wrong answer presented as correct, nonsensical, or would teach bad patterns",
    },
)


# =============================================================================
# Config builder
# =============================================================================


def build_config(
    seed_path: str = "seed.parquet",
    model_alias: str = "frontier-judge-vlm",
    model_id: str = "",
) -> dd.DataDesignerConfigBuilder:
    """Build the Data Designer config for frontier-model QA judging."""
    config_builder = dd.DataDesignerConfigBuilder(
        model_configs=[
            dd.ModelConfig(
                alias=model_alias,
                model=model_id,
                provider=PROVIDER_NAME,
                inference_parameters=dd.ChatCompletionInferenceParams(
                    timeout=300,
                    max_tokens=40000,
                    max_parallel_requests=32,
                ),
            ),
        ]
    )

    config_builder.with_seed_dataset(
        dd.LocalFileSeedSource(path=seed_path),
        sampling_strategy=dd.SamplingStrategy.ORDERED,
    )

    config_builder.add_column(
        dd.LLMJudgeColumnConfig(
            name="qa_quality_judge",
            model_alias=model_alias,
            prompt=PROMPT_JUDGE,
            scores=[
                answer_correctness_score,
                question_quality_score,
                visual_grounding_score,
                format_compliance_score,
                training_signal_score,
            ],
            multi_modal_context=[
                dd.ImageContext(
                    column_name="png_images_base64",
                    data_type=dd.ModalityDataType.BASE64,
                    image_format=dd.ImageFormat.PNG,
                ),
            ],
        )
    )

    config_builder.add_column(
        dd.CustomColumnConfig(
            name="weighted_quality_score",
            generator_function=compute_weighted_score,
        )
    )

    return config_builder


# =============================================================================
# Dataset creation
# =============================================================================


def create_dataset(
    config_builder: dd.DataDesignerConfigBuilder,
    num_records: int,
    endpoint: str = "",
    api_key_env: str = "",
    artifact_path: Path | str | None = None,
) -> DatasetCreationResults:
    """Create the judged dataset."""
    model_providers = [
        dd.ModelProvider(
            name=PROVIDER_NAME,
            endpoint=endpoint,
            provider_type="openai",
            api_key=api_key_env,
        ),
    ]
    data_designer = DataDesigner(
        artifact_path=artifact_path,
        model_providers=model_providers,
    )
    data_designer.set_run_config(
        dd.RunConfig(disable_early_shutdown=True, progress_bar=True),
    )
    results = data_designer.create(config_builder, num_records=num_records, dataset_name="frontier_judge")
    return results


# =============================================================================
# CLI entry point
# =============================================================================

if __name__ == "__main__":
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
    parser.add_argument("--model-alias", type=str, default="frontier-judge-vlm")
    parser.add_argument("--model-id", type=str, required=True, help="ID of the model to use for judging")
    parser.add_argument("--endpoint", type=str, required=True, help="OpenAI-compatible API endpoint URL")
    parser.add_argument(
        "--api-key-env", type=str, required=True, help="Environment variable name containing the API key"
    )
    parser.add_argument("--num-records", type=int, default=5)
    parser.add_argument("--artifact-path", type=str, default=None)
    args = parser.parse_args()

    config_builder = build_config(
        seed_path=args.seed_path,
        model_alias=args.model_alias,
        model_id=args.model_id,
    )
    results = create_dataset(
        config_builder,
        num_records=args.num_records,
        endpoint=args.endpoint,
        api_key_env=args.api_key_env,
        artifact_path=args.artifact_path,
    )

    print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")

    results.load_analysis().to_report()