# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "data-designer>=0.5.6",
# ]
# ///
"""Long-Document Understanding Visual Question-Answering Recipe

Generate question-answer pairs grounded in document page images using a
vision-language model (VLM). For each seed record the pipeline:

  1. Samples a question type (multiple choice, yes/no, free-form, not answerable)
  2. Generates a question conditioned on the page image and its classification
  3. Generates an answer (with chain-of-thought reasoning captured separately)
  4. Evaluates question relevance against the visible content
  5. Evaluates answer correctness against the visible content

Prerequisites:
    - A seed parquet file containing:
        * `png_images_base64`    – JSON array of base64-encoded PNGs (one
          element per page; single-page seeds have a one-element array).
        * `page_classification` – JSON describing the visual element type and
          reasoning complexity score (produced by 04-page-classification-sdg.py)
    - A vLLM-compatible deployment of the VLM (default: Qwen/Qwen3-VL-235B-A22B-Thinking-FP8).
      Recommended vLLM launch flags:
        --tensor-parallel-size 4
        --max-model-len 50000
        --gpu-memory-utilization 0.90
        --reasoning-parser deepseek_r1
        --limit-mm-per-prompt '{"video": 0}'
        --trust-remote-code

      Example launch script for 4× H100:
        docker run --gpus all \
            -p 8000:8000 \
            vllm/vllm-openai:latest \
            --model Qwen/Qwen3-VL-235B-A22B-Thinking-FP8 \
            --tensor-parallel-size 4 \
            --max-model-len 50000 \
            --gpu-memory-utilization 0.90 \
            --reasoning-parser deepseek_r1 \
            --limit-mm-per-prompt '{"video": 0}' \
            --trust-remote-code

Run:
    # Basic usage (seed-path should point to the output of 04-page-classification-sdg.py)
    uv run 05-visual-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path artifacts/page_classification/parquet-files/*.parquet

    # Custom model and record count
    uv run 05-visual-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path artifacts/page_classification/parquet-files/*.parquet --num-records 100

    # For help message and available options
    uv run 05-visual-qa-sdg.py --help
"""

from pathlib import Path

import data_designer.config as dd
from data_designer.interface import DataDesigner, DatasetCreationResults

DEFAULT_VLM_MODEL = "Qwen/Qwen3-VL-235B-A22B-Thinking-FP8"
VLLM_PROVIDER_NAME = "vllm"

# =============================================================================
# Image context helper
# =============================================================================

IMAGE_CONTEXT = [
    dd.ImageContext(
        # Expects a single-element JSON array from the per-page seed.
        column_name="png_images_base64",
        data_type=dd.ModalityDataType.BASE64,
        image_format=dd.ImageFormat.PNG,
    )
]

# =============================================================================
# Prompt templates
# =============================================================================

PROMPT_QUESTION = """\
You are an expert in creating meaningful questions that test comprehension and reasoning about visual document content.

Your task: Create a targeted question of type <question-type> based on the visual element classification and visible content.

<question-type>
{{question_type}}
</question-type>

<page-classification>
{{page_classification}}
</page-classification>

CRITICAL: Focus on the Visual Element

The <page-classification> identifies the PRIMARY visual element type (e.g., TABULAR, QUANTITATIVE, LOGIC_DIAGRAMS) present in the image.

IMPORTANT: When creating your question, focus EXCLUSIVELY on the area of the image that contains the visual element identified in <page-classification>.
- If primary_categories shows TABULAR with subcategory SIMPLE_TABLE, focus your question on the TABLE content specifically
- If primary_categories shows QUANTITATIVE with subcategory BAR_CHART, focus your question on the CHART data specifically
- Ignore any surrounding text, headers, footers, or decorative elements that are not part of the primary visual element
- Your question should be about the DATA/CONTENT within the visual element, not about peripheral information

═══════════════════════════════════════════════════════════════════════════════
IMPORTANT: PREFER SIMPLE REASONING QUESTIONS
═══════════════════════════════════════════════════════════════════════════════

Create questions that require ONE STEP of reasoning or calculation - not just reading a value, but not overly complex either.

PREFERRED question types (aim for these):

1. SIMPLE COMPARISONS (no calculation needed):
   ✓ "Which category has the highest/lowest value?"
   ✓ "Is X greater than Y?"
   ✓ "Which period showed the largest increase?"
   → Requires comparing 2-4 values, no math

2. SIMPLE CALCULATIONS (one operation):
   ✓ "What is the difference between X and Y?" (subtraction)
   ✓ "What is the total of categories A and B?" (addition of 2-3 items)
   ✓ "What percentage of the total does X represent?" (one division)
   → Requires one simple calculation with clearly visible values

3. SIMPLE TRENDS/PATTERNS (observation):
   ✓ "Did revenue increase or decrease from Q1 to Q4?"
   ✓ "Which category grew the most?"
   ✓ "List all items above/below value X"
   → Requires identifying patterns without complex math

4. SIMPLE RATIOS (when values are obvious):
   ✓ "How many times larger is A than B?" (when A=200, B=100 → answer: 2)
   → Only when the ratio is simple (2x, 3x, etc.)

AVOID these question types:
✗ Direct single value lookup: "What was the revenue in Q3?"
✗ Multi-step calculations: "What is the average growth rate across all periods?"
✗ Complex aggregations: "What percentage do the top 5 categories represent combined?"
✗ Statistical measures: "What is the variance?" or "What is the correlation?"
✗ Ambiguous questions: "Which shows the most consistent pattern?"

KEY PRINCIPLE: Questions should require examining 2-3 data points and ONE simple operation (compare, add, subtract, or divide).

STEP 1: Analyze the Visual Element Type

The <page-classification> tells you what TYPE of visual content is present. Use this to create appropriate reasoning-based questions.

PRIMARY CATEGORIES and REASONING Question Strategies:

QUANTITATIVE (Charts/Graphs):
  - Subcategories: BAR_CHART, LINE_GRAPH, SCATTER_PLOT, PIE_CHART, AREA_GRAPH, HISTOGRAM, BOX_PLOT, HEATMAP, BUBBLE_CHART
  - Best question types: numerical (simple calculations), comparisons, multiple choice
  - SIMPLE REASONING Examples:
    ✓ "Which category has the highest value?" (comparison)
    ✓ "What is the difference between the highest and lowest values?" (one subtraction)
    ✓ "Did sales increase or decrease from Q1 to Q4?" (simple trend)
    ✓ "What is the total of the two largest categories?" (simple addition)
    ✓ "Is Category A greater than Category B?" (simple comparison)
    ✓ "How many categories have values above 100?" (counting with condition)
  - AVOID: Direct lookups ("What was Q3 revenue?"), complex calculations ("What's the average growth rate?")

TABULAR (Tables):
  - Subcategories: SIMPLE_TABLE, NESTED_TABLE, PIVOT_TABLE, COMPARISON_TABLE, FINANCIAL_TABLE
  - Best question types: numerical (simple calculations), comparisons, filtered lists
  - SIMPLE REASONING Examples:
    ✓ "Which fund has the highest budget?" (comparison)
    ✓ "What is the total budget of Funds A and B?" (simple addition)
    ✓ "How many funds have a budget over $1000?" (counting with condition)
    ✓ "Is Fund A's budget greater than Fund B's?" (simple comparison)
    ✓ "List all funds with 'Education' as their purpose" (filtering)
    ✓ "What is the difference between the largest and smallest fund?" (one subtraction)
  - AVOID: Direct cell lookups ("What is Fund 01's source?"), complex calculations ("What's the average of all funds meeting multiple conditions?")

LOGIC_DIAGRAMS (Flowcharts/Process):
  - Subcategories: FLOWCHART, DECISION_TREE, PROCESS_MAP, ALGORITHM_DIAGRAM, STATE_DIAGRAM, SEQUENCE_DIAGRAM
  - Best question types: text (simple path tracing), yes/no, list
  - SIMPLE REASONING Examples:
    ✓ "If condition A is true, what is the next step?" (simple path following)
    ✓ "How many decision points are shown in the flowchart?" (counting)
    ✓ "Does the process include step X?" (yes/no)
    ✓ "What happens immediately after step X?" (one-step trace)
    ✓ "List all possible outcomes shown" (enumeration)
    ✓ "Which step comes before the final outcome?" (simple reverse trace)
  - AVOID: Complex path analysis ("What sequence of 5 conditions leads to Z?"), multi-hop reasoning

HIERARCHICAL (Org Charts/Trees):
  - Subcategories: ORG_CHART, MIND_MAP, TREE_STRUCTURE, TAXONOMY, DENDROGRAM
  - Best question types: text (simple relationships), counting, lists
  - SIMPLE REASONING Examples:
    ✓ "How many people directly report to Manager X?" (counting direct connections)
    ✓ "Who is Manager X's immediate supervisor?" (one-level relationship)
    ✓ "Which manager has the most direct reports?" (comparison)
    ✓ "List all people who report directly to the CEO" (enumeration)
    ✓ "How many levels are in the organizational hierarchy?" (counting layers)
    ✓ "Is Person A senior to Person B?" (relationship check)
  - AVOID: Complex multi-level traversal ("How many total reports including indirect?"), percentage calculations

SPATIAL_RELATIONAL (Maps/Diagrams):
  - Subcategories: FLOOR_PLAN, BLUEPRINT, CHOROPLETH_MAP, POINT_MAP, TOPOGRAPHIC_MAP, NETWORK_DIAGRAM
  - Best question types: text (simple spatial), yes/no, counting
  - SIMPLE REASONING Examples:
    ✓ "Which room is adjacent to Room X?" (one-step spatial)
    ✓ "How many rooms are on the first floor?" (counting)
    ✓ "Is Room A directly connected to Room B?" (yes/no spatial)
    ✓ "Which area is the largest?" (comparison)
    ✓ "List all rooms that connect to the hallway" (enumeration)
    ✓ "What is located north of Building X?" (directional)
  - AVOID: Complex path finding ("shortest path through 5 rooms"), density calculations, percentage of area

SCHEMATIC (Technical Diagrams):
  - Subcategories: CIRCUIT_DIAGRAM, MECHANICAL_DIAGRAM, ANATOMICAL_DIAGRAM, WIRING_DIAGRAM, PLUMBING_DIAGRAM
  - Best question types: text (simple connections), counting, lists
  - SIMPLE REASONING Examples:
    ✓ "What component is directly connected to component X?" (one-step connection)
    ✓ "How many components are of type X?" (counting)
    ✓ "Is component A connected to component B?" (yes/no)
    ✓ "List all components connected to the input" (enumeration)
    ✓ "Which component has the most connections?" (comparison)
    ✓ "What is the next component after X in the flow?" (one-step trace)
  - AVOID: Path analysis ("all components in signal path"), failure analysis, impedance calculations

INFOGRAPHIC (Visual Narratives):
  - Subcategories: TIMELINE, STATISTICAL_INFOGRAPHIC, PROCESS_INFOGRAPHIC, COMPARISON_INFOGRAPHIC
  - Best question types: text (simple analysis), comparisons, counting
  - SIMPLE REASONING Examples:
    ✓ "Which year had the most events?" (comparison)
    ✓ "How many events occurred between Year X and Year Y?" (counting)
    ✓ "Which category shows the largest value in the comparison?" (simple comparison)
    ✓ "Did the trend increase or decrease over time?" (direction)
    ✓ "List all events that occurred after Year X" (filtering)
    ✓ "Is Category A greater than Category B?" (simple comparison)
  - AVOID: Complex calculations ("average time intervals"), growth rates, statistical measures

STEP 2: Match Question Type to Content

NUMERICAL question types (int, float, percentage %):
  ✓ Use for: QUANTITATIVE charts, TABULAR data with numbers, INFOGRAPHIC with statistics
  ✓ ALWAYS require calculation, comparison, or aggregation - NEVER direct lookup
  ✗ NEVER use for: LOGIC_DIAGRAMS, HIERARCHICAL, SCHEMATIC (unless they contain numerical labels)

TEXT question types (short answer, list of items, yes/no):
  ✓ Use for: TABULAR (if text content), LOGIC_DIAGRAMS, HIERARCHICAL, SPATIAL_RELATIONAL, SCHEMATIC, INFOGRAPHIC
  ✓ Should require reasoning, filtering, or multi-step analysis

MULTIPLE CHOICE:
  ✓ Good for any category - create options based on calculated or derived values, not direct readings
  ✓ Options should require the user to perform reasoning to eliminate incorrect choices

NOT ANSWERABLE:
  ✓ Create questions relevant to the visual element type but whose answer isn't present
  ✓ Example: For a 2023 revenue table, ask "What percentage increase occurred from 2024 Q1 to Q2?"

STEP 3: Match Complexity to Score

The reasoning_complexity_score (1-10) in <page-classification> indicates the appropriate depth.
IMPORTANT: Keep questions simple and achievable. Most questions should be in the 3-6 range.

- Score 1-3 (Low): Basic comparisons or simple observations
  * Examples: "Which category has the highest value?", "Is A greater than B?"
  * Requires comparing 2-3 values, no calculation needed

- Score 4-6 (Medium): ONE simple calculation or counting with a condition
  * This is the TARGET for most questions - requires one step of reasoning
  * Examples:
    - "What is the total of A and B?" (simple addition)
    - "What is the difference between highest and lowest?" (simple subtraction)
    - "How many items are above 100?" (counting with condition)
  * Questions should require examining 2-4 data points and ONE simple operation

- Score 7-8 (High): Use sparingly - slightly more complex but still straightforward
  * Examples: "What percentage does X represent?" (requires division)
  * Only use when the calculation is still simple and unambiguous

- Score 9-10 (Expert): AVOID - Too complex for reliable VLM answering
  * Do not create questions requiring: multi-step calculations, averages of many items, growth rates, statistical measures
  * These lead to calculation errors and incorrect answers

GENERAL RULE: If you need to do more than ONE calculation step in your head to answer it, the question is too complex.

═══════════════════════════════════════════════════════════════════════════════
CRITICAL: CREATE VERIFIABLE QUESTIONS
═══════════════════════════════════════════════════════════════════════════════

Before finalizing your question, ask yourself:
1. "Can I answer this question clearly by looking at the visual?"
2. "Can I verify if an answer is correct or incorrect?"
3. "Is there a clear, unambiguous correct answer?"

If you cannot easily answer and verify the question yourself, DO NOT use it.

Examples:
✓ GOOD: "Which category has the highest value?"
  → You can look and determine: "Category A = 150, Category B = 120, so Category A is correct"

✓ GOOD: "What is the difference between Product A and Product B?"
  → You can calculate: "Product A = 200, Product B = 150, difference = 50"

✗ BAD: "Which category shows the most consistent growth?"
  → Ambiguous - what does "most consistent" mean? Hard to verify.

✗ BAD: "What is the average of all values shown?"
  → If there are 10+ values, too tedious to verify correctly

✗ BAD: "What percentage of total do the top 5 categories represent?"
  → Requires identifying top 5, summing them, calculating percentage - too many steps to verify reliably

FORBIDDEN - DO NOT CREATE:
✗ Questions answerable by reading a single trivial value (unless complexity score is 1-3)
✗ Character/letter counting
✗ Word counting (unless semantically meaningful)
✗ Font style/size questions
✗ Trivial string manipulation
✗ Color or formatting questions
✗ Generic questions that ignore the visual element type

Question Framing Rules:
1. Create questions SPECIFIC to the visual element type identified in <page-classification>
2. Focus ONLY on the primary visual element (table, chart, diagram, etc.), not surrounding content
3. Do NOT use: "the page", "the image", "the document", "according to"
4. Ask about content directly using action verbs like: "Which", "What is", "How many", "Is"
5. Prefer simple reasoning questions (one comparison or one calculation) over direct lookups
6. Match question difficulty to the reasoning_complexity_score (target: 3-6)
7. CRITICAL: You must be able to answer the question yourself and verify if an answer is correct
8. Ensure questions have clear, unambiguous correct answers
9. Keep questions achievable - avoid ambiguous terms like "most consistent" or "optimal"

═══════════════════════════════════════════════════════════════════════════════
OUTPUT FORMAT INSTRUCTIONS
═══════════════════════════════════════════════════════════════════════════════

Your response MUST contain ONLY the question text.

If the question type is "multiple choices":
- Output the question on the first line
- Output each choice on a separate line, starting with a letter (A., B., C., D.)
- Example:
  Which category has the highest value?
  A. Category A
  B. Category B
  C. Category C
  D. Category D

For all other question types:
- Output ONLY the question text, nothing else
- Example: What is the difference between Product A and Product B?

DO NOT include any explanations, reasoning, or additional text.\
"""


PROMPT_ANSWER = """\
You are an expert at providing accurate, comprehensive answers based on given information.

Your task is to answer the <question> using ONLY the information visible in the image.

<question-type>
{{question_type}}
</question-type>

<page-classification>
{{page_classification}}
</page-classification>

<question>
{{question}}
</question>

Answer Guidelines:
- Base your answer ENTIRELY on the visible content
- Do not make assumptions or add information not present in the visible content
- Use the <page-classification> to understand the content type and provide appropriate answers:
  * For QUANTITATIVE/TABULAR with numbers: Perform calculations accurately using visible data
  * For LOGIC_DIAGRAMS: Describe process steps or decision flows
  * For HIERARCHICAL: Explain relationships or structures
  * For SPATIAL_RELATIONAL: Describe locations or spatial relationships
  * For SCHEMATIC: Explain component connections or technical details
  * For INFOGRAPHIC: Extract key facts or statistics

CRITICAL - For Calculation-Based Questions:
When the question asks you to calculate percentages, ratios, differences, averages, or any derived values:
1. Extract the relevant data points from the visible content
2. Perform the calculation accurately
3. Provide the final answer with appropriate units (%, ratio, currency, etc.)
4. Round percentages to 1-2 decimal places (e.g., "25.5%" or "33.33%")
5. Round decimal numbers to 2-3 significant figures unless the question specifies otherwise

Examples of calculation questions:
- "What percentage of X?" → Calculate: (part/whole) × 100, answer as "XX.X%"
- "What is the ratio of A to B?" → Calculate: A/B, answer as "X:Y" or "X.XX"
- "What is the difference between X and Y?" → Calculate: |X - Y|, answer with units
- "What is the average of X, Y, Z?" → Calculate: (X+Y+Z)/3, answer with units

Special Cases:
- If the question type is "not answerable", respond with "Not answerable"
- For multiple choice questions: Select the correct option based on the visible content (perform calculations if needed)
- For yes/no questions: Respond with "Yes" or "No"
- For list questions: Format your answer as a clear list

Answer Format:
- Provide a direct answer without meta-commentary like "Based on the image..." or "According to the information provided..."
- Answer as if you are directly viewing the content
- Be precise and factual - do not speculate or infer beyond what is explicitly visible
- For numerical answers, include appropriate units and precision\
"""


PROMPT_QUESTION_RELEVANCE = """\
You are an expert at evaluating question quality and relevance.

Your task is to determine if the <question> is relevant to the content visible in the image.

<question-type>
{{question_type}}
</question-type>

<page-classification>
{{page_classification}}
</page-classification>

<question>
{{question}}
</question>

═══════════════════════════════════════════════════════════════════════════════
CRITICAL: VERIFY STEP-BY-STEP BEFORE JUDGING
═══════════════════════════════════════════════════════════════════════════════

Do NOT make a snap judgment. Follow this systematic verification process:

STEP 1: Look at the image and identify what visual content is present
- What type of visual element do you see? (table, chart, diagram, etc.)
- What specific data or information is shown?
- What are the main topics, categories, or entities visible?

STEP 2: Analyze what the question is asking about
- What topic or data does the question reference?
- What type of information would be needed to answer it?
- Does the question align with the visual element type from <page-classification>?

STEP 3: Check if the question relates to visible content
- Are the entities/categories mentioned in the question actually present in the image?
- Is the type of data needed to answer visible in the image?
- Does the question make sense for this type of visual?

STEP 4: Make your decision
- Mark "Relevant" if: The question asks about data/content that IS present in the visible image
- Mark "Relevant" if: The question is "not answerable" type AND is about the right domain/topic but specific data is missing
- Mark "Irrelevant" ONLY if: The question asks about content clearly NOT in the image OR is inappropriate for the visual type

═══════════════════════════════════════════════════════════════════════════════
EVALUATION PHILOSOPHY: FOCUS ON CONTENT ALIGNMENT
═══════════════════════════════════════════════════════════════════════════════

After verifying step-by-step, apply these standards:

MARK as "Relevant" when:
- The question asks about data, entities, or topics that ARE visible in the image
- The question type matches the visual element type (charts for quantitative, tables for tabular, etc.)
- For "not answerable" questions: the domain/topic matches but specific data is missing

MARK as "Irrelevant" when:
- The question asks about entities or data clearly NOT present in the image
- The question type is inappropriate for the visual element (e.g., asking about flowchart steps when showing a bar chart)
- The question topic has no connection to the visible content

KEY PRINCIPLE: Verify that the question's subject matter aligns with what's actually visible in the image.

═══════════════════════════════════════════════════════════════════════════════

Detailed Evaluation Guidelines:

1. RELEVANT questions are those that:
   ✓ Ask about data, entities, or relationships visible in the image
   ✓ Are appropriate for the visual element type (e.g., asking about values in a chart, rows in a table)
   ✓ Can potentially be answered from or reasoned about using the visible content
   ✓ For "not answerable" type: relate to the domain/topic but specific answer is not present

2. IRRELEVANT questions are those that:
   ✗ Ask about entities, data, or topics completely absent from the image
   ✗ Are inappropriate for the visual type (e.g., asking about flowchart steps when image shows a bar chart)
   ✗ Reference information that has nothing to do with the visible content

Examples by Visual Type:

For TABULAR content:
- "What is Fund A's budget?" → Relevant (if Fund A is in the table)
- "Which fund has the highest value?" → Relevant (if funds and values are shown)
- "What is the CEO's salary?" → Irrelevant (if no CEO or salary data visible)

For QUANTITATIVE (Charts):
- "Which category has the highest value?" → Relevant (if categories are shown)
- "What is the total of A and B?" → Relevant (if A and B are in the chart)
- "What was the value in 2025?" → Irrelevant (if only 2020-2023 data shown)

For LOGIC_DIAGRAMS (Flowcharts):
- "What happens after step X?" → Relevant (if step X is in the flowchart)
- "How many decision points are there?" → Relevant (if diagram shows decision points)
- "What is the database schema?" → Irrelevant (if image shows a process flow, not database)

Special Case - "not answerable" questions:
- These should be relevant to the DOMAIN but the specific answer should not be present
- Example: Image shows 2023 revenue table, Question: "What was 2024 Q1 revenue?" → Relevant domain, but answer not present

Your response should be:
- "Relevant" - if the question relates to content visible in the image (DEFAULT choice)
- "Irrelevant" - ONLY if the question is clearly about something not in the image

═══════════════════════════════════════════════════════════════════════════════
OUTPUT FORMAT INSTRUCTIONS
═══════════════════════════════════════════════════════════════════════════════

Your response MUST contain ONLY ONE WORD:
- "Relevant" OR
- "Irrelevant"

DO NOT include any explanations, reasoning, or additional text.
Output ONLY the single word.\
"""


PROMPT_ANSWER_CORRECTNESS = """\
You are an expert at evaluating answer accuracy and correctness.

Your task is to determine if the <answer> reasonably addresses the <question> based on the visible content.

<question-type>
{{question_type}}
</question-type>

<page-classification>
{{page_classification}}
</page-classification>

<question>
{{question}}
</question>

<answer>
{{answer}}
</answer>

═══════════════════════════════════════════════════════════════════════════════
CRITICAL: VERIFY STEP-BY-STEP BEFORE JUDGING
═══════════════════════════════════════════════════════════════════════════════

Do NOT make a snap judgment. Follow this systematic verification process:

STEP 1: Understand what the question is asking
- What type of answer is expected? (comparison, calculation, value, list, yes/no)
- What specific information needs to be extracted or computed?

STEP 2: Look at the visible content and verify the answer yourself
- Identify the relevant data points in the image
- If the question requires calculation, do the calculation yourself
- If the question requires comparison, compare the values yourself
- If the question requires counting or listing, count/list them yourself

STEP 3: Compare YOUR answer to the PROVIDED answer
- Does the provided answer match what you found?
- Is it in the right ballpark? (within ±5% for numbers)
- Is it semantically equivalent even if worded differently?
- Does it make sense given the data?

STEP 4: Make your decision
- Mark "Correct" if: Your answer and provided answer align (exact or close enough)
- Mark "Incorrect" ONLY if: Provided answer is clearly wrong based on your verification

═══════════════════════════════════════════════════════════════════════════════
EVALUATION PHILOSOPHY: FOCUS ON SUBSTANTIVE CORRECTNESS
═══════════════════════════════════════════════════════════════════════════════

After verifying step-by-step, apply these standards:

ACCEPT as "Correct" when:
- The answer is factually accurate based on the visible content
- Numbers are close enough (within ±5% for calculations due to rounding)
- Wording differs but the meaning/value is the same
- Format differs ("25%" vs "0.25" vs "1/4") but represents the same value

MARK as "Incorrect" when:
- The answer contradicts the visible content
- Numbers are significantly wrong (beyond ±5% tolerance)
- The answer uses wrong data from the image
- The answer doesn't address what was asked

KEY PRINCIPLE: Distinguish between minor variations (format, rounding) and actual errors (wrong data, wrong calculation).

═══════════════════════════════════════════════════════════════════════════════
OUTPUT FORMAT INSTRUCTIONS
═══════════════════════════════════════════════════════════════════════════════

Your response MUST contain ONLY ONE WORD:
- "Correct" OR
- "Incorrect"

DO NOT include any explanations, reasoning, or additional text.
Output ONLY the single word.

═══════════════════════════════════════════════════════════════════════════════

Detailed Evaluation Guidelines:

1. FOR "not answerable" QUESTIONS:
   - Mark "Correct" if the answer indicates it's not answerable (e.g., "Not answerable", "Cannot be determined", "Information not provided", etc.)
   - Only mark "Incorrect" if the answer provides a specific answer when it should say "not answerable", OR if it says "not answerable" but the information is clearly present

2. FOR CALCULATION/REASONING QUESTIONS (percentages, ratios, trends, comparisons):

   MARK AS "Correct" IF ANY OF THESE ARE TRUE:
   ✓ The answer is in the right ballpark (within ±5% for calculations)
   ✓ The answer uses a reasonable interpretation of the data
   ✓ The answer shows correct reasoning even if numbers differ slightly
   ✓ The answer addresses the question asked, even if format varies
   ✓ Minor calculation differences due to rounding (e.g., 33% vs 33.33%)
   ✓ Equivalent representations (e.g., "1/4" vs "25%" vs "0.25")
   ✓ Different but reasonable ways to express the same concept

   Examples of answers to mark "Correct":
   - Question: "What percentage does X represent?"
     Answer: "25%" when exact is 24.8% → CORRECT (close enough)
   - Question: "What's the ratio of A to B?"
     Answer: "about 2 to 1" when exact is 1.9:1 → CORRECT (reasonable)
   - Question: "By how much did X increase?"
     Answer: "doubled" when exact increase is 95% → CORRECT (reasonable interpretation)

   ONLY MARK AS "Incorrect" IF:
   ✗ The answer is wildly off (e.g., says 80% when it's actually 20%)
   ✗ The answer clearly misidentifies what data to use
   ✗ The answer provides a specific value when asked for a calculation but didn't calculate
   ✗ The answer is completely unrelated to what was asked

3. FOR EXTRACTION QUESTIONS (specific values, items from lists):

   MARK AS "Correct" IF:
   ✓ The answer matches the visible content
   ✓ Minor wording differences that don't change meaning
   ✓ The answer captures the key information even if not word-for-word

   ONLY MARK AS "Incorrect" IF:
   ✗ The answer states information not present in the visible content
   ✗ The answer contradicts what's visible

4. FOR COMPARISON/ANALYSIS QUESTIONS:

   MARK AS "Correct" IF:
   ✓ The answer shows reasonable analysis of the visible content
   ✓ The conclusion is defensible based on the data
   ✓ The reasoning makes sense even if you might analyze it differently

   ONLY MARK AS "Incorrect" IF:
   ✗ The conclusion clearly contradicts the visible data
   ✗ The reasoning is fundamentally flawed

5. FOR MULTIPLE CHOICE QUESTIONS:

   MARK AS "Correct" IF:
   ✓ The selected option is correct or defensible

   MARK AS "Incorrect" IF:
   ✗ The selected option is clearly wrong

6. FOR YES/NO QUESTIONS:

   MARK AS "Correct" IF:
   ✓ The yes/no answer is reasonable based on visible content

   MARK AS "Incorrect" IF:
   ✗ The yes/no answer clearly contradicts visible content\
"""


# =============================================================================
# Pipeline configuration
# =============================================================================


def build_config(
    seed_path: str = "seed.parquet",
    model_alias: str = "qwen-vl",
    model_id: str = DEFAULT_VLM_MODEL,
) -> dd.DataDesignerConfigBuilder:
    model_configs = [
        dd.ModelConfig(
            alias=model_alias,
            model=model_id,
            provider=VLLM_PROVIDER_NAME,
            inference_parameters=dd.ChatCompletionInferenceParams(
                timeout=1200,
                max_tokens=40000,
                max_parallel_requests=32,
            ),
        ),
    ]

    config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)

    config_builder.with_seed_dataset(
        dd.LocalFileSeedSource(path=seed_path),
        sampling_strategy=dd.SamplingStrategy.ORDERED,
    )

    config_builder.add_column(
        dd.SamplerColumnConfig(
            name="question_type",
            sampler_type=dd.SamplerType.CATEGORY,
            params=dd.CategorySamplerParams(
                values=[
                    "multiple choices",
                    "yes or no",
                    "number, word, phrase, short sentence (string), list of items (int, string, float or mixed)",
                    "not answerable",
                ],
                weights=[0.05, 0.1, 2, 0.01],
            ),
        )
    )

    config_builder.add_column(
        dd.LLMTextColumnConfig(
            name="question",
            model_alias=model_alias,
            prompt=PROMPT_QUESTION,
            multi_modal_context=IMAGE_CONTEXT,
        )
    )

    config_builder.add_column(
        dd.LLMTextColumnConfig(
            name="answer",
            model_alias=model_alias,
            prompt=PROMPT_ANSWER,
            multi_modal_context=IMAGE_CONTEXT,
            extract_reasoning_content=True,
        )
    )

    config_builder.add_column(
        dd.LLMTextColumnConfig(
            name="question_relevance",
            model_alias=model_alias,
            prompt=PROMPT_QUESTION_RELEVANCE,
            multi_modal_context=IMAGE_CONTEXT,
        )
    )

    config_builder.add_column(
        dd.LLMTextColumnConfig(
            name="answer_correctness",
            model_alias=model_alias,
            prompt=PROMPT_ANSWER_CORRECTNESS,
            multi_modal_context=IMAGE_CONTEXT,
        )
    )

    return config_builder


def create_dataset(
    config_builder: dd.DataDesignerConfigBuilder,
    num_records: int,
    vllm_endpoint: str,
    artifact_path: Path | str | None = None,
) -> DatasetCreationResults:
    model_providers = [
        dd.ModelProvider(
            name=VLLM_PROVIDER_NAME,
            endpoint=vllm_endpoint,
        ),
    ]
    data_designer = DataDesigner(
        artifact_path=artifact_path,
        model_providers=model_providers,
    )
    data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True))
    results = data_designer.create(config_builder, num_records=num_records, dataset_name="visual_qa")
    return results


if __name__ == "__main__":
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument(
        "--vllm-endpoint",
        type=str,
        required=True,
        help="Base URL of the vLLM server hosting the VLM (e.g. http://localhost:8000/v1)",
    )
    parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
    parser.add_argument("--model-alias", type=str, default="qwen-vl")
    parser.add_argument("--model-id", type=str, default=DEFAULT_VLM_MODEL)
    parser.add_argument("--num-records", type=int, default=5)
    parser.add_argument("--artifact-path", type=str, default=None)
    args = parser.parse_args()

    config_builder = build_config(
        seed_path=args.seed_path,
        model_alias=args.model_alias,
        model_id=args.model_id,
    )
    results = create_dataset(
        config_builder,
        num_records=args.num_records,
        vllm_endpoint=args.vllm_endpoint,
        artifact_path=args.artifact_path,
    )

    print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")

    results.load_analysis().to_report()