# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "data-designer>=0.5.6",
# ]
# ///
"""Long-Document Understanding Whole-Document QA Recipe

Generate multi-page question-answer pairs that require combining information
from across an entire PDF document. Targets the hardest long-document
understanding skills: counting across pages, list extraction, repeated-layout
aggregation, cross-page computation, and lookup chains. MMLongBench-Doc is
used to track progress toward these capabilities.

For each seed record the pipeline:

  1. Samples a question type (multiple choice, yes/no, string, layout,
     numerical int/float/percentage, list, not answerable)
  2. Generates a question that requires examining 2+ pages (preferably 4-8)
  3. Generates an answer with exhaustive page-by-page reasoning (captured
     separately via extract_reasoning_content)
  4. Evaluates overall quality including multi-page requirement, answer
     correctness, reasoning thoroughness, and format compliance (0/1/2 score)

Prerequisites:
    - A seed parquet file containing:
        * `png_images_base64` – JSON array of base64-encoded PNGs covering
          all pages of each document (one entry per page).
    - A vLLM-compatible deployment of the VLM
      (default: Qwen/Qwen3-VL-235B-A22B-Thinking-FP8).
      Recommended vLLM launch flags:
        --tensor-parallel-size 4
        --max-model-len 50000
        --gpu-memory-utilization 0.90
        --reasoning-parser deepseek_r1
        --limit-mm-per-prompt '{"video": 0}'
        --trust-remote-code

      Example launch script for 4× H100:
        docker run --gpus all \
            -p 8000:8000 \
            vllm/vllm-openai:latest \
            --model Qwen/Qwen3-VL-235B-A22B-Thinking-FP8 \
            --tensor-parallel-size 4 \
            --max-model-len 50000 \
            --gpu-memory-utilization 0.90 \
            --reasoning-parser deepseek_r1 \
            --limit-mm-per-prompt '{"video": 0}' \
            --trust-remote-code

Run:
    # Basic usage (generates 5 records by default)
    uv run 08-whole-document-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_whole_document.parquet

    # Custom model and record count
    uv run 08-whole-document-qa-sdg.py --vllm-endpoint http://localhost:8000/v1 --seed-path seed_data/seed_whole_document.parquet --num-records 100

    # For help message and available options
    uv run 08-whole-document-qa-sdg.py --help
"""

from pathlib import Path

import data_designer.config as dd
from data_designer.interface import DataDesigner, DatasetCreationResults

DEFAULT_VLM_MODEL = "Qwen/Qwen3-VL-235B-A22B-Thinking-FP8"
VLLM_PROVIDER_NAME = "vllm"

# =============================================================================
# Image context helper
# =============================================================================

IMAGE_CONTEXT = [
    dd.ImageContext(
        column_name="png_images_base64",
        data_type=dd.ModalityDataType.BASE64,
        image_format=dd.ImageFormat.PNG,
    ),
]

# =============================================================================
# Prompt templates
# =============================================================================

PROMPT_QUESTION = """\
<question-type>
{{ question_type }}
</question-type>

You are given ALL pages of a PDF document. Create a question of the given <question-type> that requires examining these pages.

Think step by step:
1. Scan ALL pages. Note every table, chart, figure, diagram, icon, and text section.
2. Identify connections ACROSS pages: a value on one page that relates to a value on another, a term defined on one page used in a chart on another, items to count across multiple pages.
3. Formulate a question that REQUIRES combining information from at least 2 pages (preferably 4-8). Verify: could someone answer this from any single page alone? If yes, revise.

CORE RULES:
- The question must require the ENTIRE document, including later pages. Do not create questions answerable from just the first half.
- Reject any question that would still be answered correctly if the reader stopped after the first cluster of matching pages.
- Use light grounding like the benchmark: "In this report, ...", "According to the slides, ...",
  or "In the Appendix, ...". Do NOT use explicit table/figure numbers (e.g., "In Table 3").
  Put the specificity in the CRITERIA of what to find, not where to find it.
- Prefer questions that use visual elements (charts, figures, icons, diagrams) over plain text.
- Do NOT include the answer or hints in the question. Output ONLY the question text.

MULTI-PAGE QUESTION PATTERNS
The model's accuracy drops with more evidence pages (1pg: 51%, 4pg: 23%, 6+: 15%).
Its dominant failure is systematic undercounting. Generate questions that train exhaustive scanning.

HIGH-VALUE TARGETS (based on failure analysis):
- COUNTING across pages — model undercounts by 2-3x. Use clear, visual, objective criteria.
- LIST EXTRACTION across pages — 81% zero rate on "list all". Items on later pages get missed.
- REPEATED-LAYOUT AGGREGATION — brochure cards, museum entries, cover-page student profiles, FAQ cards, chart panels, etc. Require scanning all matching pages, not just the first.
- CROSS-PAGE COMPUTATION — financial ratios, sums across tables.
- INFOGRAPHIC SPATIAL — binding numbers/labels to correct regions on maps/diagrams.
- LOOKUP CHAINS — find in one table, look up in another.

GROUNDING (match benchmark style — light references, heavy criteria):
The benchmark uses light grounding. Do NOT use explicit table/figure numbers.
Allowed grounding, from lightest to heaviest:
  1. Bare: "How many figures are there in total in the article?"
  2. Document type: "In the slides, how many charts compare ONLY US and Europe?"
  3. Section/part: "How many prompt examples are given in the Appendix?"
  4. Page range (rare): "How many figures are in this paper from Page 3 to Page 10?"
The CRITERIA (what to count/find) must be clear and visual — the LOCATION is left to the model.

QUALIFIER FIDELITY (critical)
- If multiple nearby answers exist, the question MUST include the qualifier that makes the target unique.
- Prefer qualifiers like: strongly / somewhat / overall / net, displayed / shown vs listed / mentioned,
  exact row / column / year / fiscal year / subgroup / legend item.
- The question must not be answerable by selecting a nearby but broader fact.

COUNTING DISCIPLINE: verify your own count by scanning every page. Enumerate, then recount.

{% if "not answerable" in question_type %}
Create a question relevant to the document whose answer is NOT present anywhere in the pages.
Prefer hard, benchmark-like negatives over trivial missing-info questions.
Good negative patterns:
  - wrong year/date not shown anywhere in the document
  - wrong subgroup/series not present anywhere in the document
  - a condition that no entity satisfies anywhere in the document
  - a lookup chain that never resolves anywhere in the document
Templates:
  - "In the Consolidated Balance Sheets, what is [METRIC] for [YEAR not shown]?"
  - "According to the report, which [ENTITY] satisfies [CONDITION not met by any row]?"
  - "In the chart comparing [REAL GROUPS], what is the value for [GROUP NOT IN THE CHART]?"

{% elif "numerical" in question_type %}
Use visible numbers from tables/charts. Require arithmetic or counting across pages.
{% if "int" in question_type %}
Add "Answer with an integer." to the question. Keep counts in range 3-30.
Use clear, VISUAL, objective criteria — each instance must be unambiguously identifiable.
Templates:
  - "How many charts in this report compare the general public with Latinos/Hispanics?
     Count each chart once, even if it has multiple sub-panels. Answer with an integer."
  - "How many pages include at least one photograph of a building exterior?"
  - "Across all tables in this paper, how many use 'F1' as a column header or metric?"
  - "How many charts use 'year' (e.g., 2010, 2015) as the horizontal axis?"
  - "How many distinct country/region names appear in the regulatory compliance sections?"
  - "What is the total [METRIC] across all tables in the document?"
  - "How many rows across all tables in this paper have [COLUMN] above [THRESHOLD]?"
  - "Using the percentage on one page and the sample size on another page, how many [GROUP] does that correspond to? Round to the nearest hundred and answer with an integer."
{% elif "float" in question_type %}
Specify rounding (e.g., "Round to two decimal places.").
Templates:
  - "Using the Consolidated Statements of Income and Consolidated Balance Sheets,
     what is the FY2021 inventory turnover ratio (Cost of Sales / Inventories)?
     Round to two decimal places."
  - "What is the sum of Accrued Liabilities (balance sheet) and Selling Expense
     (income statement) for FY2021? Answer in millions."
  - "What is the ratio of [METRIC A] from the income statement to [METRIC B] from the balance sheet?"
  - "What is the difference between the largest and smallest 'Change' values
     across all subgroup charts in this report?"
{% elif "percentage" in question_type %}
Add "Answer with a % sign."
Templates:
  - "According to the financial statements, what is [METRIC A] as a percentage of [METRIC B]?"
  - "What is the percentage difference between [GROUP A] and [GROUP B] in the chart titled 'X'?"
  - "How much did [METRIC] change between [YEAR A] and [YEAR B] according to the report?"
{% endif %}

{% elif "list" in question_type %}
Answer should be 2-8 short items gathered from DIFFERENT pages.
Add "Return a JSON array of strings, e.g., ["A", "B"]."
The model's worst pattern is "list all" (81% zero rate) — items on later pages get missed.
Specify scope and criteria explicitly.
The question itself must be a natural language sentence — NEVER output a JSON array as the question.
Templates:
  - "List all organisations introduced with at least one paragraph of description.
     Return a JSON array, e.g., ["Org A", "Org B"]."
  - "What are all the examples given for 'what does not make you data-driven'?
     Collect from every page. Return a JSON array."
  - "List every college that uses a Year 3 student on its brochure cover. Return a JSON array."
  - "What are the [FIELD] values for [ENTITY] mentioned across the document?"
  - "What colors represent [CATEGORIES] in charts titled 'X' and 'Y'? Return a JSON array."

{% elif "yes" in question_type %}
Templates:
  - "Is [METRIC] for [ENTITY A] greater than for [ENTITY B] according to the report?"
  - "Is the revenue growth rate in the financial highlights consistent with the detailed
     segment breakdown later in the report? Answer Yes or No."
  - "Does the document contain more than N [ITEMS with clear criteria]?"

{% elif "multiple choice" in question_type %}
Provide exactly 4 options (A-D), plausible and mutually exclusive.
Templates:
  - "Which [ENTITY] has the highest [METRIC] in the report?
     A. ... B. ... C. ... D. ..."
  - "Based on the financial data in this report, which statement is true?
     A. ... B. ... C. ... D. ..."

{% elif "string:" in question_type %}
Answer is a word, phrase, or short sentence requiring multi-page reasoning.
Templates:
  - "In the chart titled 'X', in the 'Change' column, which subgroup shows the largest increase?"
  - "Which method has the highest BLEU score in Table 3? Look up that method's parameter
     count in Table 1. What is it?"
  - "According to the report, which region has the largest number of [ENTITY]?"
  - "Compare the 2015 and 2008 values for 'Foreign born' Latinos in the economic optimism
     chart vs. the personal finance chart. In which is the gap larger?"
  - "In the support contacts table, what phone number is listed for the country mentioned
     most often in the Regulatory Compliance sections?"

{% elif "layout" in question_type %}
Answer requires understanding visual/spatial structure. Answer is a number, word, or phrase.
Templates:
  - "What range does [COLOR] represent in the legend of the chart titled 'X'?"
  - "In the network diagram, which nodes are connected to [LABEL]?"
  - "In the flowchart in Figure N, what step follows [LABEL]?"
  - "What text appears inside the [COLOR/POSITION] box in the slides?"
  - "What are the colors of the icons for [ACTION A] and [ACTION B] in the document?"
{% endif %}

These templates are for inspiration. Create a question specific to the actual visible content.
OUTPUT: Write ONLY the question. No answer, no explanation.\
"""


PROMPT_ANSWER = """\
Your question is: {{ question }}

You are given ALL pages of a PDF document. Answer the question using these pages.

You MUST use this exact output structure:
<think>
[all reasoning here]
</think>
[bare final answer here — no explanation, no labels, no extra text]

In your THINKING (inside <think> tags), follow this protocol.

QUALIFIER LOCK (critical)
Before extracting any answer, copy the restrictive qualifiers from the question and keep them fixed:
- year / date / fiscal year
- subgroup / series / legend item
- exact metric (count vs percentage vs percentage-point difference)
- displayed / shown / visible vs listed / mentioned
- first / second / last / nearest / highest / lowest

Do NOT substitute a nearby year, nearby subgroup, nearby series, nearby row, or nearby fact.
If the question asks for a specific subgroup or metric, read exactly that one and no other.

THINKING PROTOCOL:
1. State what you need to find.
2. Scan ALL pages first to last. For each page, note whether it contains relevant info.
   Do NOT stop at the first match — the same content type often continues on later pages
   (charts across pages 4-14, financial notes on pages 60-80, organisations on pages 10-35).
3. For each relevant page, enumerate findings with explicit numbering:
   "Page 28: Item 14 — Arven Gold..., Item 15 — Bergen Kunsthall..."
   "Target B found on page 59 in Table 5."
   Keep a running tally. Do NOT summarize or estimate.
4. If the answer requires combining values from multiple pages, show the computation:
   "Total Debt = Current portion (799) + Long-term (6,692) = 7,491"
5. After the LAST page, verify:
   - Counting: recount from the top. Compare tallies. Third pass if they differ.
   - Computation: re-read source values and redo the arithmetic.
   - List extraction: scan again for missed items, especially on later pages.
   - Lookup chains: confirm intermediate results match the correct row/entity.
   - Comparison: re-check you read from the correct charts/tables (match by title).

UNIT DISCIPLINE
- Preserve units exactly when present or requested (%, $, million, etc.).
- Financial reports: parentheses = negative; check table header for unit scale (e.g., "In millions");
  "how much higher/more" = positive number; "change" = positive for increase, negative for decrease.

COUNT / PERCENT / DENOMINATOR DISCIPLINE
- If one page provides a percentage and another page provides a sample size, the percentage is NOT the final answer until it is converted using the sample size.
- Distinguish carefully between count, percentage, percentage-point difference, and ratio.
- Only round after the final computation, never before.

THINKING STABILITY (critical)
- Follow the protocol once from top to bottom. Do NOT restart from step 1 after you already found the relevant pages.
- Do at most one scan pass and one verification pass.
- If there are two plausible candidates, compare them once using the question's qualifiers, choose the best-supported one, and continue. Do NOT keep generating new alternatives.
- Do NOT repeat the same scan, recount, or conclusion more than once.
- As soon as the answer is found and verified, stop thinking and produce the final answer.
- Do NOT use filler loops such as repeating a phrase, title, entity name, or page reference many times.
- If you have a complete answer supported by the required pages and qualifiers, commit to it. Do not reopen the search.

EXHAUSTIVE REPEATED-LAYOUT SCAN
If multiple pages in the document share the same layout or template, scan ALL matching pages before concluding.
Do not stop after the first valid hit.
For counts/lists, maintain a running page-by-page tally or item list until the last relevant page in the document.
Bad: "I see items 14-29 on pages 28-31. Count = 29 - 14 + 1 = 16." (stopped early, missed pages 32-34)
Good: "Page 28: items 14-21 (running total: 8). Page 30: items 22-29 (running total: 16). Page 32: items 30-37 (running total: 24). Page 34: items 38-44 (running total: 31). Final count: 31."

PAGE-BREAK CONTINUATION
If a sentence, paragraph, table row, caption, or figure explanation appears to continue onto the next page,
combine the text before deciding the answer is missing.

THINKING TRACE QUALITY (your thinking is used as training data):
- Cite which page and which element (by title/heading) you found evidence on.
- Quote the specific values you read.
- For computation, show the formula with named operands.
- For counting, enumerate every item, not just a total.

FINAL ANSWER: After </think>, output ONLY the bare answer. No reasoning, no step labels,
no "DECOMPOSE:", no "VERIFY:", no explanation. Just the answer value.

{% if "not answerable" in question_type %}
Only say "Not answerable" if the information is genuinely absent from ALL pages — you must have
scanned every single page before concluding this. Do NOT refuse because:
- The exact phrasing doesn't appear (look for equivalent information)
- You need to compute the answer from available data (do the computation)
- A figure or icon is hard to read (give your best reading)
- You only checked the first half of the document (later pages may have the answer)
{% endif %}

- For figures/diagrams: examine visual elements directly (nodes, arrows, colors, spatial groupings), not just captions.
- For list answers: format as ["item1", "item2"]. Include units when applicable.\
"""


PROMPT_QUALITY_SCORE = """\
<question-type>{{ question_type }}</question-type>
<question>{{ question }}</question>
<answer>{{ answer }}</answer>
<answer_reasoning>{{ answer__reasoning_content }}</answer_reasoning>

You are given ALL pages of a PDF document. Evaluate the question-answer pair AND its reasoning.

Filter out bad training data, especially:
- looping or repetitive reasoning traces that keep rescanning the same pages/evidence without converging
- unfinished or truncated reasoning traces that stop mid-thought
- reasoning that stops after the first cluster of results when later pages still matter

CHECKS (any failure => score 0):

1. PAGES READABLE — document pages are clear and not low quality.
2. QUESTION RELEVANT — question is about content in these pages.
{% if "not answerable" in question_type %}
   For "not answerable": question should be relevant but the answer must NOT be in the pages.
{% endif %}
3. ANSWER CORRECT — verify by examining the actual pages yourself. For counts, count the
   items yourself. For chart values, find the specific chart and check. For lists, verify
   each item exists.
{% if "not answerable" in question_type %}
   Correct answer must be exactly "Not answerable".
{% endif %}
4. QUESTION WELL-FORMED — unambiguous, doesn't contain the answer, not trivially easy.
5. MULTI-PAGE REQUIRED — would someone need 2+ pages to answer? Score 0 if answerable
   from a single page.
6. FORMAT + REFUSAL —
   - Lists must be JSON arrays, units included, correct specificity.
   - Score 0 if the answer contains reasoning steps, protocol labels (e.g., "DECOMPOSE:",
     "SCAN:", "VERIFY:"), explanations, or anything beyond the bare result.
   {% if "not answerable" in question_type %}
   - Answer must be exactly "Not answerable".
   - Score 0 if the question is unanswerable only because it asks a trivially absent detail
     (e.g., author's phone number). Must be a near-miss negative where a specific qualifier
     (year, subgroup, row, condition) is absent from the document.
   {% else %}
   - Answer MUST NOT be "Not answerable" or any refusal.
   {% endif %}
7. REASONING QUALITY (critical — reasoning is used as chain-of-thought training data):
   The <answer_reasoning> must demonstrate thorough multi-page scanning. Score 0 if:
   - Reasoning does NOT mention specific pages where evidence was found.
   - Reasoning stops scanning after the first cluster of results (e.g., finds 5 items on
     pages 10-12 and stops, when more exist on pages 15-20).
   - It repeats the same scan, recount, candidate answer, page reference, title, entity name,
     or conclusion without adding new evidence.
   - It restarts the reasoning process after already finding the relevant page(s) or elements.
   - It contains obvious loop markers such as repeated "Wait, let me", "Actually",
     "Let's look again", or "Let's look at" — especially if the same phrase appears
     more than 3 times with no new information between repetitions.
   - It keeps generating new alternatives after already having enough evidence to answer.
   - It ends in an unfinished or truncated way, or appears to stop mid-thought.
   - For repeated-layout or repeated-entry questions, it stops after the first valid hit instead
     of scanning all matching pages.
   - For counting: reasoning does NOT enumerate items explicitly (just states a total
     without listing each instance).
   - For computation: reasoning does NOT show the formula with named values from specific
     pages (e.g., must show "Revenue (44,538 from income statement) / ..." not just "44,538 / ...").
   - For cross-page computations, it does NOT clearly distinguish which page provides the key,
     target value, denominator, or comparison value.
   - It confuses count vs percentage vs percentage-point difference vs ratio, or rounds before
     the final computation rather than after it.
   - Reasoning uses only vague references ("the table", "the chart", "the page") without
     identifying which specific element on which page.

SCORING:
- Score 0: any check fails.
- Score 1: all checks pass.
- Score 2: all pass AND question requires non-trivial cross-page reasoning (computation,
  counting across 3+ pages, or lookup chain), AND reasoning demonstrates exhaustive scanning
  with explicit page-by-page enumeration.
Respond with ONLY: 0, 1, or 2.\
"""


# =============================================================================
# Pipeline configuration
# =============================================================================


def build_config(
    seed_path: str = "seed.parquet",
    model_alias: str = "vl",
    model_id: str = DEFAULT_VLM_MODEL,
) -> dd.DataDesignerConfigBuilder:
    model_configs = [
        dd.ModelConfig(
            alias=model_alias,
            model=model_id,
            provider=VLLM_PROVIDER_NAME,
            inference_parameters=dd.ChatCompletionInferenceParams(
                timeout=120,
                temperature=1.0,
                top_p=0.95,
                max_parallel_requests=32,
                extra_body={
                    "top_k": 20,
                    "min_p": 0.0,
                    "presence_penalty": 1.5,
                    "repetition_penalty": 1.0,
                },
            ),
        ),
    ]

    config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)

    config_builder.with_seed_dataset(
        dd.LocalFileSeedSource(path=seed_path),
        sampling_strategy=dd.SamplingStrategy.ORDERED,
    )

    config_builder.add_column(
        dd.SamplerColumnConfig(
            name="question_type",
            sampler_type=dd.SamplerType.CATEGORY,
            params=dd.CategorySamplerParams(
                values=[
                    "multiple choice",
                    "yes or no",
                    "string: word, phrase or short sentence",
                    "layout",
                    "numerical (int)",
                    "numerical (float)",
                    "numerical (percentage)",
                    "list of items (int, string, float or mixed)",
                    "not answerable",
                ],
                weights=[
                    0.001858736059479554,
                    0.001858736059479554,
                    0.07434944237918216,
                    0.14869888475836432,
                    0.2230483271375465,
                    0.14869888475836432,
                    0.14869888475836432,
                    0.2230483271375465,
                    0.029739776951672865,
                ],
            ),
        )
    )

    config_builder.add_column(
        dd.LLMTextColumnConfig(
            name="question",
            model_alias=model_alias,
            prompt=PROMPT_QUESTION,
            multi_modal_context=IMAGE_CONTEXT,
        )
    )

    config_builder.add_column(
        dd.LLMTextColumnConfig(
            name="answer",
            model_alias=model_alias,
            prompt=PROMPT_ANSWER,
            multi_modal_context=IMAGE_CONTEXT,
            extract_reasoning_content=True,
        )
    )

    config_builder.add_column(
        dd.LLMTextColumnConfig(
            name="quality_score",
            model_alias=model_alias,
            prompt=PROMPT_QUALITY_SCORE,
            multi_modal_context=IMAGE_CONTEXT,
        )
    )

    return config_builder


def create_dataset(
    config_builder: dd.DataDesignerConfigBuilder,
    num_records: int,
    vllm_endpoint: str,
    artifact_path: Path | str | None = None,
) -> DatasetCreationResults:
    model_providers = [
        dd.ModelProvider(
            name=VLLM_PROVIDER_NAME,
            endpoint=vllm_endpoint,
        ),
    ]
    data_designer = DataDesigner(
        artifact_path=artifact_path,
        model_providers=model_providers,
    )
    data_designer.set_run_config(dd.RunConfig(progress_bar=True, disable_early_shutdown=True))
    results = data_designer.create(config_builder, num_records=num_records, dataset_name="whole_document_qa")
    return results


if __name__ == "__main__":
    from argparse import ArgumentParser

    parser = ArgumentParser()
    parser.add_argument(
        "--vllm-endpoint",
        type=str,
        required=True,
        help="Base URL of the vLLM server hosting the VLM (e.g. http://localhost:8000/v1)",
    )
    parser.add_argument("--seed-path", type=str, required=True, help="Path to the seed parquet file")
    parser.add_argument("--model-alias", type=str, default="vl")
    parser.add_argument("--model-id", type=str, default=DEFAULT_VLM_MODEL)
    parser.add_argument("--num-records", type=int, default=5)
    parser.add_argument("--artifact-path", type=str, default=None)
    args = parser.parse_args()

    config_builder = build_config(
        seed_path=args.seed_path,
        model_alias=args.model_alias,
        model_id=args.model_id,
    )
    results = create_dataset(
        config_builder,
        num_records=args.num_records,
        vllm_endpoint=args.vllm_endpoint,
        artifact_path=args.artifact_path,
    )

    print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")

    results.load_analysis().to_report()
