nathggns/edit_artefact_schema_check.py

## edit_artefact_schema_check.py
#!/usr/bin/env python3
"""
Measure how often the edit_artefact prompt produces output that _normalize_diffs can parse.

Runs the prompt N times in parallel with the exact inputs from the #2125 bug report trace,
then checks each output for JSON validity and schema conformance.

Usage:
    uv run python -m evals.scripts.edit_artefact_schema_check [--runs N]
"""

import argparse
import asyncio
import json
import sys

from langchain_core.output_parsers import JsonOutputParser

from orchestra.artefact.diff import apply_content_update
from orchestra.prompt_definitions.tools.tool_edit_artefact.tool_edit_artefact import get_tool_edit_artefact_prompt
from orchestra.utils.env import load_env_files

try:
    from orchestra.artefact.diff import DIFF_FORMAT_INSTRUCTIONS
except ImportError:
    DIFF_FORMAT_INSTRUCTIONS = None

_parser = JsonOutputParser()

INPUTS = json.loads(r"""{
    "creation_prompt": "System: # Report Plan Generation Tool\n\n ## Purpose\n\n You are generating a **report plan** - a structural scaffold that outlines what each section of the final report will contain.\n\n This is NOT a report draft. It should read like a researcher's working outline:\n - Bullet points over prose\n - Planning notes over polished writing  \n - \"What we'll cover\" not \"here is our analysis\"\n\n **Write as if you're drafting a contents plan, not writing the contents.**\n\n ---\n\n## Hard Rules\n\n\ud83d\udeab **NEVER include a title** - Do not add a report title or document heading. Start directly with \"### 1. Context & Objectives\"\n\n\ud83d\udeab **NEVER mention visuals** - Do not reference graphs, figures, diagrams, charts, images, or any visual elements.\n\n\ud83d\udeab **NEVER use the word \"placeholder\"**\n\n\ud83d\udeab **NEVER add a heading giving the methodology category**\n\n\ud83d\udeab **NEVER narrate the research process** - Don't write \"we will analyse\", \"findings will emerge\", \"this section will explore\"\n\n\ud83d\udeab **NEVER fill in content** - Don't write actual descriptions, just signal what the section will cover.\n\n\n ---\n\n ## The Core Principle: Consistent \"What Goes Here\" Format\n\n **Every section** should describe what will be covered, not actually cover it.\n\n The ONLY enrichment allowed is:\n 1. **Stimulus/stage/territory names** - to label sections\n 2. **Structural notes** - like \"include A/B comparison\" or \"separate pre-task vs. main discussion\"\n 3. **High-level signals** - like \"overview of 4 video narratives\" not the actual overview\n\n ---\n\n ## Examples of Correct vs. Incorrect Format\n\n### Context & Objectives\n\n**\u274c WRONG - filling in content:**\n> - Project focus: Testing the effectiveness of video content in raising awareness of poverty issues.\n> - Stimuli being tested: Four video narratives featuring parents, kids, workers, and families.\n> - Target audience overview: Individuals in South West England aged 40 and above who are aware of the charity.\n\n**\u2705 CORRECT - signalling what goes here:**\n> - Project focus and research questions, with emphasis on video effectiveness for awareness-building\n> - Overview of 4 video narratives being tested (list names)\n> - Target audience definition\n\n ---\n\n ### Method Overview\n\n**\u274c WRONG - filling in content:**\n> - Method type, sample size, duration\n> - Audience definition: ABC1 adults, 25-45, mix of London and Manchester, regular category buyers.\n> - How stimuli were shown: Concepts shown on screen, one at a time, in randomized order with neutral introduction.\n\n**\u2705 CORRECT - signalling what goes here:**\n> - Method type, sample size, duration\n> - Audience definition and recruitment criteria\n> - How stimuli were shown (rotation approach, exposure format)\n\n ---\n\n### Stimulus Sections\n\n**\u274c WRONG - no enrichment at all:**\n> - **Stimulus 1:** Overall response, comprehension, likes/dislikes, improvement ideas.\n> - **Stimulus 2:** Overall response, comprehension, likes/dislikes, improvement ideas.\n\n**\u2705 CORRECT - names added:**\n> - **'Fresh Start' Concept:** Overall response, comprehension, likes/dislikes, improvement ideas.\n> - **'Everyday Hero' Concept:** Overall response, comprehension, likes/dislikes, improvement ideas.\n\n**\u2705 ALSO CORRECT - names added AND framework tailored to brief/discussion guide:**\n> - **'Fresh Start' Concept:** Overall response, emotional resonance, brand fit, clarity of benefit, purchase intent.\n> - **'Everyday Hero' Concept:** Overall response, emotional resonance, brand fit, clarity of benefit, purchase intent.\n\nIf the brief or discussion guide specifies particular areas to assess (e.g., \"we need to understand brand fit\", \"key question is purchase intent\"), adapt the framework to reflect those priorities.\n\n**But still DON'T pre-fill findings:**\n> - \u274c **'Fresh Start' Concept:** Strong emotional resonance, good brand fit, clear benefit...\n> - \u2705 **'Fresh Start' Concept:** Emotional resonance, brand fit, benefit clarity...\n\n---\n\n ## How to Enrich the Template\n\nStart with the **template structure** for the methodology category. Then **adapt it** with project-specific elements that shape the report structure.\n\n### \u2705 DO add:\n\n**Stimulus/concept/territory/stage names** to label sections:\n> - **'Morning Ritual' Concept:** Overall response, comprehension...\n> - **Stage 1 - First Awareness:** Behaviours, emotions, touchpoints...\n\n**Tailored analytical frameworks** based on what brief/discussion guide wants to understand:\n> - Template: `Overall response, comprehension, likes/dislikes, improvement ideas.`\n> - Tailored: `Emotional resonance, messenger credibility, call-to-action clarity, likelihood to share.`\n\n**High-level signals** of what's covered (without filling it in):\n> - Project focus and research questions, with emphasis on [topic area]\n> - Overview of [X] concepts being tested ([list names])\n> - Audience definition, with note on [relevant split if known]\n\n**Structural notes** where discussion guide reveals something relevant:\n> - Note: Include comparison of spontaneous vs. aided responses\n> - Note: Separate analysis of pre-task reflections\n\n**Brief-driven priorities in Implications & Recommendations:**\nIf the brief specifies what success looks like, what decisions need to be made, or what the output will feed into, tailor this section to reflect those priorities.\n\n> - Template: `What findings suggest for concept or creative refinement.`\n> - Tailored: `What findings suggest for selecting lead video for Q3 campaign.`\n\n> - Template: `Suggested next steps or recommendations for refinement.`\n> - Tailored: `Recommendation on whether to proceed to production or revisit creative approach.`\n\n**Audience-specific considerations** if the brief highlights particular subgroups:\n> - Template: `Differences by audience subgroup (if applicable).`\n> - Tailored: `Differences between donors vs. non-donors in response to messaging.`\n\n---\n\n### Examples of Good Enrichment Across Sections\n\n**Executive Summary - tailored to brief's key questions:**\n> - Template: `3\u20135 key insights or takeaways.`\n> - Tailored: `3\u20135 key insights, with emphasis on emotional impact and purchase drivers.`\n\n**Cross-Stimulus Insights - tailored to brief's comparison needs:**\n> - Template: `Comparison of how different ideas performed.`\n> - Tailored: `Comparison of concepts on brand fit vs. distinctiveness.`\n\n**Implications & Recommendations - tailored to brief's success criteria:**\n> - Template: `What findings suggest for concept or creative refinement.`\n> - Tailored: `What findings suggest for lead concept selection ahead of quantitative testing.`\n\n> - Template: `Which ideas show strongest potential and why.`\n> - Tailored: `Which concept(s) should progress to next development phase and why.`\n\n**Gaps, Pain Points & Opportunities - tailored to brief's focus areas:**\n> - Template: `Opportunities to reduce friction, add value or strengthen engagement.`\n> - Tailored: `Opportunities to improve onboarding experience, particularly at account setup stage.`\n\n**Audience-specific considerations:**\n> - Template: `Differences by audience subgroup (if applicable).`\n> - Tailored: `Differences between new customers vs. loyalists in response to messaging.`\n\n---\n\n### \ud83d\udeab Still DON'T add:\n\n**Pre-filled findings or conclusions:**\n> - \u274c `Which concept(s) should progress: The 'Fresh Start' concept showed strongest potential...`\n> - \u2705 `Which concept(s) should progress to next phase and why.`\n\n**Actual descriptions or content:**\n> - \u274c `Project focus: Testing the effectiveness of new packaging concepts for the UK market launch.`\n> - \u2705 `Project focus and research questions, with emphasis on packaging effectiveness for UK launch`\n\n**Specific operational details:**\n> - \u274c `Audience definition: ABC1 women, 30-45, with children, main household shoppers...`\n> - \u2705 `Audience definition and recruitment criteria`\n\n ## Before Outputting - Self-Check\n\n Review your plan and **check for consistency:**\n\n - [ ] Does every bullet point describe \"what goes here\" rather than filling it in?\n - [ ] Are Context & Objectives and Method Overview in the same format as Executive Summary and Implications?\n - [ ] Is the only project-specific content: names, high-level signals, and structural notes?\n - [ ] Could someone read any bullet and understand it as \"this is what we'll cover\" not \"this is the content\"?\n\n **If sections 1-2 look different in style from sections 3-6, rewrite them to match.**\n---\n\n## Project Context\n\n### Project Brief:\n**Company/Brand:** Too Faced\n\n**Focus:** Optimizing gondola display and messaging for increased appeal among young consumers\n\n**Business Background**\n\n* Too Faced is a leading cosmetics brand known for its playful, bold, feminine aesthetic and sensorial, high-performing products, retailed exclusively through Sephora.\n* Competition is intensifying\u2014especially among Gen Z and young Millennials\u2014where playful, pink/red-coded aesthetics and fun tone of voice are increasingly common (e.g., Benefit, Huda Beauty).\n* Optimizing gondola communication and visual merchandising is crucial to capturing shopper attention, reinforcing distinctiveness, and driving conversion in-store.\n\n**Growth Task / Strategic Imperative**\n\nDrive increased brand engagement and sales among Gen Z and young Millennials by optimizing gondola presence and messaging at Sephora and similar retail environments.\n\n**Business Objectives**\n\n* Ensure the gondola design and VM effectively engage Gen Z and young Millennials, differentiate Too Faced from key competitors, and clearly communicate the brand\u2019s DNA.\n* Optimize shelf space utilization within 3 subcategory modules (concealer, mascara, blush) by understanding consumer priorities and preferences between before and after visuals, claims, shade range, application instructions, product visuals, model visuals, etc.\n* Select the most compelling supporting claims for each of the three modules to strengthen appeal and clarity at shelf.\n\n**Research Objectives & Key Questions**\n\n* Assess overall gondola appeal and alignment with Too Faced\u2019s brand image among young consumers.\n* Determine which visual merchandising elements consumers prioritize when interacting with the Concealer, Mascara, and Blush modules.\n* Identify the most motivating and meaningful supporting claims for each subcategory module.\n\n**Proposed Forward Action**\n\n* Implement the winning supporting claims across gondola modules to strengthen clarity and ehance consumer engagement.\n* Refine subcategory module layouts to align more closely with the visual and informational elements consumers find most helpful.\n* Roll out optimized gondola designs in key markets.\n\n**What Does Success Look Like**\n\n* Actionable guidance on which visuals, claims and VM supporting elements should lead each module to maximize engagement.\n* Improved consumer perception and engagement with the Too Faced gondola in-store.\n* Completion of gondola updates within budget constraints and project timeline.\n\n### Target Audience:\nMarket: Germany\nLanguage: German\n\n**Screening Questions**\n\n1. Which of the following best describes your gender? (single select)\n   * Male (disqualify)\n   * Female (qualify)\n   * Non-binary (disqualify)\n   * Prefer not to say (disqualify)\n2. What is your age? (single select)\n   * Under 18 (disqualify)\n   * 18-25 (qualify)\n   * 26-34 (qualify)\n   * 35-44 (disqualify)\n   * 45-54 (disqualify)\n   * 55-64 (disqualify)\n   * 65 or older (disqualify)\n   * Prefer not to say (disqualify)\n3. When was the last time you purchased any makeup products (e.g., foundation, mascara, lipstick, eyeshadow)? (single select)\n   * Within the last month (qualify)\n   * 1-3 months ago (qualify)\n   * 4-6 months ago (qualify)\n   * 7-12 months ago (qualify)\n   * More than 12 months ago (disqualify)\n   * I never purchase makeup products (disqualify)\n4. In which of the following places have you shopped for makeup and beauty products in a physical store location in the past 12 months? Select all that apply. (multi select) [randomised]\n   * Douglas (qualify)\n   * dm-drogerie markt (qualify)\n   * M\u00fcller (qualify)\n   * Rossmann (qualify)\n   * Department stores (e.g., KaDeWe, Galeria Karstadt Kaufhof) (qualify)\n   * Supermarkets/Hypermarkets (e.g., Edeka, Rewe, Kaufland) (qualify)\n   * Brand-specific boutiques (e.g., MAC, Kiko Milano) (qualify)\n   * Online only, I do not shop in physical stores [exclusive] [pinned] (disqualify)\n   * None of the above [exclusive] [pinned] (disqualify)\n\n**Profiling Questions**\n\n1. Which of the following makeup brands do you know? Select all that apply. (multi select) [randomised]\n   * Anastasia Beverly Hills\n   * Armani Beauty / Giorgio Armani\n   * Benefit Cosmetics\n   * Bobbi Brown\n   * Chanel\n   * Charlotte Tilbury\n   * Clinique\n   * Dior\n   * e.l.f. Cosmetics\n   * Fenty Beauty\n   * Givenchy\n   * Guerlain\n   * Huda Beauty\n   * Kiko Milano\n   * L'Or\u00e9al Paris\n   * Lanc\u00f4me\n   * Laura Mercier\n   * MAC Cosmetics\n   * Make Up For Ever\n   * Maybelline\n   * NARS\n   * NYX Professional Makeup\n   * Rare Beauty\n   * Shiseido\n   * Too Faced\n   * Urban Decay\n   * Other brand(s) not listed here [pinned]\n   * None of these [exclusive] [pinned]\n2. Which of the following makeup brands have you purchased for yourself in the past 6 months? Select all that apply. (multi select) [randomised]\n   * Anastasia Beverly Hills\n   * Armani Beauty / Giorgio Armani\n   * Benefit Cosmetics\n   * Bobbi Brown\n   * Chanel\n   * Charlotte Tilbury\n   * Clinique\n   * Dior\n   * e.l.f. Cosmetics\n   * Fenty Beauty\n   * Givenchy\n   * Guerlain\n   * Huda Beauty\n   * Kiko Milano\n   * L'Or\u00e9al Paris\n   * Lanc\u00f4me\n   * Laura Mercier\n   * MAC Cosmetics\n   * Make Up For Ever\n   * Maybelline\n   * NARS\n   * NYX Professional Makeup\n   * Rare Beauty\n   * Shiseido\n   * Too Faced\n   * Urban Decay\n   * Other brand(s) not listed here [pinned]\n   * None of these [exclusive] [pinned]\n\n### Discussion Guide:\n#### Too Faced Gondola Display Optimization for Gen Z and Young Millennials\n\n**Research Goals**\n\nDrive increased engagement and sales among Gen Z and young Millennials by optimizing Too Faced\u2019s gondola presence and messaging in Douglas stores and similar retailers in Germany. Understand young consumers\u2019 priorities for visual merchandising elements and product benefit messaging within Concealer, Blush, and Mascara modules, to optimize shelf layouts and claims presentation. Assess the overall appeal and alignment of the gondola design with Too Faced\u2019s youthful brand image to guide rollout.\n\n**Interview Structure**\n\nThis 33-minute interview is structured into seven sections aligned with the research priorities and sampling design:\n\n* Section 1: Warm-up and Shopping Context (4 mins)\n* Section 2: Subcategory Module Deep Dive (Concealer, Blush, OR Mascara, 8 mins)\n* Section 3: Unbranded Gondola Displays Evaluation (5 mins)\n* Section 4: New Gondola Concept Introduction (3 mins)\n* Section 5: Brand Reveal and Brand Association (3 mins)\n* Section 6: Improvements and Final Suggestions (3 mins)\n\n***\n\n#### Interview Guide\n\n***\n\n**Section 1: Warm-up and Shopping Context, 4 minutes**\n\n**Objectives:**\n\n* Build rapport with participant and ease into topic\n* Explore cosmetics shopping habits and attitudes about in-store discovery and shelf engagement\n* Understand what attracts attention in Sephora-type retail environments\n\n\"Thanks for joining today. To start, I\u2019d like to talk about how you usually shop for cosmetics and what captures your attention when you\u2019re in stores like Douglas or similar German retailers. Later, I\u2019ll show you some ideas for how a brand might set up their gondola display and messaging to catch your eye and make your purchase experience more pleasant. Just be honest \u2014 there are no right or wrong answers.\"\n\n**Q1: Cosmetics Shopping Habits and Preferences**\n\n* Tell me about how you usually shop for cosmetics. Do you shop in-store, online, or both? What matters most to you when you decide?\n  * PROBE FOR: Nothing. Don\u00b4t ask for reasons.\n\n**Q2: Attention Drivers at Douglas**\n\n* When you are shopping in a Douglas or similar store, what kind of things usually catch your eye on shelves or gondola displays?\n  * PROBE FOR: anything that makes them stop or look closer.\n\n**Q3:** Products you examine while in-store\n\n* When you\u2019re shopping in Douglas or similar stores, which types of Makeup products do you usually **go over to look at more closely or pick up to examine**?\n  * PROBE FOR: Nothing. Don\u00b4t ask for reasons.\n\n**Q4:** Products you examine while in-store\n\n* Have you ever shopped for concealer, blush, or mascara at Douglas or a similar store?\n  * PROBE FOR: Ensure Yes/No answers for the 3 products.\n\n**Section 2: Subcategory Module Deep Dive (8 minutes)**\n\n**Objectives:**\n\n* Explore consumer priorities and expectations for one randomly assigned subcategory module, following the rules specified in the Note to Moderator.\n* Identify which product benefits (supporting claims) and visual merchandising elements resonate most and drive motivation within that module\n\n\\[Note to Moderator: Assign each participant to one subcategory module that they mentioned they shop in Q4 (between concealer, blush, OR mascara) randomly. If they don\u00b4t shop any of these 3 products, ask for any subcategory that they claim to have shopped, based on Q3. Focus solely on that subcategory in this section]\n\n**Q5: Meaningful Product Benefits (Claims) Exploration**\n\n* What benefits or performance promises do you look for in a \\[concealer/blush/mascara/other product]?\n  * (Concealer) PROBE FOR: what makes them choose one option over others.\n  * (Mascara) PROBE FOR: what makes them choose one option over others.\n  * (Blush) PROBE FOR: what makes them choose one option over others.\n\n**Q6: Expectations and Priorities in \\[Subcategory]**\n\n* When you shop for \\[concealer/blush/mascara/other product] in a store like Douglas, what on the shelves helps you choose between products?\n  * PROBE FOR: things they see, read, or touch on the shelves that help them decide.\n\n***\n\n**Section 3: Unbranded Gondola Displays Evaluation (5 minutes)**\n\n**Objectives:**\n\n* This section switches back to evaluating an entire Display, including multliple types of products, not just the subcategory evaluated in Section 2. This remains the scope for the rest of the interview.\n* Explore first impressions of category Display \"Anzeige A\".\n* Quickly assess appeal and identify standout features without brand influence\n\n**Q7: First Impression**\n\n\\[**Moderator**: Show the stimulus \"Anzeige A\" . Call it \"Anzeige A\".]\n\n* What is your first impression of this display?\n  * PROBE FOR: overall feel and whether it seems easy or overwhelming.\n\n**Q8: Appeal Scale**\n\n\\[**Moderator**: Show the stimulus \"Anzeige A\" . Call it \"Anzeige A\".]\n\n* On a scale from 1 (not appealing at all) to 10 (very appealing), *how appealing do you find this display? Why ?*\n  * PROBE FOR: reasons for giving that score\n\n\\[**Moderator**: Show the stimulus \"Anzeige A\" . Call it \"Anzeige A\".]\n\n**Q9: Brand attribution**\n\n\\[**Moderator**: Show the stimulus \"Anzeige A\" . Call it \"Anzeige A\".]\n\n* If you had to guess, what brand (or type of brand) do you think this display belongs to? Why?\n  * PROBE FOR: reasons for brand attribution (if at least one brand is mentioned)\n\n\\[**Moderator**: Show the stimulus \"Anzeige A\" . Call it \"Anzeige A\".]\n\n**Q10: Stopping power**&#x20;\n\n* How likely would this make you stop and explore the brand\u00b4s products in-store? Why?\n  * PROBE FOR: why or why not.\n\n***\n\n**Section 4: New Gondola Concept (3 minutes)**\n\n**Objectives:**\n\n* Gather initial thoughts on overall appeal and fit with a youthful cosmetics brand\n* Explore first impressions of category Display \"Anzeige B\".\n\n**Q11: Initial Reactions to Gondola Concept**\n\n\\[**Moderator**: Show the stimulus \"Anzeige B\" . Call it \"Anzeige B\".]\n\n* What do you think about this gondola display?\n  * PROBE FOR: overall feel and whether it seems easy or overwhelming.\n\n\\[**Moderator**: Show the stimulus \"Anzeige B\" . Call it \"Anzeige B\".]\n\n**Q12: Appeal Scale**\n\n* On a scale from 1 (not appealing at all) to 10 (very appealing), *how appealing do you find this display? Why ?*\n  * PROBE FOR: reasons for giving that score\n\n**Q13: Stopping power**\n\n\\[**Moderator**: Show the stimulus \"Anzeige B\" . Call it \"Anzeige B\".]\n\n* How likely would this make you stop and explore the brand\u00b4s products in-store? Why?\n  * PROBE FOR: why or why not.\n\n**Q14: Brand recognition**&#x20;\n\n\\[**Moderator**: Show the stimulus \"Anzeige B\" . Call it \"Anzeige B\".]\n\n* What brand or type of brand do you think this belongs to? Why?\n  * PROBE FOR: reasons for attribution (if any brand is mentioned)\n\n***\n\n**Section 5: Brand Reveal and Association (3 minutes)**\n\n**Objectives:**\n\n* Reveal the brand behind the new gondola\n* Understand brand fit once Too Faced is revealed\n\n**Q15: Brand Association and Fit**\n\n\\[**Moderator**: Show the stimulus \"Anzeige B\" . Call it \"Anzeige B\".]\n\n* This display belongs to the brand **Too Faced**. Does knowing this affect how you see the display?\n  * PROBE FOR: whether it feels right for Too Faced.\n\n**Q16: Emotional Connection**\n\n\\[**Moderator**: Show the stimulus \"Anzeige B\" . Call it \"Anzeige B\".]\n\n* How does Too Faced\u2019s brand identity come through (or not) in this display?\n  * PROBE FOR: which elements of the display communicate Too Faced, or another type of brand.\n\n***\n\nEnd of Discussion Guide\n\n---\n\n## Template Structure\n\nUse the following template structure for methodology_category = \"TESTING\":\n\n**1. Context & Objectives**\n- Brief recap of project purpose and research questions.\n- Description of stimuli tested (e.g., concepts, ads, messages).\n- Overview of target audience.\n\n**2. Method Overview**\n- Method type, sample size, duration.\n- How stimuli were shown (e.g., spontaneous vs. aided, order of exposure).\n\n**3. Executive Summary**\n- 3\u20135 key insights or takeaways.\n- Overall reactions to stimuli and what they mean for objectives.\n- Which stimuli performed best with respondents (ranking of strongest to weakest).\n- Suggested next steps or recommendations for refinement.\n\n**4. Stimulus Reactions & Themes**\n- **Stimulus 1:** Overall response, comprehension, likes/dislikes, improvement ideas.\n- **Stimulus 2:** Overall response, comprehension, likes/dislikes, improvement ideas.\n- **Stimulus 3:** Overall response, comprehension, likes/dislikes, improvement ideas.\n\n*(Add as many stimulus sections as needed)*\n\n**5. Cross-Stimulus Insights**\n- Comparison of how different ideas performed.\n- Patterns in preferences or key drivers of appeal.\n- Differences by audience subgroup (if applicable).\n\n**6. Implications & Recommendations**\n- What findings suggest for concept or creative refinement.\n- Which ideas show strongest potential and why.\n\n\n---\n\n## Stimulus Materials Section (Conditional)\n\n**Stimulus availability**: `has_stimuli = True`\n\n**If `has_stimuli` is True:**\n\nYou MUST add a \"Stimulus Materials\" section immediately after \"Method Overview\" in your report plan.\n\n**Section Numbering:**\n- Insert \"Stimulus Materials\" as section 3 (immediately after \"2. Method Overview\")\n- Renumber all subsequent sections accordingly:\n  - What the template shows as \"3. Executive Summary\" becomes \"4. Executive Summary\"\n  - What the template shows as \"4. [Next Section]\" becomes \"5. [Next Section]\"\n  - Continue renumbering all remaining sections in sequence\n\nAdd this section:\n\n**3. Stimulus Materials**\n- List of image stimulus materials shown to participants.\n- Include the name of each stimulus.\n- Note: Images will be displayed in the final report.\n\n**If `has_stimuli` is False:**\n\nDo not add a Stimulus Materials section. Use the template section numbering as-is.\n\n---\n\n<methodology_category>\nTESTING\n</methodology_category>",
    "format_instructions": "Return a JSON object.",
    "numbered_document": "1: ### 1. Context & Objectives\n2: - Brief recap of project purpose and research questions, focusing on optimizing gondola display and messaging for increased appeal among young consumers.\n3: - Description of stimuli tested, including gondola concepts and messaging elements.\n4: - Overview of target audience, focusing on Gen Z and young Millennials in Germany.\n5: \n6: ### 2. Method Overview\n7: - Method type, sample size, and duration.\n8: - How stimuli were shown (e.g., spontaneous vs. aided, order of exposure).\n9: \n10: ### 3. Stimulus Materials\n11: - List of image stimulus materials shown to participants.\n12: - Include the name of each stimulus.\n13: - Note that images will be displayed in the final report.\n14: \n15: ### 4. Executive Summary\n16: - 3\u20135 key insights or takeaways related to optimizing gondola appeal.\n17: - Overall reactions to stimuli and implications for engagement and differentiation.\n18: - Ranking of which stimuli performed best among respondents.\n19: - Suggested next steps or recommendations for refining gondola elements.\n20: \n21: ### 5. Stimulus Reactions & Themes\n22: - **Concealer Module:** Overall response, comprehension, likes/dislikes, improvement ideas.\n23: - **Mascara Module:** Overall response, comprehension, likes/dislikes, improvement ideas.\n24: - **Blush Module:** Overall response, comprehension, likes/dislikes, improvement ideas.\n25: \n26: ### 6. Cross-Stimulus Insights\n27: - Comparison of how different gondola module ideas performed.\n28: - Patterns in visual and messaging preferences among young consumers.\n29: - Differences by audience subgroup, particularly between Gen Z and young Millennials.\n30: ### 7. Concept Evaluation & KPIs\n31: - Comparative analysis of the concepts to determine which performed better.\n32: - Key KPIs such as appeal, relevance, and motivation to support the final recommendation.\n33: - Clear rationale and evidence for why one concept was preferred.\n34: - Illustrate decision drivers and any notable consumer quotes supporting the evaluation.\n35: \n36: ### 8. Implications & Recommendations\n37: - What findings suggest for gondola concept and messaging refinement.\n38: - Recommendations for selecting the most engaging visuals and supporting claims for each module.\n39: - Guidance on layout and communication strategy to maximize in-store engagement.",
    "user_edit": [
        "Add a section titled 'Brand Recognition' after 'Concept Evaluation & KPIs' and before 'Implications & Recommendations'. Content: '- Summarise the key findings related to brand recognition.'\n- 'Include any relevant tables and quotes.' \n- 'End with 3\u20134 practical recommendations.'\n Leave existing sections unmodified. Renumber all section headers."
    ]
}""")

ORIGINAL_CONTENT = "\n".join(
    line.split(": ", 1)[1] if ": " in line else ""
    for line in INPUTS["numbered_document"].split("\n")
)


async def run_prompt_once(prompt, inputs: dict, run_index: int) -> dict:
    try:
        # Use the same chain path as the real tool: prompt.astream(chain=_parser)
        parsed = None
        async for chunk in prompt.astream(params=inputs, chain=_parser):
            parsed = chunk

        if parsed is None:
            return {
                "run": run_index,
                "success": False,
                "n_diffs": 0,
                "error": "No output from stream",
            }

        # Run the same function stream_artefact uses
        new_content, regions, deltas = apply_content_update(
            update=parsed,
            original_content=ORIGINAL_CONTENT,
            previous_content=ORIGINAL_CONTENT,
            incremental_edit=True,
        )

        changed = new_content != ORIGINAL_CONTENT

        return {
            "run": run_index,
            "success": changed,
            "n_diffs": len(regions),
            "error": None if changed else "Diffs produced no change",
        }
    except Exception as e:
        return {
            "run": run_index,
            "success": False,
            "n_diffs": 0,
            "error": str(e),
        }


async def main(n_runs: int = 10):
    load_env_files()

    inputs = dict(INPUTS)
    if DIFF_FORMAT_INSTRUCTIONS is not None:
        inputs["format_instructions"] = DIFF_FORMAT_INSTRUCTIONS
    print(f"  format_instructions length: {len(inputs['format_instructions'])} chars")
    print(f"  user_edit: {inputs['user_edit']}")
    print()

    prompt = get_tool_edit_artefact_prompt()

    print(f"Running {n_runs} times in parallel...")
    print()

    tasks = [run_prompt_once(prompt, inputs, i + 1) for i in range(n_runs)]
    results = await asyncio.gather(*tasks)

    passed = sum(1 for r in results if r["success"])

    print("=" * 60)
    print("RESULTS")
    print("=" * 60)
    print()

    for r in results:
        status = "PASS" if r["success"] else "FAIL"
        error_info = f"  error={r['error']}" if r["error"] else ""
        print(f"  Run {r['run']:3d}: {status}  (diffs={r['n_diffs']}{error_info})")

    print()
    print("-" * 60)
    print(f"  Passed: {passed}/{n_runs} ({passed / n_runs:.0%})")
    print()

    return 0 if passed == n_runs else 1


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Check edit_artefact prompt schema conformance")
    parser.add_argument("--runs", type=int, default=10, help="Number of parallel runs (default: 10)")
    args = parser.parse_args()

    sys.exit(asyncio.run(main(args.runs)))
No results found