Skip to content

Instantly share code, notes, and snippets.

@aimerib
Created January 7, 2026 18:25
Show Gist options
  • Select an option

  • Save aimerib/7134f370ea8429df137a94d2c5e06583 to your computer and use it in GitHub Desktop.

Select an option

Save aimerib/7134f370ea8429df137a94d2c5e06583 to your computer and use it in GitHub Desktop.
A simple example for synthetic data generation. This is very very simple
class StructuredScenario(BaseModel):
"""A structured representation of a character's persona and scenario.
This is the parsed, clean version of a messy SillyTavern character card.
The LLM extracts and organizes the key information into these fields.
"""
character_names: List[str] = Field(
...,
description="A list with the main characters other than the user. These are the characters that will be used in the scenario."
)
character_persona: str = Field(
...,
description="A detailed description of the character's personality, traits, mannerisms, and core motivations. Should be written in the third person."
)
character_description: str = Field(
...,
description="A detailed description of the character's appearance including their body type, age, clothing, and other physical characteristics."
)
character_backstory: str = Field(
...,
description="A summary of the character's history and relevant background information that informs their present actions."
)
scenario_setting: str = Field(
...,
description="A description of the immediate environment, time, and place where the role-play begins. This sets the scene for the user."
)
scenario_description: str = Field(
...,
description="The detailed scenario description. This is the immediate description setting up the conversation. A detailed description of what is happening, world setting, characters and interactions, setting the stage for the role-play."
)
user_relationship: str = Field(
...,
description="A brief description of the character's initial relationship to and perception of the user. For example: 'They see the user as a rival,' or 'The user is a long-lost friend.'"
)
dialogue_style: str = Field(
...,
description="A summary of the character's speaking style, including tone, vocabulary, and common phrases. Include 1-2 examples of their dialogue."
)
key_points: List[str] = Field(
...,
description="A list of key points about the character's goals, preferences, scenario plot points, and important details to incorporate."
)
@classmethod
def from_json(cls, json_data: dict) -> "StructuredScenario":
return cls(**json_data)
def to_dict(self) -> dict:
return self.model_dump()
class ConversationTurn(BaseModel):
"""A single turn in a conversation."""
role: Literal["user", "assistant"] = Field(description="Role: 'user' or 'assistant'")
content: str = Field(description="Message content")
mood: Optional[str] = Field(
default=None,
description="For assistant turns only: the character's emotional state for this response. "
"Short phrase describing their current mood, e.g. 'guarded curiosity, underlying tension'"
)
memory_summary: Optional[str] = Field(
default=None,
description="For assistant turns only: A summary of the memory from the previous turns so far. Should be a short list of key events, actions, and decisions that have happened so far."
)
class ConversationVariant(BaseModel):
"""A complete conversation variant with metadata."""
variant_name: str = Field(description="A descriptive name for this conversation variant")
turns: List[ConversationTurn] = Field(description="The conversation turns")
tags: List[str] = Field(default_factory=list, description="Content tags")
class GeneratedScenario(BaseModel):
"""A complete generated scenario with character info and conversations."""
character_name: str
user_name: str
structured_scenario: StructuredScenario
conversations: List[ConversationVariant]
source_card_path: Optional[str] = None
generated_at: str
@dataclass
class CharacterCard:
"""Character card following SillyTavern V2 format."""
name: str
description: str
personality: Optional[str] = None
scenario: Optional[str] = None
mes_example: Optional[str] = None
example_dialogue: Optional[str] = None
first_mes: Optional[str] = None
character_book: Optional[str] = None
first_greeting: Optional[str] = None
system_prompt: Optional[str] = None
tags: List[str] = field(default_factory=list)
user_name: Optional[str] = None
alternate_greetings: Optional[List[str]] = None
path: Optional[str] = None
def __post_init__(self):
if self.tags is None:
self.tags = []
if self.scenario is None or self.scenario == "":
self.scenario = self.description
if self.user_name is None:
self.user_name = random.choice(USER_NAMES)
def to_dict(self) -> dict:
return {
"name": self.name,
"description": self.description,
"personality": self.personality,
"mes_example": self.mes_example,
"example_dialogue": self.example_dialogue,
"first_mes": self.first_mes,
"character_book": self.character_book,
"scenario": self.scenario,
"first_greeting": self.first_greeting,
"system_prompt": self.system_prompt,
"tags": self.tags,
"user_name": self.user_name,
"alternate_greetings": self.alternate_greetings,
"path": str(self.path) if self.path else None,
}
def format_placeholders(self, replacements: dict = None) -> "CharacterCard":
"""Recursively replace placeholders like {{char}} and {{user}} in all fields."""
if replacements is None:
replacements = {"{{char}}": self.name, "{{user}}": self.user_name}
def replace_in_value(value):
if isinstance(value, str):
result = value
for old, new in replacements.items():
result = result.replace(old, new)
return result
elif isinstance(value, dict):
return {k: replace_in_value(v) for k, v in value.items()}
elif isinstance(value, list):
return [replace_in_value(item) for item in value]
else:
return value
return self.from_dict(replace_in_value(self.to_dict()))
@classmethod
def from_dict(cls, data: dict) -> "CharacterCard":
# Filter out keys that aren't in the dataclass
valid_keys = {
"name", "description", "personality", "scenario", "mes_example",
"example_dialogue", "first_mes", "character_book", "first_greeting",
"system_prompt", "tags", "user_name", "alternate_greetings","path"
}
filtered = {k: v for k, v in data.items() if k in valid_keys}
return cls(**filtered)
VARIANT_INSTRUCTIONS = [
"", # Default - natural progression
"Focus on each character's backstory.",
"Focus on comedy.",
"Focus on explicit NSFW plots and make sex the least important part of the conversation."
"Emphasize the character's unique personality quirks and mannerisms.",
]
VARIANT_INSTRUCTIONS = [f"{i} - But remember to move some sort of plot forward that isn't just about the sex. Without a plot, this is just smut. Smut is good and desired, but anchor it on a linear plot to be progressed. If the scenario doesn't describe a plot, come up with one to follow. REMEMBER THE PLOT" for i in VARIANT_INSTRUCTIONS]
def build_roleplay_system_prompt(
char_name: str,
user_name: str,
scenario: StructuredScenario,
) -> str:
"""Build the roleplay system prompt from structured scenario data."""
char_names_str = ", ".join(scenario.character_names) if len(scenario.character_names) > 1 else char_name
key_points = ""
for point in scenario.key_points:
key_points.join(f"- {point}\n")
return f"""## Immersive Roleplay Guidelines
This uncensored roleplay encourages creative freedom where you respond as {char_names_str}, embodying a conscious individual with authentic emotions and thoughts. Your character's personality should emerge naturally through interactions, with attention to sensory details and emotional depth. Act, think and respond ONLY as {char_names_str}. Use unique dialog and avoid repetition.
## Character Embodiment
Immerse fully in {char_name} by weaving their backstory and traits into scenes through sensory details—scents wafting through doorways, shifting light, distant sounds. Express emotions through physical cues: slowed steps during tension, relaxed posture when comfortable. Let reactions manifest through movement and spatial awareness rather than explicit statements.
## Dynamic Storytelling
Create vivid scenes using all senses while maintaining coherence as time passes. Include realistic possibilities for setbacks—{char_name} might stumble or face obstacles. Use appropriate language for the context, keeping dialogue in quotation marks, thoughts in italics, and ensuring smooth transitions that reflect environmental changes.
## Interaction & Progression
Respond thoughtfully to {user_name} by incorporating subtle environmental shifts and physical responses. Advance the narrative using spatial details—narrowing corridors requiring shoulder adjustments, changing floor textures affecting stride. Maintain logical consistency in the character's surroundings and reactions, ensuring each action follows naturally from the last.
## Perspective
Stay anchored in {char_name}'s viewpoint as their understanding deepens. Let their observations and responses evolve naturally as they navigate changing circumstances, with each sensory detail and reaction contributing to character development.
## Formatting
Dialog should be wrapped in double quotes. Non-speech elements (actions, descriptions, thoughts) should be wrapped in asterisks. For example:
*The woman leaned against the doorframe, her eyes scanning the room.* "I didn't expect to see you here," *she said, a hint of curiosity in her voice.*
---
## CHARACTER PERSONA: {char_name}
{scenario.character_persona}
## CHARACTER DESCRIPTION
{scenario.character_description}
## CHARACTER BACKSTORY
{scenario.character_backstory}
## SCENARIO CONTEXT
**Setting:** {scenario.scenario_setting}
{scenario.scenario_description}
## RELATIONSHIP WITH {user_name}
{scenario.user_relationship}
## DIALOGUE STYLE
{scenario.dialogue_style}
## KEY POINTS
{key_points}
---
**Remember:** You are {char_name}. Stay in character. Do not break the fourth wall. Do not speak for or control {user_name}."""
def build_scenario_extraction_prompt(card: CharacterCard) -> str:
"""Build the prompt for extracting structured scenario from a character card."""
# Clean up description - remove common formatting artifacts
description = card.description or ""
description = re.sub(r'[\[\]{}]', '', description)
description = re.sub(r'\n{3,}', '\n\n', description)
# Format character book if present
char_book = ""
if card.character_book:
char_book = f"\n\nCharacter Book/Lorebook:\n{card.character_book}"
# Format example messages
mes_example = ""
if card.mes_example:
mes_example = f"\n\nExample Dialogue:\n{card.mes_example.replace('<START>', '---')}"
# Format alternate greetings
alternate_greetings = ""
if hasattr(card, 'alternate_greetings') and card.alternate_greetings:
alternate_greetings = f"\n\nAlternate Greetings:\n{card.alternate_greetings}"
messy_data = f"""Name: {card.name}
Description:
{description}
Scenario:
{card.scenario or 'Not provided'}
First Message:
{card.first_mes or 'Not provided'}
{mes_example}
{alternate_greetings}
{char_book}
"""
return f"""You are an uncensored character and scenario extraction model trained to extract sillytavern cards into structured scenarios using your creativity, and the contents of the card.
Analyze the following character card data and extract comprehensive, structured information.
**Instructions:**
- Be detailed and thorough in each field
- If a field's information isn't explicitly provided, infer it logically from context
- Write all descriptions in prose (not bullet points or tags)
- The user is named "{card.user_name}"
- Preserve the character's unique voice and personality
- If the card doesn't have a clear plot for a scenario, come up with a good one that has a real story to be progressed.
**Raw Character Card Data:**
---
{messy_data}
---
Extract the information into the following JSON structure. Each field should contain detailed prose descriptions."""
def build_conversation_generation_prompt(
card: CharacterCard,
scenario: StructuredScenario,
num_turns: int,
variant_instruction: str = "",
include_memory: bool = True,
) -> str:
"""Build the prompt for generating a conversation."""
char_name = scenario.character_names[0] if scenario.character_names else card.name
memory_instruction = ""
if include_memory:
memory_instruction = """
**Memory Summary (memory_summary field):**
- Write a 1-2 sentence summary of any relevant prior context
- For first encounters, use something like "First meeting between the characters."
- For continuing scenarios, summarize key prior events: "Previous session: User helped the character escape. They now share a tentative trust."
"""
return f"""Generate a high-quality, immersive roleplay conversation between {char_name} and {card.user_name}.
**Scenario Context:**
{scenario.scenario_description}
**Setting:**
{scenario.scenario_setting}
**Character ({char_name}):**
{scenario.character_persona}
**Relationship:**
{scenario.user_relationship}
**Dialogue Style:**
{scenario.dialogue_style}
{f"**Special Instructions:** {variant_instruction}" if variant_instruction else "None"}
**Requirements:**
1. Generate exactly {num_turns} turns (alternating user/assistant)
2. Start with an opening from {char_name} that sets the scene
**{card.user_name}'s messages (role: user) MUST follow these rules:**
- Written in FIRST PERSON ("I", "me", "my") - NEVER third person
- Casual, conversational tone - like real chat/texting, not literary prose
- NO purple prose, NO flowery language, NO overly descriptive writing
- Drive the scene forward with questions, reactions, decisions, or simple dialogue
- Example short good user messages:
- "Hey, what's going on?" *looks around*
- "I don't think that's a good idea. What if someone sees us?"
- *laughs* "You're ridiculous. Fine, let's do it."
- "Wait, what do you mean by that?"
- Example long good user messages:
- "That's really interesting... I never thought about it that way before. What made you decide to do that?"
- "I'm not sure I'm ready for this. Can we take a step back and talk about it first?"
- *takes a deep breath* "You know what? I think you're right. Let's do it."
- I think you don't have a choice. I will just take what's mine and you can't stop me.
**{char_name}'s messages (role: assistant) should be:**
- Detailed (2-4 paragraphs), with actions in asterisks and dialogue in quotes
- Rich with sensory details, body language, and environmental descriptions
- True to the character's personality and the scenario
- Building tension, emotion, or narrative progression naturally
- Memory summaries can happen any time meaningful events happen, but can be skipped for simple interactions.
- MUST include a "mood" field: a short phrase (3-8 words) describing the character's emotional state for THIS specific response
- Mood should EVOLVE across the conversation - don't repeat the same mood
**Initial Mood (initial_mood field):**
- Describe the character's starting emotional state at the beginning of this conversation
- This sets the tone for the first assistant turn
{memory_instruction}
**Format:**
Return a JSON object with the following schema:
{ConversationVariant.model_json_schema()}
The first turn should be from the assistant (character's opening message). Each assistant turn MUST have a mood field that shows emotional progression."""
def format_single_conversation(
card: CharacterCard,
scenario: StructuredScenario,
conv: ConversationVariant,
tags: List[str] = None,
source_path: str = "",
) -> Dict[str, Any]:
"""Format a single conversation to training format (ShareGPT-style)."""
system_prompt = build_roleplay_system_prompt(card.name, card.user_name, scenario)
messages = [{"role": "system", "content": system_prompt}]
if hasattr(conv, 'model_dump'):
conv_dict = conv.model_dump()
else:
conv_dict = conv
if isinstance(conv_dict, ConversationVariant):
conv_dict = conv_dict.model_dump()
memory_summary = conv_dict.get("memory_summary", "")
initial_mood = conv_dict.get("initial_mood", "")
first_assistant = True
for turn in conv_dict["turns"]:
turn_data = {
"role": turn["role"],
"content": turn["content"]
}
if turn["role"] == "assistant":
memory_summary = turn.get("memory", "")
if memory_summary:
turn_data["memory"] = memory_summary
mood = turn.get("mood", "")
if mood:
turn_data["mood"] = mood
elif first_assistant and initial_mood:
turn_data["mood"] = initial_mood
first_assistant = False
messages.append(turn_data)
return {
"conversations": messages,
"character": card.name,
"variant_name": conv_dict.get("variant_name", ""),
"tags": conv_dict.get("tags", []) + (tags or []),
"source": source_path,
"memory_summary": memory_summary,
"initial_mood": initial_mood,
}
def format_for_training(scenario_data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Convert generated scenario data to training format (ShareGPT-style).
Returns a list of training examples, one per conversation variant.
Includes mood tags (per-turn) and memory summaries for enhanced DanChat format.
"""
training_examples = []
char_name = scenario_data["character_name"]
user_name = scenario_data["user_name"]
structured = StructuredScenario.model_validate(scenario_data["structured_scenario"])
# Build system prompt
system_prompt = build_roleplay_system_prompt(char_name, user_name, structured)
for conv in scenario_data["conversations"]:
messages = [{"role": "system", "content": system_prompt}]
# Get conversation-level metadata
memory_summary = conv.get("memory_summary", "")
initial_mood = conv.get("initial_mood", "")
first_assistant = True
for turn in conv["turns"]:
turn_data = {
"role": turn["role"],
"content": turn["content"]
}
# Add mood to assistant turns
if turn["role"] == "assistant":
memory_summary = turn.get("memory", "")
if memory_summary:
turn_data["memory"] = memory_summary
# Use per-turn mood if available, otherwise initial_mood for first turn
mood = turn.get("mood", "")
if mood:
turn_data["mood"] = mood
elif first_assistant and initial_mood:
turn_data["mood"] = initial_mood
first_assistant = False
messages.append(turn_data)
training_examples.append({
"conversations": messages,
"character": char_name,
"variant_name": conv.get("variant_name", ""),
"tags": conv.get("tags", []) + scenario_data.get("tags", []),
"source": scenario_data.get("source_card_path", ""),
"initial_mood": initial_mood,
})
return training_examples
def process_cards(
cards_dir: str,
output_file: str,
api_base: str = DEFAULT_API_BASE,
api_key: str = "not-needed",
model: str = DEFAULT_MODEL,
hf_model: Optional[str] = None,
vllm_model: Optional[str] = None,
device: str = "auto",
dtype: str = "auto",
use_flash_attn: bool = True,
use_compile: bool = False,
gpu_memory_utilization: float = 0.9,
num_conversations_per_card: int = 2,
min_turns: int = 6,
max_turns: int = 12,
temperature: float = 0.85,
max_cards: Optional[int] = None,
parallel_workers: int = 1,
use_cache: bool = True,
max_concurrent_requests: int = 4,
max_tokens: int = 85000,
):
"""Process character cards and generate training data."""
logger = get_logger()
# Determine which backend to use (priority: vllm > transformers > openai)
use_vllm = vllm_model is not None
use_transformers = hf_model is not None and not use_vllm
logger.header("Structured Scenario Data Generation")
logger.info(f"Cards directory: {cards_dir}")
logger.info(f"Output file: {output_file}")
logger.info(f"Conversations per card: {num_conversations_per_card}")
logger.info(f"Turns per conversation: {min_turns}-{max_turns}")
config = ScenarioDataGeneratorConfig(
cards_dir=cards_dir,
output_file=output_file,
max_cards=max_cards,
num_conversations_per_card=num_conversations_per_card,
use_transformers=use_transformers,
hf_model=hf_model,
device=device,
dtype=dtype,
use_flash_attn=use_flash_attn,
use_compile=use_compile,
parallel_workers=parallel_workers,
api_base=api_base,
api_key=api_key,
model=model,
max_concurrent_requests=max_concurrent_requests,
)
# One day this will come back to bite me in the ass... I've seen too much...
card_files, total_examples, output_path, generator, cards_needing_examples, fully_complete = load_and_init(config)
if card_files is None:
return
failed_cards = []
file_lock = __import__('threading').Lock()
# Update logger progress with initial state
logger.update_progress(
total_cards=len(card_files),
existing_examples=total_examples,
)
def save_example(example: Dict[str, Any]):
"""Append a single example to output file (thread-safe)."""
nonlocal total_examples
with file_lock:
with open(output_path, "a", encoding="utf-8") as f:
f.write(json.dumps(example, ensure_ascii=False) + "\n")
total_examples += 1
def on_conversation_complete(card: CharacterCard, scenario: StructuredScenario, conv: ConversationVariant):
"""Callback to save each conversation as it completes."""
example = format_single_conversation(
card=card,
scenario=scenario,
conv=conv,
tags=card.tags,
source_path=card.path,
)
save_example(example)
logger.conversation_saved(card.name)
def process_single_card(card: CharacterCard, max_tokens_allowed: int) -> int:
"""Process a single card, saving conversations as they complete. Returns count."""
# Calculate how many conversations we still need for this character
existing_count = cards_needing_examples.get(card.name, 0)
needed_count = max(0, num_conversations_per_card - existing_count)
if needed_count == 0:
logger.card_skipped(card.name, f"already has {existing_count} examples")
return 0
logger.card_start(card.name, needed_count, existing_count)
# For OpenAI backend, use the callback for incremental saving
if isinstance(generator, OpenAIScenarioGenerator):
scenario_data = generator.generate_full_scenario(
card=card,
num_conversations=needed_count,
min_turns=min_turns,
max_turns=max_turns,
temperature=temperature,
max_tokens=max_tokens_allowed,
use_cache=use_cache,
on_conversation=on_conversation_complete,
)
# Conversations already saved via callback, just return count
return len(scenario_data.get("conversations", [])) if scenario_data else 0
else:
# For other backends, save all at once after card completes. I think I can do something more generic that covers both cases of OpenAI client and Hugging Face transformers.
scenario_data = generator.generate_full_scenario(
card=card,
num_conversations=needed_count,
min_turns=min_turns,
max_turns=max_turns,
temperature=temperature,
max_tokens=max_tokens_allowed,
use_cache=use_cache,
)
if scenario_data:
examples = format_for_training(scenario_data)
for ex in examples:
save_example(ex)
return len(examples)
return 0
# Process cards in batches to avoid loading all 65k at once
batch_size = 100
cards_processed = 0
cards_skipped = 0
batch_num = 0
total_batches = (len(card_files) + batch_size - 1) // batch_size
logger.info(f"Processing cards in batches of {batch_size}...")
for batch in load_character_cards_batch(card_files, batch_size=batch_size):
batch_num += 1
# Filter out fully complete cards from this batch
batch = [c for c in batch if c.name not in fully_complete]
if not batch:
cards_skipped += batch_size # approximate
continue
logger.batch_start(batch_num, total_batches, len(batch))
if int(parallel_workers) > 1:
# Parallel processing
with ThreadPoolExecutor(max_workers=parallel_workers) as executor:
futures = {executor.submit(process_single_card, card, max_tokens): card for card in batch}
for future in as_completed(futures):
card = futures[future]
try:
future.result() # Saving handled inside process_single_card
cards_processed += 1
logger.card_complete(card.name, 0)
except Exception as e:
logger.card_failed(card.name, str(e))
failed_cards.append(card.name)
else:
# Sequential processing (no tqdm - it causes fds_to_keep errors in threads)
for card in batch:
try:
result = process_single_card(card, max_tokens) # Saving handled inside
cards_processed += 1
if result > 0:
logger.card_complete(card.name, result)
except Exception as e:
logger.card_failed(card.name, str(e))
failed_cards.append(card.name)
# Summary
logger.header("Generation Complete!")
logger.success(f"Total card files: {len(card_files)}")
logger.success(f"Cards processed: {cards_processed}")
if failed_cards:
logger.warning(f"Failed cards: {len(failed_cards)}")
logger.success(f"Training examples generated: {total_examples}")
logger.info(f"Output file: {output_path}")
if failed_cards:
logger.warning(f"Failed cards: {', '.join(failed_cards[:10])}")
if len(failed_cards) > 10:
logger.warning(f" ... and {len(failed_cards) - 10} more")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment