Replace LLM-based document selection with random selection, scaling questions by document size.
MIN_DOC_CHARS = 500
CHARS_PER_QUESTION = 500
MAX_QUESTIONS_PER_DOC = 8def calculate_question_count(content: str) -> int:
"""Scale questions by content length: 1 per 500 chars, max 8."""
char_count = len(content)
return min(char_count // CHARS_PER_QUESTION, MAX_QUESTIONS_PER_DOC)
def select_docs_for_topic_random(
docs: list[dict],
max_docs: int = MAX_DOCS_PER_TOPIC,
seed: int | None = None,
) -> list[dict]:
"""Select random documents that meet size threshold."""
import random
# Filter by size floor
eligible = [d for d in docs if len(d.get("content", "")) >= MIN_DOC_CHARS]
if not eligible:
return []
# Reproducible random selection
rng = random.Random(seed)
return rng.sample(eligible, min(len(eligible), max_docs))def generate_questions(doc: dict, cli: str, max_retries: int = 2) -> list[dict]:
"""Ask LLM to generate test questions for a document."""
content = doc["content"][:4000]
num_questions = calculate_question_count(content)
# Scale the question type list based on count
question_types = [
"Factual - direct answer found in text",
"Procedural - how to do something described in the doc",
"Conceptual - requires understanding the 'why'",
"Rephrased - uses synonyms/different terminology",
"Troubleshooting - what if X goes wrong",
"Edge case - less obvious use case",
"Factual - another direct fact from text",
"Procedural - a different how-to from the doc",
][:num_questions]
type_list = "\n".join(f"{i+1}. {t}" for i, t in enumerate(question_types))
prompt = f"""Generate {num_questions} test questions for evaluating semantic search retrieval quality.
Document (topic: {doc["topic"]}, ID: {doc["ordinal_id"]}):
---
{content}
---
Create {num_questions} diverse questions that this document can answer:
{type_list}"""
schema = f"""interface Question {{
question: string;
difficulty: "easy" | "medium" | "hard";
}}
type Questions = Question[]; // array of {num_questions} questions"""
# ... rest unchanged (retry logic, ask_llm_for_json call)Replace the LLM selection block (lines 629-643) with:
elif len(all_docs) > MAX_DOCS_PER_TOPIC:
# Random selection with size filter
seed = hash(topic) # deterministic per-topic
docs = select_docs_for_topic_random(all_docs, MAX_DOCS_PER_TOPIC, seed)
print(f" Random selection ({len(docs)} docs ≥{MIN_DOC_CHARS} chars)")
print(f" → {', '.join(d['ordinal_id'] for d in docs)}")
else:
# Filter small docs even when under max
docs = [d for d in all_docs if len(d.get("content", "")) >= MIN_DOC_CHARS]| Component | Before | After |
|---|---|---|
| Document selection | LLM via pairctl | Random with size filter |
| Question count | Fixed at 8 | 1 per 500 chars, max 8 |
| Size threshold | None | 500 char minimum |
| Reproducibility | Non-deterministic | Seeded per-topic |
- No LLM cost for selection phase
- Faster execution (removes one pairctl round-trip per topic)
- Reproducible with deterministic seeds
- Adaptive question count matches document substance
- Representative sampling of actual corpus distribution