Skip to content

Instantly share code, notes, and snippets.

@possibilities
Created January 21, 2026 01:59
Show Gist options
  • Select an option

  • Save possibilities/36ebe5fd552130a2e76fcc114fb8a5ec to your computer and use it in GitHub Desktop.

Select an option

Save possibilities/36ebe5fd552130a2e76fcc114fb8a5ec to your computer and use it in GitHub Desktop.

gen-queries.py: Random Selection + Scaled Questions

Replace LLM-based document selection with random selection, scaling questions by document size.

Constants

MIN_DOC_CHARS = 500
CHARS_PER_QUESTION = 500
MAX_QUESTIONS_PER_DOC = 8

New Functions

def calculate_question_count(content: str) -> int:
    """Scale questions by content length: 1 per 500 chars, max 8."""
    char_count = len(content)
    return min(char_count // CHARS_PER_QUESTION, MAX_QUESTIONS_PER_DOC)


def select_docs_for_topic_random(
    docs: list[dict], 
    max_docs: int = MAX_DOCS_PER_TOPIC,
    seed: int | None = None,
) -> list[dict]:
    """Select random documents that meet size threshold."""
    import random
    
    # Filter by size floor
    eligible = [d for d in docs if len(d.get("content", "")) >= MIN_DOC_CHARS]
    
    if not eligible:
        return []
    
    # Reproducible random selection
    rng = random.Random(seed)
    return rng.sample(eligible, min(len(eligible), max_docs))

Modified generate_questions()

def generate_questions(doc: dict, cli: str, max_retries: int = 2) -> list[dict]:
    """Ask LLM to generate test questions for a document."""
    content = doc["content"][:4000]
    num_questions = calculate_question_count(content)
    
    # Scale the question type list based on count
    question_types = [
        "Factual - direct answer found in text",
        "Procedural - how to do something described in the doc",
        "Conceptual - requires understanding the 'why'",
        "Rephrased - uses synonyms/different terminology",
        "Troubleshooting - what if X goes wrong",
        "Edge case - less obvious use case",
        "Factual - another direct fact from text",
        "Procedural - a different how-to from the doc",
    ][:num_questions]
    
    type_list = "\n".join(f"{i+1}. {t}" for i, t in enumerate(question_types))

    prompt = f"""Generate {num_questions} test questions for evaluating semantic search retrieval quality.

Document (topic: {doc["topic"]}, ID: {doc["ordinal_id"]}):
---
{content}
---

Create {num_questions} diverse questions that this document can answer:
{type_list}"""

    schema = f"""interface Question {{
  question: string;
  difficulty: "easy" | "medium" | "hard";
}}
type Questions = Question[];  // array of {num_questions} questions"""

    # ... rest unchanged (retry logic, ask_llm_for_json call)

Usage in main()

Replace the LLM selection block (lines 629-643) with:

elif len(all_docs) > MAX_DOCS_PER_TOPIC:
    # Random selection with size filter
    seed = hash(topic)  # deterministic per-topic
    docs = select_docs_for_topic_random(all_docs, MAX_DOCS_PER_TOPIC, seed)
    print(f"  Random selection ({len(docs)} docs ≥{MIN_DOC_CHARS} chars)")
    print(f"  → {', '.join(d['ordinal_id'] for d in docs)}")
else:
    # Filter small docs even when under max
    docs = [d for d in all_docs if len(d.get("content", "")) >= MIN_DOC_CHARS]

Summary of Changes

Component Before After
Document selection LLM via pairctl Random with size filter
Question count Fixed at 8 1 per 500 chars, max 8
Size threshold None 500 char minimum
Reproducibility Non-deterministic Seeded per-topic

Benefits

  • No LLM cost for selection phase
  • Faster execution (removes one pairctl round-trip per topic)
  • Reproducible with deterministic seeds
  • Adaptive question count matches document substance
  • Representative sampling of actual corpus distribution
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment