darinkishore/investigate.py

## investigate.py
import json
import os
from datetime import datetime
from pathlib import Path

import dspy


class AnalyzeAndUnderstand(dspy.Signature):
    """
    Given a dictionary of files and a query, understand and thoroughly understand the files in order to answer the query.
    Be proactive and make sure that no information is missing or omitted from the final answer.
    """

    directory: dict = dspy.InputField(
        desc='A folder on the filesystem. A high quality set of of jujutsu vcs blog posts, official documentation, and books.'
    )
    query: str = dspy.InputField(desc='The question to answer.')
    answer: str = dspy.OutputField(
        desc='Comprehensive answer to the query based on exploration of the files in the directory'
    )
    snippets: list[str] = dspy.OutputField(desc='Relevant text snippets')
    file_refs: list[str] = dspy.OutputField(desc='Referenced file paths')


lm = dspy.LM(
    'cerebras/zai-glm-4.7',
    api_key=os.environ['CEREBRAS_API_KEY'],
    api_base='https://api.cerebras.ai/v1',
    temperature=1.0,
    top_p=0.95,
    max_tokens=40000,
    disable_reasoning=False,
    clear_thinking=False,
)
dspy.configure(lm=lm)

RUNS_DIR = Path('runs')
QUESTIONS_FILE = Path('questions.json')
DATA_DIR = Path('data')


def load_folder(folder_path: Path | str, extensions: list[str] = None) -> dict:
    """Load folder into nested dict. Keys are relative paths, values are file contents."""
    extensions = extensions or ['.md', '.txt', '.py']
    root = Path(folder_path)
    tree = {}
    for path in root.rglob('*'):
        if path.is_file() and path.suffix in extensions:
            rel_path = str(path.relative_to(root))
            tree[rel_path] = path.read_text(errors='ignore')
    return tree


def investigate(file_tree: dict, query: str, max_iterations: int = 30):
    rlm = dspy.RLM(
        AnalyzeAndUnderstand,
        max_iterations=max_iterations,
        max_llm_calls=60,
        verbose=True,
    )
    result = rlm(directory=file_tree, query=query)
    return {
        'answer': result.answer,
        'snippets': result.snippets,
        'file_refs': result.file_refs,
        'trajectory': result.trajectory,
    }


def load_questions() -> dict:
    with open(QUESTIONS_FILE) as f:
        return json.load(f)


def get_question(qid: str) -> dict | None:
    data = load_questions()
    for cat in data['questions']:
        for q in cat['questions']:
            if q['id'] == qid:
                return {'category': cat['category'], **q}
    return None


def list_questions() -> list[dict]:
    data = load_questions()
    return [
        {'category': cat['category'], **q}
        for cat in data['questions']
        for q in cat['questions']
    ]


def run_question(qid: str, max_iterations: int = 10) -> dict:
def run_question(qid: str, max_iterations: int = 30) -> dict:
    q = get_question(qid)
    if not q:
        raise ValueError(f'Question {qid} not found')

    tree = load_folder(DATA_DIR)
    print(f'Loaded {len(tree)} files')
    print(f'Running: {q["id"]} - {q["question"][:60]}...')

    result = investigate(tree, q['question'], max_iterations=max_iterations)

    output = {
        'id': q['id'],
        'category': q['category'],
        'query': q['question'],
        'timestamp': datetime.now().isoformat(),
        'answer': result['answer'],
        'snippets': result['snippets'],
        'file_refs': result['file_refs'],
        'trajectory': result['trajectory'],
        'config': {'model': 'cerebras/zai-glm-4.7', 'max_iterations': max_iterations},
    }

    out_dir = RUNS_DIR / qid
    out_dir.mkdir(parents=True, exist_ok=True)
    out_file = out_dir / f'{datetime.now().strftime("%Y-%m-%dT%H-%M-%S")}.json'
    with open(out_file, 'w') as f:
        json.dump(output, f, indent=2)
    print(f'Saved to {out_file}')

    return output


if __name__ == '__main__':
    import sys

    if len(sys.argv) < 2:
        print('Usage: uv run investigate.py <question_id>')
        print('\nQuestions:')
        for q in list_questions():
            print(f'  {q["id"]}: {q["question"][:60]}...')
        sys.exit(0)

    run_question(sys.argv[1])
	import json
	import os
	from datetime import datetime
	from pathlib import Path

	import dspy


	class AnalyzeAndUnderstand(dspy.Signature):
	"""
	Given a dictionary of files and a query, understand and thoroughly understand the files in order to answer the query.
	Be proactive and make sure that no information is missing or omitted from the final answer.
	"""

	directory: dict = dspy.InputField(
	desc='A folder on the filesystem. A high quality set of of jujutsu vcs blog posts, official documentation, and books.'
	)
	query: str = dspy.InputField(desc='The question to answer.')
	answer: str = dspy.OutputField(
	desc='Comprehensive answer to the query based on exploration of the files in the directory'
	)
	snippets: list[str] = dspy.OutputField(desc='Relevant text snippets')
	file_refs: list[str] = dspy.OutputField(desc='Referenced file paths')


	lm = dspy.LM(
	'cerebras/zai-glm-4.7',
	api_key=os.environ['CEREBRAS_API_KEY'],
	api_base='https://api.cerebras.ai/v1',
	temperature=1.0,
	top_p=0.95,
	max_tokens=40000,
	disable_reasoning=False,
	clear_thinking=False,
	)
	dspy.configure(lm=lm)

	RUNS_DIR = Path('runs')
	QUESTIONS_FILE = Path('questions.json')
	DATA_DIR = Path('data')


	def load_folder(folder_path: Path \| str, extensions: list[str] = None) -> dict:
	"""Load folder into nested dict. Keys are relative paths, values are file contents."""
	extensions = extensions or ['.md', '.txt', '.py']
	root = Path(folder_path)
	tree = {}
	for path in root.rglob('*'):
	if path.is_file() and path.suffix in extensions:
	rel_path = str(path.relative_to(root))
	tree[rel_path] = path.read_text(errors='ignore')
	return tree


	def investigate(file_tree: dict, query: str, max_iterations: int = 30):
	rlm = dspy.RLM(
	AnalyzeAndUnderstand,
	max_iterations=max_iterations,
	max_llm_calls=60,
	verbose=True,
	)
	result = rlm(directory=file_tree, query=query)
	return {
	'answer': result.answer,
	'snippets': result.snippets,
	'file_refs': result.file_refs,
	'trajectory': result.trajectory,
	}


	def load_questions() -> dict:
	with open(QUESTIONS_FILE) as f:
	return json.load(f)


	def get_question(qid: str) -> dict \| None:
	data = load_questions()
	for cat in data['questions']:
	for q in cat['questions']:
	if q['id'] == qid:
	return {'category': cat['category'], **q}
	return None


	def list_questions() -> list[dict]:
	data = load_questions()
	return [
	{'category': cat['category'], **q}
	for cat in data['questions']
	for q in cat['questions']
	]


	def run_question(qid: str, max_iterations: int = 10) -> dict:
	def run_question(qid: str, max_iterations: int = 30) -> dict:
	q = get_question(qid)
	if not q:
	raise ValueError(f'Question {qid} not found')

	tree = load_folder(DATA_DIR)
	print(f'Loaded {len(tree)} files')
	print(f'Running: {q["id"]} - {q["question"][:60]}...')

	result = investigate(tree, q['question'], max_iterations=max_iterations)

	output = {
	'id': q['id'],
	'category': q['category'],
	'query': q['question'],
	'timestamp': datetime.now().isoformat(),
	'answer': result['answer'],
	'snippets': result['snippets'],
	'file_refs': result['file_refs'],
	'trajectory': result['trajectory'],
	'config': {'model': 'cerebras/zai-glm-4.7', 'max_iterations': max_iterations},
	}

	out_dir = RUNS_DIR / qid
	out_dir.mkdir(parents=True, exist_ok=True)
	out_file = out_dir / f'{datetime.now().strftime("%Y-%m-%dT%H-%M-%S")}.json'
	with open(out_file, 'w') as f:
	json.dump(output, f, indent=2)
	print(f'Saved to {out_file}')

	return output


	if __name__ == '__main__':
	import sys

	if len(sys.argv) < 2:
	print('Usage: uv run investigate.py <question_id>')
	print('\nQuestions:')
	for q in list_questions():
	print(f' {q["id"]}: {q["question"][:60]}...')
	sys.exit(0)

	run_question(sys.argv[1])
No results found