Last active
August 25, 2025 06:40
-
-
Save bhavsarpratik/1c960188bb67c0252925e1b628b0c339 to your computer and use it in GitHub Desktop.
experiment_test.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import time | |
| from typing import Dict, List, Any | |
| from galileo.experiments import run_experiment | |
| from dotenv import load_dotenv | |
| from galileo import galileo_context | |
| load_dotenv("../.env") | |
| def weather_conversation_function(input_data: Dict[str, Any]) -> str: | |
| """ | |
| Process a multi-turn weather conversation based on input data. | |
| This function handles the conversation and returns the final results as a JSON string. | |
| """ | |
| # Extract conversation data from the input | |
| conversation = input_data.get("conversation", []) | |
| session_id = input_data.get("session_id", "test-session") | |
| logger = galileo_context.get_logger_instance() | |
| # Initialize results container | |
| results = { | |
| "session_id": session_id, | |
| "turns_completed": 0, | |
| "turns_results": [], | |
| "success": False, | |
| } | |
| # Process each conversation turn | |
| for turn in conversation: | |
| turn_id = turn["turn_id"] | |
| turn_start_time = time.time() | |
| # Start a workflow span for this turn | |
| workflow_name = f"turn_{turn_id}_workflow" | |
| logger.add_workflow_span( | |
| input=turn["user_input"], | |
| name=workflow_name, | |
| metadata={ | |
| "turn_id": str(turn_id), | |
| "session_id": session_id, | |
| }, | |
| tags=[f"turn_{turn_id}", "conversation_turn"], | |
| ) | |
| # Simulate thinking/planning with LLM | |
| thinking_output = simulate_llm_thinking( | |
| turn["user_input"], conversation[: turn_id - 1] | |
| ) | |
| # Process tool calls if present | |
| tool_results = [] | |
| if "tool_calls" in turn: | |
| for i, tool_input in enumerate(turn["tool_calls"]): | |
| tool_start_time = time.time() | |
| tool_input_str = json.dumps(tool_input) | |
| tool_result = simulate_weather_api(tool_input) | |
| tool_duration_ns = int((time.time() - tool_start_time) * 1_000_000_000) | |
| tool_call_id = f"{session_id}_turn{turn_id}_tool{i}" | |
| logger.add_tool_span( | |
| input=tool_input_str, | |
| output=json.dumps(tool_result), | |
| name=f"weather_api_{tool_input.get('location', 'unknown')}", | |
| duration_ns=tool_duration_ns, | |
| metadata={ | |
| "location": tool_input.get("location", "unknown"), | |
| "units": tool_input.get("units", "celsius"), | |
| "turn_id": str(turn_id), | |
| "session_id": session_id, | |
| "tool_index": str(i), | |
| "tool_type": "weather_api", | |
| }, | |
| tool_call_id=tool_call_id, | |
| tags=[ | |
| "weather_api", | |
| tool_input.get("location", "unknown"), | |
| ], | |
| ) | |
| print("Tool result added to logger") | |
| tool_results.append(tool_result) | |
| # Generate response | |
| assistant_response = generate_response( | |
| turn["user_input"], tool_results, conversation[: turn_id - 1] | |
| ) | |
| # Calculate turn duration | |
| turn_duration_ns = int((time.time() - turn_start_time) * 1_000_000_000) | |
| # Conclude the workflow span for this turn | |
| logger.conclude( | |
| output=assistant_response, | |
| duration_ns=turn_duration_ns, | |
| ) | |
| # Record turn results | |
| turn_result = { | |
| "turn_id": turn_id, | |
| "user_input": turn["user_input"], | |
| "thinking": thinking_output, | |
| "tool_results": tool_results, | |
| "assistant_response": assistant_response, | |
| "processing_time_ms": int((time.time() - turn_start_time) * 1000), | |
| } | |
| results["turns_results"].append(turn_result) | |
| results["turns_completed"] += 1 | |
| # Set success flag | |
| results["success"] = True if results["turns_completed"] > 0 else False | |
| results["conversation_summary"] = ( | |
| "Completed multi-turn conversation about weather in different cities" | |
| ) | |
| return json.dumps(results) | |
| def simulate_llm_thinking(user_input: str, history: List[Dict]) -> str: | |
| """Simulate LLM thinking process""" | |
| if "San Francisco" in user_input: | |
| return "I need to check the weather in San Francisco." | |
| elif "New York" in user_input: | |
| return "User is asking about weather in New York." | |
| else: | |
| return f"Analyzing user query about {user_input}" | |
| def simulate_weather_api(tool_input: Dict) -> Dict: | |
| """Simulate weather API call""" | |
| location = tool_input.get("location", "Unknown") | |
| # Return mock data based on location | |
| if location == "San Francisco": | |
| return { | |
| "temperature": 18, | |
| "condition": "Partly Cloudy", | |
| "humidity": 72, | |
| "wind_speed": 12, | |
| } | |
| elif location == "New York": | |
| return { | |
| "temperature": 15, | |
| "condition": "Rainy", | |
| "humidity": 85, | |
| "wind_speed": 20, | |
| } | |
| else: | |
| return { | |
| "temperature": 20, | |
| "condition": "Clear", | |
| "humidity": 60, | |
| "wind_speed": 10, | |
| } | |
| def generate_response( | |
| user_input: str, tool_results: List[Dict], history: List[Dict] | |
| ) -> str: | |
| """Generate assistant response based on weather data""" | |
| # In a real application, you would use OpenAI here | |
| if tool_results: | |
| weather = tool_results[0] | |
| if "San Francisco" in user_input: | |
| return f"The current weather in San Francisco is {weather['condition'].lower()} with a temperature of {weather['temperature']}°C. The humidity is {weather['humidity']}% and wind speed is {weather['wind_speed']} km/h." | |
| elif "New York" in user_input or "How about New York" in user_input: | |
| return f"In New York, it's currently {weather['condition'].lower()} with a temperature of {weather['temperature']}°C. The humidity is quite high at {weather['humidity']}% with wind speeds of {weather['wind_speed']} km/h." | |
| else: | |
| location = ( | |
| user_input.split("weather in ")[-1].split("?")[0] | |
| if "weather in" in user_input | |
| else "the requested location" | |
| ) | |
| return f"The weather in {location} is {weather['condition'].lower()} with a temperature of {weather['temperature']}°C." | |
| else: | |
| return "I don't have current weather information for that location." | |
| # Create example dataset for the experiment | |
| weather_dataset = [ | |
| { | |
| "session_id": "test-session-1", | |
| "conversation": [ | |
| { | |
| "turn_id": 1, | |
| "user_input": "What's the weather in San Francisco?", | |
| "tool_calls": [{"location": "San Francisco", "units": "celsius"}], | |
| }, | |
| { | |
| "turn_id": 2, | |
| "user_input": "How about New York?", | |
| "tool_calls": [{"location": "New York", "units": "celsius"}], | |
| }, | |
| ], | |
| }, | |
| { | |
| "session_id": "test-session-2", | |
| "conversation": [ | |
| { | |
| "turn_id": 1, | |
| "user_input": "What's the weather in Paris?", | |
| "tool_calls": [{"location": "Paris", "units": "celsius"}], | |
| } | |
| ], | |
| }, | |
| { | |
| "session_id": "test-session-3", | |
| "conversation": [ | |
| { | |
| "turn_id": 1, | |
| "user_input": "What's the weather in Tokyo?", | |
| "tool_calls": [{"location": "Tokyo", "units": "celsius"}], | |
| }, | |
| { | |
| "turn_id": 2, | |
| "user_input": "And what about London?", | |
| "tool_calls": [{"location": "London", "units": "celsius"}], | |
| }, | |
| { | |
| "turn_id": 3, | |
| "user_input": "Is it warmer in Dubai?", | |
| "tool_calls": [{"location": "Dubai", "units": "celsius"}], | |
| }, | |
| ], | |
| }, | |
| ] | |
| # Run the experiment | |
| if __name__ == "__main__": | |
| # Use microseconds in the timestamp to ensure uniqueness | |
| experiment_name = f"weather-conversation-experiment-{int(time.time() * 1000000)}" | |
| # No galileo_context here - we're creating new loggers for each turn | |
| results = run_experiment( | |
| experiment_name, | |
| dataset=weather_dataset, | |
| function=weather_conversation_function, | |
| metrics=["tool_selection_quality"], | |
| project="test-project", | |
| ) | |
| print(f"Experiment completed with {len(results)} data points") |
Author
bhavsarpratik
commented
Aug 25, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment