Created
October 23, 2024 23:44
-
-
Save caleb-kaiser/2cbcce465ffa5e856af07151c202a51c to your computer and use it in GitHub Desktop.
3-0-evaluation.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/caleb-kaiser/2cbcce465ffa5e856af07151c202a51c/3-0-evaluation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "<img src=\"https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/static/img/opik-logo.svg\" width=\"250\"/>" | |
| ], | |
| "metadata": { | |
| "id": "tO9p7St93Pa0" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "vqfKhJcs92nt" | |
| }, | |
| "source": [ | |
| "# Evaluation with Opik\n", | |
| "\n", | |
| "In this exercise, you'll implement a basic evaluation pipeline with Opik. You can use OpenAI or open source models via LiteLLM" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Imports & Configuration" | |
| ], | |
| "metadata": { | |
| "id": "UhlBep2DdIKx" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "! pip install opik openai comet_ml litellm --quiet" | |
| ], | |
| "metadata": { | |
| "id": "mDiX7cUYdLiQ" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "5xcX30we92nx" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import opik\n", | |
| "from opik import Opik, track\n", | |
| "from opik.evaluation import evaluate\n", | |
| "from opik.evaluation.metrics import (IsJson)\n", | |
| "from opik.integrations.openai import track_openai\n", | |
| "import openai\n", | |
| "import os\n", | |
| "from datetime import datetime\n", | |
| "from getpass import getpass\n", | |
| "import litellm\n", | |
| "\n", | |
| "# Define project name to enable tracing\n", | |
| "os.environ[\"OPIK_PROJECT_NAME\"] = \"food_chatbot_eval\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Opik configuration\n", | |
| "if \"OPIK_API_KEY\" not in os.environ:\n", | |
| " os.environ[\"OPIK_API_KEY\"] = getpass(\"Enter your Opik API key: \")\n", | |
| "\n", | |
| "opik.configure()" | |
| ], | |
| "metadata": { | |
| "id": "NcJ25mYXdb58" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# OpenAI configuration (ignore if you're using LiteLLM)\n", | |
| "if \"OPENAI_API_KEY\" not in os.environ:\n", | |
| " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API key: \")" | |
| ], | |
| "metadata": { | |
| "id": "b7nLcmo70kgD" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "MODEL = \"gpt-4o-mini\"" | |
| ], | |
| "metadata": { | |
| "id": "Qrp4OjVKdTsI" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "client = opik.Opik()" | |
| ], | |
| "metadata": { | |
| "id": "IT1T_ilrel9J" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Dataset" | |
| ], | |
| "metadata": { | |
| "id": "aaUhMjZhlknD" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Create or get the dataset\n", | |
| "dataset = client.get_or_create_dataset(name=\"foodchatbot_eval\")" | |
| ], | |
| "metadata": { | |
| "id": "NjD0upvAlmRN" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Optional: Download Dataset From Comet" | |
| ], | |
| "metadata": { | |
| "id": "V0ymGQlYmzdT" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "If you have not previously created the `foodchatbot_eval` dataset in your Opik workspace, run the following code to download the dataset as a Comet Artifact and populate your Opik dataset.\n", | |
| "\n", | |
| "If you have already created the `foodchatbot_eval` dataset, you can skip to the next section" | |
| ], | |
| "metadata": { | |
| "id": "FskOHALFmdTA" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import comet_ml" | |
| ], | |
| "metadata": { | |
| "id": "0D9BaSz6lmOu" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "experiment = comet_ml.start(project_name=\"foodchatbot_eval\")\n", | |
| "\n", | |
| "logged_artifact = experiment.get_artifact(artifact_name=\"foodchatbot_eval\",\n", | |
| " workspace=\"examples\")\n", | |
| "local_artifact = logged_artifact.download(\"./\")\n", | |
| "experiment.end()" | |
| ], | |
| "metadata": { | |
| "id": "UMzbSBCPlmIR" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import csv\n", | |
| "import json\n", | |
| "# Read the CSV file and insert items into the dataset\n", | |
| "with open('./foodchatbot_clean_eval_dataset.csv', newline='') as csvfile:\n", | |
| " reader = csv.reader(csvfile)\n", | |
| " for row in reader:\n", | |
| " index, question, response = row\n", | |
| " item = {\n", | |
| " \"index\": index,\n", | |
| " \"question\": question,\n", | |
| " \"response\": response\n", | |
| " }\n", | |
| "\n", | |
| " dataset.insert([item])" | |
| ], | |
| "metadata": { | |
| "id": "s8IpjR9Bm920" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Templates & Prompts" | |
| ], | |
| "metadata": { | |
| "id": "o87OB7Eqe24p" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# menu items\n", | |
| "menu_items = \"\"\"\n", | |
| "Menu: Kids Menu\n", | |
| "Food Item: Mini Cheeseburger\n", | |
| "Price: $6.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 4/5\n", | |
| "Included: Mini beef patty, cheese, lettuce, tomato, and fries.\n", | |
| "\n", | |
| "Menu: Appetizers\n", | |
| "Food Item: Loaded Potato Skins\n", | |
| "Price: $8.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 3/5\n", | |
| "Included: Crispy potato skins filled with cheese, bacon bits, and served with sour cream.\n", | |
| "\n", | |
| "Menu: Appetizers\n", | |
| "Food Item: Bruschetta\n", | |
| "Price: $7.99\n", | |
| "Vegan: Y\n", | |
| "Popularity: 4/5\n", | |
| "Included: Toasted baguette slices topped with fresh tomatoes, basil, garlic, and balsamic glaze.\n", | |
| "\n", | |
| "Menu: Main Menu\n", | |
| "Food Item: Grilled Chicken Caesar Salad\n", | |
| "Price: $12.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 4/5\n", | |
| "Included: Grilled chicken breast, romaine lettuce, Parmesan cheese, croutons, and Caesar dressing.\n", | |
| "\n", | |
| "Menu: Main Menu\n", | |
| "Food Item: Classic Cheese Pizza\n", | |
| "Price: $10.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 5/5\n", | |
| "Included: Thin-crust pizza topped with tomato sauce, mozzarella cheese, and fresh basil.\n", | |
| "\n", | |
| "Menu: Main Menu\n", | |
| "Food Item: Spaghetti Bolognese\n", | |
| "Price: $14.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 4/5\n", | |
| "Included: Pasta tossed in a savory meat sauce made with ground beef, tomatoes, onions, and herbs.\n", | |
| "\n", | |
| "Menu: Vegan Options\n", | |
| "Food Item: Veggie Wrap\n", | |
| "Price: $9.99\n", | |
| "Vegan: Y\n", | |
| "Popularity: 3/5\n", | |
| "Included: Grilled vegetables, hummus, mixed greens, and a wrap served with a side of sweet potato fries.\n", | |
| "\n", | |
| "Menu: Vegan Options\n", | |
| "Food Item: Vegan Beyond Burger\n", | |
| "Price: $11.99\n", | |
| "Vegan: Y\n", | |
| "Popularity: 4/5\n", | |
| "Included: Plant-based patty, vegan cheese, lettuce, tomato, onion, and a choice of regular or sweet potato fries.\n", | |
| "\n", | |
| "Menu: Desserts\n", | |
| "Food Item: Chocolate Lava Cake\n", | |
| "Price: $6.99\n", | |
| "Vegan: N\n", | |
| "Popularity: 5/5\n", | |
| "Included: Warm chocolate cake with a gooey molten center, served with vanilla ice cream.\n", | |
| "\n", | |
| "Menu: Desserts\n", | |
| "Food Item: Fresh Berry Parfait\n", | |
| "Price: $5.99\n", | |
| "Vegan: Y\n", | |
| "Popularity: 4/5\n", | |
| "Included: Layers of mixed berries, granola, and vegan coconut yogurt.\n", | |
| "\"\"\"\n" | |
| ], | |
| "metadata": { | |
| "id": "UddfOiIFe4ja" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "prompt_template = \"\"\"Answer a question about the following menu:\n", | |
| "\n", | |
| "# MENU\n", | |
| "{menu}\n", | |
| "\n", | |
| "# QUESTION\n", | |
| "{question}\n", | |
| "\"\"\"" | |
| ], | |
| "metadata": { | |
| "id": "TP3uUuY9e5jW" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "j0viO7Gqe5hJ" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "qFrieyPRe5e-" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "MWL8el1ve5c1" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "xWyIaDwIe5Z4" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "G3-pMk6p92nz" | |
| }, | |
| "source": [ | |
| "# LLM Application\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Simple little client class for using different LLM APIs (OpenAI or LiteLLM)\n", | |
| "class LLMClient:\n", | |
| " def __init__(self, client_type: str =\"openai\", model: str =\"gpt-4\"):\n", | |
| " self.client_type = client_type\n", | |
| " self.model = model\n", | |
| "\n", | |
| " if self.client_type == \"openai\":\n", | |
| " self.client = track_openai(openai.OpenAI())\n", | |
| "\n", | |
| " else:\n", | |
| " self.client = None\n", | |
| "\n", | |
| " # LiteLLM query function\n", | |
| " def _get_litellm_response(self, query: str, system: str = \"You are a helpful assistant.\"):\n", | |
| " messages = [\n", | |
| " {\"role\": \"system\", \"content\": system },\n", | |
| " { \"role\": \"user\", \"content\": query }\n", | |
| " ]\n", | |
| "\n", | |
| " response = litellm.completion(\n", | |
| " model=self.model,\n", | |
| " messages=messages\n", | |
| " )\n", | |
| "\n", | |
| " return response.choices[0].message.content\n", | |
| "\n", | |
| " # OpenAI query function - use **kwargs to pass arguments like temperature\n", | |
| " def _get_openai_response(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n", | |
| " messages = [\n", | |
| " {\"role\": \"system\", \"content\": system },\n", | |
| " { \"role\": \"user\", \"content\": query }\n", | |
| " ]\n", | |
| "\n", | |
| " response = self.client.chat.completions.create(\n", | |
| " model=self.model,\n", | |
| " messages=messages,\n", | |
| " **kwargs\n", | |
| " )\n", | |
| "\n", | |
| " return response.choices[0].message.content\n", | |
| "\n", | |
| "\n", | |
| " def query(self, query: str, system: str = \"You are a helpful assistant.\", **kwargs):\n", | |
| " if self.client_type == 'openai':\n", | |
| " return self._get_openai_response(query, system, **kwargs)\n", | |
| "\n", | |
| " else:\n", | |
| " return self._get_litellm_response(query, system)\n", | |
| "\n", | |
| "\n", | |
| "\n" | |
| ], | |
| "metadata": { | |
| "id": "3rhh1oX6fUTz" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "llm_client = LLMClient(model=MODEL)" | |
| ], | |
| "metadata": { | |
| "id": "w1e6ceRpfZiJ" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "m6s3Rk9u92n0" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "@track\n", | |
| "def chatbot_application(input: str) -> str:\n", | |
| " response = llm_client.query(prompt_template.format(menu=menu_items, question=input))\n", | |
| " return response\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Evaluation" | |
| ], | |
| "metadata": { | |
| "id": "hkGXuEY33Dc7" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Define the evaluation task\n", | |
| "def evaluation_task(x):\n", | |
| " return {\n", | |
| " \"input\": x['question'],\n", | |
| " \"output\": chatbot_application(x['question']),\n", | |
| " \"context\": menu_items,\n", | |
| " \"reference\": x['response']\n", | |
| " }\n" | |
| ], | |
| "metadata": { | |
| "id": "jd75tlHbf2wg" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "#dataset = client.get_or_create_dataset(name=\"foodchatbot_eval\")" | |
| ], | |
| "metadata": { | |
| "id": "5V7_qLTaf4EI" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Define the metrics\n", | |
| "metrics = [IsJson()]\n", | |
| "\n", | |
| "# experiment_name\n", | |
| "experiment_name = MODEL + \"_\" + dataset.name + \"_\" + datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")\n", | |
| "\n", | |
| "# run evaluation\n", | |
| "evaluation = evaluate(\n", | |
| " experiment_name=experiment_name,\n", | |
| " dataset=dataset,\n", | |
| " task=evaluation_task,\n", | |
| " scoring_metrics=metrics,\n", | |
| " experiment_config={\n", | |
| " \"model\": MODEL\n", | |
| " }\n", | |
| ")" | |
| ], | |
| "metadata": { | |
| "id": "Iro4ybLof6Q2" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "L3osp2A-f2ti" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "Aw0A5CIGf2sZ" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "u-W0bkn4f2qL" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "DSuvdj2vf2ny" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "BkvQd_QKf2lD" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "comet-eval", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.15" | |
| }, | |
| "colab": { | |
| "provenance": [], | |
| "include_colab_link": true | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment