alonsosilvaallende/notebook.ipynb

## notebook.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "093b1977-dc46-4508-b109-17ea5315877b",
   "metadata": {},
   "source": [
    "We already have the vocabulary from a model with the script you gave me so we don't need this cell:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ec12bffc-60dd-48e3-8661-0110d61109c3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-18T15:55:12.360538Z",
     "iopub.status.busy": "2026-02-18T15:55:12.360027Z",
     "iopub.status.idle": "2026-02-18T15:55:12.807528Z",
     "shell.execute_reply": "2026-02-18T15:55:12.806416Z",
     "shell.execute_reply.started": "2026-02-18T15:55:12.360490Z"
    }
   },
   "outputs": [],
   "source": [
    "import outlines_core\n",
    "\n",
    "model_id = \"Qwen/Qwen3-0.6B\"\n",
    "vocabulary = outlines_core.Vocabulary.from_pretrained(model_id)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22d884e2-c189-4e0b-923d-5b1fce9282c1",
   "metadata": {},
   "source": [
    "This is all we need from `outlines_core`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "afa6ee68-3f32-49fb-875f-a281a33e074c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-18T15:55:30.593216Z",
     "iopub.status.busy": "2026-02-18T15:55:30.592715Z",
     "iopub.status.idle": "2026-02-18T15:55:30.642589Z",
     "shell.execute_reply": "2026-02-18T15:55:30.641631Z",
     "shell.execute_reply.started": "2026-02-18T15:55:30.593170Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index {\n",
       "    initial_state: 64,\n",
       "    final_states: {\n",
       "        112,\n",
       "    },\n",
       "    transitions: {\n",
       "        112: {\n",
       "            151645: 112,\n",
       "        },\n",
       "        64: {\n",
       "            54: 128,\n",
       "            1639: 80,\n",
       "            10234: 96,\n",
       "        },\n",
       "        80: {\n",
       "            88: 96,\n",
       "        },\n",
       "        128: {\n",
       "            8503: 96,\n",
       "            71: 80,\n",
       "        },\n",
       "        96: {\n",
       "            220: 112,\n",
       "        },\n",
       "    },\n",
       "    eos_token_id: 151645,\n",
       "    vocab_size: 151656,\n",
       "}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index = outlines_core.Index(\"Why \", vocabulary)\n",
    "index"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50ebabed-c20f-4fea-867f-d5cea195b3ad",
   "metadata": {},
   "source": [
    "In reality, we only need the transitions (I can deduce the other info):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d744f7b5-6e58-434b-bdef-e19668d1d05b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-18T15:56:14.351757Z",
     "iopub.status.busy": "2026-02-18T15:56:14.351001Z",
     "iopub.status.idle": "2026-02-18T15:56:14.360753Z",
     "shell.execute_reply": "2026-02-18T15:56:14.358920Z",
     "shell.execute_reply.started": "2026-02-18T15:56:14.351691Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{112: {151645: 112},\n",
       " 64: {54: 128, 1639: 80, 10234: 96},\n",
       " 80: {88: 96},\n",
       " 128: {8503: 96, 71: 80},\n",
       " 96: {220: 112}}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index.get_transitions()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ade5c02-d253-402f-b508-0388e9808c59",
   "metadata": {},
   "source": [
    "The transitions consist of {state: {token: next_state}}."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5044b68-bd66-4e8e-8571-e15269049d12",
   "metadata": {},
   "source": [
    "## This is just to make the transitions 'prettier'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fb075839-f549-4fb6-9819-4731428513fc",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-18T15:58:13.649553Z",
     "iopub.status.busy": "2026-02-18T15:58:13.649093Z",
     "iopub.status.idle": "2026-02-18T15:58:13.658346Z",
     "shell.execute_reply": "2026-02-18T15:58:13.656443Z",
     "shell.execute_reply.started": "2026-02-18T15:58:13.649512Z"
    }
   },
   "outputs": [],
   "source": [
    "def my_recursive(\n",
    "    state: int,\n",
    "    index: outlines_core.Index,\n",
    "    mapping: dict[int, int],\n",
    "    visited: set[int],\n",
    "    final_states: set[int],\n",
    ") -> None:\n",
    "    if state in final_states:\n",
    "        return\n",
    "    visited.add(state)\n",
    "    for symbol, new_state in index.get_transitions().get(state, {}).items():\n",
    "        if new_state in final_states:\n",
    "            continue  # Skip final states entirely\n",
    "        if new_state not in mapping:\n",
    "            mapping[new_state] = len(mapping)\n",
    "        if new_state not in visited:\n",
    "            my_recursive(new_state, index, mapping, visited, final_states)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8dadfe8d-e228-4ebb-b7f6-cbcdd293dc95",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-18T15:58:19.521604Z",
     "iopub.status.busy": "2026-02-18T15:58:19.521152Z",
     "iopub.status.idle": "2026-02-18T15:58:19.529955Z",
     "shell.execute_reply": "2026-02-18T15:58:19.528246Z",
     "shell.execute_reply.started": "2026-02-18T15:58:19.521562Z"
    }
   },
   "outputs": [],
   "source": [
    "def get_state_mapping(index: outlines_core.Index) -> dict[int, int]:\n",
    "    initial_state = index.get_initial_state()\n",
    "    final_states = index.get_final_states()\n",
    "    num_states = len(index.get_transitions().keys())\n",
    "    mapping = {}\n",
    "    # Start from initial state (mapped to 0)\n",
    "    mapping[initial_state] = 0\n",
    "    visited = set()\n",
    "    my_recursive(initial_state, index, mapping, visited, final_states)\n",
    "    # End with final states (mapped at the end)\n",
    "    for i, final_state in enumerate(final_states):\n",
    "        mapping[final_state] = num_states - (i + 1)\n",
    "    return mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "92f4f768-9ab1-42a0-8b6f-19555abf7ded",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-18T15:58:25.853038Z",
     "iopub.status.busy": "2026-02-18T15:58:25.852579Z",
     "iopub.status.idle": "2026-02-18T15:58:25.861045Z",
     "shell.execute_reply": "2026-02-18T15:58:25.859315Z",
     "shell.execute_reply.started": "2026-02-18T15:58:25.852996Z"
    }
   },
   "outputs": [],
   "source": [
    "def get_dfa(index: outlines_core.Index) -> dict[int, dict[int, int]]:\n",
    "    mapping = get_state_mapping(index)\n",
    "    dfa = {}\n",
    "    for state, transitions in index.get_transitions().items():\n",
    "        new_transitions = {}\n",
    "        for token, new_state in transitions.items():\n",
    "            new_transitions[token] = mapping[new_state]\n",
    "        if state not in index.get_final_states():\n",
    "            dfa[mapping[state]] = new_transitions\n",
    "    return dfa"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b3afd24b-fd37-43fa-b1cd-0266de321e1d",
   "metadata": {},
   "source": [
    "I consider this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "3b08fbb1-ca69-4c7f-8870-ae3744355ec3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-18T15:58:28.185285Z",
     "iopub.status.busy": "2026-02-18T15:58:28.184789Z",
     "iopub.status.idle": "2026-02-18T15:58:28.192492Z",
     "shell.execute_reply": "2026-02-18T15:58:28.190267Z",
     "shell.execute_reply.started": "2026-02-18T15:58:28.185242Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: {54: 1, 1639: 3, 10234: 2}, 3: {88: 2}, 1: {8503: 2, 71: 3}, 2: {220: 4}}\n"
     ]
    }
   ],
   "source": [
    "print(get_dfa(index))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "568145ca-3197-47da-b8dc-bdcf910808cb",
   "metadata": {},
   "source": [
    "prettier than this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1aba4457-29c1-4cb0-935d-283db83e0449",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-18T15:58:30.189048Z",
     "iopub.status.busy": "2026-02-18T15:58:30.188546Z",
     "iopub.status.idle": "2026-02-18T15:58:30.197936Z",
     "shell.execute_reply": "2026-02-18T15:58:30.196146Z",
     "shell.execute_reply.started": "2026-02-18T15:58:30.189003Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{112: {151645: 112},\n",
       " 64: {54: 128, 1639: 80, 10234: 96},\n",
       " 80: {88: 96},\n",
       " 128: {8503: 96, 71: 80},\n",
       " 96: {220: 112}}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index.get_transitions()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "89816d12-5e25-46da-859c-81f22423ad57",
   "metadata": {},
   "source": [
    "but that's just me."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8803c412-9e9d-4473-9e7f-ab377a3840a4",
   "metadata": {},
   "source": [
    "The mapping from `outlines_core` states to the prettier states is:\n",
    "```\n",
    "64 -> 0\n",
    "128 -> 1\n",
    "96 -> 2\n",
    "80 -> 3\n",
    "112 -> 4\n",
    "```"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "093b1977-dc46-4508-b109-17ea5315877b",
	"metadata": {},
	"source": [
	"We already have the vocabulary from a model with the script you gave me so we don't need this cell:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "ec12bffc-60dd-48e3-8661-0110d61109c3",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-02-18T15:55:12.360538Z",
	"iopub.status.busy": "2026-02-18T15:55:12.360027Z",
	"iopub.status.idle": "2026-02-18T15:55:12.807528Z",
	"shell.execute_reply": "2026-02-18T15:55:12.806416Z",
	"shell.execute_reply.started": "2026-02-18T15:55:12.360490Z"
	}
	},
	"outputs": [],
	"source": [
	"import outlines_core\n",
	"\n",
	"model_id = \"Qwen/Qwen3-0.6B\"\n",
	"vocabulary = outlines_core.Vocabulary.from_pretrained(model_id)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "22d884e2-c189-4e0b-923d-5b1fce9282c1",
	"metadata": {},
	"source": [
	"This is all we need from `outlines_core`:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "afa6ee68-3f32-49fb-875f-a281a33e074c",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-02-18T15:55:30.593216Z",
	"iopub.status.busy": "2026-02-18T15:55:30.592715Z",
	"iopub.status.idle": "2026-02-18T15:55:30.642589Z",
	"shell.execute_reply": "2026-02-18T15:55:30.641631Z",
	"shell.execute_reply.started": "2026-02-18T15:55:30.593170Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Index {\n",
	" initial_state: 64,\n",
	" final_states: {\n",
	" 112,\n",
	" },\n",
	" transitions: {\n",
	" 112: {\n",
	" 151645: 112,\n",
	" },\n",
	" 64: {\n",
	" 54: 128,\n",
	" 1639: 80,\n",
	" 10234: 96,\n",
	" },\n",
	" 80: {\n",
	" 88: 96,\n",
	" },\n",
	" 128: {\n",
	" 8503: 96,\n",
	" 71: 80,\n",
	" },\n",
	" 96: {\n",
	" 220: 112,\n",
	" },\n",
	" },\n",
	" eos_token_id: 151645,\n",
	" vocab_size: 151656,\n",
	"}"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"index = outlines_core.Index(\"Why \", vocabulary)\n",
	"index"
	]
	},
	{
	"cell_type": "markdown",
	"id": "50ebabed-c20f-4fea-867f-d5cea195b3ad",
	"metadata": {},
	"source": [
	"In reality, we only need the transitions (I can deduce the other info):"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "d744f7b5-6e58-434b-bdef-e19668d1d05b",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-02-18T15:56:14.351757Z",
	"iopub.status.busy": "2026-02-18T15:56:14.351001Z",
	"iopub.status.idle": "2026-02-18T15:56:14.360753Z",
	"shell.execute_reply": "2026-02-18T15:56:14.358920Z",
	"shell.execute_reply.started": "2026-02-18T15:56:14.351691Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{112: {151645: 112},\n",
	" 64: {54: 128, 1639: 80, 10234: 96},\n",
	" 80: {88: 96},\n",
	" 128: {8503: 96, 71: 80},\n",
	" 96: {220: 112}}"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"index.get_transitions()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "8ade5c02-d253-402f-b508-0388e9808c59",
	"metadata": {},
	"source": [
	"The transitions consist of {state: {token: next_state}}."
	]
	},
	{
	"cell_type": "markdown",
	"id": "b5044b68-bd66-4e8e-8571-e15269049d12",
	"metadata": {},
	"source": [
	"## This is just to make the transitions 'prettier'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "fb075839-f549-4fb6-9819-4731428513fc",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-02-18T15:58:13.649553Z",
	"iopub.status.busy": "2026-02-18T15:58:13.649093Z",
	"iopub.status.idle": "2026-02-18T15:58:13.658346Z",
	"shell.execute_reply": "2026-02-18T15:58:13.656443Z",
	"shell.execute_reply.started": "2026-02-18T15:58:13.649512Z"
	}
	},
	"outputs": [],
	"source": [
	"def my_recursive(\n",
	" state: int,\n",
	" index: outlines_core.Index,\n",
	" mapping: dict[int, int],\n",
	" visited: set[int],\n",
	" final_states: set[int],\n",
	") -> None:\n",
	" if state in final_states:\n",
	" return\n",
	" visited.add(state)\n",
	" for symbol, new_state in index.get_transitions().get(state, {}).items():\n",
	" if new_state in final_states:\n",
	" continue # Skip final states entirely\n",
	" if new_state not in mapping:\n",
	" mapping[new_state] = len(mapping)\n",
	" if new_state not in visited:\n",
	" my_recursive(new_state, index, mapping, visited, final_states)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "8dadfe8d-e228-4ebb-b7f6-cbcdd293dc95",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-02-18T15:58:19.521604Z",
	"iopub.status.busy": "2026-02-18T15:58:19.521152Z",
	"iopub.status.idle": "2026-02-18T15:58:19.529955Z",
	"shell.execute_reply": "2026-02-18T15:58:19.528246Z",
	"shell.execute_reply.started": "2026-02-18T15:58:19.521562Z"
	}
	},
	"outputs": [],
	"source": [
	"def get_state_mapping(index: outlines_core.Index) -> dict[int, int]:\n",
	" initial_state = index.get_initial_state()\n",
	" final_states = index.get_final_states()\n",
	" num_states = len(index.get_transitions().keys())\n",
	" mapping = {}\n",
	" # Start from initial state (mapped to 0)\n",
	" mapping[initial_state] = 0\n",
	" visited = set()\n",
	" my_recursive(initial_state, index, mapping, visited, final_states)\n",
	" # End with final states (mapped at the end)\n",
	" for i, final_state in enumerate(final_states):\n",
	" mapping[final_state] = num_states - (i + 1)\n",
	" return mapping"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "92f4f768-9ab1-42a0-8b6f-19555abf7ded",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-02-18T15:58:25.853038Z",
	"iopub.status.busy": "2026-02-18T15:58:25.852579Z",
	"iopub.status.idle": "2026-02-18T15:58:25.861045Z",
	"shell.execute_reply": "2026-02-18T15:58:25.859315Z",
	"shell.execute_reply.started": "2026-02-18T15:58:25.852996Z"
	}
	},
	"outputs": [],
	"source": [
	"def get_dfa(index: outlines_core.Index) -> dict[int, dict[int, int]]:\n",
	" mapping = get_state_mapping(index)\n",
	" dfa = {}\n",
	" for state, transitions in index.get_transitions().items():\n",
	" new_transitions = {}\n",
	" for token, new_state in transitions.items():\n",
	" new_transitions[token] = mapping[new_state]\n",
	" if state not in index.get_final_states():\n",
	" dfa[mapping[state]] = new_transitions\n",
	" return dfa"
	]
	},
	{
	"cell_type": "markdown",
	"id": "b3afd24b-fd37-43fa-b1cd-0266de321e1d",
	"metadata": {},
	"source": [
	"I consider this:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "3b08fbb1-ca69-4c7f-8870-ae3744355ec3",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-02-18T15:58:28.185285Z",
	"iopub.status.busy": "2026-02-18T15:58:28.184789Z",
	"iopub.status.idle": "2026-02-18T15:58:28.192492Z",
	"shell.execute_reply": "2026-02-18T15:58:28.190267Z",
	"shell.execute_reply.started": "2026-02-18T15:58:28.185242Z"
	}
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{0: {54: 1, 1639: 3, 10234: 2}, 3: {88: 2}, 1: {8503: 2, 71: 3}, 2: {220: 4}}\n"
	]
	}
	],
	"source": [
	"print(get_dfa(index))"
	]
	},
	{
	"cell_type": "markdown",
	"id": "568145ca-3197-47da-b8dc-bdcf910808cb",
	"metadata": {},
	"source": [
	"prettier than this:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "1aba4457-29c1-4cb0-935d-283db83e0449",
	"metadata": {
	"execution": {
	"iopub.execute_input": "2026-02-18T15:58:30.189048Z",
	"iopub.status.busy": "2026-02-18T15:58:30.188546Z",
	"iopub.status.idle": "2026-02-18T15:58:30.197936Z",
	"shell.execute_reply": "2026-02-18T15:58:30.196146Z",
	"shell.execute_reply.started": "2026-02-18T15:58:30.189003Z"
	}
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{112: {151645: 112},\n",
	" 64: {54: 128, 1639: 80, 10234: 96},\n",
	" 80: {88: 96},\n",
	" 128: {8503: 96, 71: 80},\n",
	" 96: {220: 112}}"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"index.get_transitions()"
	]
	},
	{
	"cell_type": "markdown",
	"id": "89816d12-5e25-46da-859c-81f22423ad57",
	"metadata": {},
	"source": [
	"but that's just me."
	]
	},
	{
	"cell_type": "markdown",
	"id": "8803c412-9e9d-4473-9e7f-ab377a3840a4",
	"metadata": {},
	"source": [
	"The mapping from `outlines_core` states to the prettier states is:\n",
	"```\n",
	"64 -> 0\n",
	"128 -> 1\n",
	"96 -> 2\n",
	"80 -> 3\n",
	"112 -> 4\n",
	"```"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found