Last active
July 28, 2025 14:41
-
-
Save kernkraft235/101aa4c03c43c7072b7010b80cd4c64d to your computer and use it in GitHub Desktop.
logit_bias all dashes set to -100, compared to normal response
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import sys | |
| import asyncio | |
| import openai | |
| import os | |
| import subprocess | |
| import json | |
| import pathlib | |
| import unicodedata as u | |
| try: | |
| import tiktoken | |
| except Exception: | |
| sys.stderr.write("tiktoken is required. Install with: pip install tiktoken\n") | |
| sys.exit(2) | |
| USAGE = 'Usage: python3 script.py "<prompt>"' | |
| if len(sys.argv) != 2: | |
| print(USAGE) | |
| sys.exit(1) | |
| prompt = sys.argv[1] | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| print("Error: Please set the OPENAI_API_KEY environment variable.") | |
| sys.exit(1) | |
| MODEL = os.getenv("OPENAI_MODEL", "chatgpt-4o-latest") | |
| BIAS_VALUE = int(os.getenv("LOGIT_BIAS_VALUE", "-100")) | |
| REBUILD_CACHE = os.getenv("REBUILD_DASH_CACHE", "0") == "1" | |
| # Cache location and version | |
| CACHE_VERSION = 3 | |
| xdg_cache = os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")) | |
| CACHE_DIR = pathlib.Path(xdg_cache) / "openai-logit-bias" | |
| CACHE_DIR.mkdir(parents=True, exist_ok=True) | |
| CACHE_FILE = CACHE_DIR / f"dash_tokens_{MODEL}.json" | |
| client = openai.OpenAI(api_key=api_key) | |
| # ---------------- helpers ----------------- | |
| def to_clipboard(text: str) -> None: | |
| for cmd in (["pbcopy"], ["xclip", "-selection", "clipboard"], ["clip"]): | |
| try: | |
| subprocess.run(cmd, input=text.encode("utf-8"), check=True) | |
| return | |
| except Exception: | |
| continue | |
| def get_encoding(model: str): | |
| try: | |
| return tiktoken.encoding_for_model(model) | |
| except Exception: | |
| return tiktoken.get_encoding("o200k_base") | |
| # Dash-like detection: allow ASCII hyphen-minus '-' in compound words, | |
| # but block any token that contains hyphen adjacent to whitespace on either side, | |
| # and block all non-ASCII dashlike characters. | |
| EXTRA_DASHES = { | |
| "\u2212", # minus sign | |
| "\u2043", # hyphen bullet | |
| "\u2E3A", # two-em dash | |
| "\u2E3B", # three-em dash | |
| "\u301C", # wave dash | |
| "\u3030", # wavy dash | |
| "\uFE31", # presentation form dash | |
| "\uFE58", # small em dash | |
| "\uFE63", # small hyphen-minus | |
| "\uFF0D", # fullwidth hyphen-minus | |
| } | |
| DASH_CODEPOINTS = {"\u2012", "\u2013", "\u2014", "\u2015"} | EXTRA_DASHES | |
| def contains_disallowed_dash(s: str) -> bool: | |
| # Block spaced hyphen patterns within a single token | |
| if " -" in s or "- " in s: | |
| return True | |
| # Block any non-ASCII dashlike characters | |
| for ch in s: | |
| if ch == "-": | |
| continue | |
| if ch in DASH_CODEPOINTS: | |
| return True | |
| if u.category(ch) == "Pd": | |
| return True | |
| return False | |
| def build_dash_bias_ids(enc) -> list[int]: | |
| ids = [] | |
| for tid in range(enc.n_vocab): | |
| try: | |
| s = enc.decode([tid]) | |
| except Exception: | |
| continue | |
| if contains_disallowed_dash(s): | |
| ids.append(tid) | |
| return ids | |
| def load_or_build_dash_bias_ids(enc) -> list[int]: | |
| if not REBUILD_CACHE and CACHE_FILE.exists(): | |
| try: | |
| with open(CACHE_FILE, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| if isinstance(data, dict) and data.get("version") == CACHE_VERSION and isinstance(data.get("ids"), list): | |
| ids = data["ids"] | |
| if all(isinstance(x, int) for x in ids): | |
| return ids | |
| except Exception: | |
| pass | |
| ids = build_dash_bias_ids(enc) | |
| try: | |
| with open(CACHE_FILE, "w", encoding="utf-8") as f: | |
| json.dump({"version": CACHE_VERSION, "ids": ids}, f) | |
| except Exception: | |
| pass | |
| return ids | |
| # --------------- request ------------------ | |
| async def fetch_response(apply_bias: bool = False): | |
| params = { | |
| "model": MODEL, | |
| "messages": [{"role": "user", "content": prompt}], | |
| } | |
| if apply_bias: | |
| enc = get_encoding(MODEL) | |
| token_ids = load_or_build_dash_bias_ids(enc) | |
| lb = {str(tid): BIAS_VALUE for tid in token_ids} | |
| if lb: | |
| params["logit_bias"] = lb | |
| response = await asyncio.to_thread(client.chat.completions.create, **params) | |
| return response.choices[0].message.content.strip(), params.get("logit_bias") | |
| # --------------- main --------------------- | |
| async def main(): | |
| normal_task = asyncio.create_task(fetch_response(apply_bias=False)) | |
| bias_task = asyncio.create_task(fetch_response(apply_bias=True)) | |
| (normal_response, _), (bias_response, bias_map) = await asyncio.gather(normal_task, bias_task) | |
| bias_summary = json.dumps(bias_map or {}, indent=2, ensure_ascii=False) | |
| output = ( | |
| f"\n# Normal Response\n\n{normal_response}\n\n" | |
| f"---\n\n" | |
| f"# Logit Bias Response\n\n{bias_response}\n\n" | |
| # f"---\n\n" | |
| # f"# Active Logit Bias Map (token_id -> bias)\n\n{bias_summary}\n" | |
| ) | |
| print(output) | |
| to_clipboard(output) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "533": -100, | |
| "1127": -100, | |
| "1585": -100, | |
| "2230": -100, | |
| "2322": -100, | |
| "2733": -100, | |
| "2747": -100, | |
| "7474": -100, | |
| "8290": -100, | |
| "9307": -100, | |
| "12583": -100, | |
| "18882": -100, | |
| "20075": -100, | |
| "20962": -100, | |
| "26659": -100, | |
| "26691": -100, | |
| "27127": -100, | |
| "27418": -100, | |
| "29747": -100, | |
| "30885": -100, | |
| "30892": -100, | |
| "31167": -100, | |
| "31536": -100, | |
| "35251": -100, | |
| "36553": -100, | |
| "36690": -100, | |
| "38771": -100, | |
| "39102": -100, | |
| "40758": -100, | |
| "41648": -100, | |
| "43717": -100, | |
| "46849": -100, | |
| "49795": -100, | |
| "50005": -100, | |
| "51692": -100, | |
| "54067": -100, | |
| "62961": -100, | |
| "63267": -100, | |
| "64860": -100, | |
| "65363": -100, | |
| "66319": -100, | |
| "68058": -100, | |
| "70238": -100, | |
| "74605": -100, | |
| "77201": -100, | |
| "77304": -100, | |
| "79593": -100, | |
| "83064": -100, | |
| "83199": -100, | |
| "83978": -100, | |
| "85865": -100, | |
| "86031": -100, | |
| "87643": -100, | |
| "90877": -100, | |
| "91169": -100, | |
| "93615": -100, | |
| "94012": -100, | |
| "94353": -100, | |
| "94374": -100, | |
| "94828": -100, | |
| "96275": -100, | |
| "96754": -100, | |
| "105024": -100, | |
| "108181": -100, | |
| "109774": -100, | |
| "112305": -100, | |
| "114635": -100, | |
| "118256": -100, | |
| "121630": -100, | |
| "121655": -100, | |
| "123101": -100, | |
| "126952": -100, | |
| "127126": -100, | |
| "128468": -100, | |
| "134820": -100, | |
| "137419": -100, | |
| "140135": -100, | |
| "141084": -100, | |
| "141391": -100, | |
| "142462": -100, | |
| "142654": -100, | |
| "144129": -100, | |
| "144787": -100, | |
| "147994": -100, | |
| "151396": -100, | |
| "155217": -100, | |
| "155638": -100, | |
| "157456": -100, | |
| "160984": -100, | |
| "169785": -100, | |
| "170523": -100, | |
| "178328": -100, | |
| "179041": -100, | |
| "180279": -100, | |
| "180500": -100, | |
| "183122": -100, | |
| "183862": -100, | |
| "187349": -100, | |
| "187708": -100, | |
| "188860": -100, | |
| "190702": -100, | |
| "192749": -100, | |
| "194935": -100, | |
| "196615": -100, | |
| "197618": -100, | |
| "199266": -100 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment