Skip to content

Instantly share code, notes, and snippets.

@kernkraft235
Last active July 28, 2025 14:41
Show Gist options
  • Select an option

  • Save kernkraft235/101aa4c03c43c7072b7010b80cd4c64d to your computer and use it in GitHub Desktop.

Select an option

Save kernkraft235/101aa4c03c43c7072b7010b80cd4c64d to your computer and use it in GitHub Desktop.
logit_bias all dashes set to -100, compared to normal response
#!/usr/bin/env python3
import sys
import asyncio
import openai
import os
import subprocess
import json
import pathlib
import unicodedata as u
try:
import tiktoken
except Exception:
sys.stderr.write("tiktoken is required. Install with: pip install tiktoken\n")
sys.exit(2)
USAGE = 'Usage: python3 script.py "<prompt>"'
if len(sys.argv) != 2:
print(USAGE)
sys.exit(1)
prompt = sys.argv[1]
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
print("Error: Please set the OPENAI_API_KEY environment variable.")
sys.exit(1)
MODEL = os.getenv("OPENAI_MODEL", "chatgpt-4o-latest")
BIAS_VALUE = int(os.getenv("LOGIT_BIAS_VALUE", "-100"))
REBUILD_CACHE = os.getenv("REBUILD_DASH_CACHE", "0") == "1"
# Cache location and version
CACHE_VERSION = 3
xdg_cache = os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache"))
CACHE_DIR = pathlib.Path(xdg_cache) / "openai-logit-bias"
CACHE_DIR.mkdir(parents=True, exist_ok=True)
CACHE_FILE = CACHE_DIR / f"dash_tokens_{MODEL}.json"
client = openai.OpenAI(api_key=api_key)
# ---------------- helpers -----------------
def to_clipboard(text: str) -> None:
for cmd in (["pbcopy"], ["xclip", "-selection", "clipboard"], ["clip"]):
try:
subprocess.run(cmd, input=text.encode("utf-8"), check=True)
return
except Exception:
continue
def get_encoding(model: str):
try:
return tiktoken.encoding_for_model(model)
except Exception:
return tiktoken.get_encoding("o200k_base")
# Dash-like detection: allow ASCII hyphen-minus '-' in compound words,
# but block any token that contains hyphen adjacent to whitespace on either side,
# and block all non-ASCII dashlike characters.
EXTRA_DASHES = {
"\u2212", # minus sign
"\u2043", # hyphen bullet
"\u2E3A", # two-em dash
"\u2E3B", # three-em dash
"\u301C", # wave dash
"\u3030", # wavy dash
"\uFE31", # presentation form dash
"\uFE58", # small em dash
"\uFE63", # small hyphen-minus
"\uFF0D", # fullwidth hyphen-minus
}
DASH_CODEPOINTS = {"\u2012", "\u2013", "\u2014", "\u2015"} | EXTRA_DASHES
def contains_disallowed_dash(s: str) -> bool:
# Block spaced hyphen patterns within a single token
if " -" in s or "- " in s:
return True
# Block any non-ASCII dashlike characters
for ch in s:
if ch == "-":
continue
if ch in DASH_CODEPOINTS:
return True
if u.category(ch) == "Pd":
return True
return False
def build_dash_bias_ids(enc) -> list[int]:
ids = []
for tid in range(enc.n_vocab):
try:
s = enc.decode([tid])
except Exception:
continue
if contains_disallowed_dash(s):
ids.append(tid)
return ids
def load_or_build_dash_bias_ids(enc) -> list[int]:
if not REBUILD_CACHE and CACHE_FILE.exists():
try:
with open(CACHE_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict) and data.get("version") == CACHE_VERSION and isinstance(data.get("ids"), list):
ids = data["ids"]
if all(isinstance(x, int) for x in ids):
return ids
except Exception:
pass
ids = build_dash_bias_ids(enc)
try:
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump({"version": CACHE_VERSION, "ids": ids}, f)
except Exception:
pass
return ids
# --------------- request ------------------
async def fetch_response(apply_bias: bool = False):
params = {
"model": MODEL,
"messages": [{"role": "user", "content": prompt}],
}
if apply_bias:
enc = get_encoding(MODEL)
token_ids = load_or_build_dash_bias_ids(enc)
lb = {str(tid): BIAS_VALUE for tid in token_ids}
if lb:
params["logit_bias"] = lb
response = await asyncio.to_thread(client.chat.completions.create, **params)
return response.choices[0].message.content.strip(), params.get("logit_bias")
# --------------- main ---------------------
async def main():
normal_task = asyncio.create_task(fetch_response(apply_bias=False))
bias_task = asyncio.create_task(fetch_response(apply_bias=True))
(normal_response, _), (bias_response, bias_map) = await asyncio.gather(normal_task, bias_task)
bias_summary = json.dumps(bias_map or {}, indent=2, ensure_ascii=False)
output = (
f"\n# Normal Response\n\n{normal_response}\n\n"
f"---\n\n"
f"# Logit Bias Response\n\n{bias_response}\n\n"
# f"---\n\n"
# f"# Active Logit Bias Map (token_id -> bias)\n\n{bias_summary}\n"
)
print(output)
to_clipboard(output)
if __name__ == "__main__":
asyncio.run(main())
{
"533": -100,
"1127": -100,
"1585": -100,
"2230": -100,
"2322": -100,
"2733": -100,
"2747": -100,
"7474": -100,
"8290": -100,
"9307": -100,
"12583": -100,
"18882": -100,
"20075": -100,
"20962": -100,
"26659": -100,
"26691": -100,
"27127": -100,
"27418": -100,
"29747": -100,
"30885": -100,
"30892": -100,
"31167": -100,
"31536": -100,
"35251": -100,
"36553": -100,
"36690": -100,
"38771": -100,
"39102": -100,
"40758": -100,
"41648": -100,
"43717": -100,
"46849": -100,
"49795": -100,
"50005": -100,
"51692": -100,
"54067": -100,
"62961": -100,
"63267": -100,
"64860": -100,
"65363": -100,
"66319": -100,
"68058": -100,
"70238": -100,
"74605": -100,
"77201": -100,
"77304": -100,
"79593": -100,
"83064": -100,
"83199": -100,
"83978": -100,
"85865": -100,
"86031": -100,
"87643": -100,
"90877": -100,
"91169": -100,
"93615": -100,
"94012": -100,
"94353": -100,
"94374": -100,
"94828": -100,
"96275": -100,
"96754": -100,
"105024": -100,
"108181": -100,
"109774": -100,
"112305": -100,
"114635": -100,
"118256": -100,
"121630": -100,
"121655": -100,
"123101": -100,
"126952": -100,
"127126": -100,
"128468": -100,
"134820": -100,
"137419": -100,
"140135": -100,
"141084": -100,
"141391": -100,
"142462": -100,
"142654": -100,
"144129": -100,
"144787": -100,
"147994": -100,
"151396": -100,
"155217": -100,
"155638": -100,
"157456": -100,
"160984": -100,
"169785": -100,
"170523": -100,
"178328": -100,
"179041": -100,
"180279": -100,
"180500": -100,
"183122": -100,
"183862": -100,
"187349": -100,
"187708": -100,
"188860": -100,
"190702": -100,
"192749": -100,
"194935": -100,
"196615": -100,
"197618": -100,
"199266": -100
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment