Skip to content

Instantly share code, notes, and snippets.

@veritem
Created December 4, 2025 23:16
Show Gist options
  • Select an option

  • Save veritem/6b40ebf73af63045f9837e3061f1f7f2 to your computer and use it in GitHub Desktop.

Select an option

Save veritem/6b40ebf73af63045f9837e3061f1f7f2 to your computer and use it in GitHub Desktop.
problem5-toc
#!/usr/bin/env python3
Grammar = dict[str, list[list[str]]]
def get_start_symbol(grammar: Grammar) -> str:
return next(iter(grammar))
def fresh_symbol(existing: set[str], base: str) -> str:
if base not in existing:
return base
i = 1
while f"{base}_{i}" in existing:
i += 1
return f"{base}_{i}"
def cfg_union(G1: Grammar, G2: Grammar) -> Grammar:
all_nonterminals = set(G1.keys()) | set(G2.keys())
S1 = get_start_symbol(G1)
S2 = get_start_symbol(G2)
S_new = fresh_symbol(all_nonterminals, "union")
result: Grammar = {}
result.update(G1)
result.update(G2)
result[S_new] = [[S1], [S2]]
return result
def cfg_concatenation(G1: Grammar, G2: Grammar) -> Grammar:
all_nonterminals = set(G1.keys()) | set(G2.keys())
S1 = get_start_symbol(G1)
S2 = get_start_symbol(G2)
S_new = fresh_symbol(all_nonterminals, "concat")
result: Grammar = {}
result.update(G1)
result.update(G2)
result[S_new] = [[S1, S2]]
return result
def cfg_kleene_star(G: Grammar) -> Grammar:
all_nonterminals = set(G.keys())
S0 = get_start_symbol(G)
S_star = fresh_symbol(all_nonterminals, "star")
result: Grammar = {}
result.update(G)
result[S_star] = [
[],
[S_star, S0],
]
return result
def regex_to_cfg(alphabet: set[str], regex: str) -> Grammar:
regex_str = "".join(c for c in regex if not c.isspace())
pos = 0
grammar: Grammar = {}
counter = 1
def new_nt() -> str:
nonlocal counter
name = f"S{counter}"
counter += 1
return name
def peek() -> str:
return regex_str[pos] if pos < len(regex_str) else ""
def consume(ch: str) -> None:
nonlocal pos
if peek() != ch:
raise ValueError(f"Expected '{ch}' at position {pos}, got '{peek()}'")
pos += 1
def parse_expr() -> str:
"""
expr -> term ((U or |) term)*
"""
left_nt = parse_term()
while peek() in ("U", "|"):
consume(peek())
right_nt = parse_term()
nt = new_nt()
grammar[nt] = [[left_nt], [right_nt]]
left_nt = nt
return left_nt
def parse_term() -> str:
"""
term -> factor+ (concatenation)
"""
factors = []
while True:
c = peek()
if c == "" or c in ")U|":
break
factors.append(parse_factor())
if not factors:
nt = new_nt()
grammar[nt] = [[]]
return nt
cur = factors[0]
for nxt in factors[1:]:
nt = new_nt()
grammar[nt] = [[cur, nxt]]
cur = nt
return cur
def parse_factor() -> str:
"""
factor -> atom ('*')*
"""
base = parse_atom()
while peek() == '*':
consume('*')
nt = new_nt()
grammar[nt] = [
[], # epsilon
[nt, base],
]
base = nt
return base
def parse_atom() -> str:
"""
atom -> '(' expr ')' | ε | literal-run
where literal-run is a maximal run of alphabet characters,
e.g. "10" or "000", which becomes one nonterminal A -> 1 0 or A -> 0 0 0.
"""
c = peek()
if c == '(':
consume('(')
sym_nt = parse_expr()
if peek() != ')':
raise ValueError(f"Expected ')' at position {pos}")
consume(')')
return sym_nt
elif c == 'ε':
consume('ε')
nt = new_nt()
grammar[nt] = [[]]
return nt
elif c in alphabet:
letters: list[str] = []
while peek() in alphabet:
letters.append(peek())
consume(peek())
nt = new_nt()
grammar[nt] = [letters]
return nt
else:
raise ValueError(f"Unexpected character '{c}' at position {pos}")
start_nt = parse_expr()
if pos != len(regex_str):
raise ValueError(f"Unexpected trailing chars at position {pos}: {regex_str[pos:]}")
ordered: Grammar = {start_nt: grammar[start_nt]}
for nt, prods in grammar.items():
if nt != start_nt:
ordered[nt] = prods
return ordered
def print_grammar(grammar: Grammar) -> None:
start = get_start_symbol(grammar)
print(f"Start symbol: {start}")
print("Productions:")
for nt, productions in grammar.items():
rhs_strings = []
for rhs in productions:
if len(rhs) == 0:
rhs_strings.append("ε")
else:
rhs_strings.append(" ".join(rhs))
print(f" {nt} -> " + " | ".join(rhs_strings))
print()
if __name__ == "__main__":
cfg1 = {
"S": [
["(", "S", ")"],
["S", "+", "S"],
["-", "S"],
["p"],
["q"],
]
}
cfg2 = {
"T": [
["p"],
["q", "T"],
]
}
union = cfg_union(cfg1, cfg2)
concat = cfg_concatenation(cfg1, cfg2)
star = cfg_kleene_star(cfg1)
print("cfg1 ∪ cfg2")
print_grammar(union)
print("cfg2 · cfg2")
print_grammar(concat)
print("cfg1*")
print_grammar(star)
regex_grammar = regex_to_cfg({"0", "1"}, "0 U 10(000)*")
print_grammar(regex_grammar)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment