Created
December 4, 2025 23:16
-
-
Save veritem/6b40ebf73af63045f9837e3061f1f7f2 to your computer and use it in GitHub Desktop.
problem5-toc
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| Grammar = dict[str, list[list[str]]] | |
| def get_start_symbol(grammar: Grammar) -> str: | |
| return next(iter(grammar)) | |
| def fresh_symbol(existing: set[str], base: str) -> str: | |
| if base not in existing: | |
| return base | |
| i = 1 | |
| while f"{base}_{i}" in existing: | |
| i += 1 | |
| return f"{base}_{i}" | |
| def cfg_union(G1: Grammar, G2: Grammar) -> Grammar: | |
| all_nonterminals = set(G1.keys()) | set(G2.keys()) | |
| S1 = get_start_symbol(G1) | |
| S2 = get_start_symbol(G2) | |
| S_new = fresh_symbol(all_nonterminals, "union") | |
| result: Grammar = {} | |
| result.update(G1) | |
| result.update(G2) | |
| result[S_new] = [[S1], [S2]] | |
| return result | |
| def cfg_concatenation(G1: Grammar, G2: Grammar) -> Grammar: | |
| all_nonterminals = set(G1.keys()) | set(G2.keys()) | |
| S1 = get_start_symbol(G1) | |
| S2 = get_start_symbol(G2) | |
| S_new = fresh_symbol(all_nonterminals, "concat") | |
| result: Grammar = {} | |
| result.update(G1) | |
| result.update(G2) | |
| result[S_new] = [[S1, S2]] | |
| return result | |
| def cfg_kleene_star(G: Grammar) -> Grammar: | |
| all_nonterminals = set(G.keys()) | |
| S0 = get_start_symbol(G) | |
| S_star = fresh_symbol(all_nonterminals, "star") | |
| result: Grammar = {} | |
| result.update(G) | |
| result[S_star] = [ | |
| [], | |
| [S_star, S0], | |
| ] | |
| return result | |
| def regex_to_cfg(alphabet: set[str], regex: str) -> Grammar: | |
| regex_str = "".join(c for c in regex if not c.isspace()) | |
| pos = 0 | |
| grammar: Grammar = {} | |
| counter = 1 | |
| def new_nt() -> str: | |
| nonlocal counter | |
| name = f"S{counter}" | |
| counter += 1 | |
| return name | |
| def peek() -> str: | |
| return regex_str[pos] if pos < len(regex_str) else "" | |
| def consume(ch: str) -> None: | |
| nonlocal pos | |
| if peek() != ch: | |
| raise ValueError(f"Expected '{ch}' at position {pos}, got '{peek()}'") | |
| pos += 1 | |
| def parse_expr() -> str: | |
| """ | |
| expr -> term ((U or |) term)* | |
| """ | |
| left_nt = parse_term() | |
| while peek() in ("U", "|"): | |
| consume(peek()) | |
| right_nt = parse_term() | |
| nt = new_nt() | |
| grammar[nt] = [[left_nt], [right_nt]] | |
| left_nt = nt | |
| return left_nt | |
| def parse_term() -> str: | |
| """ | |
| term -> factor+ (concatenation) | |
| """ | |
| factors = [] | |
| while True: | |
| c = peek() | |
| if c == "" or c in ")U|": | |
| break | |
| factors.append(parse_factor()) | |
| if not factors: | |
| nt = new_nt() | |
| grammar[nt] = [[]] | |
| return nt | |
| cur = factors[0] | |
| for nxt in factors[1:]: | |
| nt = new_nt() | |
| grammar[nt] = [[cur, nxt]] | |
| cur = nt | |
| return cur | |
| def parse_factor() -> str: | |
| """ | |
| factor -> atom ('*')* | |
| """ | |
| base = parse_atom() | |
| while peek() == '*': | |
| consume('*') | |
| nt = new_nt() | |
| grammar[nt] = [ | |
| [], # epsilon | |
| [nt, base], | |
| ] | |
| base = nt | |
| return base | |
| def parse_atom() -> str: | |
| """ | |
| atom -> '(' expr ')' | ε | literal-run | |
| where literal-run is a maximal run of alphabet characters, | |
| e.g. "10" or "000", which becomes one nonterminal A -> 1 0 or A -> 0 0 0. | |
| """ | |
| c = peek() | |
| if c == '(': | |
| consume('(') | |
| sym_nt = parse_expr() | |
| if peek() != ')': | |
| raise ValueError(f"Expected ')' at position {pos}") | |
| consume(')') | |
| return sym_nt | |
| elif c == 'ε': | |
| consume('ε') | |
| nt = new_nt() | |
| grammar[nt] = [[]] | |
| return nt | |
| elif c in alphabet: | |
| letters: list[str] = [] | |
| while peek() in alphabet: | |
| letters.append(peek()) | |
| consume(peek()) | |
| nt = new_nt() | |
| grammar[nt] = [letters] | |
| return nt | |
| else: | |
| raise ValueError(f"Unexpected character '{c}' at position {pos}") | |
| start_nt = parse_expr() | |
| if pos != len(regex_str): | |
| raise ValueError(f"Unexpected trailing chars at position {pos}: {regex_str[pos:]}") | |
| ordered: Grammar = {start_nt: grammar[start_nt]} | |
| for nt, prods in grammar.items(): | |
| if nt != start_nt: | |
| ordered[nt] = prods | |
| return ordered | |
| def print_grammar(grammar: Grammar) -> None: | |
| start = get_start_symbol(grammar) | |
| print(f"Start symbol: {start}") | |
| print("Productions:") | |
| for nt, productions in grammar.items(): | |
| rhs_strings = [] | |
| for rhs in productions: | |
| if len(rhs) == 0: | |
| rhs_strings.append("ε") | |
| else: | |
| rhs_strings.append(" ".join(rhs)) | |
| print(f" {nt} -> " + " | ".join(rhs_strings)) | |
| print() | |
| if __name__ == "__main__": | |
| cfg1 = { | |
| "S": [ | |
| ["(", "S", ")"], | |
| ["S", "+", "S"], | |
| ["-", "S"], | |
| ["p"], | |
| ["q"], | |
| ] | |
| } | |
| cfg2 = { | |
| "T": [ | |
| ["p"], | |
| ["q", "T"], | |
| ] | |
| } | |
| union = cfg_union(cfg1, cfg2) | |
| concat = cfg_concatenation(cfg1, cfg2) | |
| star = cfg_kleene_star(cfg1) | |
| print("cfg1 ∪ cfg2") | |
| print_grammar(union) | |
| print("cfg2 · cfg2") | |
| print_grammar(concat) | |
| print("cfg1*") | |
| print_grammar(star) | |
| regex_grammar = regex_to_cfg({"0", "1"}, "0 U 10(000)*") | |
| print_grammar(regex_grammar) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment