Skip to content

Instantly share code, notes, and snippets.

@ptmcg
Last active December 30, 2025 17:42
Show Gist options
  • Select an option

  • Save ptmcg/39fca2fdb18386a1d7f90a0fc36890d9 to your computer and use it in GitHub Desktop.

Select an option

Save ptmcg/39fca2fdb18386a1d7f90a0fc36890d9 to your computer and use it in GitHub Desktop.
Pyparsing regex parser to detect partial matches
#
# pyparsing script to convert a regular expression into a pyparsing parser, and detect partial matches
# Supports:
# - *, +, {n} and {m,n} repetition
# - ? optional elements
# - [] character ranges
# - () grouping
# - | alternation
#
import pyparsing as pp
pp.ParserElement.enable_packrat()
# partial match detectors
partial_match_end = pp.StringEnd() + pp.Tag("partial")
partial_match_end.set_name("<PARTIAL>")
maybe_partial_match = partial_match_end | ""
maybe_partial_match.set_name("<PARTIAL?>")
def handle_range(toks):
return pp.Char(pp.srange(toks[0])).set_name(toks[0])
def handle_repetition(toks):
toks = toks[0]
expr, *_, rep = toks
maybe_partial_expr = (maybe_partial_match + expr).set_name(str(expr))
if rep == "+":
return expr + pp.ZeroOrMore(maybe_partial_expr)
if rep == "*":
return (expr + pp.ZeroOrMore(maybe_partial_expr)) | ""
if rep == "?":
return expr | ""
if "count" in toks:
icount = int(toks.count)
return expr + maybe_partial_expr * (icount - 1)
if "minCount" in toks:
mincount = int(toks.minCount)
maxcount = int(toks.maxCount)
optcount = maxcount - mincount
return expr + maybe_partial_expr* (mincount - 1) + expr * (0, optcount)
def handle_literal(toks):
lit = ""
for t in toks:
if t[0] == "\\":
if t[1] == "t":
lit += "\t"
else:
lit += t[1]
else:
lit += t
return pp.Literal(lit)
def handle_macro(toks):
macro_char = toks[0][1]
if macro_char == "d":
return pp.Char("0123456789").set_name("[0-9]")
elif macro_char == "w":
return pp.Char(pp.srange("[A-Za-z0-9_]")).set_name("[A-Za-z0-9_]")
elif macro_char == "s":
return pp.White(" ")
else:
raise pp.ParseFatalException(
"", 0, "unsupported macro character (" + macro_char + ")"
)
def handle_sequence(toks):
return (
toks[0][0] + pp.And((partial_match_end | expr).set_name(str(expr)) for expr in toks[0][1:])
).streamline()
def handle_dot():
return pp.Char(pp.printables + " ").set_name(pp.util._collapse_string_to_ranges(pp.printables + " "))
def handle_alternative(toks):
return pp.Or(toks[0])
class PartialMatcher:
def __init__(self, regex: str):
self.regex = regex
self.parser = self._make_parser()
try:
self.matcher = self.parser.parse_string(self.regex)[0].streamline()
self.matcher.add_parse_action("".join)
except pp.ParseException as pe:
print(pe.explain())
def _make_parser(self) -> pp.ParserElement:
pp.ParserElement.set_default_whitespace_chars("")
lbrack, rbrack, lbrace, rbrace, lparen, rparen, colon, qmark, comma = pp.Literal.using_each(
"[]{}():?,"
)
re_macro = pp.Combine("\\" + pp.one_of("d w s")).set_name(r"\[dws]")
printable_chars = pp.one_of(list(pp.printables)).set_name(f"W:{pp.util._collapse_string_to_ranges(pp.printables)}")
escaped_char = ~re_macro + pp.Combine("\\" + printable_chars)
literal_chars = (
[*(c for c in pp.printables if c not in r"\[]{}().*?+|"), " ", "\t"]
)
re_literal_char = pp.one_of(literal_chars).set_name(f"W:{pp.util._collapse_string_to_ranges(literal_chars)}")
re_range = pp.Combine(lbrack + pp.SkipTo(rbrack, ignore=escaped_char) + rbrack) # type: ignore
re_literal = escaped_char | re_literal_char
re_non_capture_group = pp.Suppress("?:")
re_dot = pp.Literal(".")
re_int = pp.Word(pp.nums)
repetition = (
(lbrace.suppress() + re_int("count") + rbrace.suppress())
| (lbrace.suppress() + re_int("minCount") + comma.suppress() + re_int("maxCount") + rbrace.suppress())
| pp.one_of(list("*+?"))
)
re_range.add_parse_action(handle_range)
re_literal.add_parse_action(handle_literal)
re_macro.add_parse_action(handle_macro)
re_dot.add_parse_action(handle_dot)
re_term = re_literal | re_range | re_macro | re_dot | re_non_capture_group
pp.autoname_elements()
re_expr = pp.infix_notation(
re_term,
[
(repetition, 1, pp.OpAssoc.LEFT, handle_repetition),
(pp.Empty(), 2, pp.OpAssoc.LEFT, handle_sequence),
(pp.Suppress("|"), 2, pp.OpAssoc.LEFT, handle_alternative),
],
)
_parser = re_expr
return _parser.set_name("_parser")
def main():
matcher = PartialMatcher(r"\w+(\.\w+)*@\w+(\.\w+)+")
# print(matcher.matcher)
# matcher.parser.create_diagram("partial_regex_match_diagram.html", show_results_names=True)
tests = (r"""
john
john.doe@
john.doe@blah
john.doe@blah.com
john.doe@mail.blah.com
john@mail.blah.com
john..doe@mail.blah.com
""".splitlines())
for t in tests:
t = t.strip()
if not t:
continue
print(t, end=" ")
try:
result = matcher.matcher.parse_string(t, parse_all=True)
if result.partial:
print(f"-> PARTIAL ({result.as_list()})")
else:
print(f"-> COMPLETE ({result.as_list()})")
except pp.ParseException as pe:
print("-> INVALID")
print(pe.explain())
print()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment