Skip to content

Instantly share code, notes, and snippets.

@rosstex
rosstex / soup_xpath_gen.py
Last active June 16, 2023 18:59
deterministic xpath generation of BeautifulSoup elements for web crawling
import html
BAD_CHARS = set(["\"", "'", "[", "]"])
# generates an xpath string for a given BeautifulSoup element
def soup_xpath_gen(element):
xpath = ""
while element.name != 'document':
if element.name == 'html':
xpath = "/html/" + xpath