lucaslouca/strip_html.py

## strip_html.py
import os
import sys
import re


def strip(path: str):
    with open(path, "r") as html_file:
        content = html_file.read()
        body_start_index = content.find('<body>')
        body_end_index = content.find('</body>')
        body = content[body_start_index + 6:body_end_index]

        # Remove Anchor links as well
        anchor = re.compile('<a class="anchor-link" .*?>.*?</a>')
        body = re.sub(anchor, '', body)

        # Write body to new file
        new_file_path, file_extension = os.path.splitext(path)
        new_file_path += "_body.txt"
        with open(new_file_path, "w") as out:
            out.write(body)


def main():
    if len(sys.argv) <= 1:
        print("Usage: %s file" % sys.argv[0])
        return

    strip(sys.argv[1])


if __name__ == '__main__':
    main()
	import os
	import sys
	import re


	def strip(path: str):
	with open(path, "r") as html_file:
	content = html_file.read()
	body_start_index = content.find('<body>')
	body_end_index = content.find('</body>')
	body = content[body_start_index + 6:body_end_index]

	# Remove Anchor links as well
	anchor = re.compile('<a class="anchor-link" .?>.?</a>')
	body = re.sub(anchor, '', body)

	# Write body to new file
	new_file_path, file_extension = os.path.splitext(path)
	new_file_path += "_body.txt"
	with open(new_file_path, "w") as out:
	out.write(body)


	def main():
	if len(sys.argv) <= 1:
	print("Usage: %s file" % sys.argv[0])
	return

	strip(sys.argv[1])


	if __name__ == '__main__':
	main()
No results found