Last active
March 7, 2021 00:07
-
-
Save mvanorder/f503ab2e801bf285adbe77bb99203e19 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 168, | |
| "id": "worthy-screw", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from pprint import pprint\n", | |
| "import xml.etree.ElementTree as ET\n", | |
| "\n", | |
| "# Load in html file\n", | |
| "# tree = ET.parse(\"C:/Users/Malcolm's laptop/Downloads/message_1.html\")\n", | |
| "tree = ET.parse(\"C:/Users/Malcolm's laptop/Downloads/message_2.html\")\n", | |
| "root = tree.getroot()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 169, | |
| "id": "silver-daniel", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "3962 items in _4t5n\n", | |
| "\n", | |
| "first 5 elements:\n", | |
| "[{'class': '_7s7q'},\n", | |
| " {'class': '_7s7q'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'}]\n", | |
| "\n", | |
| "last 5 elements:\n", | |
| "[{'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': '_7s7q'},\n", | |
| " {'class': '_7s7q'}]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Get the <div class=\"_4t5n\"> element\n", | |
| "_4t5n = [item for item in root.iter() if item.get('class') == '_4t5n'][0]\n", | |
| "\n", | |
| "print(f'{len(_4t5n)} items in _4t5n')\n", | |
| "print(\"\\nfirst 5 elements:\")\n", | |
| "pprint([child.attrib for child in _4t5n[:5]])\n", | |
| "print(\"\\nlast 5 elements:\")\n", | |
| "pprint([child.attrib for child in _4t5n[-5:]])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 170, | |
| "id": "scheduled-boutique", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "\n", | |
| "first 5 elements:\n", | |
| "[{'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'}]\n", | |
| "\n", | |
| "last 5 elements:\n", | |
| "[{'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'}]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Get the messages and reverse them\n", | |
| "messages = list(filter(lambda child: child.get('class') != '_7s7q', _4t5n))[::-1]\n", | |
| "\n", | |
| "print(\"\\nfirst 5 elements:\")\n", | |
| "pprint([child.attrib for child in messages[:5]])\n", | |
| "print(\"\\nlast 5 elements:\")\n", | |
| "pprint([child.attrib for child in messages[-5:]])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 171, | |
| "id": "nutritional-richmond", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Add the bottom page number links to the bottom of the messages\n", | |
| "messages.append(_4t5n[-1])\n", | |
| "messages.append(_4t5n[-1])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 172, | |
| "id": "white-globe", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "2 items in _4t5n\n", | |
| "\n", | |
| "All elements in _4t5n:\n", | |
| "[{'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n", | |
| " {'class': '_7s7q'},\n", | |
| " {'class': '_7s7q'}]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Remove all messages (<div class=\"_7s7q\"> elements) from _4t5n\n", | |
| "for child in list(filter(lambda child: child.get('class') != '_7s7q', _4t5n)):\n", | |
| " _4t5n.remove(child)\n", | |
| " \n", | |
| "# Remove the bottom page number links from _4t5n\n", | |
| "_4t5n.remove(_4t5n[-1])\n", | |
| "_4t5n.remove(_4t5n[-1])\n", | |
| "print(f'{len(_4t5n)} items in _4t5n')\n", | |
| "print(\"\\nAll elements in _4t5n:\")\n", | |
| "pprint([child.attrib for child in messages[-5:]])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 173, | |
| "id": "minute-hanging", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "SyntaxError", | |
| "evalue": "invalid syntax (<ipython-input-173-cf1e32d18c25>, line 9)", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[1;36m File \u001b[1;32m\"<ipython-input-173-cf1e32d18c25>\"\u001b[1;36m, line \u001b[1;32m9\u001b[0m\n\u001b[1;33m pprint([child.attrib for child in _4t5n[-5:]])1\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Add reversed messages and bottom page links back to _4t5n\n", | |
| "for child in messages:\n", | |
| " _4t5n.append(child)\n", | |
| "\n", | |
| "print(f'{len(_4t5n)} items in _4t5n')\n", | |
| "print(\"\\nfirst 5 elements:\")\n", | |
| "pprint([child.attrib for child in _4t5n[:5]])\n", | |
| "print(\"\\nlast 5 elements:\")\n", | |
| "pprint([child.attrib for child in _4t5n[-5:]])1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 174, | |
| "id": "considerable-surprise", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "messages/inbox/saadhassan_12hym6rvtg/message_1_old_at_top.html\n", | |
| "messages/inbox/saadhassan_12hym6rvtg/message_2_old_at_top.html\n", | |
| "messages/inbox/saadhassan_12hym6rvtg/message_1_old_at_top.html\n", | |
| "messages/inbox/saadhassan_12hym6rvtg/message_2_old_at_top.html\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Update links to point to the new filename\n", | |
| "import re\n", | |
| "for item in _4t5n.iter():\n", | |
| " if '_42ft' in item.attrib.get('class', []):\n", | |
| " item.attrib['href'] = re.sub(r'message_(\\d+).html', r'message_\\1_old_at_top.html', item.attrib['href'])\n", | |
| " print(item.attrib['href'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 175, | |
| "id": "pleasant-holder", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Output to file\n", | |
| "# tree.write(\"C:/Users/Malcolm's laptop/Downloads/message_1_old_at_top.html\", method=\"html\")\n", | |
| "tree.write(\"C:/Users/Malcolm's laptop/Downloads/message_2_old_at_top.html\", method=\"html\")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment