Skip to content

Instantly share code, notes, and snippets.

@mvanorder
Last active March 7, 2021 00:07
Show Gist options
  • Select an option

  • Save mvanorder/f503ab2e801bf285adbe77bb99203e19 to your computer and use it in GitHub Desktop.

Select an option

Save mvanorder/f503ab2e801bf285adbe77bb99203e19 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 168,
"id": "worthy-screw",
"metadata": {},
"outputs": [],
"source": [
"from pprint import pprint\n",
"import xml.etree.ElementTree as ET\n",
"\n",
"# Load in html file\n",
"# tree = ET.parse(\"C:/Users/Malcolm's laptop/Downloads/message_1.html\")\n",
"tree = ET.parse(\"C:/Users/Malcolm's laptop/Downloads/message_2.html\")\n",
"root = tree.getroot()"
]
},
{
"cell_type": "code",
"execution_count": 169,
"id": "silver-daniel",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3962 items in _4t5n\n",
"\n",
"first 5 elements:\n",
"[{'class': '_7s7q'},\n",
" {'class': '_7s7q'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'}]\n",
"\n",
"last 5 elements:\n",
"[{'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': '_7s7q'},\n",
" {'class': '_7s7q'}]\n"
]
}
],
"source": [
"# Get the <div class=\"_4t5n\"> element\n",
"_4t5n = [item for item in root.iter() if item.get('class') == '_4t5n'][0]\n",
"\n",
"print(f'{len(_4t5n)} items in _4t5n')\n",
"print(\"\\nfirst 5 elements:\")\n",
"pprint([child.attrib for child in _4t5n[:5]])\n",
"print(\"\\nlast 5 elements:\")\n",
"pprint([child.attrib for child in _4t5n[-5:]])"
]
},
{
"cell_type": "code",
"execution_count": 170,
"id": "scheduled-boutique",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"first 5 elements:\n",
"[{'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'}]\n",
"\n",
"last 5 elements:\n",
"[{'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'}]\n"
]
}
],
"source": [
"# Get the messages and reverse them\n",
"messages = list(filter(lambda child: child.get('class') != '_7s7q', _4t5n))[::-1]\n",
"\n",
"print(\"\\nfirst 5 elements:\")\n",
"pprint([child.attrib for child in messages[:5]])\n",
"print(\"\\nlast 5 elements:\")\n",
"pprint([child.attrib for child in messages[-5:]])"
]
},
{
"cell_type": "code",
"execution_count": 171,
"id": "nutritional-richmond",
"metadata": {},
"outputs": [],
"source": [
"# Add the bottom page number links to the bottom of the messages\n",
"messages.append(_4t5n[-1])\n",
"messages.append(_4t5n[-1])"
]
},
{
"cell_type": "code",
"execution_count": 172,
"id": "white-globe",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 items in _4t5n\n",
"\n",
"All elements in _4t5n:\n",
"[{'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'},\n",
" {'class': '_7s7q'},\n",
" {'class': '_7s7q'}]\n"
]
}
],
"source": [
"# Remove all messages (<div class=\"_7s7q\"> elements) from _4t5n\n",
"for child in list(filter(lambda child: child.get('class') != '_7s7q', _4t5n)):\n",
" _4t5n.remove(child)\n",
" \n",
"# Remove the bottom page number links from _4t5n\n",
"_4t5n.remove(_4t5n[-1])\n",
"_4t5n.remove(_4t5n[-1])\n",
"print(f'{len(_4t5n)} items in _4t5n')\n",
"print(\"\\nAll elements in _4t5n:\")\n",
"pprint([child.attrib for child in messages[-5:]])"
]
},
{
"cell_type": "code",
"execution_count": 173,
"id": "minute-hanging",
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (<ipython-input-173-cf1e32d18c25>, line 9)",
"output_type": "error",
"traceback": [
"\u001b[1;36m File \u001b[1;32m\"<ipython-input-173-cf1e32d18c25>\"\u001b[1;36m, line \u001b[1;32m9\u001b[0m\n\u001b[1;33m pprint([child.attrib for child in _4t5n[-5:]])1\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"# Add reversed messages and bottom page links back to _4t5n\n",
"for child in messages:\n",
" _4t5n.append(child)\n",
"\n",
"print(f'{len(_4t5n)} items in _4t5n')\n",
"print(\"\\nfirst 5 elements:\")\n",
"pprint([child.attrib for child in _4t5n[:5]])\n",
"print(\"\\nlast 5 elements:\")\n",
"pprint([child.attrib for child in _4t5n[-5:]])1"
]
},
{
"cell_type": "code",
"execution_count": 174,
"id": "considerable-surprise",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"messages/inbox/saadhassan_12hym6rvtg/message_1_old_at_top.html\n",
"messages/inbox/saadhassan_12hym6rvtg/message_2_old_at_top.html\n",
"messages/inbox/saadhassan_12hym6rvtg/message_1_old_at_top.html\n",
"messages/inbox/saadhassan_12hym6rvtg/message_2_old_at_top.html\n"
]
}
],
"source": [
"# Update links to point to the new filename\n",
"import re\n",
"for item in _4t5n.iter():\n",
" if '_42ft' in item.attrib.get('class', []):\n",
" item.attrib['href'] = re.sub(r'message_(\\d+).html', r'message_\\1_old_at_top.html', item.attrib['href'])\n",
" print(item.attrib['href'])"
]
},
{
"cell_type": "code",
"execution_count": 175,
"id": "pleasant-holder",
"metadata": {},
"outputs": [],
"source": [
"# Output to file\n",
"# tree.write(\"C:/Users/Malcolm's laptop/Downloads/message_1_old_at_top.html\", method=\"html\")\n",
"tree.write(\"C:/Users/Malcolm's laptop/Downloads/message_2_old_at_top.html\", method=\"html\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment