Skip to content

Instantly share code, notes, and snippets.

@jleedev
Created May 18, 2025 21:25
Show Gist options
  • Select an option

  • Save jleedev/e8289cade94aaa20a2cb95651c47e4a1 to your computer and use it in GitHub Desktop.

Select an option

Save jleedev/e8289cade94aaa20a2cb95651c47e4a1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "af9565ed-5cab-4189-a225-0c61463ca9f9",
"metadata": {},
"outputs": [],
"source": [
"from pyarrow import fs, orc\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c4bed3aa-b2f7-4a90-a8a7-e6a1d5c44399",
"metadata": {},
"outputs": [],
"source": [
"s3 = fs.S3FileSystem(region=\"us-east-1\", anonymous=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "98d58837-05c8-490b-817e-317833c401e0",
"metadata": {},
"outputs": [],
"source": [
"orc_file=orc.ORCFile(s3.open_input_file('osm-pds/changesets/changesets-latest.orc'))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d44315d7-0d46-4aa5-b3f9-358c422bd024",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(id: int64\n",
" tags: map<string, string>\n",
" child 0, entries: struct<key: string not null, value: string> not null\n",
" child 0, key: string not null\n",
" child 1, value: string\n",
" created_at: timestamp[ns]\n",
" open: bool\n",
" closed_at: timestamp[ns]\n",
" comments_count: int64\n",
" min_lat: decimal128(9, 7)\n",
" max_lat: decimal128(9, 7)\n",
" min_lon: decimal128(10, 7)\n",
" max_lon: decimal128(10, 7)\n",
" num_changes: int64\n",
" uid: int64\n",
" user: string,\n",
" 244)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"orc_file.schema, orc_file.nstripes"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "7c787d67-0584-480c-8471-de36644b908a",
"metadata": {},
"outputs": [],
"source": [
"stripe = orc_file.read_stripe(orc_file.nstripes - 1, columns=[\"id\",\"user\",\"tags\",\"created_at\"])\n",
"dt = stripe.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "537242c8-0252-4e81-bd3c-f7a2e78d9cef",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>tags</th>\n",
" <th>created_at</th>\n",
" <th>user</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>165871645</td>\n",
" <td>[(host, https://www.openstreetmap.org/edit), (...</td>\n",
" <td>2025-05-06 03:37:52</td>\n",
" <td>Sangmu</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>165871646</td>\n",
" <td>[(comment, tennis-&gt;pickleball), (source, Custo...</td>\n",
" <td>2025-05-06 03:37:55</td>\n",
" <td>Mundilfari</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>165871647</td>\n",
" <td>[(comment, Created a fast_food), (created_by, ...</td>\n",
" <td>2025-05-06 03:37:57</td>\n",
" <td>Frederik</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>165871648</td>\n",
" <td>[(host, https://www.openstreetmap.org/edit), (...</td>\n",
" <td>2025-05-06 03:38:02</td>\n",
" <td>ChristianB77</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>165871649</td>\n",
" <td>[(hashtags, #OzonGeo), (review_requested, yes)...</td>\n",
" <td>2025-05-06 03:38:02</td>\n",
" <td>Ozon OSM7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257760</th>\n",
" <td>166129406</td>\n",
" <td>[(hashtags, #MapPyOSM;#CIDI), (review_requeste...</td>\n",
" <td>2025-05-11 23:59:26</td>\n",
" <td>jessigaleano</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257761</th>\n",
" <td>166129407</td>\n",
" <td>[(comment, Campinápolis rural), (source, Bing)...</td>\n",
" <td>2025-05-11 23:59:31</td>\n",
" <td>EduRBS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257762</th>\n",
" <td>166129408</td>\n",
" <td>[(host, https://www.openstreetmap.org/edit), (...</td>\n",
" <td>2025-05-11 23:59:31</td>\n",
" <td>open-street-guy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257763</th>\n",
" <td>166129409</td>\n",
" <td>[(host, https://www.openstreetmap.org/edit), (...</td>\n",
" <td>2025-05-11 23:59:48</td>\n",
" <td>erickerr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257764</th>\n",
" <td>166129410</td>\n",
" <td>[(host, https://www.openstreetmap.org/edit), (...</td>\n",
" <td>2025-05-11 23:59:59</td>\n",
" <td>Rawmance</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>257765 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" id tags \\\n",
"0 165871645 [(host, https://www.openstreetmap.org/edit), (... \n",
"1 165871646 [(comment, tennis->pickleball), (source, Custo... \n",
"2 165871647 [(comment, Created a fast_food), (created_by, ... \n",
"3 165871648 [(host, https://www.openstreetmap.org/edit), (... \n",
"4 165871649 [(hashtags, #OzonGeo), (review_requested, yes)... \n",
"... ... ... \n",
"257760 166129406 [(hashtags, #MapPyOSM;#CIDI), (review_requeste... \n",
"257761 166129407 [(comment, Campinápolis rural), (source, Bing)... \n",
"257762 166129408 [(host, https://www.openstreetmap.org/edit), (... \n",
"257763 166129409 [(host, https://www.openstreetmap.org/edit), (... \n",
"257764 166129410 [(host, https://www.openstreetmap.org/edit), (... \n",
"\n",
" created_at user \n",
"0 2025-05-06 03:37:52 Sangmu \n",
"1 2025-05-06 03:37:55 Mundilfari \n",
"2 2025-05-06 03:37:57 Frederik \n",
"3 2025-05-06 03:38:02 ChristianB77 \n",
"4 2025-05-06 03:38:02 Ozon OSM7 \n",
"... ... ... \n",
"257760 2025-05-11 23:59:26 jessigaleano \n",
"257761 2025-05-11 23:59:31 EduRBS \n",
"257762 2025-05-11 23:59:31 open-street-guy \n",
"257763 2025-05-11 23:59:48 erickerr \n",
"257764 2025-05-11 23:59:59 Rawmance \n",
"\n",
"[257765 rows x 4 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dt"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "5746fee1-7228-4a01-9069-5734928807ee",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>created_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>2.577650e+05</td>\n",
" <td>257765</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.660005e+08</td>\n",
" <td>2025-05-08 23:21:27.037134592</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.658716e+08</td>\n",
" <td>2025-05-06 03:37:52</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>1.659361e+08</td>\n",
" <td>2025-05-07 12:16:45</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.660005e+08</td>\n",
" <td>2025-05-08 19:39:29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.660650e+08</td>\n",
" <td>2025-05-10 11:26:11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1.661294e+08</td>\n",
" <td>2025-05-11 23:59:59</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>7.441090e+04</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id created_at\n",
"count 2.577650e+05 257765\n",
"mean 1.660005e+08 2025-05-08 23:21:27.037134592\n",
"min 1.658716e+08 2025-05-06 03:37:52\n",
"25% 1.659361e+08 2025-05-07 12:16:45\n",
"50% 1.660005e+08 2025-05-08 19:39:29\n",
"75% 1.660650e+08 2025-05-10 11:26:11\n",
"max 1.661294e+08 2025-05-11 23:59:59\n",
"std 7.441090e+04 NaN"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dt.describe()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7af1f3be-a2bf-48c6-a372-095ce4b6b694",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 257765 entries, 0 to 257764\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 257765 non-null int64 \n",
" 1 tags 257765 non-null object \n",
" 2 created_at 257765 non-null datetime64[ns]\n",
" 3 user 257765 non-null object \n",
"dtypes: datetime64[ns](1), int64(1), object(2)\n",
"memory usage: 7.9+ MB\n"
]
}
],
"source": [
"dt.info();"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40ff9b67-a0e4-4809-9613-ceb03a5ab969",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment