Created
May 18, 2025 21:25
-
-
Save jleedev/e8289cade94aaa20a2cb95651c47e4a1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "af9565ed-5cab-4189-a225-0c61463ca9f9", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from pyarrow import fs, orc\n", | |
| "import pandas as pd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "c4bed3aa-b2f7-4a90-a8a7-e6a1d5c44399", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "s3 = fs.S3FileSystem(region=\"us-east-1\", anonymous=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "98d58837-05c8-490b-817e-317833c401e0", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "orc_file=orc.ORCFile(s3.open_input_file('osm-pds/changesets/changesets-latest.orc'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "d44315d7-0d46-4aa5-b3f9-358c422bd024", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(id: int64\n", | |
| " tags: map<string, string>\n", | |
| " child 0, entries: struct<key: string not null, value: string> not null\n", | |
| " child 0, key: string not null\n", | |
| " child 1, value: string\n", | |
| " created_at: timestamp[ns]\n", | |
| " open: bool\n", | |
| " closed_at: timestamp[ns]\n", | |
| " comments_count: int64\n", | |
| " min_lat: decimal128(9, 7)\n", | |
| " max_lat: decimal128(9, 7)\n", | |
| " min_lon: decimal128(10, 7)\n", | |
| " max_lon: decimal128(10, 7)\n", | |
| " num_changes: int64\n", | |
| " uid: int64\n", | |
| " user: string,\n", | |
| " 244)" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "orc_file.schema, orc_file.nstripes" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "7c787d67-0584-480c-8471-de36644b908a", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "stripe = orc_file.read_stripe(orc_file.nstripes - 1, columns=[\"id\",\"user\",\"tags\",\"created_at\"])\n", | |
| "dt = stripe.to_pandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "537242c8-0252-4e81-bd3c-f7a2e78d9cef", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id</th>\n", | |
| " <th>tags</th>\n", | |
| " <th>created_at</th>\n", | |
| " <th>user</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>165871645</td>\n", | |
| " <td>[(host, https://www.openstreetmap.org/edit), (...</td>\n", | |
| " <td>2025-05-06 03:37:52</td>\n", | |
| " <td>Sangmu</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>165871646</td>\n", | |
| " <td>[(comment, tennis->pickleball), (source, Custo...</td>\n", | |
| " <td>2025-05-06 03:37:55</td>\n", | |
| " <td>Mundilfari</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>165871647</td>\n", | |
| " <td>[(comment, Created a fast_food), (created_by, ...</td>\n", | |
| " <td>2025-05-06 03:37:57</td>\n", | |
| " <td>Frederik</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>165871648</td>\n", | |
| " <td>[(host, https://www.openstreetmap.org/edit), (...</td>\n", | |
| " <td>2025-05-06 03:38:02</td>\n", | |
| " <td>ChristianB77</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>165871649</td>\n", | |
| " <td>[(hashtags, #OzonGeo), (review_requested, yes)...</td>\n", | |
| " <td>2025-05-06 03:38:02</td>\n", | |
| " <td>Ozon OSM7</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>257760</th>\n", | |
| " <td>166129406</td>\n", | |
| " <td>[(hashtags, #MapPyOSM;#CIDI), (review_requeste...</td>\n", | |
| " <td>2025-05-11 23:59:26</td>\n", | |
| " <td>jessigaleano</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>257761</th>\n", | |
| " <td>166129407</td>\n", | |
| " <td>[(comment, Campinápolis rural), (source, Bing)...</td>\n", | |
| " <td>2025-05-11 23:59:31</td>\n", | |
| " <td>EduRBS</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>257762</th>\n", | |
| " <td>166129408</td>\n", | |
| " <td>[(host, https://www.openstreetmap.org/edit), (...</td>\n", | |
| " <td>2025-05-11 23:59:31</td>\n", | |
| " <td>open-street-guy</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>257763</th>\n", | |
| " <td>166129409</td>\n", | |
| " <td>[(host, https://www.openstreetmap.org/edit), (...</td>\n", | |
| " <td>2025-05-11 23:59:48</td>\n", | |
| " <td>erickerr</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>257764</th>\n", | |
| " <td>166129410</td>\n", | |
| " <td>[(host, https://www.openstreetmap.org/edit), (...</td>\n", | |
| " <td>2025-05-11 23:59:59</td>\n", | |
| " <td>Rawmance</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>257765 rows × 4 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " id tags \\\n", | |
| "0 165871645 [(host, https://www.openstreetmap.org/edit), (... \n", | |
| "1 165871646 [(comment, tennis->pickleball), (source, Custo... \n", | |
| "2 165871647 [(comment, Created a fast_food), (created_by, ... \n", | |
| "3 165871648 [(host, https://www.openstreetmap.org/edit), (... \n", | |
| "4 165871649 [(hashtags, #OzonGeo), (review_requested, yes)... \n", | |
| "... ... ... \n", | |
| "257760 166129406 [(hashtags, #MapPyOSM;#CIDI), (review_requeste... \n", | |
| "257761 166129407 [(comment, Campinápolis rural), (source, Bing)... \n", | |
| "257762 166129408 [(host, https://www.openstreetmap.org/edit), (... \n", | |
| "257763 166129409 [(host, https://www.openstreetmap.org/edit), (... \n", | |
| "257764 166129410 [(host, https://www.openstreetmap.org/edit), (... \n", | |
| "\n", | |
| " created_at user \n", | |
| "0 2025-05-06 03:37:52 Sangmu \n", | |
| "1 2025-05-06 03:37:55 Mundilfari \n", | |
| "2 2025-05-06 03:37:57 Frederik \n", | |
| "3 2025-05-06 03:38:02 ChristianB77 \n", | |
| "4 2025-05-06 03:38:02 Ozon OSM7 \n", | |
| "... ... ... \n", | |
| "257760 2025-05-11 23:59:26 jessigaleano \n", | |
| "257761 2025-05-11 23:59:31 EduRBS \n", | |
| "257762 2025-05-11 23:59:31 open-street-guy \n", | |
| "257763 2025-05-11 23:59:48 erickerr \n", | |
| "257764 2025-05-11 23:59:59 Rawmance \n", | |
| "\n", | |
| "[257765 rows x 4 columns]" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dt" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "5746fee1-7228-4a01-9069-5734928807ee", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>id</th>\n", | |
| " <th>created_at</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>count</th>\n", | |
| " <td>2.577650e+05</td>\n", | |
| " <td>257765</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>mean</th>\n", | |
| " <td>1.660005e+08</td>\n", | |
| " <td>2025-05-08 23:21:27.037134592</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>min</th>\n", | |
| " <td>1.658716e+08</td>\n", | |
| " <td>2025-05-06 03:37:52</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>25%</th>\n", | |
| " <td>1.659361e+08</td>\n", | |
| " <td>2025-05-07 12:16:45</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>50%</th>\n", | |
| " <td>1.660005e+08</td>\n", | |
| " <td>2025-05-08 19:39:29</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>75%</th>\n", | |
| " <td>1.660650e+08</td>\n", | |
| " <td>2025-05-10 11:26:11</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>max</th>\n", | |
| " <td>1.661294e+08</td>\n", | |
| " <td>2025-05-11 23:59:59</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>std</th>\n", | |
| " <td>7.441090e+04</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " id created_at\n", | |
| "count 2.577650e+05 257765\n", | |
| "mean 1.660005e+08 2025-05-08 23:21:27.037134592\n", | |
| "min 1.658716e+08 2025-05-06 03:37:52\n", | |
| "25% 1.659361e+08 2025-05-07 12:16:45\n", | |
| "50% 1.660005e+08 2025-05-08 19:39:29\n", | |
| "75% 1.660650e+08 2025-05-10 11:26:11\n", | |
| "max 1.661294e+08 2025-05-11 23:59:59\n", | |
| "std 7.441090e+04 NaN" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "dt.describe()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "7af1f3be-a2bf-48c6-a372-095ce4b6b694", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<class 'pandas.core.frame.DataFrame'>\n", | |
| "RangeIndex: 257765 entries, 0 to 257764\n", | |
| "Data columns (total 4 columns):\n", | |
| " # Column Non-Null Count Dtype \n", | |
| "--- ------ -------------- ----- \n", | |
| " 0 id 257765 non-null int64 \n", | |
| " 1 tags 257765 non-null object \n", | |
| " 2 created_at 257765 non-null datetime64[ns]\n", | |
| " 3 user 257765 non-null object \n", | |
| "dtypes: datetime64[ns](1), int64(1), object(2)\n", | |
| "memory usage: 7.9+ MB\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "dt.info();" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "40ff9b67-a0e4-4809-9613-ceb03a5ab969", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.10" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment