Created
November 28, 2023 22:12
-
-
Save emcake/acc1aa233339a5b3534e2f54702dd46e to your computer and use it in GitHub Desktop.
Merging DeltaTables
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import deltalake as dl\n", | |
| "import tempfile\n", | |
| "import pyarrow as pa\n", | |
| "from typing import List, Dict" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "\n", | |
| "def get_some_data(keys: List[int], partitions:List[int], value: int) -> List[Dict[str, int]]:\n", | |
| " data: List[Dict[str, int]] = []\n", | |
| "\n", | |
| " for p in partitions:\n", | |
| " for k in keys:\n", | |
| " data.append({'partition' : p, 'key' : k, 'value': value})\n", | |
| "\n", | |
| " return data\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "initial_keys = [1,2,3,5,6,7,9,10]\n", | |
| "\n", | |
| "initial_partitions = [1,2,3,4]\n", | |
| "\n", | |
| "data = get_some_data(initial_keys, initial_partitions, 1)\n", | |
| "\n", | |
| "table_location = tempfile.mkdtemp()\n", | |
| "\n", | |
| "dl.write_deltalake(table_location, pa.Table.from_pylist(data), partition_by=['partition'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "tbl = dl.DeltaTable(table_location)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['partition=3/0-ca806a47-b2b9-43f7-ae31-dc40d42a1bdf-0.parquet',\n", | |
| " 'partition=4/0-ca806a47-b2b9-43f7-ae31-dc40d42a1bdf-0.parquet',\n", | |
| " 'partition=2/0-ca806a47-b2b9-43f7-ae31-dc40d42a1bdf-0.parquet',\n", | |
| " 'partition=1/0-ca806a47-b2b9-43f7-ae31-dc40d42a1bdf-0.parquet']" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "files_before = tbl.files()\n", | |
| "files_before" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>partition</th>\n", | |
| " <th>key</th>\n", | |
| " <th>value</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>24</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>25</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>26</th>\n", | |
| " <td>1</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>27</th>\n", | |
| " <td>1</td>\n", | |
| " <td>5</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>28</th>\n", | |
| " <td>1</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>29</th>\n", | |
| " <td>1</td>\n", | |
| " <td>7</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>30</th>\n", | |
| " <td>1</td>\n", | |
| " <td>9</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>31</th>\n", | |
| " <td>1</td>\n", | |
| " <td>10</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>16</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>17</th>\n", | |
| " <td>2</td>\n", | |
| " <td>2</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>18</th>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>19</th>\n", | |
| " <td>2</td>\n", | |
| " <td>5</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>20</th>\n", | |
| " <td>2</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>21</th>\n", | |
| " <td>2</td>\n", | |
| " <td>7</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>22</th>\n", | |
| " <td>2</td>\n", | |
| " <td>9</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>23</th>\n", | |
| " <td>2</td>\n", | |
| " <td>10</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>3</td>\n", | |
| " <td>2</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>3</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>3</td>\n", | |
| " <td>5</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>3</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>3</td>\n", | |
| " <td>7</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>3</td>\n", | |
| " <td>9</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>3</td>\n", | |
| " <td>10</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>4</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>4</td>\n", | |
| " <td>2</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>10</th>\n", | |
| " <td>4</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>11</th>\n", | |
| " <td>4</td>\n", | |
| " <td>5</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>12</th>\n", | |
| " <td>4</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13</th>\n", | |
| " <td>4</td>\n", | |
| " <td>7</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>14</th>\n", | |
| " <td>4</td>\n", | |
| " <td>9</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>15</th>\n", | |
| " <td>4</td>\n", | |
| " <td>10</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " partition key value\n", | |
| "24 1 1 1\n", | |
| "25 1 2 1\n", | |
| "26 1 3 1\n", | |
| "27 1 5 1\n", | |
| "28 1 6 1\n", | |
| "29 1 7 1\n", | |
| "30 1 9 1\n", | |
| "31 1 10 1\n", | |
| "16 2 1 1\n", | |
| "17 2 2 1\n", | |
| "18 2 3 1\n", | |
| "19 2 5 1\n", | |
| "20 2 6 1\n", | |
| "21 2 7 1\n", | |
| "22 2 9 1\n", | |
| "23 2 10 1\n", | |
| "0 3 1 1\n", | |
| "1 3 2 1\n", | |
| "2 3 3 1\n", | |
| "3 3 5 1\n", | |
| "4 3 6 1\n", | |
| "5 3 7 1\n", | |
| "6 3 9 1\n", | |
| "7 3 10 1\n", | |
| "8 4 1 1\n", | |
| "9 4 2 1\n", | |
| "10 4 3 1\n", | |
| "11 4 5 1\n", | |
| "12 4 6 1\n", | |
| "13 4 7 1\n", | |
| "14 4 9 1\n", | |
| "15 4 10 1" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tbl.to_pandas().sort_values(['partition', 'key'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'num_source_rows': 30,\n", | |
| " 'num_target_rows_inserted': 14,\n", | |
| " 'num_target_rows_updated': 16,\n", | |
| " 'num_target_rows_deleted': 0,\n", | |
| " 'num_target_rows_copied': 16,\n", | |
| " 'num_output_rows': 46,\n", | |
| " 'num_target_files_added': 5,\n", | |
| " 'num_target_files_removed': 4,\n", | |
| " 'execution_time_ms': 19,\n", | |
| " 'scan_time_ms': 0,\n", | |
| " 'rewrite_time_ms': 17}" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "more_data = get_some_data(list(range(1,11)), list(range(3,6)), 2)\n", | |
| "\n", | |
| "tbl.merge(pa.Table.from_pylist(more_data), \"source.key = target.key and source.partition = target.partition\", source_alias=\"source\", target_alias=\"target\").when_matched_update_all().when_not_matched_insert_all().execute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "tbl.update_incremental()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>partition</th>\n", | |
| " <th>key</th>\n", | |
| " <th>value</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>10</th>\n", | |
| " <td>1</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>11</th>\n", | |
| " <td>1</td>\n", | |
| " <td>5</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>12</th>\n", | |
| " <td>1</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13</th>\n", | |
| " <td>1</td>\n", | |
| " <td>7</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>14</th>\n", | |
| " <td>1</td>\n", | |
| " <td>9</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>15</th>\n", | |
| " <td>1</td>\n", | |
| " <td>10</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2</td>\n", | |
| " <td>2</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2</td>\n", | |
| " <td>5</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>2</td>\n", | |
| " <td>7</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>2</td>\n", | |
| " <td>9</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>2</td>\n", | |
| " <td>10</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>36</th>\n", | |
| " <td>3</td>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>37</th>\n", | |
| " <td>3</td>\n", | |
| " <td>2</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>38</th>\n", | |
| " <td>3</td>\n", | |
| " <td>3</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>44</th>\n", | |
| " <td>3</td>\n", | |
| " <td>4</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>39</th>\n", | |
| " <td>3</td>\n", | |
| " <td>5</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>40</th>\n", | |
| " <td>3</td>\n", | |
| " <td>6</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>41</th>\n", | |
| " <td>3</td>\n", | |
| " <td>7</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>45</th>\n", | |
| " <td>3</td>\n", | |
| " <td>8</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>42</th>\n", | |
| " <td>3</td>\n", | |
| " <td>9</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>43</th>\n", | |
| " <td>3</td>\n", | |
| " <td>10</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>26</th>\n", | |
| " <td>4</td>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>27</th>\n", | |
| " <td>4</td>\n", | |
| " <td>2</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>28</th>\n", | |
| " <td>4</td>\n", | |
| " <td>3</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>34</th>\n", | |
| " <td>4</td>\n", | |
| " <td>4</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>29</th>\n", | |
| " <td>4</td>\n", | |
| " <td>5</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>30</th>\n", | |
| " <td>4</td>\n", | |
| " <td>6</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>31</th>\n", | |
| " <td>4</td>\n", | |
| " <td>7</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>35</th>\n", | |
| " <td>4</td>\n", | |
| " <td>8</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>32</th>\n", | |
| " <td>4</td>\n", | |
| " <td>9</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>33</th>\n", | |
| " <td>4</td>\n", | |
| " <td>10</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>16</th>\n", | |
| " <td>5</td>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>17</th>\n", | |
| " <td>5</td>\n", | |
| " <td>2</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>18</th>\n", | |
| " <td>5</td>\n", | |
| " <td>3</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>19</th>\n", | |
| " <td>5</td>\n", | |
| " <td>4</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>20</th>\n", | |
| " <td>5</td>\n", | |
| " <td>5</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>21</th>\n", | |
| " <td>5</td>\n", | |
| " <td>6</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>22</th>\n", | |
| " <td>5</td>\n", | |
| " <td>7</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>23</th>\n", | |
| " <td>5</td>\n", | |
| " <td>8</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>24</th>\n", | |
| " <td>5</td>\n", | |
| " <td>9</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>25</th>\n", | |
| " <td>5</td>\n", | |
| " <td>10</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " partition key value\n", | |
| "8 1 1 1\n", | |
| "9 1 2 1\n", | |
| "10 1 3 1\n", | |
| "11 1 5 1\n", | |
| "12 1 6 1\n", | |
| "13 1 7 1\n", | |
| "14 1 9 1\n", | |
| "15 1 10 1\n", | |
| "0 2 1 1\n", | |
| "1 2 2 1\n", | |
| "2 2 3 1\n", | |
| "3 2 5 1\n", | |
| "4 2 6 1\n", | |
| "5 2 7 1\n", | |
| "6 2 9 1\n", | |
| "7 2 10 1\n", | |
| "36 3 1 2\n", | |
| "37 3 2 2\n", | |
| "38 3 3 2\n", | |
| "44 3 4 2\n", | |
| "39 3 5 2\n", | |
| "40 3 6 2\n", | |
| "41 3 7 2\n", | |
| "45 3 8 2\n", | |
| "42 3 9 2\n", | |
| "43 3 10 2\n", | |
| "26 4 1 2\n", | |
| "27 4 2 2\n", | |
| "28 4 3 2\n", | |
| "34 4 4 2\n", | |
| "29 4 5 2\n", | |
| "30 4 6 2\n", | |
| "31 4 7 2\n", | |
| "35 4 8 2\n", | |
| "32 4 9 2\n", | |
| "33 4 10 2\n", | |
| "16 5 1 2\n", | |
| "17 5 2 2\n", | |
| "18 5 3 2\n", | |
| "19 5 4 2\n", | |
| "20 5 5 2\n", | |
| "21 5 6 2\n", | |
| "22 5 7 2\n", | |
| "23 5 8 2\n", | |
| "24 5 9 2\n", | |
| "25 5 10 2" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tbl.to_pandas().sort_values(['partition', 'key'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>path</th>\n", | |
| " <th>size_bytes</th>\n", | |
| " <th>modification_time</th>\n", | |
| " <th>data_change</th>\n", | |
| " <th>partition_values</th>\n", | |
| " <th>num_records</th>\n", | |
| " <th>null_count</th>\n", | |
| " <th>min</th>\n", | |
| " <th>max</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>partition=2/part-00001-45a744d1-5f10-4fcd-a91b...</td>\n", | |
| " <td>945</td>\n", | |
| " <td>2023-11-28 22:08:15.911</td>\n", | |
| " <td>True</td>\n", | |
| " <td>{'partition': 2}</td>\n", | |
| " <td>8</td>\n", | |
| " <td>{'partition': None, 'key': 0, 'value': 0}</td>\n", | |
| " <td>{'partition': None, 'key': 1, 'value': 1}</td>\n", | |
| " <td>{'partition': None, 'key': 10, 'value': 1}</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>partition=1/part-00001-bcbc0521-6330-4c0d-a6ba...</td>\n", | |
| " <td>945</td>\n", | |
| " <td>2023-11-28 22:08:15.911</td>\n", | |
| " <td>True</td>\n", | |
| " <td>{'partition': 1}</td>\n", | |
| " <td>8</td>\n", | |
| " <td>{'partition': None, 'key': 0, 'value': 0}</td>\n", | |
| " <td>{'partition': None, 'key': 1, 'value': 1}</td>\n", | |
| " <td>{'partition': None, 'key': 10, 'value': 1}</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>partition=5/part-00001-540383e6-716b-44ca-b7e1...</td>\n", | |
| " <td>961</td>\n", | |
| " <td>2023-11-28 22:08:15.912</td>\n", | |
| " <td>True</td>\n", | |
| " <td>{'partition': 5}</td>\n", | |
| " <td>10</td>\n", | |
| " <td>{'partition': None, 'key': 0, 'value': 0}</td>\n", | |
| " <td>{'partition': None, 'key': 1, 'value': 2}</td>\n", | |
| " <td>{'partition': None, 'key': 10, 'value': 2}</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>partition=4/part-00001-0e554628-ebf8-4fea-9563...</td>\n", | |
| " <td>961</td>\n", | |
| " <td>2023-11-28 22:08:15.912</td>\n", | |
| " <td>True</td>\n", | |
| " <td>{'partition': 4}</td>\n", | |
| " <td>10</td>\n", | |
| " <td>{'partition': None, 'key': 0, 'value': 0}</td>\n", | |
| " <td>{'partition': None, 'key': 1, 'value': 2}</td>\n", | |
| " <td>{'partition': None, 'key': 10, 'value': 2}</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>partition=3/part-00001-dcc36881-4a3f-46f7-8cee...</td>\n", | |
| " <td>961</td>\n", | |
| " <td>2023-11-28 22:08:15.912</td>\n", | |
| " <td>True</td>\n", | |
| " <td>{'partition': 3}</td>\n", | |
| " <td>10</td>\n", | |
| " <td>{'partition': None, 'key': 0, 'value': 0}</td>\n", | |
| " <td>{'partition': None, 'key': 1, 'value': 2}</td>\n", | |
| " <td>{'partition': None, 'key': 10, 'value': 2}</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " path size_bytes \\\n", | |
| "0 partition=2/part-00001-45a744d1-5f10-4fcd-a91b... 945 \n", | |
| "1 partition=1/part-00001-bcbc0521-6330-4c0d-a6ba... 945 \n", | |
| "2 partition=5/part-00001-540383e6-716b-44ca-b7e1... 961 \n", | |
| "3 partition=4/part-00001-0e554628-ebf8-4fea-9563... 961 \n", | |
| "4 partition=3/part-00001-dcc36881-4a3f-46f7-8cee... 961 \n", | |
| "\n", | |
| " modification_time data_change partition_values num_records \\\n", | |
| "0 2023-11-28 22:08:15.911 True {'partition': 2} 8 \n", | |
| "1 2023-11-28 22:08:15.911 True {'partition': 1} 8 \n", | |
| "2 2023-11-28 22:08:15.912 True {'partition': 5} 10 \n", | |
| "3 2023-11-28 22:08:15.912 True {'partition': 4} 10 \n", | |
| "4 2023-11-28 22:08:15.912 True {'partition': 3} 10 \n", | |
| "\n", | |
| " null_count \\\n", | |
| "0 {'partition': None, 'key': 0, 'value': 0} \n", | |
| "1 {'partition': None, 'key': 0, 'value': 0} \n", | |
| "2 {'partition': None, 'key': 0, 'value': 0} \n", | |
| "3 {'partition': None, 'key': 0, 'value': 0} \n", | |
| "4 {'partition': None, 'key': 0, 'value': 0} \n", | |
| "\n", | |
| " min \\\n", | |
| "0 {'partition': None, 'key': 1, 'value': 1} \n", | |
| "1 {'partition': None, 'key': 1, 'value': 1} \n", | |
| "2 {'partition': None, 'key': 1, 'value': 2} \n", | |
| "3 {'partition': None, 'key': 1, 'value': 2} \n", | |
| "4 {'partition': None, 'key': 1, 'value': 2} \n", | |
| "\n", | |
| " max \n", | |
| "0 {'partition': None, 'key': 10, 'value': 1} \n", | |
| "1 {'partition': None, 'key': 10, 'value': 1} \n", | |
| "2 {'partition': None, 'key': 10, 'value': 2} \n", | |
| "3 {'partition': None, 'key': 10, 'value': 2} \n", | |
| "4 {'partition': None, 'key': 10, 'value': 2} " | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pa.Table.from_batches([tbl.get_add_actions()]).to_pandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['partition=2/part-00001-45a744d1-5f10-4fcd-a91b-3e60a2f1ddec-c000.snappy.parquet',\n", | |
| " 'partition=1/part-00001-bcbc0521-6330-4c0d-a6ba-93492ffc99d6-c000.snappy.parquet',\n", | |
| " 'partition=5/part-00001-540383e6-716b-44ca-b7e1-2d0cd0f1cb5b-c000.snappy.parquet',\n", | |
| " 'partition=4/part-00001-0e554628-ebf8-4fea-9563-78f67b189302-c000.snappy.parquet',\n", | |
| " 'partition=3/part-00001-dcc36881-4a3f-46f7-8cee-324ab6b00eda-c000.snappy.parquet']" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "files_after = tbl.files()\n", | |
| "files_after" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "False" | |
| ] | |
| }, | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "any([f in files_after for f in files_before])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "systools39", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.16" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment