Created
October 4, 2025 20:49
-
-
Save calilisantos/30d35f58b7a417dffae2789f79ed2a4c to your computer and use it in GitHub Desktop.
Reduce in Pyspark (?!!)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "0f207f96-f616-437a-95ab-f6f814615db0", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## **Dependencies:**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "5d183aef-7b6c-45cc-96d9-5095b1a9f3f2", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from itertools import product\n", | |
| "from pyspark.sql import functions as F\n", | |
| "import string" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "780094a8-feb1-4667-9df5-50071283658e", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## **Sample data:**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "09350909-c729-4bf9-8a6d-25e0a376206a", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "['table_a', 'table_b', 'table_c', 'table_d', 'table_e', 'table_f', 'table_g', 'table_h', 'table_i', 'table_j']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Lowercase alphabet\n", | |
| "tables = [f\"table_{letter}\" for letter in string.ascii_lowercase[:10]]\n", | |
| "\n", | |
| "print(tables)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "0472112d-3ef3-4c8b-9194-67eebb4a9a70", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "['table_a_v1', 'table_a_temp', 'table_a_backup', 'table_b_v1', 'table_b_temp', 'table_b_backup', 'table_c_v1', 'table_c_temp', 'table_c_backup']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "temp_suffixes = [\"_v1\", \"_temp\", \"_backup\"]\n", | |
| "temp_tables = [f\"{table}{suffix}\" for table, suffix in product(tables[:3], temp_suffixes)]\n", | |
| "print(temp_tables)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "bb111873-0643-40aa-b7fe-10168e5f1b06", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "{'table_a',\n", | |
| " 'table_a_backup',\n", | |
| " 'table_a_temp',\n", | |
| " 'table_a_v1',\n", | |
| " 'table_b',\n", | |
| " 'table_b_backup',\n", | |
| " 'table_b_temp',\n", | |
| " 'table_b_v1',\n", | |
| " 'table_c',\n", | |
| " 'table_c_backup',\n", | |
| " 'table_c_temp',\n", | |
| " 'table_c_v1',\n", | |
| " 'table_d',\n", | |
| " 'table_e',\n", | |
| " 'table_f',\n", | |
| " 'table_g',\n", | |
| " 'table_h',\n", | |
| " 'table_i',\n", | |
| " 'table_j'}" | |
| ] | |
| }, | |
| "execution_count": 98, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tables.extend(temp_tables)\n", | |
| "sample_tables = set(sorted(tables))\n", | |
| "sample_tables" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "2a19aff0-f57c-4f80-8dd5-72b49ffbe795", | |
| "showTitle": false, | |
| "tableResultSettingsMap": { | |
| "0": { | |
| "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759596037628}", | |
| "filterBlob": null, | |
| "queryPlanFiltersBlob": null, | |
| "tableResultIndex": 0 | |
| } | |
| }, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>table_name</th></tr></thead><tbody><tr><td>table_a</td></tr><tr><td>table_b</td></tr><tr><td>table_c</td></tr><tr><td>table_d</td></tr><tr><td>table_e</td></tr><tr><td>table_f</td></tr><tr><td>table_g</td></tr><tr><td>table_h</td></tr><tr><td>table_i</td></tr><tr><td>table_j</td></tr><tr><td>table_a_v1</td></tr><tr><td>table_a_temp</td></tr><tr><td>table_a_backup</td></tr><tr><td>table_b_v1</td></tr><tr><td>table_b_temp</td></tr><tr><td>table_b_backup</td></tr><tr><td>table_c_v1</td></tr><tr><td>table_c_temp</td></tr><tr><td>table_c_backup</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "table_a" | |
| ], | |
| [ | |
| "table_b" | |
| ], | |
| [ | |
| "table_c" | |
| ], | |
| [ | |
| "table_d" | |
| ], | |
| [ | |
| "table_e" | |
| ], | |
| [ | |
| "table_f" | |
| ], | |
| [ | |
| "table_g" | |
| ], | |
| [ | |
| "table_h" | |
| ], | |
| [ | |
| "table_i" | |
| ], | |
| [ | |
| "table_j" | |
| ], | |
| [ | |
| "table_a_v1" | |
| ], | |
| [ | |
| "table_a_temp" | |
| ], | |
| [ | |
| "table_a_backup" | |
| ], | |
| [ | |
| "table_b_v1" | |
| ], | |
| [ | |
| "table_b_temp" | |
| ], | |
| [ | |
| "table_b_backup" | |
| ], | |
| [ | |
| "table_c_v1" | |
| ], | |
| [ | |
| "table_c_temp" | |
| ], | |
| [ | |
| "table_c_backup" | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "table_name", | |
| "type": "\"string\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "tables_df = spark.createDataFrame(\n", | |
| " data=[(table,) for table in tables], \n", | |
| " schema=\"table_name STRING\"\n", | |
| ")\n", | |
| "display(tables_df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "3053d967-8013-4288-82dc-4ea0a5f35bca", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## **Check suffix with contains:**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "d872ecc6-35d4-4725-8989-19064f7fb5a0", | |
| "showTitle": false, | |
| "tableResultSettingsMap": { | |
| "0": { | |
| "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759610524390}", | |
| "filterBlob": null, | |
| "queryPlanFiltersBlob": null, | |
| "tableResultIndex": 0 | |
| } | |
| }, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>table_name</th><th>is_temp_table</th></tr></thead><tbody><tr><td>table_a</td><td>false</td></tr><tr><td>table_b</td><td>false</td></tr><tr><td>table_c</td><td>false</td></tr><tr><td>table_d</td><td>false</td></tr><tr><td>table_e</td><td>false</td></tr><tr><td>table_f</td><td>false</td></tr><tr><td>table_g</td><td>false</td></tr><tr><td>table_h</td><td>false</td></tr><tr><td>table_i</td><td>false</td></tr><tr><td>table_j</td><td>false</td></tr><tr><td>table_a_v1</td><td>true</td></tr><tr><td>table_a_temp</td><td>true</td></tr><tr><td>table_a_backup</td><td>true</td></tr><tr><td>table_b_v1</td><td>true</td></tr><tr><td>table_b_temp</td><td>true</td></tr><tr><td>table_b_backup</td><td>true</td></tr><tr><td>table_c_v1</td><td>true</td></tr><tr><td>table_c_temp</td><td>true</td></tr><tr><td>table_c_backup</td><td>true</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "table_a", | |
| false | |
| ], | |
| [ | |
| "table_b", | |
| false | |
| ], | |
| [ | |
| "table_c", | |
| false | |
| ], | |
| [ | |
| "table_d", | |
| false | |
| ], | |
| [ | |
| "table_e", | |
| false | |
| ], | |
| [ | |
| "table_f", | |
| false | |
| ], | |
| [ | |
| "table_g", | |
| false | |
| ], | |
| [ | |
| "table_h", | |
| false | |
| ], | |
| [ | |
| "table_i", | |
| false | |
| ], | |
| [ | |
| "table_j", | |
| false | |
| ], | |
| [ | |
| "table_a_v1", | |
| true | |
| ], | |
| [ | |
| "table_a_temp", | |
| true | |
| ], | |
| [ | |
| "table_a_backup", | |
| true | |
| ], | |
| [ | |
| "table_b_v1", | |
| true | |
| ], | |
| [ | |
| "table_b_temp", | |
| true | |
| ], | |
| [ | |
| "table_b_backup", | |
| true | |
| ], | |
| [ | |
| "table_c_v1", | |
| true | |
| ], | |
| [ | |
| "table_c_temp", | |
| true | |
| ], | |
| [ | |
| "table_c_backup", | |
| true | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "table_name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "is_temp_table", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "tables_df.withColumn(\n", | |
| " \"is_temp_table\",\n", | |
| " F.when(F.col(\"table_name\").contains(\"_temp\"), True)\n", | |
| " .when(F.col(\"table_name\").contains(\"_v1\"), True)\n", | |
| " .when(F.col(\"table_name\").contains(\"_backup\"), True)\n", | |
| " .otherwise(False)\n", | |
| ").display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "153d5d50-8af3-4741-885f-bcca440d8012", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## **Check suffix with endswith:**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "9a614143-8e51-4f3e-ba7d-25a2f3889fad", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>table_name</th><th>is_temp_table</th></tr></thead><tbody><tr><td>table_a</td><td>false</td></tr><tr><td>table_b</td><td>false</td></tr><tr><td>table_c</td><td>false</td></tr><tr><td>table_d</td><td>false</td></tr><tr><td>table_e</td><td>false</td></tr><tr><td>table_f</td><td>false</td></tr><tr><td>table_g</td><td>false</td></tr><tr><td>table_h</td><td>false</td></tr><tr><td>table_i</td><td>false</td></tr><tr><td>table_j</td><td>false</td></tr><tr><td>table_a_v1</td><td>true</td></tr><tr><td>table_a_temp</td><td>true</td></tr><tr><td>table_a_backup</td><td>true</td></tr><tr><td>table_b_v1</td><td>true</td></tr><tr><td>table_b_temp</td><td>true</td></tr><tr><td>table_b_backup</td><td>true</td></tr><tr><td>table_c_v1</td><td>true</td></tr><tr><td>table_c_temp</td><td>true</td></tr><tr><td>table_c_backup</td><td>true</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "table_a", | |
| false | |
| ], | |
| [ | |
| "table_b", | |
| false | |
| ], | |
| [ | |
| "table_c", | |
| false | |
| ], | |
| [ | |
| "table_d", | |
| false | |
| ], | |
| [ | |
| "table_e", | |
| false | |
| ], | |
| [ | |
| "table_f", | |
| false | |
| ], | |
| [ | |
| "table_g", | |
| false | |
| ], | |
| [ | |
| "table_h", | |
| false | |
| ], | |
| [ | |
| "table_i", | |
| false | |
| ], | |
| [ | |
| "table_j", | |
| false | |
| ], | |
| [ | |
| "table_a_v1", | |
| true | |
| ], | |
| [ | |
| "table_a_temp", | |
| true | |
| ], | |
| [ | |
| "table_a_backup", | |
| true | |
| ], | |
| [ | |
| "table_b_v1", | |
| true | |
| ], | |
| [ | |
| "table_b_temp", | |
| true | |
| ], | |
| [ | |
| "table_b_backup", | |
| true | |
| ], | |
| [ | |
| "table_c_v1", | |
| true | |
| ], | |
| [ | |
| "table_c_temp", | |
| true | |
| ], | |
| [ | |
| "table_c_backup", | |
| true | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "table_name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "is_temp_table", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "tables_df.withColumn(\n", | |
| " \"is_temp_table\",\n", | |
| " F.when(F.col(\"table_name\").endswith(\"_temp\"), True)\n", | |
| " .when(F.col(\"table_name\").endswith(\"_v1\"), True)\n", | |
| " .when(F.col(\"table_name\").endswith(\"_backup\"), True)\n", | |
| " .otherwise(False)\n", | |
| ").display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "d475df5d-18fd-4f34-95db-4ce2f9a22aa6", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## **The reduce:**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "2956f179-3f3d-482c-b701-9363a5e0fbed", | |
| "showTitle": false, | |
| "tableResultSettingsMap": { | |
| "0": { | |
| "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759610626771}", | |
| "filterBlob": null, | |
| "queryPlanFiltersBlob": null, | |
| "tableResultIndex": 0 | |
| } | |
| }, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>collect_list(id)</th></tr></thead><tbody><tr><td>List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| [ | |
| 0, | |
| 1, | |
| 2, | |
| 3, | |
| 4, | |
| 5, | |
| 6, | |
| 7, | |
| 8, | |
| 9 | |
| ] | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{\"__autoGeneratedAlias\": \"true\"}", | |
| "name": "collect_list(id)", | |
| "type": "{\"containsNull\":false,\"elementType\":\"long\",\"type\":\"array\"}" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "dummy_df = spark.range(10)\n", | |
| "dummy_df.select(\n", | |
| " F.collect_list(\"id\")\n", | |
| ").display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "fc87ba89-fbe5-406a-9b98-34b45375e976", | |
| "showTitle": false, | |
| "tableResultSettingsMap": { | |
| "0": { | |
| "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759609942733}", | |
| "filterBlob": null, | |
| "queryPlanFiltersBlob": null, | |
| "tableResultIndex": 0 | |
| } | |
| }, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>sum_id</th></tr></thead><tbody><tr><td>45</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| 45 | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "sum_id", | |
| "type": "\"long\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "def simple_sum(acc, x):\n", | |
| " return acc + x\n", | |
| "\n", | |
| "dummy_df.select(\n", | |
| " F.reduce(\n", | |
| " col=F.collect_list(\"id\"), # array to be reduced\n", | |
| " initialValue=F.lit(0).cast(\"bigint\"), # initial value (accumulator)\n", | |
| " # lambda acc, x: acc + x\n", | |
| " merge=simple_sum # function to merge the accumulator with the next element\n", | |
| " ).alias(\"sum_id\")\n", | |
| ").display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "7d8248ae-1949-4c12-9310-1d95c01ce186", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## **Check suffix with reduce:**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "d079f1b3-1206-4e2f-98c0-cc9726406ca8", | |
| "showTitle": false, | |
| "tableResultSettingsMap": { | |
| "0": { | |
| "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759609223978}", | |
| "filterBlob": null, | |
| "queryPlanFiltersBlob": null, | |
| "tableResultIndex": 0 | |
| } | |
| }, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>table_name</th><th>is_temp_table</th></tr></thead><tbody><tr><td>table_a</td><td>false</td></tr><tr><td>table_b</td><td>false</td></tr><tr><td>table_c</td><td>false</td></tr><tr><td>table_d</td><td>false</td></tr><tr><td>table_e</td><td>false</td></tr><tr><td>table_f</td><td>false</td></tr><tr><td>table_g</td><td>false</td></tr><tr><td>table_h</td><td>false</td></tr><tr><td>table_i</td><td>false</td></tr><tr><td>table_j</td><td>false</td></tr><tr><td>table_a_v1</td><td>true</td></tr><tr><td>table_a_temp</td><td>true</td></tr><tr><td>table_a_backup</td><td>true</td></tr><tr><td>table_b_v1</td><td>true</td></tr><tr><td>table_b_temp</td><td>true</td></tr><tr><td>table_b_backup</td><td>true</td></tr><tr><td>table_c_v1</td><td>true</td></tr><tr><td>table_c_temp</td><td>true</td></tr><tr><td>table_c_backup</td><td>true</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "table_a", | |
| false | |
| ], | |
| [ | |
| "table_b", | |
| false | |
| ], | |
| [ | |
| "table_c", | |
| false | |
| ], | |
| [ | |
| "table_d", | |
| false | |
| ], | |
| [ | |
| "table_e", | |
| false | |
| ], | |
| [ | |
| "table_f", | |
| false | |
| ], | |
| [ | |
| "table_g", | |
| false | |
| ], | |
| [ | |
| "table_h", | |
| false | |
| ], | |
| [ | |
| "table_i", | |
| false | |
| ], | |
| [ | |
| "table_j", | |
| false | |
| ], | |
| [ | |
| "table_a_v1", | |
| true | |
| ], | |
| [ | |
| "table_a_temp", | |
| true | |
| ], | |
| [ | |
| "table_a_backup", | |
| true | |
| ], | |
| [ | |
| "table_b_v1", | |
| true | |
| ], | |
| [ | |
| "table_b_temp", | |
| true | |
| ], | |
| [ | |
| "table_b_backup", | |
| true | |
| ], | |
| [ | |
| "table_c_v1", | |
| true | |
| ], | |
| [ | |
| "table_c_temp", | |
| true | |
| ], | |
| [ | |
| "table_c_backup", | |
| true | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "table_name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "is_temp_table", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "suffix_array = F.array([F.lit(s) for s in temp_suffixes]) # sintetic array to check temp suffixes\n", | |
| "\n", | |
| "def check_suffix(acc, x):\n", | |
| " return acc | F.col(\"table_name\").endswith(x)\n", | |
| "\n", | |
| "tables_df.withColumn(\n", | |
| " \"is_temp_table\",\n", | |
| " F.reduce(\n", | |
| " col=suffix_array,\n", | |
| " initialValue=F.lit(False),\n", | |
| " merge=check_suffix\n", | |
| " )\n", | |
| ").display()\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "9301dc0b-45a8-45e8-b7c4-f2d4fa988ff7", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## (Easiest) **Check suffix with regex:**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "c5059eb1-1b8f-406e-8743-064479ba8570", | |
| "showTitle": false, | |
| "tableResultSettingsMap": { | |
| "0": { | |
| "dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759610296168}", | |
| "filterBlob": null, | |
| "queryPlanFiltersBlob": null, | |
| "tableResultIndex": 0 | |
| } | |
| }, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "regex_pattern_temp_suffixes (_v1|_temp|_backup)\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>table_name</th><th>is_temp_table</th></tr></thead><tbody><tr><td>table_a</td><td>false</td></tr><tr><td>table_b</td><td>false</td></tr><tr><td>table_c</td><td>false</td></tr><tr><td>table_d</td><td>false</td></tr><tr><td>table_e</td><td>false</td></tr><tr><td>table_f</td><td>false</td></tr><tr><td>table_g</td><td>false</td></tr><tr><td>table_h</td><td>false</td></tr><tr><td>table_i</td><td>false</td></tr><tr><td>table_j</td><td>false</td></tr><tr><td>table_a_v1</td><td>true</td></tr><tr><td>table_a_temp</td><td>true</td></tr><tr><td>table_a_backup</td><td>true</td></tr><tr><td>table_b_v1</td><td>true</td></tr><tr><td>table_b_temp</td><td>true</td></tr><tr><td>table_b_backup</td><td>true</td></tr><tr><td>table_c_v1</td><td>true</td></tr><tr><td>table_c_temp</td><td>true</td></tr><tr><td>table_c_backup</td><td>true</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "table_a", | |
| false | |
| ], | |
| [ | |
| "table_b", | |
| false | |
| ], | |
| [ | |
| "table_c", | |
| false | |
| ], | |
| [ | |
| "table_d", | |
| false | |
| ], | |
| [ | |
| "table_e", | |
| false | |
| ], | |
| [ | |
| "table_f", | |
| false | |
| ], | |
| [ | |
| "table_g", | |
| false | |
| ], | |
| [ | |
| "table_h", | |
| false | |
| ], | |
| [ | |
| "table_i", | |
| false | |
| ], | |
| [ | |
| "table_j", | |
| false | |
| ], | |
| [ | |
| "table_a_v1", | |
| true | |
| ], | |
| [ | |
| "table_a_temp", | |
| true | |
| ], | |
| [ | |
| "table_a_backup", | |
| true | |
| ], | |
| [ | |
| "table_b_v1", | |
| true | |
| ], | |
| [ | |
| "table_b_temp", | |
| true | |
| ], | |
| [ | |
| "table_b_backup", | |
| true | |
| ], | |
| [ | |
| "table_c_v1", | |
| true | |
| ], | |
| [ | |
| "table_c_temp", | |
| true | |
| ], | |
| [ | |
| "table_c_backup", | |
| true | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "table_name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "is_temp_table", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "regex_pattern_temp_suffixes = \"(\" + \"|\".join(map(re.escape, temp_suffixes)) + \")\"\n", | |
| "print(\"regex_pattern_temp_suffixes\", regex_pattern_temp_suffixes)\n", | |
| "tables_df.withColumn(\n", | |
| " \"is_temp_table\",\n", | |
| " F.col(\"table_name\").rlike(regex_pattern_temp_suffixes)\n", | |
| ").display()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "application/vnd.databricks.v1+notebook": { | |
| "computePreferences": null, | |
| "dashboards": [], | |
| "environmentMetadata": { | |
| "base_environment": "", | |
| "environment_version": "3" | |
| }, | |
| "inputWidgetPreferences": null, | |
| "language": "python", | |
| "notebookMetadata": { | |
| "pythonIndentUnit": 4 | |
| }, | |
| "notebookName": "reduce_in_pyspark", | |
| "widgets": {} | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment