Skip to content

Instantly share code, notes, and snippets.

@calilisantos
Created October 4, 2025 20:49
Show Gist options
  • Select an option

  • Save calilisantos/30d35f58b7a417dffae2789f79ed2a4c to your computer and use it in GitHub Desktop.

Select an option

Save calilisantos/30d35f58b7a417dffae2789f79ed2a4c to your computer and use it in GitHub Desktop.
Reduce in Pyspark (?!!)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "0f207f96-f616-437a-95ab-f6f814615db0",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## **Dependencies:**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "5d183aef-7b6c-45cc-96d9-5095b1a9f3f2",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": [
"from itertools import product\n",
"from pyspark.sql import functions as F\n",
"import string"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "780094a8-feb1-4667-9df5-50071283658e",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## **Sample data:**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "09350909-c729-4bf9-8a6d-25e0a376206a",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"['table_a', 'table_b', 'table_c', 'table_d', 'table_e', 'table_f', 'table_g', 'table_h', 'table_i', 'table_j']\n"
]
}
],
"source": [
"# Lowercase alphabet\n",
"tables = [f\"table_{letter}\" for letter in string.ascii_lowercase[:10]]\n",
"\n",
"print(tables)"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "0472112d-3ef3-4c8b-9194-67eebb4a9a70",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"['table_a_v1', 'table_a_temp', 'table_a_backup', 'table_b_v1', 'table_b_temp', 'table_b_backup', 'table_c_v1', 'table_c_temp', 'table_c_backup']\n"
]
}
],
"source": [
"temp_suffixes = [\"_v1\", \"_temp\", \"_backup\"]\n",
"temp_tables = [f\"{table}{suffix}\" for table, suffix in product(tables[:3], temp_suffixes)]\n",
"print(temp_tables)"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "bb111873-0643-40aa-b7fe-10168e5f1b06",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'table_a',\n",
" 'table_a_backup',\n",
" 'table_a_temp',\n",
" 'table_a_v1',\n",
" 'table_b',\n",
" 'table_b_backup',\n",
" 'table_b_temp',\n",
" 'table_b_v1',\n",
" 'table_c',\n",
" 'table_c_backup',\n",
" 'table_c_temp',\n",
" 'table_c_v1',\n",
" 'table_d',\n",
" 'table_e',\n",
" 'table_f',\n",
" 'table_g',\n",
" 'table_h',\n",
" 'table_i',\n",
" 'table_j'}"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tables.extend(temp_tables)\n",
"sample_tables = set(sorted(tables))\n",
"sample_tables"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "2a19aff0-f57c-4f80-8dd5-72b49ffbe795",
"showTitle": false,
"tableResultSettingsMap": {
"0": {
"dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759596037628}",
"filterBlob": null,
"queryPlanFiltersBlob": null,
"tableResultIndex": 0
}
},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>table_name</th></tr></thead><tbody><tr><td>table_a</td></tr><tr><td>table_b</td></tr><tr><td>table_c</td></tr><tr><td>table_d</td></tr><tr><td>table_e</td></tr><tr><td>table_f</td></tr><tr><td>table_g</td></tr><tr><td>table_h</td></tr><tr><td>table_i</td></tr><tr><td>table_j</td></tr><tr><td>table_a_v1</td></tr><tr><td>table_a_temp</td></tr><tr><td>table_a_backup</td></tr><tr><td>table_b_v1</td></tr><tr><td>table_b_temp</td></tr><tr><td>table_b_backup</td></tr><tr><td>table_c_v1</td></tr><tr><td>table_c_temp</td></tr><tr><td>table_c_backup</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"table_a"
],
[
"table_b"
],
[
"table_c"
],
[
"table_d"
],
[
"table_e"
],
[
"table_f"
],
[
"table_g"
],
[
"table_h"
],
[
"table_i"
],
[
"table_j"
],
[
"table_a_v1"
],
[
"table_a_temp"
],
[
"table_a_backup"
],
[
"table_b_v1"
],
[
"table_b_temp"
],
[
"table_b_backup"
],
[
"table_c_v1"
],
[
"table_c_temp"
],
[
"table_c_backup"
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "table_name",
"type": "\"string\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"tables_df = spark.createDataFrame(\n",
" data=[(table,) for table in tables], \n",
" schema=\"table_name STRING\"\n",
")\n",
"display(tables_df)"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "3053d967-8013-4288-82dc-4ea0a5f35bca",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## **Check suffix with contains:**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "d872ecc6-35d4-4725-8989-19064f7fb5a0",
"showTitle": false,
"tableResultSettingsMap": {
"0": {
"dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759610524390}",
"filterBlob": null,
"queryPlanFiltersBlob": null,
"tableResultIndex": 0
}
},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>table_name</th><th>is_temp_table</th></tr></thead><tbody><tr><td>table_a</td><td>false</td></tr><tr><td>table_b</td><td>false</td></tr><tr><td>table_c</td><td>false</td></tr><tr><td>table_d</td><td>false</td></tr><tr><td>table_e</td><td>false</td></tr><tr><td>table_f</td><td>false</td></tr><tr><td>table_g</td><td>false</td></tr><tr><td>table_h</td><td>false</td></tr><tr><td>table_i</td><td>false</td></tr><tr><td>table_j</td><td>false</td></tr><tr><td>table_a_v1</td><td>true</td></tr><tr><td>table_a_temp</td><td>true</td></tr><tr><td>table_a_backup</td><td>true</td></tr><tr><td>table_b_v1</td><td>true</td></tr><tr><td>table_b_temp</td><td>true</td></tr><tr><td>table_b_backup</td><td>true</td></tr><tr><td>table_c_v1</td><td>true</td></tr><tr><td>table_c_temp</td><td>true</td></tr><tr><td>table_c_backup</td><td>true</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"table_a",
false
],
[
"table_b",
false
],
[
"table_c",
false
],
[
"table_d",
false
],
[
"table_e",
false
],
[
"table_f",
false
],
[
"table_g",
false
],
[
"table_h",
false
],
[
"table_i",
false
],
[
"table_j",
false
],
[
"table_a_v1",
true
],
[
"table_a_temp",
true
],
[
"table_a_backup",
true
],
[
"table_b_v1",
true
],
[
"table_b_temp",
true
],
[
"table_b_backup",
true
],
[
"table_c_v1",
true
],
[
"table_c_temp",
true
],
[
"table_c_backup",
true
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "table_name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "is_temp_table",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"tables_df.withColumn(\n",
" \"is_temp_table\",\n",
" F.when(F.col(\"table_name\").contains(\"_temp\"), True)\n",
" .when(F.col(\"table_name\").contains(\"_v1\"), True)\n",
" .when(F.col(\"table_name\").contains(\"_backup\"), True)\n",
" .otherwise(False)\n",
").display()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "153d5d50-8af3-4741-885f-bcca440d8012",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## **Check suffix with endswith:**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "9a614143-8e51-4f3e-ba7d-25a2f3889fad",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>table_name</th><th>is_temp_table</th></tr></thead><tbody><tr><td>table_a</td><td>false</td></tr><tr><td>table_b</td><td>false</td></tr><tr><td>table_c</td><td>false</td></tr><tr><td>table_d</td><td>false</td></tr><tr><td>table_e</td><td>false</td></tr><tr><td>table_f</td><td>false</td></tr><tr><td>table_g</td><td>false</td></tr><tr><td>table_h</td><td>false</td></tr><tr><td>table_i</td><td>false</td></tr><tr><td>table_j</td><td>false</td></tr><tr><td>table_a_v1</td><td>true</td></tr><tr><td>table_a_temp</td><td>true</td></tr><tr><td>table_a_backup</td><td>true</td></tr><tr><td>table_b_v1</td><td>true</td></tr><tr><td>table_b_temp</td><td>true</td></tr><tr><td>table_b_backup</td><td>true</td></tr><tr><td>table_c_v1</td><td>true</td></tr><tr><td>table_c_temp</td><td>true</td></tr><tr><td>table_c_backup</td><td>true</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"table_a",
false
],
[
"table_b",
false
],
[
"table_c",
false
],
[
"table_d",
false
],
[
"table_e",
false
],
[
"table_f",
false
],
[
"table_g",
false
],
[
"table_h",
false
],
[
"table_i",
false
],
[
"table_j",
false
],
[
"table_a_v1",
true
],
[
"table_a_temp",
true
],
[
"table_a_backup",
true
],
[
"table_b_v1",
true
],
[
"table_b_temp",
true
],
[
"table_b_backup",
true
],
[
"table_c_v1",
true
],
[
"table_c_temp",
true
],
[
"table_c_backup",
true
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "table_name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "is_temp_table",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"tables_df.withColumn(\n",
" \"is_temp_table\",\n",
" F.when(F.col(\"table_name\").endswith(\"_temp\"), True)\n",
" .when(F.col(\"table_name\").endswith(\"_v1\"), True)\n",
" .when(F.col(\"table_name\").endswith(\"_backup\"), True)\n",
" .otherwise(False)\n",
").display()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "d475df5d-18fd-4f34-95db-4ce2f9a22aa6",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## **The reduce:**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "2956f179-3f3d-482c-b701-9363a5e0fbed",
"showTitle": false,
"tableResultSettingsMap": {
"0": {
"dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759610626771}",
"filterBlob": null,
"queryPlanFiltersBlob": null,
"tableResultIndex": 0
}
},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>collect_list(id)</th></tr></thead><tbody><tr><td>List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9
]
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{\"__autoGeneratedAlias\": \"true\"}",
"name": "collect_list(id)",
"type": "{\"containsNull\":false,\"elementType\":\"long\",\"type\":\"array\"}"
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"dummy_df = spark.range(10)\n",
"dummy_df.select(\n",
" F.collect_list(\"id\")\n",
").display()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "fc87ba89-fbe5-406a-9b98-34b45375e976",
"showTitle": false,
"tableResultSettingsMap": {
"0": {
"dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759609942733}",
"filterBlob": null,
"queryPlanFiltersBlob": null,
"tableResultIndex": 0
}
},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>sum_id</th></tr></thead><tbody><tr><td>45</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
45
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "sum_id",
"type": "\"long\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"def simple_sum(acc, x):\n",
" return acc + x\n",
"\n",
"dummy_df.select(\n",
" F.reduce(\n",
" col=F.collect_list(\"id\"), # array to be reduced\n",
" initialValue=F.lit(0).cast(\"bigint\"), # initial value (accumulator)\n",
" # lambda acc, x: acc + x\n",
" merge=simple_sum # function to merge the accumulator with the next element\n",
" ).alias(\"sum_id\")\n",
").display()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "7d8248ae-1949-4c12-9310-1d95c01ce186",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## **Check suffix with reduce:**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "d079f1b3-1206-4e2f-98c0-cc9726406ca8",
"showTitle": false,
"tableResultSettingsMap": {
"0": {
"dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759609223978}",
"filterBlob": null,
"queryPlanFiltersBlob": null,
"tableResultIndex": 0
}
},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>table_name</th><th>is_temp_table</th></tr></thead><tbody><tr><td>table_a</td><td>false</td></tr><tr><td>table_b</td><td>false</td></tr><tr><td>table_c</td><td>false</td></tr><tr><td>table_d</td><td>false</td></tr><tr><td>table_e</td><td>false</td></tr><tr><td>table_f</td><td>false</td></tr><tr><td>table_g</td><td>false</td></tr><tr><td>table_h</td><td>false</td></tr><tr><td>table_i</td><td>false</td></tr><tr><td>table_j</td><td>false</td></tr><tr><td>table_a_v1</td><td>true</td></tr><tr><td>table_a_temp</td><td>true</td></tr><tr><td>table_a_backup</td><td>true</td></tr><tr><td>table_b_v1</td><td>true</td></tr><tr><td>table_b_temp</td><td>true</td></tr><tr><td>table_b_backup</td><td>true</td></tr><tr><td>table_c_v1</td><td>true</td></tr><tr><td>table_c_temp</td><td>true</td></tr><tr><td>table_c_backup</td><td>true</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"table_a",
false
],
[
"table_b",
false
],
[
"table_c",
false
],
[
"table_d",
false
],
[
"table_e",
false
],
[
"table_f",
false
],
[
"table_g",
false
],
[
"table_h",
false
],
[
"table_i",
false
],
[
"table_j",
false
],
[
"table_a_v1",
true
],
[
"table_a_temp",
true
],
[
"table_a_backup",
true
],
[
"table_b_v1",
true
],
[
"table_b_temp",
true
],
[
"table_b_backup",
true
],
[
"table_c_v1",
true
],
[
"table_c_temp",
true
],
[
"table_c_backup",
true
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "table_name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "is_temp_table",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"suffix_array = F.array([F.lit(s) for s in temp_suffixes]) # sintetic array to check temp suffixes\n",
"\n",
"def check_suffix(acc, x):\n",
" return acc | F.col(\"table_name\").endswith(x)\n",
"\n",
"tables_df.withColumn(\n",
" \"is_temp_table\",\n",
" F.reduce(\n",
" col=suffix_array,\n",
" initialValue=F.lit(False),\n",
" merge=check_suffix\n",
" )\n",
").display()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "9301dc0b-45a8-45e8-b7c4-f2d4fa988ff7",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## (Easiest) **Check suffix with regex:**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "c5059eb1-1b8f-406e-8743-064479ba8570",
"showTitle": false,
"tableResultSettingsMap": {
"0": {
"dataGridStateBlob": "{\"version\":1,\"tableState\":{\"columnPinning\":{\"left\":[\"#row_number#\"],\"right\":[]},\"columnSizing\":{},\"columnVisibility\":{}},\"settings\":{\"columns\":{}},\"syncTimestamp\":1759610296168}",
"filterBlob": null,
"queryPlanFiltersBlob": null,
"tableResultIndex": 0
}
},
"title": ""
}
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"regex_pattern_temp_suffixes (_v1|_temp|_backup)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>table_name</th><th>is_temp_table</th></tr></thead><tbody><tr><td>table_a</td><td>false</td></tr><tr><td>table_b</td><td>false</td></tr><tr><td>table_c</td><td>false</td></tr><tr><td>table_d</td><td>false</td></tr><tr><td>table_e</td><td>false</td></tr><tr><td>table_f</td><td>false</td></tr><tr><td>table_g</td><td>false</td></tr><tr><td>table_h</td><td>false</td></tr><tr><td>table_i</td><td>false</td></tr><tr><td>table_j</td><td>false</td></tr><tr><td>table_a_v1</td><td>true</td></tr><tr><td>table_a_temp</td><td>true</td></tr><tr><td>table_a_backup</td><td>true</td></tr><tr><td>table_b_v1</td><td>true</td></tr><tr><td>table_b_temp</td><td>true</td></tr><tr><td>table_b_backup</td><td>true</td></tr><tr><td>table_c_v1</td><td>true</td></tr><tr><td>table_c_temp</td><td>true</td></tr><tr><td>table_c_backup</td><td>true</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"table_a",
false
],
[
"table_b",
false
],
[
"table_c",
false
],
[
"table_d",
false
],
[
"table_e",
false
],
[
"table_f",
false
],
[
"table_g",
false
],
[
"table_h",
false
],
[
"table_i",
false
],
[
"table_j",
false
],
[
"table_a_v1",
true
],
[
"table_a_temp",
true
],
[
"table_a_backup",
true
],
[
"table_b_v1",
true
],
[
"table_b_temp",
true
],
[
"table_b_backup",
true
],
[
"table_c_v1",
true
],
[
"table_c_temp",
true
],
[
"table_c_backup",
true
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "table_name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "is_temp_table",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"regex_pattern_temp_suffixes = \"(\" + \"|\".join(map(re.escape, temp_suffixes)) + \")\"\n",
"print(\"regex_pattern_temp_suffixes\", regex_pattern_temp_suffixes)\n",
"tables_df.withColumn(\n",
" \"is_temp_table\",\n",
" F.col(\"table_name\").rlike(regex_pattern_temp_suffixes)\n",
").display()"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"computePreferences": null,
"dashboards": [],
"environmentMetadata": {
"base_environment": "",
"environment_version": "3"
},
"inputWidgetPreferences": null,
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 4
},
"notebookName": "reduce_in_pyspark",
"widgets": {}
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment