Last active
January 19, 2025 15:39
-
-
Save calilisantos/21f7fbac5f71b1c0c946697da4c82271 to your computer and use it in GitHub Desktop.
Masking with pyspark.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "23b5cf4d-a666-4a18-ae50-61fbdd578186", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Dependencies import**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "e43a2f57-3625-4f9b-b26f-c8cf41b1f623", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from functools import reduce\n", | |
| "from pyspark.sql import functions as F, SparkSession" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "f259058e-bc5a-47c1-967a-2771324bea2d", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Spark Session Create**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "c6aab036-fd32-4f6e-88f1-8abcb7d4a4e5", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "spark = (\n", | |
| " SparkSession.builder\n", | |
| " .appName('masking_dataframe')\n", | |
| " .master('local[*]')\n", | |
| " .getOrCreate()\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "cc5da744-89ef-484b-a955-779da0fe2f13", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Dataframe create**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "c8c594cf-4645-4546-9a66-5d60f4994a9f", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>id</th><th>name</th><th>email</th><th>expenses</th><th>cpf</th></tr></thead><tbody><tr><td>1</td><td>John Smith</td><td>john-smith@bol.com</td><td>2000</td><td>123.456.789-00</td></tr><tr><td>2</td><td>Jane Smith</td><td>jane-smith@bol.com</td><td>5000</td><td>123.456.789-01</td></tr><tr><td>3</td><td>Ane Doe</td><td>ane-doe@bol.com</td><td>4000</td><td>123.456.789-02</td></tr><tr><td>4</td><td>Charlie Bronson</td><td>charlie-bronson@bol.com</td><td>3000</td><td>123.456.789-03</td></tr><tr><td>5</td><td>Daniel Jones</td><td>daniel-jones@bol.com</td><td>6000</td><td>123.456.789-04</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| 1, | |
| "John Smith", | |
| "john-smith@bol.com", | |
| 2000, | |
| "123.456.789-00" | |
| ], | |
| [ | |
| 2, | |
| "Jane Smith", | |
| "jane-smith@bol.com", | |
| 5000, | |
| "123.456.789-01" | |
| ], | |
| [ | |
| 3, | |
| "Ane Doe", | |
| "ane-doe@bol.com", | |
| 4000, | |
| "123.456.789-02" | |
| ], | |
| [ | |
| 4, | |
| "Charlie Bronson", | |
| "charlie-bronson@bol.com", | |
| 3000, | |
| "123.456.789-03" | |
| ], | |
| [ | |
| 5, | |
| "Daniel Jones", | |
| "daniel-jones@bol.com", | |
| 6000, | |
| "123.456.789-04" | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "id", | |
| "type": "\"long\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "email", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "expenses", | |
| "type": "\"long\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "cpf", | |
| "type": "\"string\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "data:list = [\n", | |
| " (1, 'John Smith', 'john-smith@bol.com', 2000, '123.456.789-00'),\n", | |
| " (2, 'Jane Smith', 'jane-smith@bol.com', 5000, '123.456.789-01'),\n", | |
| " (3, 'Ane Doe', 'ane-doe@bol.com', 4000, '123.456.789-02'),\n", | |
| " (4, 'Charlie Bronson', 'charlie-bronson@bol.com', 3000, '123.456.789-03'),\n", | |
| " (5, 'Daniel Jones', 'daniel-jones@bol.com', 6000, '123.456.789-04')\n", | |
| "]\n", | |
| "\n", | |
| "columns:list = ['id', 'name', 'email', 'expenses', 'cpf']\n", | |
| "\n", | |
| "df = spark.createDataFrame(data, columns)\n", | |
| "\n", | |
| "df.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "f014d10b-4705-4146-8c7c-d33324987023", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Set LGPD columns**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "5158b869-602b-46aa-a902-6376d8ae3102", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "lgpd_columns:list = ['name', 'email', 'cpf']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "ec9e4547-2908-4ff2-8f64-f379e23df0df", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Masking data**\n", | |
| "### Params:\n", | |
| "* **Required:**\n", | |
| " * **col:** The column to be masked\n", | |
| "* **Optional:**\n", | |
| " * **IMPORTANT:** For retain original values set null for the respective param\n", | |
| " * Ex: ```F.lit(None)```\n", | |
| " * Described below" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "8f6cd5b6-05e1-4534-b16b-5bb63490bb5e", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>id</th><th>name</th><th>email</th><th>expenses</th><th>cpf</th></tr></thead><tbody><tr><td>1</td><td>Jlll@Sllll</td><td>llll@lllll@lll@lll</td><td>2000</td><td>NNN@NNN@NNN@NN</td></tr><tr><td>2</td><td>Jlll@Sllll</td><td>llll@lllll@lll@lll</td><td>5000</td><td>NNN@NNN@NNN@NN</td></tr><tr><td>3</td><td>All@Dll</td><td>lll@lll@lll@lll</td><td>4000</td><td>NNN@NNN@NNN@NN</td></tr><tr><td>4</td><td>Cllllll@Bllllll</td><td>lllllll@lllllll@lll@lll</td><td>3000</td><td>NNN@NNN@NNN@NN</td></tr><tr><td>5</td><td>Dlllll@Jllll</td><td>llllll@lllll@lll@lll</td><td>6000</td><td>NNN@NNN@NNN@NN</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| 1, | |
| "Jlll@Sllll", | |
| "llll@lllll@lll@lll", | |
| 2000, | |
| "NNN@NNN@NNN@NN" | |
| ], | |
| [ | |
| 2, | |
| "Jlll@Sllll", | |
| "llll@lllll@lll@lll", | |
| 5000, | |
| "NNN@NNN@NNN@NN" | |
| ], | |
| [ | |
| 3, | |
| "All@Dll", | |
| "lll@lll@lll@lll", | |
| 4000, | |
| "NNN@NNN@NNN@NN" | |
| ], | |
| [ | |
| 4, | |
| "Cllllll@Bllllll", | |
| "lllllll@lllllll@lll@lll", | |
| 3000, | |
| "NNN@NNN@NNN@NN" | |
| ], | |
| [ | |
| 5, | |
| "Dlllll@Jllll", | |
| "llllll@lllll@lll@lll", | |
| 6000, | |
| "NNN@NNN@NNN@NN" | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "id", | |
| "type": "\"long\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "email", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "expenses", | |
| "type": "\"long\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "cpf", | |
| "type": "\"string\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "masked_df = df\n", | |
| "\n", | |
| "for column in lgpd_columns:\n", | |
| " temp_df = (\n", | |
| " masked_df\n", | |
| " .withColumn(\n", | |
| " column,\n", | |
| " F.mask(\n", | |
| " col=df[column], # column to mask\n", | |
| " upperChar=F.lit(None), # Optional: character to mask for upper case\n", | |
| " lowerChar=F.lit('l'), # Optional: character to mask for lower case\n", | |
| " digitChar=F.lit('N'), # Optional: character to mask for digits\n", | |
| " otherChar=F.lit('@') # Optional: character to mask others characters types\n", | |
| " )\n", | |
| " )\n", | |
| " )\n", | |
| "\n", | |
| " masked_df = temp_df\n", | |
| "\n", | |
| "\n", | |
| "masked_df.show()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "689a2464-283e-443d-9e91-b243d191a5b8", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Masking for pyspark < 3.5.0**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "5eb4ca99-2844-41c3-ba6b-9813c41296b1", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>id</th><th>name</th><th>email</th><th>expenses</th><th>cpf</th></tr></thead><tbody><tr><td>1</td><td>ef61a579c907bbed674c0dbcbcf7f7af8f851538eef7b8e58c5bee0b8cfdac4a</td><td>ff8b0c94f59b0e4c2ab667bef8e0e5144302a0b898cf97175cb51ba4170f5b6e</td><td>2000</td><td>f788964b0ba012d64ae6d4c20c698d86821b8fbe725e1748e0d9644f3ce9a1b7</td></tr><tr><td>2</td><td>a2dd3acadb1c9dcd956216993056a7f50a9db6e3a16c60b35482139b5349c288</td><td>2e6012fda3adc0026efc585367f1ef73e5d4dc6127fc3838a8b2a9609fb81c23</td><td>5000</td><td>58ded4688b789ec2003713822ebd45057fff2f4ddbdc9171c816d47f9279f002</td></tr><tr><td>3</td><td>6738614c92ad0d8d77716f60a5ac81b223a5601c7aac080ba236eb7cb764eccf</td><td>b4a7ceaf09a6ce9d74629f154721bf8990288d6c1a4469ca1d783ac2ef5f251f</td><td>4000</td><td>a53a328f4e60f72e1cb3f9efa5aa27685769462fb4b3a4ec10ff3a6e7739cb49</td></tr><tr><td>4</td><td>fd843a6df3d1303931033eb8966cec1d83e563909fcb73bbf2aaf52590cbe74c</td><td>29c5f9d773df250c8bb4a932f9e35f52c5679528251339aed6181dcb30d4ca85</td><td>3000</td><td>43550d992a240ca591ffeb0b2d0b30ee4478bbc1eb87208b1fb21985cfdfc52e</td></tr><tr><td>5</td><td>453c2eb103afbe37b7f031ffb826c0fed4fabb1e97421f2478ab2623eaf2caa8</td><td>836482b97153e7525e2c3fc0bc77def36e8c9f187d16e83d381a8936b06d0d1b</td><td>6000</td><td>feb9aa3a5b4d7f9ce8aec441ff441171f360ceab296e3cbd08d12255242b516b</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| 1, | |
| "ef61a579c907bbed674c0dbcbcf7f7af8f851538eef7b8e58c5bee0b8cfdac4a", | |
| "ff8b0c94f59b0e4c2ab667bef8e0e5144302a0b898cf97175cb51ba4170f5b6e", | |
| 2000, | |
| "f788964b0ba012d64ae6d4c20c698d86821b8fbe725e1748e0d9644f3ce9a1b7" | |
| ], | |
| [ | |
| 2, | |
| "a2dd3acadb1c9dcd956216993056a7f50a9db6e3a16c60b35482139b5349c288", | |
| "2e6012fda3adc0026efc585367f1ef73e5d4dc6127fc3838a8b2a9609fb81c23", | |
| 5000, | |
| "58ded4688b789ec2003713822ebd45057fff2f4ddbdc9171c816d47f9279f002" | |
| ], | |
| [ | |
| 3, | |
| "6738614c92ad0d8d77716f60a5ac81b223a5601c7aac080ba236eb7cb764eccf", | |
| "b4a7ceaf09a6ce9d74629f154721bf8990288d6c1a4469ca1d783ac2ef5f251f", | |
| 4000, | |
| "a53a328f4e60f72e1cb3f9efa5aa27685769462fb4b3a4ec10ff3a6e7739cb49" | |
| ], | |
| [ | |
| 4, | |
| "fd843a6df3d1303931033eb8966cec1d83e563909fcb73bbf2aaf52590cbe74c", | |
| "29c5f9d773df250c8bb4a932f9e35f52c5679528251339aed6181dcb30d4ca85", | |
| 3000, | |
| "43550d992a240ca591ffeb0b2d0b30ee4478bbc1eb87208b1fb21985cfdfc52e" | |
| ], | |
| [ | |
| 5, | |
| "453c2eb103afbe37b7f031ffb826c0fed4fabb1e97421f2478ab2623eaf2caa8", | |
| "836482b97153e7525e2c3fc0bc77def36e8c9f187d16e83d381a8936b06d0d1b", | |
| 6000, | |
| "feb9aa3a5b4d7f9ce8aec441ff441171f360ceab296e3cbd08d12255242b516b" | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "id", | |
| "type": "\"long\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "email", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "expenses", | |
| "type": "\"long\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "cpf", | |
| "type": "\"string\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "bits_number:int = 256\n", | |
| "hashed_df = df\n", | |
| "\n", | |
| "for column in lgpd_columns:\n", | |
| " temp_df = (\n", | |
| " hashed_df\n", | |
| " .withColumn(\n", | |
| " column,\n", | |
| " F.sha2(\n", | |
| " df[column], \n", | |
| " bits_number\n", | |
| " )\n", | |
| " )\n", | |
| " )\n", | |
| "\n", | |
| " hashed_df = temp_df\n", | |
| "\n", | |
| "hashed_df.showshow()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "ec1a07b3-6beb-4e9e-864f-815ea8d0c9ea", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Tuning Masking with reduce**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "14ac2aab-ac31-407f-b19c-4f9b821b1568", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>id</th><th>name</th><th>email</th><th>expenses</th><th>cpf</th></tr></thead><tbody><tr><td>1</td><td>ef61a579c907bbed674c0dbcbcf7f7af8f851538eef7b8e58c5bee0b8cfdac4a</td><td>ff8b0c94f59b0e4c2ab667bef8e0e5144302a0b898cf97175cb51ba4170f5b6e</td><td>2000</td><td>f788964b0ba012d64ae6d4c20c698d86821b8fbe725e1748e0d9644f3ce9a1b7</td></tr><tr><td>2</td><td>a2dd3acadb1c9dcd956216993056a7f50a9db6e3a16c60b35482139b5349c288</td><td>2e6012fda3adc0026efc585367f1ef73e5d4dc6127fc3838a8b2a9609fb81c23</td><td>5000</td><td>58ded4688b789ec2003713822ebd45057fff2f4ddbdc9171c816d47f9279f002</td></tr><tr><td>3</td><td>6738614c92ad0d8d77716f60a5ac81b223a5601c7aac080ba236eb7cb764eccf</td><td>b4a7ceaf09a6ce9d74629f154721bf8990288d6c1a4469ca1d783ac2ef5f251f</td><td>4000</td><td>a53a328f4e60f72e1cb3f9efa5aa27685769462fb4b3a4ec10ff3a6e7739cb49</td></tr><tr><td>4</td><td>fd843a6df3d1303931033eb8966cec1d83e563909fcb73bbf2aaf52590cbe74c</td><td>29c5f9d773df250c8bb4a932f9e35f52c5679528251339aed6181dcb30d4ca85</td><td>3000</td><td>43550d992a240ca591ffeb0b2d0b30ee4478bbc1eb87208b1fb21985cfdfc52e</td></tr><tr><td>5</td><td>453c2eb103afbe37b7f031ffb826c0fed4fabb1e97421f2478ab2623eaf2caa8</td><td>836482b97153e7525e2c3fc0bc77def36e8c9f187d16e83d381a8936b06d0d1b</td><td>6000</td><td>feb9aa3a5b4d7f9ce8aec441ff441171f360ceab296e3cbd08d12255242b516b</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| 1, | |
| "ef61a579c907bbed674c0dbcbcf7f7af8f851538eef7b8e58c5bee0b8cfdac4a", | |
| "ff8b0c94f59b0e4c2ab667bef8e0e5144302a0b898cf97175cb51ba4170f5b6e", | |
| 2000, | |
| "f788964b0ba012d64ae6d4c20c698d86821b8fbe725e1748e0d9644f3ce9a1b7" | |
| ], | |
| [ | |
| 2, | |
| "a2dd3acadb1c9dcd956216993056a7f50a9db6e3a16c60b35482139b5349c288", | |
| "2e6012fda3adc0026efc585367f1ef73e5d4dc6127fc3838a8b2a9609fb81c23", | |
| 5000, | |
| "58ded4688b789ec2003713822ebd45057fff2f4ddbdc9171c816d47f9279f002" | |
| ], | |
| [ | |
| 3, | |
| "6738614c92ad0d8d77716f60a5ac81b223a5601c7aac080ba236eb7cb764eccf", | |
| "b4a7ceaf09a6ce9d74629f154721bf8990288d6c1a4469ca1d783ac2ef5f251f", | |
| 4000, | |
| "a53a328f4e60f72e1cb3f9efa5aa27685769462fb4b3a4ec10ff3a6e7739cb49" | |
| ], | |
| [ | |
| 4, | |
| "fd843a6df3d1303931033eb8966cec1d83e563909fcb73bbf2aaf52590cbe74c", | |
| "29c5f9d773df250c8bb4a932f9e35f52c5679528251339aed6181dcb30d4ca85", | |
| 3000, | |
| "43550d992a240ca591ffeb0b2d0b30ee4478bbc1eb87208b1fb21985cfdfc52e" | |
| ], | |
| [ | |
| 5, | |
| "453c2eb103afbe37b7f031ffb826c0fed4fabb1e97421f2478ab2623eaf2caa8", | |
| "836482b97153e7525e2c3fc0bc77def36e8c9f187d16e83d381a8936b06d0d1b", | |
| 6000, | |
| "feb9aa3a5b4d7f9ce8aec441ff441171f360ceab296e3cbd08d12255242b516b" | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "id", | |
| "type": "\"long\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "email", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "expenses", | |
| "type": "\"long\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "cpf", | |
| "type": "\"string\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "tuned_hashed_df = reduce(\n", | |
| " lambda df, col: df.withColumn(col, F.sha2(df[col], bits_number)),\n", | |
| " lgpd_columns,\n", | |
| " df\n", | |
| ")\n", | |
| "\n", | |
| "tuned_hashed_df.show()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "application/vnd.databricks.v1+notebook": { | |
| "dashboards": [], | |
| "environmentMetadata": null, | |
| "language": "python", | |
| "notebookMetadata": { | |
| "pythonIndentUnit": 2 | |
| }, | |
| "notebookName": "masking_in_pyspark", | |
| "widgets": {} | |
| }, | |
| "kernelspec": { | |
| "display_name": "Python 3.10.12 ('test-in-pyspark': venv)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.12" | |
| }, | |
| "orig_nbformat": 4, | |
| "vscode": { | |
| "interpreter": { | |
| "hash": "1e1ba4f9e318103b43fbdacf54c3a9a049c33c4c6eeff3e0fb7c252c27a47729" | |
| } | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment