Skip to content

Instantly share code, notes, and snippets.

@calilisantos
Last active January 19, 2025 15:39
Show Gist options
  • Select an option

  • Save calilisantos/21f7fbac5f71b1c0c946697da4c82271 to your computer and use it in GitHub Desktop.

Select an option

Save calilisantos/21f7fbac5f71b1c0c946697da4c82271 to your computer and use it in GitHub Desktop.
Masking with pyspark.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "23b5cf4d-a666-4a18-ae50-61fbdd578186",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# **Dependencies import**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "e43a2f57-3625-4f9b-b26f-c8cf41b1f623",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": [
"from functools import reduce\n",
"from pyspark.sql import functions as F, SparkSession"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "f259058e-bc5a-47c1-967a-2771324bea2d",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# **Spark Session Create**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "c6aab036-fd32-4f6e-88f1-8abcb7d4a4e5",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": [
"spark = (\n",
" SparkSession.builder\n",
" .appName('masking_dataframe')\n",
" .master('local[*]')\n",
" .getOrCreate()\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "cc5da744-89ef-484b-a955-779da0fe2f13",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# **Dataframe create**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "c8c594cf-4645-4546-9a66-5d60f4994a9f",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>id</th><th>name</th><th>email</th><th>expenses</th><th>cpf</th></tr></thead><tbody><tr><td>1</td><td>John Smith</td><td>john-smith@bol.com</td><td>2000</td><td>123.456.789-00</td></tr><tr><td>2</td><td>Jane Smith</td><td>jane-smith@bol.com</td><td>5000</td><td>123.456.789-01</td></tr><tr><td>3</td><td>Ane Doe</td><td>ane-doe@bol.com</td><td>4000</td><td>123.456.789-02</td></tr><tr><td>4</td><td>Charlie Bronson</td><td>charlie-bronson@bol.com</td><td>3000</td><td>123.456.789-03</td></tr><tr><td>5</td><td>Daniel Jones</td><td>daniel-jones@bol.com</td><td>6000</td><td>123.456.789-04</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
1,
"John Smith",
"john-smith@bol.com",
2000,
"123.456.789-00"
],
[
2,
"Jane Smith",
"jane-smith@bol.com",
5000,
"123.456.789-01"
],
[
3,
"Ane Doe",
"ane-doe@bol.com",
4000,
"123.456.789-02"
],
[
4,
"Charlie Bronson",
"charlie-bronson@bol.com",
3000,
"123.456.789-03"
],
[
5,
"Daniel Jones",
"daniel-jones@bol.com",
6000,
"123.456.789-04"
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "id",
"type": "\"long\""
},
{
"metadata": "{}",
"name": "name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "email",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "expenses",
"type": "\"long\""
},
{
"metadata": "{}",
"name": "cpf",
"type": "\"string\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"data:list = [\n",
" (1, 'John Smith', 'john-smith@bol.com', 2000, '123.456.789-00'),\n",
" (2, 'Jane Smith', 'jane-smith@bol.com', 5000, '123.456.789-01'),\n",
" (3, 'Ane Doe', 'ane-doe@bol.com', 4000, '123.456.789-02'),\n",
" (4, 'Charlie Bronson', 'charlie-bronson@bol.com', 3000, '123.456.789-03'),\n",
" (5, 'Daniel Jones', 'daniel-jones@bol.com', 6000, '123.456.789-04')\n",
"]\n",
"\n",
"columns:list = ['id', 'name', 'email', 'expenses', 'cpf']\n",
"\n",
"df = spark.createDataFrame(data, columns)\n",
"\n",
"df.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "f014d10b-4705-4146-8c7c-d33324987023",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# **Set LGPD columns**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "5158b869-602b-46aa-a902-6376d8ae3102",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": [
"lgpd_columns:list = ['name', 'email', 'cpf']"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "ec9e4547-2908-4ff2-8f64-f379e23df0df",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# **Masking data**\n",
"### Params:\n",
"* **Required:**\n",
" * **col:** The column to be masked\n",
"* **Optional:**\n",
" * **IMPORTANT:** For retain original values set null for the respective param\n",
" * Ex: ```F.lit(None)```\n",
" * Described below"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "8f6cd5b6-05e1-4534-b16b-5bb63490bb5e",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>id</th><th>name</th><th>email</th><th>expenses</th><th>cpf</th></tr></thead><tbody><tr><td>1</td><td>Jlll@Sllll</td><td>llll@lllll@lll@lll</td><td>2000</td><td>NNN@NNN@NNN@NN</td></tr><tr><td>2</td><td>Jlll@Sllll</td><td>llll@lllll@lll@lll</td><td>5000</td><td>NNN@NNN@NNN@NN</td></tr><tr><td>3</td><td>All@Dll</td><td>lll@lll@lll@lll</td><td>4000</td><td>NNN@NNN@NNN@NN</td></tr><tr><td>4</td><td>Cllllll@Bllllll</td><td>lllllll@lllllll@lll@lll</td><td>3000</td><td>NNN@NNN@NNN@NN</td></tr><tr><td>5</td><td>Dlllll@Jllll</td><td>llllll@lllll@lll@lll</td><td>6000</td><td>NNN@NNN@NNN@NN</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
1,
"Jlll@Sllll",
"llll@lllll@lll@lll",
2000,
"NNN@NNN@NNN@NN"
],
[
2,
"Jlll@Sllll",
"llll@lllll@lll@lll",
5000,
"NNN@NNN@NNN@NN"
],
[
3,
"All@Dll",
"lll@lll@lll@lll",
4000,
"NNN@NNN@NNN@NN"
],
[
4,
"Cllllll@Bllllll",
"lllllll@lllllll@lll@lll",
3000,
"NNN@NNN@NNN@NN"
],
[
5,
"Dlllll@Jllll",
"llllll@lllll@lll@lll",
6000,
"NNN@NNN@NNN@NN"
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "id",
"type": "\"long\""
},
{
"metadata": "{}",
"name": "name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "email",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "expenses",
"type": "\"long\""
},
{
"metadata": "{}",
"name": "cpf",
"type": "\"string\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"masked_df = df\n",
"\n",
"for column in lgpd_columns:\n",
" temp_df = (\n",
" masked_df\n",
" .withColumn(\n",
" column,\n",
" F.mask(\n",
" col=df[column], # column to mask\n",
" upperChar=F.lit(None), # Optional: character to mask for upper case\n",
" lowerChar=F.lit('l'), # Optional: character to mask for lower case\n",
" digitChar=F.lit('N'), # Optional: character to mask for digits\n",
" otherChar=F.lit('@') # Optional: character to mask others characters types\n",
" )\n",
" )\n",
" )\n",
"\n",
" masked_df = temp_df\n",
"\n",
"\n",
"masked_df.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "689a2464-283e-443d-9e91-b243d191a5b8",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# **Masking for pyspark < 3.5.0**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "5eb4ca99-2844-41c3-ba6b-9813c41296b1",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>id</th><th>name</th><th>email</th><th>expenses</th><th>cpf</th></tr></thead><tbody><tr><td>1</td><td>ef61a579c907bbed674c0dbcbcf7f7af8f851538eef7b8e58c5bee0b8cfdac4a</td><td>ff8b0c94f59b0e4c2ab667bef8e0e5144302a0b898cf97175cb51ba4170f5b6e</td><td>2000</td><td>f788964b0ba012d64ae6d4c20c698d86821b8fbe725e1748e0d9644f3ce9a1b7</td></tr><tr><td>2</td><td>a2dd3acadb1c9dcd956216993056a7f50a9db6e3a16c60b35482139b5349c288</td><td>2e6012fda3adc0026efc585367f1ef73e5d4dc6127fc3838a8b2a9609fb81c23</td><td>5000</td><td>58ded4688b789ec2003713822ebd45057fff2f4ddbdc9171c816d47f9279f002</td></tr><tr><td>3</td><td>6738614c92ad0d8d77716f60a5ac81b223a5601c7aac080ba236eb7cb764eccf</td><td>b4a7ceaf09a6ce9d74629f154721bf8990288d6c1a4469ca1d783ac2ef5f251f</td><td>4000</td><td>a53a328f4e60f72e1cb3f9efa5aa27685769462fb4b3a4ec10ff3a6e7739cb49</td></tr><tr><td>4</td><td>fd843a6df3d1303931033eb8966cec1d83e563909fcb73bbf2aaf52590cbe74c</td><td>29c5f9d773df250c8bb4a932f9e35f52c5679528251339aed6181dcb30d4ca85</td><td>3000</td><td>43550d992a240ca591ffeb0b2d0b30ee4478bbc1eb87208b1fb21985cfdfc52e</td></tr><tr><td>5</td><td>453c2eb103afbe37b7f031ffb826c0fed4fabb1e97421f2478ab2623eaf2caa8</td><td>836482b97153e7525e2c3fc0bc77def36e8c9f187d16e83d381a8936b06d0d1b</td><td>6000</td><td>feb9aa3a5b4d7f9ce8aec441ff441171f360ceab296e3cbd08d12255242b516b</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
1,
"ef61a579c907bbed674c0dbcbcf7f7af8f851538eef7b8e58c5bee0b8cfdac4a",
"ff8b0c94f59b0e4c2ab667bef8e0e5144302a0b898cf97175cb51ba4170f5b6e",
2000,
"f788964b0ba012d64ae6d4c20c698d86821b8fbe725e1748e0d9644f3ce9a1b7"
],
[
2,
"a2dd3acadb1c9dcd956216993056a7f50a9db6e3a16c60b35482139b5349c288",
"2e6012fda3adc0026efc585367f1ef73e5d4dc6127fc3838a8b2a9609fb81c23",
5000,
"58ded4688b789ec2003713822ebd45057fff2f4ddbdc9171c816d47f9279f002"
],
[
3,
"6738614c92ad0d8d77716f60a5ac81b223a5601c7aac080ba236eb7cb764eccf",
"b4a7ceaf09a6ce9d74629f154721bf8990288d6c1a4469ca1d783ac2ef5f251f",
4000,
"a53a328f4e60f72e1cb3f9efa5aa27685769462fb4b3a4ec10ff3a6e7739cb49"
],
[
4,
"fd843a6df3d1303931033eb8966cec1d83e563909fcb73bbf2aaf52590cbe74c",
"29c5f9d773df250c8bb4a932f9e35f52c5679528251339aed6181dcb30d4ca85",
3000,
"43550d992a240ca591ffeb0b2d0b30ee4478bbc1eb87208b1fb21985cfdfc52e"
],
[
5,
"453c2eb103afbe37b7f031ffb826c0fed4fabb1e97421f2478ab2623eaf2caa8",
"836482b97153e7525e2c3fc0bc77def36e8c9f187d16e83d381a8936b06d0d1b",
6000,
"feb9aa3a5b4d7f9ce8aec441ff441171f360ceab296e3cbd08d12255242b516b"
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "id",
"type": "\"long\""
},
{
"metadata": "{}",
"name": "name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "email",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "expenses",
"type": "\"long\""
},
{
"metadata": "{}",
"name": "cpf",
"type": "\"string\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"bits_number:int = 256\n",
"hashed_df = df\n",
"\n",
"for column in lgpd_columns:\n",
" temp_df = (\n",
" hashed_df\n",
" .withColumn(\n",
" column,\n",
" F.sha2(\n",
" df[column], \n",
" bits_number\n",
" )\n",
" )\n",
" )\n",
"\n",
" hashed_df = temp_df\n",
"\n",
"hashed_df.showshow()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "ec1a07b3-6beb-4e9e-864f-815ea8d0c9ea",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# **Tuning Masking with reduce**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "14ac2aab-ac31-407f-b19c-4f9b821b1568",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>id</th><th>name</th><th>email</th><th>expenses</th><th>cpf</th></tr></thead><tbody><tr><td>1</td><td>ef61a579c907bbed674c0dbcbcf7f7af8f851538eef7b8e58c5bee0b8cfdac4a</td><td>ff8b0c94f59b0e4c2ab667bef8e0e5144302a0b898cf97175cb51ba4170f5b6e</td><td>2000</td><td>f788964b0ba012d64ae6d4c20c698d86821b8fbe725e1748e0d9644f3ce9a1b7</td></tr><tr><td>2</td><td>a2dd3acadb1c9dcd956216993056a7f50a9db6e3a16c60b35482139b5349c288</td><td>2e6012fda3adc0026efc585367f1ef73e5d4dc6127fc3838a8b2a9609fb81c23</td><td>5000</td><td>58ded4688b789ec2003713822ebd45057fff2f4ddbdc9171c816d47f9279f002</td></tr><tr><td>3</td><td>6738614c92ad0d8d77716f60a5ac81b223a5601c7aac080ba236eb7cb764eccf</td><td>b4a7ceaf09a6ce9d74629f154721bf8990288d6c1a4469ca1d783ac2ef5f251f</td><td>4000</td><td>a53a328f4e60f72e1cb3f9efa5aa27685769462fb4b3a4ec10ff3a6e7739cb49</td></tr><tr><td>4</td><td>fd843a6df3d1303931033eb8966cec1d83e563909fcb73bbf2aaf52590cbe74c</td><td>29c5f9d773df250c8bb4a932f9e35f52c5679528251339aed6181dcb30d4ca85</td><td>3000</td><td>43550d992a240ca591ffeb0b2d0b30ee4478bbc1eb87208b1fb21985cfdfc52e</td></tr><tr><td>5</td><td>453c2eb103afbe37b7f031ffb826c0fed4fabb1e97421f2478ab2623eaf2caa8</td><td>836482b97153e7525e2c3fc0bc77def36e8c9f187d16e83d381a8936b06d0d1b</td><td>6000</td><td>feb9aa3a5b4d7f9ce8aec441ff441171f360ceab296e3cbd08d12255242b516b</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
1,
"ef61a579c907bbed674c0dbcbcf7f7af8f851538eef7b8e58c5bee0b8cfdac4a",
"ff8b0c94f59b0e4c2ab667bef8e0e5144302a0b898cf97175cb51ba4170f5b6e",
2000,
"f788964b0ba012d64ae6d4c20c698d86821b8fbe725e1748e0d9644f3ce9a1b7"
],
[
2,
"a2dd3acadb1c9dcd956216993056a7f50a9db6e3a16c60b35482139b5349c288",
"2e6012fda3adc0026efc585367f1ef73e5d4dc6127fc3838a8b2a9609fb81c23",
5000,
"58ded4688b789ec2003713822ebd45057fff2f4ddbdc9171c816d47f9279f002"
],
[
3,
"6738614c92ad0d8d77716f60a5ac81b223a5601c7aac080ba236eb7cb764eccf",
"b4a7ceaf09a6ce9d74629f154721bf8990288d6c1a4469ca1d783ac2ef5f251f",
4000,
"a53a328f4e60f72e1cb3f9efa5aa27685769462fb4b3a4ec10ff3a6e7739cb49"
],
[
4,
"fd843a6df3d1303931033eb8966cec1d83e563909fcb73bbf2aaf52590cbe74c",
"29c5f9d773df250c8bb4a932f9e35f52c5679528251339aed6181dcb30d4ca85",
3000,
"43550d992a240ca591ffeb0b2d0b30ee4478bbc1eb87208b1fb21985cfdfc52e"
],
[
5,
"453c2eb103afbe37b7f031ffb826c0fed4fabb1e97421f2478ab2623eaf2caa8",
"836482b97153e7525e2c3fc0bc77def36e8c9f187d16e83d381a8936b06d0d1b",
6000,
"feb9aa3a5b4d7f9ce8aec441ff441171f360ceab296e3cbd08d12255242b516b"
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "id",
"type": "\"long\""
},
{
"metadata": "{}",
"name": "name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "email",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "expenses",
"type": "\"long\""
},
{
"metadata": "{}",
"name": "cpf",
"type": "\"string\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"tuned_hashed_df = reduce(\n",
" lambda df, col: df.withColumn(col, F.sha2(df[col], bits_number)),\n",
" lgpd_columns,\n",
" df\n",
")\n",
"\n",
"tuned_hashed_df.show()"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"environmentMetadata": null,
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 2
},
"notebookName": "masking_in_pyspark",
"widgets": {}
},
"kernelspec": {
"display_name": "Python 3.10.12 ('test-in-pyspark': venv)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "1e1ba4f9e318103b43fbdacf54c3a9a049c33c4c6eeff3e0fb7c252c27a47729"
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment