Skip to content

Instantly share code, notes, and snippets.

@calilisantos
Created October 13, 2024 16:40
Show Gist options
  • Select an option

  • Save calilisantos/e273e72d1077ee37bdcf81b9945bb599 to your computer and use it in GitHub Desktop.

Select an option

Save calilisantos/e273e72d1077ee37bdcf81b9945bb599 to your computer and use it in GitHub Desktop.
UDF pyspark Reload
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "e0362298-ac64-41f4-a5c5-11a90ba9dd25",
"showTitle": false,
"title": ""
}
},
"source": [
"# **Import dependencies**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "633c9ba1-b4ae-4647-a891-73cf3a352bee",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"import json\n",
"from pyspark.sql import functions as F, SparkSession, types as T"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "8d88687d-32c8-4e3d-8587-16bb7bd35579",
"showTitle": false,
"title": ""
}
},
"source": [
"# **Create dataframe**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "52bcfc33-ed74-42b5-97ab-ee36a3085ff8",
"showTitle": false,
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>client</th><th>client_data</th><th>client_data_malformed</th><th>agency</th><th>budget</th></tr></thead><tbody><tr><td>Jhon</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}</td><td>[{car, 1000}, {house, 20000}]</td><td>Dakota - Squad</td><td>60000</td></tr><tr><td>Mary</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}</td><td>[{car, 2000}, {house, 30000}]</td><td>New York - Squad</td><td>80000</td></tr><tr><td>Apple</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}</td><td>[{car, 3000}, {house, 50000}, {phone, 3500}]</td><td>California - Squad</td><td>200000</td></tr><tr><td>Google</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}</td><td>[{car, 4000}, {house, 60000}, {phone, 3000}]</td><td>California - Squad</td><td>180000</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"Jhon",
"{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}",
"[{car, 1000}, {house, 20000}]",
"Dakota - Squad",
60000
],
[
"Mary",
"{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}",
"[{car, 2000}, {house, 30000}]",
"New York - Squad",
80000
],
[
"Apple",
"{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}",
"[{car, 3000}, {house, 50000}, {phone, 3500}]",
"California - Squad",
200000
],
[
"Google",
"{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}",
"[{car, 4000}, {house, 60000}, {phone, 3000}]",
"California - Squad",
180000
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "client",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "client_data",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "client_data_malformed",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "agency",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "budget",
"type": "\"long\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"columns = [\"client\", \"client_data\", \"client_data_malformed\",\"agency\", \"budget\"]\n",
"\n",
"values = [\n",
" [\n",
" \"Jhon\", \n",
" '{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}',\n",
" '[{car, 1000}, {house, 20000}]',\n",
" \"Dakota - Squad\",\n",
" 60000\n",
" ],\n",
" [\n",
" \"Mary\",\n",
" '{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}',\n",
" '[{car, 2000}, {house, 30000}]',\n",
" \"New York - Squad\",\n",
" 80000\n",
" ],\n",
" [\n",
" \"Apple\",\n",
" '{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}',\n",
" '[{car, 3000}, {house, 50000}, {phone, 3500}]',\n",
" \"California - Squad\",\n",
" 200000\n",
" ],\n",
" [\n",
" \"Google\",\n",
" '{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}',\n",
" '[{car, 4000}, {house, 60000}, {phone, 3000}]',\n",
" \"California - Squad\",\n",
" 180000\n",
" ],\n",
"]\n",
"\n",
"dataframe = spark.createDataFrame(data=values, schema=columns)\n",
"\n",
"dataframe.display()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "0e64116b-739e-44b0-8e7c-508ed94d03fa",
"showTitle": false,
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>client</th><th>client_data</th><th>client_data_malformed</th><th>agency</th><th>budget</th><th>budget_plus_thousand</th></tr></thead><tbody><tr><td>Jhon</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}</td><td>[{car, 1000}, {house, 20000}]</td><td>Dakota - Squad</td><td>60000</td><td>61000</td></tr><tr><td>Mary</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}</td><td>[{car, 2000}, {house, 30000}]</td><td>New York - Squad</td><td>80000</td><td>81000</td></tr><tr><td>Apple</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}</td><td>[{car, 3000}, {house, 50000}, {phone, 3500}]</td><td>California - Squad</td><td>200000</td><td>201000</td></tr><tr><td>Google</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}</td><td>[{car, 4000}, {house, 60000}, {phone, 3000}]</td><td>California - Squad</td><td>180000</td><td>181000</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"Jhon",
"{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}",
"[{car, 1000}, {house, 20000}]",
"Dakota - Squad",
60000,
61000
],
[
"Mary",
"{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}",
"[{car, 2000}, {house, 30000}]",
"New York - Squad",
80000,
81000
],
[
"Apple",
"{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}",
"[{car, 3000}, {house, 50000}, {phone, 3500}]",
"California - Squad",
200000,
201000
],
[
"Google",
"{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}",
"[{car, 4000}, {house, 60000}, {phone, 3000}]",
"California - Squad",
180000,
181000
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "client",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "client_data",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "client_data_malformed",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "agency",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "budget",
"type": "\"long\""
},
{
"metadata": "{}",
"name": "budget_plus_thousand",
"type": "\"long\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"budget_plus_thousand = F.udf(\n",
" lambda x: x + 1000,\n",
" T.LongType()\n",
")\n",
"dataframe.withColumn(\"budget_plus_thousand\", budget_plus_thousand(F.col(\"budget\"))).display()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "66115c89-9833-4466-b287-6016b44409ae",
"showTitle": false,
"title": ""
}
},
"source": [
"# **Create UDF to deal with client_data column**\n",
"\n",
"* ## Make column client_products with every product and their value in the format: \n",
" ### _[ {product_a: product_a_value}, {product_b: product_b_value}]_"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "61aadaf0-01be-4643-be58-4fff66a7aec3",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"def get_client_products(client_data):\n",
" client_data = json.loads(client_data)\n",
" return [{product[\"product\"]: product[\"value\"]} for product in client_data[\"fields\"]]\n",
"\n",
"get_client_products_udf = F.udf(get_client_products, T.ArrayType(T.MapType(T.StringType(), T.IntegerType())))"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "79ece468-5307-478d-9953-9595410c1fdf",
"showTitle": false,
"title": ""
}
},
"source": [
"# **Create UDF to deal with agency column**\n",
"\n",
"* ## Make column agency_code, with this pattern:\n",
" ### To agency: City - Squad, the code is: AG-City-ZONE"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "317d7fad-49fb-4b5b-8b8a-e543b3d99c9d",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"def get_agency_code(agency):\n",
" if ' - Squad' in agency:\n",
" return f\"AG-{agency.split(' - Squad')[0].upper()}-ZONE\"\n",
"\n",
"get_agency_code_udf = F.udf(get_agency_code, T.StringType())"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "0520d575-c03a-405e-b47a-7fb2cdb47b8d",
"showTitle": false,
"title": ""
}
},
"source": [
"# **Create UDF to deal with client_data_malformed column**\n",
"\n",
"* ## Put respective keys in the column values"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "66ea3918-989a-4b3f-bfd4-a9615d857e99",
"showTitle": false,
"title": ""
}
},
"outputs": [],
"source": [
"@udf\n",
"def deal_client_data_malformed(client_data_malformed):\n",
" # Remove os colchetes externos\n",
" client_data_malformed = client_data_malformed.strip(\"[]\")\n",
" \n",
" # # Divide os conjuntos\n",
" client_data_malformed_sets = client_data_malformed.split(\"}, {\")\n",
"\n",
" # # Formata cada conjunto\n",
" client_data_set = []\n",
" for client in client_data_malformed_sets:\n",
" # Remove chaves e formata\n",
" client = client.replace(\"{\", \"\").replace(\"}\", \"\")\n",
" parts = client.split(\", \")\n",
" if len(parts) == 2:\n",
" # client_data_set.append(f'{{\"product\": \"{parts[0]}\", \"value\": \"{parts[1]}\"}}')\n",
" client_data_set.append({\"product\": parts[0], \"value\": parts[1]})\n",
" \n",
" # # Junta os conjuntos formatados em um array JSON\n",
" # return f\"[{', '.join(client_data_set)}]\"\n",
" return client_data_set\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "ef8c0860-dce7-4a59-b71d-f945d608e880",
"showTitle": false,
"title": ""
}
},
"source": [
"# **Treat client_data, agency and client_data_malformed columns**"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "ed29a30b-8b29-49ca-a038-586cc2cdd510",
"showTitle": false,
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>client</th><th>client_data</th><th>client_data_malformed</th><th>agency</th><th>budget</th><th>client_products</th><th>agency_code</th><th>client_data_malformed_fixed</th></tr></thead><tbody><tr><td>Jhon</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}</td><td>[{car, 1000}, {house, 20000}]</td><td>Dakota - Squad</td><td>60000</td><td>List(Map(car -> 1000), Map(house -> 20000))</td><td>AG-DAKOTA-ZONE</td><td>[{value=1000, product=car}, {value=20000, product=house}]</td></tr><tr><td>Mary</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}</td><td>[{car, 2000}, {house, 30000}]</td><td>New York - Squad</td><td>80000</td><td>List(Map(car -> 2000), Map(house -> 30000))</td><td>AG-NEW YORK-ZONE</td><td>[{value=2000, product=car}, {value=30000, product=house}]</td></tr><tr><td>Apple</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}</td><td>[{car, 3000}, {house, 50000}, {phone, 3500}]</td><td>California - Squad</td><td>200000</td><td>List(Map(car -> 3000), Map(house -> 50000), Map(phone -> 3500))</td><td>AG-CALIFORNIA-ZONE</td><td>[{value=3000, product=car}, {value=50000, product=house}, {value=3500, product=phone}]</td></tr><tr><td>Google</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}</td><td>[{car, 4000}, {house, 60000}, {phone, 3000}]</td><td>California - Squad</td><td>180000</td><td>List(Map(car -> 4000), Map(house -> 60000), Map(phone -> 3000))</td><td>AG-CALIFORNIA-ZONE</td><td>[{value=4000, product=car}, {value=60000, product=house}, {value=3000, product=phone}]</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"Jhon",
"{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}",
"[{car, 1000}, {house, 20000}]",
"Dakota - Squad",
60000,
[
{
"car": 1000
},
{
"house": 20000
}
],
"AG-DAKOTA-ZONE",
"[{value=1000, product=car}, {value=20000, product=house}]"
],
[
"Mary",
"{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}",
"[{car, 2000}, {house, 30000}]",
"New York - Squad",
80000,
[
{
"car": 2000
},
{
"house": 30000
}
],
"AG-NEW YORK-ZONE",
"[{value=2000, product=car}, {value=30000, product=house}]"
],
[
"Apple",
"{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}",
"[{car, 3000}, {house, 50000}, {phone, 3500}]",
"California - Squad",
200000,
[
{
"car": 3000
},
{
"house": 50000
},
{
"phone": 3500
}
],
"AG-CALIFORNIA-ZONE",
"[{value=3000, product=car}, {value=50000, product=house}, {value=3500, product=phone}]"
],
[
"Google",
"{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}",
"[{car, 4000}, {house, 60000}, {phone, 3000}]",
"California - Squad",
180000,
[
{
"car": 4000
},
{
"house": 60000
},
{
"phone": 3000
}
],
"AG-CALIFORNIA-ZONE",
"[{value=4000, product=car}, {value=60000, product=house}, {value=3000, product=phone}]"
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "client",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "client_data",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "client_data_malformed",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "agency",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "budget",
"type": "\"long\""
},
{
"metadata": "{}",
"name": "client_products",
"type": "{\"type\":\"array\",\"elementType\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"integer\",\"valueContainsNull\":true},\"containsNull\":true}"
},
{
"metadata": "{}",
"name": "agency_code",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "client_data_malformed_fixed",
"type": "\"string\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"treated_data = (\n",
" dataframe.withColumns({\n",
" \"client_products\": get_client_products_udf(F.col(\"client_data\")),\n",
" \"agency_code\": get_agency_code_udf(F.col(\"agency\")),\n",
" \"client_data_malformed_fixed\": deal_client_data_malformed(F.col(\"client_data_malformed\"))\n",
" })\n",
")\n",
"treated_data.display()\n"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"dashboards": [],
"environmentMetadata": null,
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 4
},
"notebookName": "udf_pyspark",
"widgets": {}
},
"kernelspec": {
"display_name": "Python 3.8.10 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment