Created
October 13, 2024 16:40
-
-
Save calilisantos/e273e72d1077ee37bdcf81b9945bb599 to your computer and use it in GitHub Desktop.
UDF pyspark Reload
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "e0362298-ac64-41f4-a5c5-11a90ba9dd25", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Import dependencies**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "633c9ba1-b4ae-4647-a891-73cf3a352bee", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import json\n", | |
| "from pyspark.sql import functions as F, SparkSession, types as T" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "8d88687d-32c8-4e3d-8587-16bb7bd35579", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Create dataframe**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "52bcfc33-ed74-42b5-97ab-ee36a3085ff8", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>client</th><th>client_data</th><th>client_data_malformed</th><th>agency</th><th>budget</th></tr></thead><tbody><tr><td>Jhon</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}</td><td>[{car, 1000}, {house, 20000}]</td><td>Dakota - Squad</td><td>60000</td></tr><tr><td>Mary</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}</td><td>[{car, 2000}, {house, 30000}]</td><td>New York - Squad</td><td>80000</td></tr><tr><td>Apple</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}</td><td>[{car, 3000}, {house, 50000}, {phone, 3500}]</td><td>California - Squad</td><td>200000</td></tr><tr><td>Google</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}</td><td>[{car, 4000}, {house, 60000}, {phone, 3000}]</td><td>California - Squad</td><td>180000</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "Jhon", | |
| "{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}", | |
| "[{car, 1000}, {house, 20000}]", | |
| "Dakota - Squad", | |
| 60000 | |
| ], | |
| [ | |
| "Mary", | |
| "{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}", | |
| "[{car, 2000}, {house, 30000}]", | |
| "New York - Squad", | |
| 80000 | |
| ], | |
| [ | |
| "Apple", | |
| "{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}", | |
| "[{car, 3000}, {house, 50000}, {phone, 3500}]", | |
| "California - Squad", | |
| 200000 | |
| ], | |
| [ | |
| "Google", | |
| "{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}", | |
| "[{car, 4000}, {house, 60000}, {phone, 3000}]", | |
| "California - Squad", | |
| 180000 | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "client", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "client_data", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "client_data_malformed", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "agency", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "budget", | |
| "type": "\"long\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "columns = [\"client\", \"client_data\", \"client_data_malformed\",\"agency\", \"budget\"]\n", | |
| "\n", | |
| "values = [\n", | |
| " [\n", | |
| " \"Jhon\", \n", | |
| " '{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}',\n", | |
| " '[{car, 1000}, {house, 20000}]',\n", | |
| " \"Dakota - Squad\",\n", | |
| " 60000\n", | |
| " ],\n", | |
| " [\n", | |
| " \"Mary\",\n", | |
| " '{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}',\n", | |
| " '[{car, 2000}, {house, 30000}]',\n", | |
| " \"New York - Squad\",\n", | |
| " 80000\n", | |
| " ],\n", | |
| " [\n", | |
| " \"Apple\",\n", | |
| " '{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}',\n", | |
| " '[{car, 3000}, {house, 50000}, {phone, 3500}]',\n", | |
| " \"California - Squad\",\n", | |
| " 200000\n", | |
| " ],\n", | |
| " [\n", | |
| " \"Google\",\n", | |
| " '{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}',\n", | |
| " '[{car, 4000}, {house, 60000}, {phone, 3000}]',\n", | |
| " \"California - Squad\",\n", | |
| " 180000\n", | |
| " ],\n", | |
| "]\n", | |
| "\n", | |
| "dataframe = spark.createDataFrame(data=values, schema=columns)\n", | |
| "\n", | |
| "dataframe.display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "0e64116b-739e-44b0-8e7c-508ed94d03fa", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>client</th><th>client_data</th><th>client_data_malformed</th><th>agency</th><th>budget</th><th>budget_plus_thousand</th></tr></thead><tbody><tr><td>Jhon</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}</td><td>[{car, 1000}, {house, 20000}]</td><td>Dakota - Squad</td><td>60000</td><td>61000</td></tr><tr><td>Mary</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}</td><td>[{car, 2000}, {house, 30000}]</td><td>New York - Squad</td><td>80000</td><td>81000</td></tr><tr><td>Apple</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}</td><td>[{car, 3000}, {house, 50000}, {phone, 3500}]</td><td>California - Squad</td><td>200000</td><td>201000</td></tr><tr><td>Google</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}</td><td>[{car, 4000}, {house, 60000}, {phone, 3000}]</td><td>California - Squad</td><td>180000</td><td>181000</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "Jhon", | |
| "{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}", | |
| "[{car, 1000}, {house, 20000}]", | |
| "Dakota - Squad", | |
| 60000, | |
| 61000 | |
| ], | |
| [ | |
| "Mary", | |
| "{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}", | |
| "[{car, 2000}, {house, 30000}]", | |
| "New York - Squad", | |
| 80000, | |
| 81000 | |
| ], | |
| [ | |
| "Apple", | |
| "{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}", | |
| "[{car, 3000}, {house, 50000}, {phone, 3500}]", | |
| "California - Squad", | |
| 200000, | |
| 201000 | |
| ], | |
| [ | |
| "Google", | |
| "{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}", | |
| "[{car, 4000}, {house, 60000}, {phone, 3000}]", | |
| "California - Squad", | |
| 180000, | |
| 181000 | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "client", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "client_data", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "client_data_malformed", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "agency", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "budget", | |
| "type": "\"long\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "budget_plus_thousand", | |
| "type": "\"long\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "budget_plus_thousand = F.udf(\n", | |
| " lambda x: x + 1000,\n", | |
| " T.LongType()\n", | |
| ")\n", | |
| "dataframe.withColumn(\"budget_plus_thousand\", budget_plus_thousand(F.col(\"budget\"))).display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "66115c89-9833-4466-b287-6016b44409ae", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Create UDF to deal with client_data column**\n", | |
| "\n", | |
| "* ## Make column client_products with every product and their value in the format: \n", | |
| " ### _[ {product_a: product_a_value}, {product_b: product_b_value}]_" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "61aadaf0-01be-4643-be58-4fff66a7aec3", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def get_client_products(client_data):\n", | |
| " client_data = json.loads(client_data)\n", | |
| " return [{product[\"product\"]: product[\"value\"]} for product in client_data[\"fields\"]]\n", | |
| "\n", | |
| "get_client_products_udf = F.udf(get_client_products, T.ArrayType(T.MapType(T.StringType(), T.IntegerType())))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "79ece468-5307-478d-9953-9595410c1fdf", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Create UDF to deal with agency column**\n", | |
| "\n", | |
| "* ## Make column agency_code, with this pattern:\n", | |
| " ### To agency: City - Squad, the code is: AG-City-ZONE" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "317d7fad-49fb-4b5b-8b8a-e543b3d99c9d", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def get_agency_code(agency):\n", | |
| " if ' - Squad' in agency:\n", | |
| " return f\"AG-{agency.split(' - Squad')[0].upper()}-ZONE\"\n", | |
| "\n", | |
| "get_agency_code_udf = F.udf(get_agency_code, T.StringType())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "0520d575-c03a-405e-b47a-7fb2cdb47b8d", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Create UDF to deal with client_data_malformed column**\n", | |
| "\n", | |
| "* ## Put respective keys in the column values" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "66ea3918-989a-4b3f-bfd4-a9615d857e99", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "@udf\n", | |
| "def deal_client_data_malformed(client_data_malformed):\n", | |
| " # Remove os colchetes externos\n", | |
| " client_data_malformed = client_data_malformed.strip(\"[]\")\n", | |
| " \n", | |
| " # # Divide os conjuntos\n", | |
| " client_data_malformed_sets = client_data_malformed.split(\"}, {\")\n", | |
| "\n", | |
| " # # Formata cada conjunto\n", | |
| " client_data_set = []\n", | |
| " for client in client_data_malformed_sets:\n", | |
| " # Remove chaves e formata\n", | |
| " client = client.replace(\"{\", \"\").replace(\"}\", \"\")\n", | |
| " parts = client.split(\", \")\n", | |
| " if len(parts) == 2:\n", | |
| " # client_data_set.append(f'{{\"product\": \"{parts[0]}\", \"value\": \"{parts[1]}\"}}')\n", | |
| " client_data_set.append({\"product\": parts[0], \"value\": parts[1]})\n", | |
| " \n", | |
| " # # Junta os conjuntos formatados em um array JSON\n", | |
| " # return f\"[{', '.join(client_data_set)}]\"\n", | |
| " return client_data_set\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "ef8c0860-dce7-4a59-b71d-f945d608e880", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "# **Treat client_data, agency and client_data_malformed columns**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "ed29a30b-8b29-49ca-a038-586cc2cdd510", | |
| "showTitle": false, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>client</th><th>client_data</th><th>client_data_malformed</th><th>agency</th><th>budget</th><th>client_products</th><th>agency_code</th><th>client_data_malformed_fixed</th></tr></thead><tbody><tr><td>Jhon</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}</td><td>[{car, 1000}, {house, 20000}]</td><td>Dakota - Squad</td><td>60000</td><td>List(Map(car -> 1000), Map(house -> 20000))</td><td>AG-DAKOTA-ZONE</td><td>[{value=1000, product=car}, {value=20000, product=house}]</td></tr><tr><td>Mary</td><td>{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}</td><td>[{car, 2000}, {house, 30000}]</td><td>New York - Squad</td><td>80000</td><td>List(Map(car -> 2000), Map(house -> 30000))</td><td>AG-NEW YORK-ZONE</td><td>[{value=2000, product=car}, {value=30000, product=house}]</td></tr><tr><td>Apple</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}</td><td>[{car, 3000}, {house, 50000}, {phone, 3500}]</td><td>California - Squad</td><td>200000</td><td>List(Map(car -> 3000), Map(house -> 50000), Map(phone -> 3500))</td><td>AG-CALIFORNIA-ZONE</td><td>[{value=3000, product=car}, {value=50000, product=house}, {value=3500, product=phone}]</td></tr><tr><td>Google</td><td>{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}</td><td>[{car, 4000}, {house, 60000}, {phone, 3000}]</td><td>California - Squad</td><td>180000</td><td>List(Map(car -> 4000), Map(house -> 60000), Map(phone -> 3000))</td><td>AG-CALIFORNIA-ZONE</td><td>[{value=4000, product=car}, {value=60000, product=house}, {value=3000, product=phone}]</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "Jhon", | |
| "{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 1000}, {\"product\": \"house\", \"value\": 20000}]}", | |
| "[{car, 1000}, {house, 20000}]", | |
| "Dakota - Squad", | |
| 60000, | |
| [ | |
| { | |
| "car": 1000 | |
| }, | |
| { | |
| "house": 20000 | |
| } | |
| ], | |
| "AG-DAKOTA-ZONE", | |
| "[{value=1000, product=car}, {value=20000, product=house}]" | |
| ], | |
| [ | |
| "Mary", | |
| "{\"type\": \"person\", \"fields\": [{\"product\": \"car\", \"value\": 2000}, {\"product\": \"house\", \"value\": 30000}]}", | |
| "[{car, 2000}, {house, 30000}]", | |
| "New York - Squad", | |
| 80000, | |
| [ | |
| { | |
| "car": 2000 | |
| }, | |
| { | |
| "house": 30000 | |
| } | |
| ], | |
| "AG-NEW YORK-ZONE", | |
| "[{value=2000, product=car}, {value=30000, product=house}]" | |
| ], | |
| [ | |
| "Apple", | |
| "{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 3000}, {\"product\": \"house\", \"value\": 50000}, {\"product\": \"phone\", \"value\": 3500}]}", | |
| "[{car, 3000}, {house, 50000}, {phone, 3500}]", | |
| "California - Squad", | |
| 200000, | |
| [ | |
| { | |
| "car": 3000 | |
| }, | |
| { | |
| "house": 50000 | |
| }, | |
| { | |
| "phone": 3500 | |
| } | |
| ], | |
| "AG-CALIFORNIA-ZONE", | |
| "[{value=3000, product=car}, {value=50000, product=house}, {value=3500, product=phone}]" | |
| ], | |
| [ | |
| "Google", | |
| "{\"type\": \"company\", \"fields\": [{\"product\": \"car\", \"value\": 4000}, {\"product\": \"house\", \"value\": 60000}, {\"product\": \"phone\", \"value\": 3000}]}", | |
| "[{car, 4000}, {house, 60000}, {phone, 3000}]", | |
| "California - Squad", | |
| 180000, | |
| [ | |
| { | |
| "car": 4000 | |
| }, | |
| { | |
| "house": 60000 | |
| }, | |
| { | |
| "phone": 3000 | |
| } | |
| ], | |
| "AG-CALIFORNIA-ZONE", | |
| "[{value=4000, product=car}, {value=60000, product=house}, {value=3000, product=phone}]" | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "client", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "client_data", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "client_data_malformed", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "agency", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "budget", | |
| "type": "\"long\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "client_products", | |
| "type": "{\"type\":\"array\",\"elementType\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"integer\",\"valueContainsNull\":true},\"containsNull\":true}" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "agency_code", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "client_data_malformed_fixed", | |
| "type": "\"string\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "treated_data = (\n", | |
| " dataframe.withColumns({\n", | |
| " \"client_products\": get_client_products_udf(F.col(\"client_data\")),\n", | |
| " \"agency_code\": get_agency_code_udf(F.col(\"agency\")),\n", | |
| " \"client_data_malformed_fixed\": deal_client_data_malformed(F.col(\"client_data_malformed\"))\n", | |
| " })\n", | |
| ")\n", | |
| "treated_data.display()\n" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "application/vnd.databricks.v1+notebook": { | |
| "dashboards": [], | |
| "environmentMetadata": null, | |
| "language": "python", | |
| "notebookMetadata": { | |
| "pythonIndentUnit": 4 | |
| }, | |
| "notebookName": "udf_pyspark", | |
| "widgets": {} | |
| }, | |
| "kernelspec": { | |
| "display_name": "Python 3.8.10 64-bit", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.10" | |
| }, | |
| "orig_nbformat": 4, | |
| "vscode": { | |
| "interpreter": { | |
| "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" | |
| } | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment