Created
March 31, 2023 21:21
-
-
Save walkerh/aa24b05085e69d63f0fc5ffafd50a60c to your computer and use it in GitHub Desktop.
Attempts to fix missing data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "17062f26", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np\n", | |
| "import pandas as pd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "9def47f6", | |
| "metadata": {}, | |
| "source": [ | |
| "# A simple set" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "d1fd9c97", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df1 = pd.DataFrame([[1, 2], [3, 4], [5, 6]], columns=['A', 'B'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "4a7e7c0d", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>A</th>\n", | |
| " <th>B</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>3</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>5</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " A B\n", | |
| "0 1 2\n", | |
| "1 3 4\n", | |
| "2 5 6" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "cf31a951", | |
| "metadata": {}, | |
| "source": [ | |
| "## Introducing `DataFrame.apply`\n", | |
| "\n", | |
| "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html#pandas.DataFrame.apply" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "ab3fb7df", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "A 9\n", | |
| "B 12\n", | |
| "dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df1.apply(np.sum, axis=0)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "1873d275", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "result = df1.apply(np.sum, axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "dbd88fd8", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "pandas.core.series.Series" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "type(result)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "6d016f95", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 3\n", | |
| "1 7\n", | |
| "2 11\n", | |
| "dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "result" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "4b5a4dd7", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 (Series, A 1\\nB 2\\nName: 0, dtype: int64)\n", | |
| "1 (Series, A 3\\nB 4\\nName: 1, dtype: int64)\n", | |
| "2 (Series, A 5\\nB 6\\nName: 2, dtype: int64)\n", | |
| "dtype: object" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "def identify(input_series):\n", | |
| " return type(input_series).__name__, str(input_series)\n", | |
| "\n", | |
| "df1.apply(identify, axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "c679aaee", | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 4\n", | |
| "1 16\n", | |
| "2 36\n", | |
| "dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df1.apply(lambda s: (s[\"A\"] + 1) * s[\"B\"], axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "d05a759b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "result = df1.apply(lambda s: pd.Series({\"C\": s.A*2, \"D\": s.B+1}), axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "91e11df0", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "pandas.core.frame.DataFrame" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "type(result)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "27324fd3", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>C</th>\n", | |
| " <th>D</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>6</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>10</td>\n", | |
| " <td>7</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " C D\n", | |
| "0 2 3\n", | |
| "1 6 5\n", | |
| "2 10 7" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "result" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "7a069ca7", | |
| "metadata": {}, | |
| "source": [ | |
| "If the return value of `apply` is scalar, you get a Series. I the return value is a `Series`, you get a `DataFrame`." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "2620785c", | |
| "metadata": {}, | |
| "source": [ | |
| "# A more complex set\n", | |
| "\n", | |
| "This is a modified version of this StackOverflow answer: https://stackoverflow.com/a/62159000/642372" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "e2c83114", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "header = [\"year\", \"name\", \"miles\"]\n", | |
| "data = [\n", | |
| " ('2010', 'Paul', 6.0),\n", | |
| " ('2010', 'Paul', 4.0),\n", | |
| " ('2010', 'Paul', np.nan),\n", | |
| " ('2011', 'Paul', 7.0),\n", | |
| " ('2011', 'Paul', 8.0),\n", | |
| " ('2011', 'Paul', np.nan),\n", | |
| " ('2012', 'Paul', 9.0),\n", | |
| " ('2012', 'Paul', 10.9),\n", | |
| " ('2012', 'Paul', 12.0),\n", | |
| "]\n", | |
| "df2 = pd.DataFrame(data, columns=header)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "e39b85fa", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>year</th>\n", | |
| " <th>name</th>\n", | |
| " <th>miles</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2010</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>6.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2010</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>4.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2010</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2011</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>7.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2011</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>8.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>2011</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>2012</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>9.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>2012</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>10.9</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>2012</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>12.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " year name miles\n", | |
| "0 2010 Paul 6.0\n", | |
| "1 2010 Paul 4.0\n", | |
| "2 2010 Paul NaN\n", | |
| "3 2011 Paul 7.0\n", | |
| "4 2011 Paul 8.0\n", | |
| "5 2011 Paul NaN\n", | |
| "6 2012 Paul 9.0\n", | |
| "7 2012 Paul 10.9\n", | |
| "8 2012 Paul 12.0" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df2" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "9a63e832", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>miles</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>year</th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>2010</th>\n", | |
| " <td>5.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2011</th>\n", | |
| " <td>7.500000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2012</th>\n", | |
| " <td>10.633333</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " miles\n", | |
| "year \n", | |
| "2010 5.000000\n", | |
| "2011 7.500000\n", | |
| "2012 10.633333" | |
| ] | |
| }, | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df2.groupby(['year']).mean(numeric_only=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "82730846", | |
| "metadata": { | |
| "scrolled": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df2['miles_new'] = df2['miles'].fillna(\n", | |
| " df2.groupby(['year'])['miles'].transform('mean')\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "id": "0e10c31d", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>year</th>\n", | |
| " <th>name</th>\n", | |
| " <th>miles</th>\n", | |
| " <th>miles_new</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2010</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>6.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2010</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>4.0</td>\n", | |
| " <td>4.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2010</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>5.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2011</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>7.0</td>\n", | |
| " <td>7.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2011</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>8.0</td>\n", | |
| " <td>8.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>2011</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>7.5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>2012</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>9.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>2012</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>10.9</td>\n", | |
| " <td>10.9</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>2012</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>12.0</td>\n", | |
| " <td>12.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " year name miles miles_new\n", | |
| "0 2010 Paul 6.0 6.0\n", | |
| "1 2010 Paul 4.0 4.0\n", | |
| "2 2010 Paul NaN 5.0\n", | |
| "3 2011 Paul 7.0 7.0\n", | |
| "4 2011 Paul 8.0 8.0\n", | |
| "5 2011 Paul NaN 7.5\n", | |
| "6 2012 Paul 9.0 9.0\n", | |
| "7 2012 Paul 10.9 10.9\n", | |
| "8 2012 Paul 12.0 12.0" | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df2" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "id": "c4574c57", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "defaults = {\"2010\": 5.1, \"2011\": 8.1, \"2012\": 10.6}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "id": "2717675a", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 (Series, year 2010\\nname Paul\\...\n", | |
| "1 (Series, year 2010\\nname Paul\\...\n", | |
| "2 (Series, year 2010\\nname Paul\\...\n", | |
| "3 (Series, year 2011\\nname Paul\\...\n", | |
| "4 (Series, year 2011\\nname Paul\\...\n", | |
| "5 (Series, year 2011\\nname Paul\\...\n", | |
| "6 (Series, year 2012\\nname Paul\\...\n", | |
| "7 (Series, year 2012\\nname Paul\\...\n", | |
| "8 (Series, year 2012\\nname Paul\\...\n", | |
| "dtype: object" | |
| ] | |
| }, | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df2.apply(identify, axis=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "id": "5e2160a9", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>year</th>\n", | |
| " <th>name</th>\n", | |
| " <th>miles</th>\n", | |
| " <th>miles_new</th>\n", | |
| " <th>A</th>\n", | |
| " <th>B</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2010</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>2010</td>\n", | |
| " <td>6.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2010</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>4.0</td>\n", | |
| " <td>4.0</td>\n", | |
| " <td>2010</td>\n", | |
| " <td>4.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2010</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>2010</td>\n", | |
| " <td>5.1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2011</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>7.0</td>\n", | |
| " <td>7.0</td>\n", | |
| " <td>2011</td>\n", | |
| " <td>7.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2011</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>8.0</td>\n", | |
| " <td>8.0</td>\n", | |
| " <td>2011</td>\n", | |
| " <td>8.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>2011</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>7.5</td>\n", | |
| " <td>2011</td>\n", | |
| " <td>8.1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>2012</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>2012</td>\n", | |
| " <td>9.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>2012</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>10.9</td>\n", | |
| " <td>10.9</td>\n", | |
| " <td>2012</td>\n", | |
| " <td>10.9</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>2012</td>\n", | |
| " <td>Paul</td>\n", | |
| " <td>12.0</td>\n", | |
| " <td>12.0</td>\n", | |
| " <td>2012</td>\n", | |
| " <td>12.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " year name miles miles_new A B\n", | |
| "0 2010 Paul 6.0 6.0 2010 6.0\n", | |
| "1 2010 Paul 4.0 4.0 2010 4.0\n", | |
| "2 2010 Paul NaN 5.0 2010 5.1\n", | |
| "3 2011 Paul 7.0 7.0 2011 7.0\n", | |
| "4 2011 Paul 8.0 8.0 2011 8.0\n", | |
| "5 2011 Paul NaN 7.5 2011 8.1\n", | |
| "6 2012 Paul 9.0 9.0 2012 9.0\n", | |
| "7 2012 Paul 10.9 10.9 2012 10.9\n", | |
| "8 2012 Paul 12.0 12.0 2012 12.0" | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pd.concat(\n", | |
| " [\n", | |
| " df2, \n", | |
| " df2.apply(\n", | |
| " lambda s: pd.Series(\n", | |
| " {\n", | |
| " \"A\": s[\"year\"], \n", | |
| " \"B\": defaults[s[\"year\"]] if np.isnan(s[\"miles\"]) else s[\"miles\"]}\n", | |
| " ), \n", | |
| " axis=1\n", | |
| " )\n", | |
| " ], \n", | |
| " axis=1\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "5cc74700", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.8" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment