Last active
March 30, 2022 16:22
-
-
Save walkerh/a697f0d530a8836c2daa8d28809c7892 to your computer and use it in GitHub Desktop.
Combining Aggregation With Column Update
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "357ab29a", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "49cc1fbf", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>A</th>\n", | |
| " <th>B</th>\n", | |
| " <th>C</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>m</th>\n", | |
| " <td>a</td>\n", | |
| " <td>1</td>\n", | |
| " <td>a</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>n</th>\n", | |
| " <td>a</td>\n", | |
| " <td>2</td>\n", | |
| " <td>b</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>o</th>\n", | |
| " <td>a</td>\n", | |
| " <td>3</td>\n", | |
| " <td>c</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>p</th>\n", | |
| " <td>a</td>\n", | |
| " <td>4</td>\n", | |
| " <td>d</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>q</th>\n", | |
| " <td>b</td>\n", | |
| " <td>1</td>\n", | |
| " <td>e</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>r</th>\n", | |
| " <td>b</td>\n", | |
| " <td>2</td>\n", | |
| " <td>f</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>s</th>\n", | |
| " <td>b</td>\n", | |
| " <td>3</td>\n", | |
| " <td>g</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " A B C\n", | |
| "m a 1 a\n", | |
| "n a 2 b\n", | |
| "o a 3 c\n", | |
| "p a 4 d\n", | |
| "q b 1 e\n", | |
| "r b 2 f\n", | |
| "s b 3 g" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df0 = pd.DataFrame(\n", | |
| " dict(\n", | |
| " A=iter(\"aaaabbb\"),\n", | |
| " B=[1, 2, 3, 4, 1, 2, 3],\n", | |
| " C=iter(\"abcdefg\"),\n", | |
| " ),\n", | |
| " index=iter(\"mnopqrs\"),\n", | |
| ")\n", | |
| "# Note:\n", | |
| "# - the custom index\n", | |
| "# - C has a unique value for every row\n", | |
| "df0" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "b1212a70", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "g = df0.groupby(\"A\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "49a8f037", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "A\n", | |
| "a 4\n", | |
| "b 3\n", | |
| "Name: B, dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "m = g[\"B\"].max()\n", | |
| "m" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "471f717c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "pandas.core.series.Series" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "type(m)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "fd879d6a", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>A</th>\n", | |
| " <th>B</th>\n", | |
| " <th>C</th>\n", | |
| " <th>B_max</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>m</th>\n", | |
| " <td>a</td>\n", | |
| " <td>1</td>\n", | |
| " <td>a</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>n</th>\n", | |
| " <td>a</td>\n", | |
| " <td>2</td>\n", | |
| " <td>b</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>o</th>\n", | |
| " <td>a</td>\n", | |
| " <td>3</td>\n", | |
| " <td>c</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>p</th>\n", | |
| " <td>a</td>\n", | |
| " <td>4</td>\n", | |
| " <td>d</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>q</th>\n", | |
| " <td>b</td>\n", | |
| " <td>1</td>\n", | |
| " <td>e</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>r</th>\n", | |
| " <td>b</td>\n", | |
| " <td>2</td>\n", | |
| " <td>f</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>s</th>\n", | |
| " <td>b</td>\n", | |
| " <td>3</td>\n", | |
| " <td>g</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " A B C B_max\n", | |
| "m a 1 a 4\n", | |
| "n a 2 b 4\n", | |
| "o a 3 c 4\n", | |
| "p a 4 d 4\n", | |
| "q b 1 e 3\n", | |
| "r b 2 f 3\n", | |
| "s b 3 g 3" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df1 = df0.join(m, on=\"A\", rsuffix=\"_max\")\n", | |
| "df1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "eb00c973", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>A</th>\n", | |
| " <th>B</th>\n", | |
| " <th>C</th>\n", | |
| " <th>B_max</th>\n", | |
| " <th>must_change</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>m</th>\n", | |
| " <td>a</td>\n", | |
| " <td>1</td>\n", | |
| " <td>a</td>\n", | |
| " <td>4</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>n</th>\n", | |
| " <td>a</td>\n", | |
| " <td>2</td>\n", | |
| " <td>b</td>\n", | |
| " <td>4</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>o</th>\n", | |
| " <td>a</td>\n", | |
| " <td>3</td>\n", | |
| " <td>c</td>\n", | |
| " <td>4</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>p</th>\n", | |
| " <td>a</td>\n", | |
| " <td>4</td>\n", | |
| " <td>d</td>\n", | |
| " <td>4</td>\n", | |
| " <td>True</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>q</th>\n", | |
| " <td>b</td>\n", | |
| " <td>1</td>\n", | |
| " <td>e</td>\n", | |
| " <td>3</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>r</th>\n", | |
| " <td>b</td>\n", | |
| " <td>2</td>\n", | |
| " <td>f</td>\n", | |
| " <td>3</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>s</th>\n", | |
| " <td>b</td>\n", | |
| " <td>3</td>\n", | |
| " <td>g</td>\n", | |
| " <td>3</td>\n", | |
| " <td>True</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " A B C B_max must_change\n", | |
| "m a 1 a 4 False\n", | |
| "n a 2 b 4 False\n", | |
| "o a 3 c 4 False\n", | |
| "p a 4 d 4 True\n", | |
| "q b 1 e 3 False\n", | |
| "r b 2 f 3 False\n", | |
| "s b 3 g 3 True" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df1[\"must_change\"] = df1.B == df1.B_max\n", | |
| "df1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "e6c78a64", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>C</th>\n", | |
| " <th>must_change</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>m</th>\n", | |
| " <td>a</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>n</th>\n", | |
| " <td>b</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>o</th>\n", | |
| " <td>c</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>p</th>\n", | |
| " <td>d</td>\n", | |
| " <td>True</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>q</th>\n", | |
| " <td>e</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>r</th>\n", | |
| " <td>f</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>s</th>\n", | |
| " <td>g</td>\n", | |
| " <td>True</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " C must_change\n", | |
| "m a False\n", | |
| "n b False\n", | |
| "o c False\n", | |
| "p d True\n", | |
| "q e False\n", | |
| "r f False\n", | |
| "s g True" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sub = df1[[\"C\", \"must_change\"]]\n", | |
| "sub" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "45659bd6", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('m', 'a', False),\n", | |
| " ('n', 'b', False),\n", | |
| " ('o', 'c', False),\n", | |
| " ('p', 'd', True),\n", | |
| " ('q', 'e', False),\n", | |
| " ('r', 'f', False),\n", | |
| " ('s', 'g', True)]" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "[(i, value, must_change) for i, value, must_change in sub.to_records()]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "3b462905", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['a', 'b', 'c', 'y', 'e', 'f', 'y']" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "[\"y\" if must_change else value for i, value, must_change in sub.to_records()]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "dc40cf69", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>A</th>\n", | |
| " <th>B</th>\n", | |
| " <th>C</th>\n", | |
| " <th>B_max</th>\n", | |
| " <th>must_change</th>\n", | |
| " <th>C_new</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>m</th>\n", | |
| " <td>a</td>\n", | |
| " <td>1</td>\n", | |
| " <td>a</td>\n", | |
| " <td>4</td>\n", | |
| " <td>False</td>\n", | |
| " <td>a</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>n</th>\n", | |
| " <td>a</td>\n", | |
| " <td>2</td>\n", | |
| " <td>b</td>\n", | |
| " <td>4</td>\n", | |
| " <td>False</td>\n", | |
| " <td>b</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>o</th>\n", | |
| " <td>a</td>\n", | |
| " <td>3</td>\n", | |
| " <td>c</td>\n", | |
| " <td>4</td>\n", | |
| " <td>False</td>\n", | |
| " <td>c</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>p</th>\n", | |
| " <td>a</td>\n", | |
| " <td>4</td>\n", | |
| " <td>d</td>\n", | |
| " <td>4</td>\n", | |
| " <td>True</td>\n", | |
| " <td>y</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>q</th>\n", | |
| " <td>b</td>\n", | |
| " <td>1</td>\n", | |
| " <td>e</td>\n", | |
| " <td>3</td>\n", | |
| " <td>False</td>\n", | |
| " <td>e</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>r</th>\n", | |
| " <td>b</td>\n", | |
| " <td>2</td>\n", | |
| " <td>f</td>\n", | |
| " <td>3</td>\n", | |
| " <td>False</td>\n", | |
| " <td>f</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>s</th>\n", | |
| " <td>b</td>\n", | |
| " <td>3</td>\n", | |
| " <td>g</td>\n", | |
| " <td>3</td>\n", | |
| " <td>True</td>\n", | |
| " <td>y</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " A B C B_max must_change C_new\n", | |
| "m a 1 a 4 False a\n", | |
| "n a 2 b 4 False b\n", | |
| "o a 3 c 4 False c\n", | |
| "p a 4 d 4 True y\n", | |
| "q b 1 e 3 False e\n", | |
| "r b 2 f 3 False f\n", | |
| "s b 3 g 3 True y" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# Use a Series to collect the results of a generator expression.\n", | |
| "# That way we don't waste memory constructing a list object.\n", | |
| "# Using a Series requires us to handle the possibility of a non-trivial index in the original DataFrame.\n", | |
| "df1[\"C_new\"] = pd.Series(\n", | |
| " (\"y\" if must_change else value for _, value, must_change in sub.to_records()),\n", | |
| " index=sub.index,\n", | |
| ")\n", | |
| "df1" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment