Created
September 15, 2016 21:10
-
-
Save tomokishii/f051de44ecf8844fb74cacb36226a65f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Pandas grouping operation\n", | |
| "(ref.) http://pandas.pydata.org/pandas-docs/stable/cookbook.html#grouping" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# packages\n", | |
| "import numpy as np\n", | |
| "import pandas as pd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(),\n", | |
| " 'size': list('SSMMMLL'),\n", | |
| " 'weight': [8, 10, 11, 1, 20, 12, 12],\n", | |
| " 'adult' : [False] * 5 + [True] * 2})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>adult</th>\n", | |
| " <th>animal</th>\n", | |
| " <th>size</th>\n", | |
| " <th>weight</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>False</td>\n", | |
| " <td>cat</td>\n", | |
| " <td>S</td>\n", | |
| " <td>8</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>False</td>\n", | |
| " <td>dog</td>\n", | |
| " <td>S</td>\n", | |
| " <td>10</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>False</td>\n", | |
| " <td>cat</td>\n", | |
| " <td>M</td>\n", | |
| " <td>11</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>False</td>\n", | |
| " <td>fish</td>\n", | |
| " <td>M</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>False</td>\n", | |
| " <td>dog</td>\n", | |
| " <td>M</td>\n", | |
| " <td>20</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>True</td>\n", | |
| " <td>cat</td>\n", | |
| " <td>L</td>\n", | |
| " <td>12</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>True</td>\n", | |
| " <td>cat</td>\n", | |
| " <td>L</td>\n", | |
| " <td>12</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " adult animal size weight\n", | |
| "0 False cat S 8\n", | |
| "1 False dog S 10\n", | |
| "2 False cat M 11\n", | |
| "3 False fish M 1\n", | |
| "4 False dog M 20\n", | |
| "5 True cat L 12\n", | |
| "6 True cat L 12" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "animal\n", | |
| "cat L\n", | |
| "dog M\n", | |
| "fish M\n", | |
| "dtype: object" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# List the size of the animals with the highest weight.\n", | |
| "df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>adult</th>\n", | |
| " <th>animal</th>\n", | |
| " <th>size</th>\n", | |
| " <th>weight</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>False</td>\n", | |
| " <td>cat</td>\n", | |
| " <td>S</td>\n", | |
| " <td>8</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>False</td>\n", | |
| " <td>cat</td>\n", | |
| " <td>M</td>\n", | |
| " <td>11</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>True</td>\n", | |
| " <td>cat</td>\n", | |
| " <td>L</td>\n", | |
| " <td>12</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>True</td>\n", | |
| " <td>cat</td>\n", | |
| " <td>L</td>\n", | |
| " <td>12</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " adult animal size weight\n", | |
| "0 False cat S 8\n", | |
| "2 False cat M 11\n", | |
| "5 True cat L 12\n", | |
| "6 True cat L 12" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# get_group example\n", | |
| "gb = df.groupby(['animal'])\n", | |
| "gb.get_group('cat')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]})\n", | |
| "gb = df.groupby('A')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>B</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " B\n", | |
| "0 1.0\n", | |
| "1 1.0\n", | |
| "2 1.0\n", | |
| "3 2.0" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# replace a cell value\n", | |
| "def replace(g):\n", | |
| " mask = g < 0\n", | |
| " g.loc[mask] = g[~mask].mean()\n", | |
| " \n", | |
| " return g\n", | |
| "\n", | |
| "gb.transform(replace)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>code</th>\n", | |
| " <th>data</th>\n", | |
| " <th>flag</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>foo</td>\n", | |
| " <td>0.16</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>bar</td>\n", | |
| " <td>-0.21</td>\n", | |
| " <td>True</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>baz</td>\n", | |
| " <td>0.33</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>foo</td>\n", | |
| " <td>0.45</td>\n", | |
| " <td>True</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>bar</td>\n", | |
| " <td>-0.59</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>baz</td>\n", | |
| " <td>0.62</td>\n", | |
| " <td>True</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " code data flag\n", | |
| "0 foo 0.16 False\n", | |
| "1 bar -0.21 True\n", | |
| "2 baz 0.33 False\n", | |
| "3 foo 0.45 True\n", | |
| "4 bar -0.59 False\n", | |
| "5 baz 0.62 True" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# Sort groups by aggregated data\n", | |
| "df = pd.DataFrame({'code': ['foo', 'bar', 'baz'] * 2,\n", | |
| " 'data': [0.16, -0.21, 0.33, 0.45, -0.59, 0.62],\n", | |
| " 'flag': [False, True] * 3})\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>code</th>\n", | |
| " <th>data</th>\n", | |
| " <th>flag</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>bar</td>\n", | |
| " <td>-0.21</td>\n", | |
| " <td>True</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>bar</td>\n", | |
| " <td>-0.59</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>foo</td>\n", | |
| " <td>0.16</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>foo</td>\n", | |
| " <td>0.45</td>\n", | |
| " <td>True</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>baz</td>\n", | |
| " <td>0.33</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>baz</td>\n", | |
| " <td>0.62</td>\n", | |
| " <td>True</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " code data flag\n", | |
| "1 bar -0.21 True\n", | |
| "4 bar -0.59 False\n", | |
| "0 foo 0.16 False\n", | |
| "3 foo 0.45 True\n", | |
| "2 baz 0.33 False\n", | |
| "5 baz 0.62 True" | |
| ] | |
| }, | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "code_groups = df.groupby('code')\n", | |
| "agg_n_sort_order = code_groups[['data']].transform(sum).sort_values(by='data')\n", | |
| "sorted_df = df.ix[agg_n_sort_order.index]\n", | |
| "sorted_df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>Color</th>\n", | |
| " <th>Value</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>Red</td>\n", | |
| " <td>100</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>Red</td>\n", | |
| " <td>150</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>Red</td>\n", | |
| " <td>50</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>Blue</td>\n", | |
| " <td>50</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " Color Value\n", | |
| "0 Red 100\n", | |
| "1 Red 150\n", | |
| "2 Red 50\n", | |
| "3 Blue 50" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df = pd.DataFrame({'Color': 'Red Red Red Blue'.split(),\n", | |
| " 'Value': [100, 150, 50, 50]})\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>Color</th>\n", | |
| " <th>Value</th>\n", | |
| " <th>Counts</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>Red</td>\n", | |
| " <td>100</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>Red</td>\n", | |
| " <td>150</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>Red</td>\n", | |
| " <td>50</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>Blue</td>\n", | |
| " <td>50</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " Color Value Counts\n", | |
| "0 Red 100 3\n", | |
| "1 Red 150 3\n", | |
| "2 Red 50 3\n", | |
| "3 Blue 50 1" | |
| ] | |
| }, | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df['Counts'] = df.groupby(['Color']).transform(len)\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Color\n", | |
| "Blue 1\n", | |
| "Red 3\n", | |
| "Name: Counts, dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.groupby('Color')['Counts'].mean()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>Color</th>\n", | |
| " <th>Value</th>\n", | |
| " <th>Counts</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>Red</td>\n", | |
| " <td>100</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>Red</td>\n", | |
| " <td>150</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>Red</td>\n", | |
| " <td>50</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " Color Value Counts\n", | |
| "0 Red 100 3\n", | |
| "1 Red 150 3\n", | |
| "2 Red 50 3" | |
| ] | |
| }, | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df[df['Color'] == 'Red']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "3" | |
| ] | |
| }, | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_red = df[df['Color']=='Red']\n", | |
| "len(df_red)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.11" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment