-
-
Save olivx/64d657ff67ecfcc38917f5f34cc1495b to your computer and use it in GitHub Desktop.
Annotations of Pandas DataFrame
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import timeit\n", | |
| "import xlrd\n", | |
| "from django.contrib.auth.models import User" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "tic = timeit.default_timer()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "filename = '/tmp/example.xlsx'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.read_excel(filename)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>NAME</th>\n", | |
| " <th>EMAIL</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>2011</th>\n", | |
| " <td>Angela Brown</td>\n", | |
| " <td>angela-brown@email.com</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2824</th>\n", | |
| " <td>Angela Brown</td>\n", | |
| " <td>angela-brown@email.com</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3599</th>\n", | |
| " <td>Angela Jones</td>\n", | |
| " <td>angela-jones@email.com</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1227</th>\n", | |
| " <td>Angela Jones</td>\n", | |
| " <td>angela-jones@email.com</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2605</th>\n", | |
| " <td>Anthony Evans</td>\n", | |
| " <td>anthony-evans@email.com</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " NAME EMAIL\n", | |
| "2011 Angela Brown angela-brown@email.com\n", | |
| "2824 Angela Brown angela-brown@email.com\n", | |
| "3599 Angela Jones angela-jones@email.com\n", | |
| "1227 Angela Jones angela-jones@email.com\n", | |
| "2605 Anthony Evans anthony-evans@email.com" | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "email = df['EMAIL']\n", | |
| "# Retornando os itens duplicados\n", | |
| "df[email.isin(email[email.duplicated()])].sort_values(by=['EMAIL']).head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>index</th>\n", | |
| " <th>NAME</th>\n", | |
| " <th>EMAIL</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>4957</th>\n", | |
| " <td>4995</td>\n", | |
| " <td>Helen Mcallister</td>\n", | |
| " <td>helen-mcallister@email.com</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4958</th>\n", | |
| " <td>4996</td>\n", | |
| " <td>Scott Hall</td>\n", | |
| " <td>scott-hall@email.com</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4959</th>\n", | |
| " <td>4997</td>\n", | |
| " <td>Dawn Dowling</td>\n", | |
| " <td>dawn-dowling@email.com</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4960</th>\n", | |
| " <td>4998</td>\n", | |
| " <td>John Campbell</td>\n", | |
| " <td>john-campbell@email.com</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4961</th>\n", | |
| " <td>4999</td>\n", | |
| " <td>Barbara Alldredge</td>\n", | |
| " <td>barbara-alldredge@email.com</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " index NAME EMAIL\n", | |
| "4957 4995 Helen Mcallister helen-mcallister@email.com\n", | |
| "4958 4996 Scott Hall scott-hall@email.com\n", | |
| "4959 4997 Dawn Dowling dawn-dowling@email.com\n", | |
| "4960 4998 John Campbell john-campbell@email.com\n", | |
| "4961 4999 Barbara Alldredge barbara-alldredge@email.com" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# Removendo os itens duplicados\n", | |
| "dffinal = df.drop_duplicates('EMAIL').reset_index()\n", | |
| "dffinal.tail()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def create_user(new_users):\n", | |
| " # Separa full_name em first_name e last_name\n", | |
| " users = []\n", | |
| " for new_user in new_users:\n", | |
| " full_name = new_user['full_name']\n", | |
| " email = new_user['email']\n", | |
| " first_name = full_name.split()[0]\n", | |
| " last_name = full_name.split()[1:]\n", | |
| " last_name = ' '.join(last_name)\n", | |
| " username = email\n", | |
| " user = User(\n", | |
| " first_name=first_name,\n", | |
| " last_name=last_name,\n", | |
| " email=email,\n", | |
| " username=username\n", | |
| " )\n", | |
| " users.append(user)\n", | |
| " return users" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def import_users(new_users):\n", | |
| " User.objects.bulk_create(new_users)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "new_users = []\n", | |
| "emails = []" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "emails = dffinal['EMAIL'].values.tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "exist_users = User.objects.filter(email__in=emails).values_list('email', flat=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Manuel Marshall manuel-marshall@email.com\n", | |
| "Joy Flores joy-flores@email.com\n", | |
| "Nicholas Waits nicholas-waits@email.com\n", | |
| "Laura Cross laura-cross@email.com\n", | |
| "Felipe Garland felipe-garland@email.com\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for row in dffinal.head().itertuples():\n", | |
| " print(row.NAME, row.EMAIL)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "for row in dffinal.itertuples():\n", | |
| " full_name = row.NAME\n", | |
| " email = row.EMAIL\n", | |
| " if email not in exist_users:\n", | |
| " data = dict(full_name=full_name, email=email)\n", | |
| " new_users.append(data)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "new_users_ = create_user(new_users)\n", | |
| "import_users(new_users_)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "17 existentes\n", | |
| "4945 novos\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(len(exist_users), 'existentes')\n", | |
| "print(len(new_users), 'novos')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "toc = timeit.default_timer()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "2.1476561470044544" | |
| ] | |
| }, | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "toc - tic" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Django Shell-Plus", | |
| "language": "python", | |
| "name": "django_extensions" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment