Skip to content

Instantly share code, notes, and snippets.

@thorwhalen
Created September 9, 2022 12:38
Show Gist options
  • Select an option

  • Save thorwhalen/590530188c4870566b5ef90fbc911838 to your computer and use it in GitHub Desktop.

Select an option

Save thorwhalen/590530188c4870566b5ef90fbc911838 to your computer and use it in GitHub Desktop.
Scrape Aix club addresses
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "2b5a9c67",
"metadata": {
"toc": true
},
"source": [
"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
"<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Get-the-data\" data-toc-modified-id=\"Get-the-data-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Get the data</a></span></li><li><span><a href=\"#Parse-out-base-info\" data-toc-modified-id=\"Parse-out-base-info-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Parse out base info</a></span></li><li><span><a href=\"#Extract-info\" data-toc-modified-id=\"Extract-info-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Extract info</a></span></li><li><span><a href=\"#Make-a-table\" data-toc-modified-id=\"Make-a-table-4\"><span class=\"toc-item-num\">4&nbsp;&nbsp;</span>Make a table</a></span></li></ul></div>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "da120aac",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T11:30:08.428299Z",
"start_time": "2022-09-09T11:30:08.358621Z"
}
},
"outputs": [],
"source": [
"\n",
"# frequently used builtins\n",
"import os\n",
"from collections import Counter, defaultdict\n",
"\n",
"# frequently used data pkgs\n",
"import numpy as np\n",
"import pandas as pd\n",
"\t\t\t\n",
"# careful import of matplotlib\n",
"try:\n",
"\timport matplotlib.pyplot as plt\n",
"except ModuleNotFoundError as e: # to catch ModuleNotFoundError: No module named '_tkinter'\n",
"\timport matplotlib\n",
"\tmatplotlib.use('agg')\n",
"\timport matplotlib.pyplot as plt\n",
"\n",
"# some magic stuff\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"%matplotlib inline\n",
"%config InlineBackend.figure_format='retina'\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "6eac654d",
"metadata": {},
"source": [
"## Get the data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ebb9b252",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T11:30:52.289730Z",
"start_time": "2022-09-09T11:30:52.219819Z"
}
},
"outputs": [],
"source": [
"from graze import graze, url_to_filepath"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e511ccb6",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T11:30:52.516645Z",
"start_time": "2022-09-09T11:30:52.392783Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"230440"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"url = 'https://www.aixenprovence.fr/Annuaire-des-clubs-et-associations-sportives'\n",
"html = graze(url).decode()\n",
"len(html)"
]
},
{
"cell_type": "markdown",
"id": "48bfe823",
"metadata": {},
"source": [
"## Parse out base info"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4ed77ff9",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T11:31:09.708409Z",
"start_time": "2022-09-09T11:31:09.477763Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"len(sections)=61\n"
]
},
{
"data": {
"text/plain": [
"284"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import bs4\n",
"from itertools import groupby, chain\n",
"from operator import itemgetter\n",
"\n",
"\n",
"def is_an_item(x):\n",
" return (\n",
" isinstance(x, bs4.element.Tag) and \n",
"# bool(x.find('img', {'class': 'puce'}))\n",
" bool(x.find('br', {'class': \"manualbr\"}))\n",
" )\n",
"\n",
"def is_a_section_header(x):\n",
" return (\n",
" isinstance(x, bs4.element.Tag) and \n",
" x.name == 'h4' and\n",
" x.get('class', None) == ['spip']\n",
" )\n",
" \n",
" \n",
"def extract_sections(node):\n",
" current_section = None\n",
" for x in node.contents:\n",
" if is_a_section_header(x):\n",
" current_section = x.text\n",
" elif is_an_item(x):\n",
" yield current_section, x\n",
"\n",
" \n",
"b = bs4.BeautifulSoup(html)\n",
"t, *_ = b.find_all('div', {'class': 'article-texte-719'})\n",
"\n",
"\n",
"section_item_pairs = list(extract_sections(t))\n",
"sections = {\n",
" k: list(map(itemgetter(1), v)) \n",
" for k, v in groupby(section_item_pairs, key=itemgetter(0))\n",
"}\n",
"print(f\"{len(sections)=}\")\n",
"clubs = list(chain(*sections.values()))\n",
"len(clubs)"
]
},
{
"cell_type": "markdown",
"id": "cc29a13f",
"metadata": {},
"source": [
"## Extract info"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "104fcca3",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T12:18:52.387413Z",
"start_time": "2022-09-09T12:18:52.305155Z"
}
},
"outputs": [],
"source": [
"import re\n",
"from operator import methodcaller, attrgetter, is_, itemgetter\n",
"from functools import partial\n",
"from i2 import Pipe, FuncFanout\n",
"\n",
"\n",
"# Note: Reusable\n",
"\n",
"def return_none(x):\n",
" return None\n",
"\n",
"def identity(obj):\n",
" return obj\n",
"\n",
"\n",
"def _interruptible_pipe(\n",
" obj, funcs, stop_condition=partial(is_, None), stop_callback=return_none\n",
"):\n",
" \"\"\"When partialized, becomes a pipeline that can be interrupted mid way.\n",
" That is, you can specify a ``stop_condition`` that will determine if the pipeline \n",
" should exit early and a ``stop_callback`` that determines what to return in \n",
" this case. \n",
" Both ``stop_condition`` and ``stop_callback`` are called on the intermediate value.\n",
" \"\"\"\n",
" current_val = obj\n",
" for func in funcs:\n",
" if stop_condition(current_val := func(current_val)):\n",
" return stop_callback(current_val)\n",
" return current_val\n",
"\n",
"\n",
"def interruptible_pipe(\n",
" *funcs, stop_condition=partial(is_, None), stop_callback=return_none\n",
"):\n",
" return partial(\n",
" _interruptible_pipe,\n",
" funcs=funcs, stop_condition=stop_condition, stop_callback=stop_callback\n",
" )\n",
"\n",
"def filtered_map(iterable, map_func=identity, filt_func=None):\n",
" return filter(filt_func, map(map_func, iterable))\n",
"\n",
"\n",
"# Note: Concrete/Specific\n",
"\n",
"extract_title = interruptible_pipe(methodcaller('find', 'strong'), attrgetter('text'))\n",
"extract_description = interruptible_pipe(methodcaller('find', 'i'), attrgetter('text'))\n",
"extract_site = interruptible_pipe(methodcaller('find', 'a'), methodcaller('get', 'href'))\n",
"\n",
"extract_addresses = Pipe(\n",
" partial(\n",
" filtered_map, \n",
" map_func=interruptible_pipe(\n",
" str,\n",
" re.compile(r'.*13\\d\\d\\d .*').search,\n",
" methodcaller('group', 0),\n",
" )\n",
" ),\n",
" list\n",
")\n",
"\n",
"extract_phone_numbers = Pipe(\n",
" str,\n",
" re.compile(r'0\\d \\d\\d \\d\\d \\d\\d').findall,\n",
")\n",
"\n",
"extract_mail = interruptible_pipe(\n",
" str,\n",
" re.compile(r'(?<=Mail.: )[^<]+').search,\n",
" methodcaller('group', 0),\n",
")\n",
"\n",
"\n",
"extract_info = Pipe(\n",
" FuncFanout(\n",
" title=extract_title,\n",
" description=extract_description,\n",
" address=extract_addresses,\n",
" phone_numbers=extract_phone_numbers,\n",
" mail=extract_mail,\n",
" site=extract_site,\n",
" ),\n",
" dict\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "b5e2c9e8",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T12:18:53.005662Z",
"start_time": "2022-09-09T12:18:52.873223Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"273"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def club_id(node):\n",
" return (\n",
" (extract_title(node) or '') +\n",
" (extract_description(node) or '') +\n",
" (''.join(extract_phone_numbers(node)) or '') +\n",
" (extract_site(node) or '')\n",
" )\n",
"\n",
"category_of_id = dict(zip(\n",
" map(Pipe(itemgetter(1), club_id), section_item_pairs),\n",
" map(itemgetter(0), section_item_pairs)\n",
"))\n",
"len(category_of_id)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "48a83548",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T12:18:53.526989Z",
"start_time": "2022-09-09T12:18:53.456192Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"<p><img alt=\"-\" class=\"puce\" height=\"11\" src=\"local/cache-vignettes/L8xH11/puce-32883.gif?1558534207\" width=\"8\"/> <strong>Provence Aïkido Club Aixois - PACA</strong><br class=\"manualbr\"/>Domaine de Bérage<br class=\"manualbr\"/>1415, Chemin Albert Guigou 13290 AIX-LES MILLES<br class=\"manualbr\"/>Tél. : 04 42 64 20 80 / 06 23 55 51 99<br class=\"manualbr\"/>Mail : cambefort.jeanpierre@gmail.com<br class=\"manualbr\"/>Site : <a class=\"spip_url spip_out auto\" href=\"http://www.aikidoaixpaca.blogspot.fr\" rel=\"nofollow external\">www.aikidoaixpaca.blogspot.fr</a></p>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import HTML\n",
"\n",
"HTML(str(clubs[9]))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "d60edc6f",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T12:18:54.222850Z",
"start_time": "2022-09-09T12:18:54.149557Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"{'title': 'Provence Aïkido Club Aixois - PACA',\n",
" 'description': None,\n",
" 'address': ['1415, Chemin Albert Guigou 13290 AIX-LES MILLES'],\n",
" 'phone_numbers': ['04 42 64 20', '06 23 55 51'],\n",
" 'mail': 'cambefort.jeanpierre@gmail.com',\n",
" 'site': 'http://www.aikidoaixpaca.blogspot.fr'}"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_info(clubs[9])"
]
},
{
"cell_type": "markdown",
"id": "62ded12a",
"metadata": {},
"source": [
"## Make a table"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "5d3b7990",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T12:18:55.556133Z",
"start_time": "2022-09-09T12:18:55.335538Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"_df.shape=(284, 7)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>category</th>\n",
" <th>title</th>\n",
" <th>description</th>\n",
" <th>address</th>\n",
" <th>phone_numbers</th>\n",
" <th>mail</th>\n",
" <th>site</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ACCROBRANCHE</td>\n",
" <td>Parcours Aventure Indian Forest Aix-en-Provence</td>\n",
" <td>Parcours Acrobatique, Saut en Quick Jump...</td>\n",
" <td>[Chemin du Viaduc 13100 AIX EN PROVENCE]</td>\n",
" <td>06 29 28 23</td>\n",
" <td>indianforestaix@gmail.com</td>\n",
" <td>http://www.indianforest-aix.fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AÉROMODÉLISME</td>\n",
" <td>Model Air Club d’Aix-en-Provence (M.A.C.A.P.)</td>\n",
" <td>Construire des modèles réduits d’avions et ens...</td>\n",
" <td>[]</td>\n",
" <td>06 33 60 39</td>\n",
" <td>infos@macap.fr - sergecataldo@gmail.com</td>\n",
" <td>http://www.macap.fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AÉRONAUTIQUE</td>\n",
" <td>Aéro-club Aix Marseille (A.C.A.M.)</td>\n",
" <td>Pratique de l’aviation légère, sportive et de ...</td>\n",
" <td>[815, Chemin de la Badesse 13290 LES MILLES]</td>\n",
" <td>04 42 24 21</td>\n",
" <td>acam@aeroclub-acam.org</td>\n",
" <td>http://www.aeroclub-acam.org</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AÏKIDO AIKIBUDO ET AFFINITAIRES</td>\n",
" <td>Aikibudo - Self Défense du Pays d’Aix</td>\n",
" <td>Aikibudo, Self défense, Boxing, Cross-Training...</td>\n",
" <td>[1630, Route de Mimet 13109 SIMIANE-COLLONGUE]</td>\n",
" <td>06 78 24 23</td>\n",
" <td>akdb.sud@gmail.com</td>\n",
" <td>http://www.akbd-sud.fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AÏKIDO AIKIBUDO ET AFFINITAIRES</td>\n",
" <td>Aïkido Sainte-Victoire</td>\n",
" <td>Aïkido, Aïkiken, Aïki Jo</td>\n",
" <td>[Loubassane, 13100 AIX EN PROVENCE]</td>\n",
" <td>06 63 56 33</td>\n",
" <td>luc.bouchareu@wanadoo.fr</td>\n",
" <td>http://www.aikido-sainte-victoire.fr</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" category \\\n",
"0 ACCROBRANCHE \n",
"1 AÉROMODÉLISME \n",
"2 AÉRONAUTIQUE \n",
"3 AÏKIDO AIKIBUDO ET AFFINITAIRES \n",
"4 AÏKIDO AIKIBUDO ET AFFINITAIRES \n",
"\n",
" title \\\n",
"0 Parcours Aventure Indian Forest Aix-en-Provence \n",
"1 Model Air Club d’Aix-en-Provence (M.A.C.A.P.) \n",
"2 Aéro-club Aix Marseille (A.C.A.M.) \n",
"3 Aikibudo - Self Défense du Pays d’Aix \n",
"4 Aïkido Sainte-Victoire \n",
"\n",
" description \\\n",
"0 Parcours Acrobatique, Saut en Quick Jump... \n",
"1 Construire des modèles réduits d’avions et ens... \n",
"2 Pratique de l’aviation légère, sportive et de ... \n",
"3 Aikibudo, Self défense, Boxing, Cross-Training... \n",
"4 Aïkido, Aïkiken, Aïki Jo \n",
"\n",
" address phone_numbers \\\n",
"0 [Chemin du Viaduc 13100 AIX EN PROVENCE] 06 29 28 23 \n",
"1 [] 06 33 60 39 \n",
"2 [815, Chemin de la Badesse 13290 LES MILLES] 04 42 24 21 \n",
"3 [1630, Route de Mimet 13109 SIMIANE-COLLONGUE] 06 78 24 23 \n",
"4 [Loubassane, 13100 AIX EN PROVENCE] 06 63 56 33 \n",
"\n",
" mail \\\n",
"0 indianforestaix@gmail.com \n",
"1 infos@macap.fr - sergecataldo@gmail.com \n",
"2 acam@aeroclub-acam.org \n",
"3 akdb.sud@gmail.com \n",
"4 luc.bouchareu@wanadoo.fr \n",
"\n",
" site \n",
"0 http://www.indianforest-aix.fr \n",
"1 http://www.macap.fr \n",
"2 http://www.aeroclub-acam.org \n",
"3 http://www.akbd-sud.fr \n",
"4 http://www.aikido-sainte-victoire.fr "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def move_column(df, col, position=0):\n",
" column = df.pop(col)\n",
" df.insert(position, column.name, column)\n",
" \n",
"_df = pd.DataFrame(map(extract_info, clubs))\n",
"_df['phone_numbers'] = _df['phone_numbers'].apply(', '.join)\n",
"_df['category'] = _df['title'].apply(category_of_title.get)\n",
"move_column(_df, 'category', 0)\n",
"_df = _df.fillna('')\n",
"print(f\"{_df.shape=}\")\n",
"_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "9d37d1ff",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T12:18:56.151785Z",
"start_time": "2022-09-09T12:18:56.047570Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"df.shape=(288, 7)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>category</th>\n",
" <th>title</th>\n",
" <th>description</th>\n",
" <th>address</th>\n",
" <th>phone_numbers</th>\n",
" <th>mail</th>\n",
" <th>site</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ACCROBRANCHE</td>\n",
" <td>Parcours Aventure Indian Forest Aix-en-Provence</td>\n",
" <td>Parcours Acrobatique, Saut en Quick Jump...</td>\n",
" <td>Chemin du Viaduc 13100 AIX EN PROVENCE</td>\n",
" <td>06 29 28 23</td>\n",
" <td>indianforestaix@gmail.com</td>\n",
" <td>http://www.indianforest-aix.fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AÉRONAUTIQUE</td>\n",
" <td>Aéro-club Aix Marseille (A.C.A.M.)</td>\n",
" <td>Pratique de l’aviation légère, sportive et de ...</td>\n",
" <td>815, Chemin de la Badesse 13290 LES MILLES</td>\n",
" <td>04 42 24 21</td>\n",
" <td>acam@aeroclub-acam.org</td>\n",
" <td>http://www.aeroclub-acam.org</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AÏKIDO AIKIBUDO ET AFFINITAIRES</td>\n",
" <td>Aikibudo - Self Défense du Pays d’Aix</td>\n",
" <td>Aikibudo, Self défense, Boxing, Cross-Training...</td>\n",
" <td>1630, Route de Mimet 13109 SIMIANE-COLLONGUE</td>\n",
" <td>06 78 24 23</td>\n",
" <td>akdb.sud@gmail.com</td>\n",
" <td>http://www.akbd-sud.fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AÏKIDO AIKIBUDO ET AFFINITAIRES</td>\n",
" <td>Aïkido Sainte-Victoire</td>\n",
" <td>Aïkido, Aïkiken, Aïki Jo</td>\n",
" <td>Loubassane, 13100 AIX EN PROVENCE</td>\n",
" <td>06 63 56 33</td>\n",
" <td>luc.bouchareu@wanadoo.fr</td>\n",
" <td>http://www.aikido-sainte-victoire.fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AÏKIDO AIKIBUDO ET AFFINITAIRES</td>\n",
" <td>Aix Université Club Aïkido</td>\n",
" <td></td>\n",
" <td>Chemin des Infirmeries 13100 AIX EN PROVENCE</td>\n",
" <td>06 18 07 72</td>\n",
" <td>gilpatrick2000@yahoo.fr</td>\n",
" <td>http://www.buikukan-riondet.fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>283</th>\n",
" <td>YOGA</td>\n",
" <td>Centre de Yoga « Abhi-Yoga / Indianités »</td>\n",
" <td></td>\n",
" <td>12, Rue Jean Daret 13090 AIX EN PROVENCE</td>\n",
" <td>04 42 64 38</td>\n",
" <td>centreabhiyoga@gmail.com</td>\n",
" <td>http://www.yoga-aix.com</td>\n",
" </tr>\n",
" <tr>\n",
" <th>284</th>\n",
" <td>YOGA</td>\n",
" <td>Luynes Yoga</td>\n",
" <td>Hatha Yoga, Prana Yoga ...</td>\n",
" <td>3115, Avenue Fortune Férrini 13080 LUYNES</td>\n",
" <td>07 69 99 14</td>\n",
" <td>luynesyoga13@gmail.com</td>\n",
" <td>http://luynesyoga13.fr/</td>\n",
" </tr>\n",
" <tr>\n",
" <th>285</th>\n",
" <td>YOGA</td>\n",
" <td>Madhya Yoga</td>\n",
" <td></td>\n",
" <td>15, Rue Venel 13100 AIX EN PROVENCE</td>\n",
" <td>04 42 23 42</td>\n",
" <td>yogaducentre@wanadoo.fr</td>\n",
" <td>http://www.yoga-aix-centre.fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>286</th>\n",
" <td>YOGA</td>\n",
" <td>YAMA - Yoga Ashtanga Marseille Aix</td>\n",
" <td>Yoga dynamique, Stages découverte et perfectio...</td>\n",
" <td>4, Rue Pierre de Courbetin 13100 AIX EN PROVENCE</td>\n",
" <td>06 40 29 96</td>\n",
" <td>contact@yoga-aix-marseille.fr</td>\n",
" <td>http://www.yoga-aix-marseille.fr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>287</th>\n",
" <td>YOGA</td>\n",
" <td>YTI Yoga et Tradition Indienne</td>\n",
" <td>Yoga, Relaxation, Concentration...</td>\n",
" <td>9, Hameau de Malouesse 495, Chemin des Frères ...</td>\n",
" <td>06 87 24 71</td>\n",
" <td>yti13@free.fr</td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>288 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" category \\\n",
"0 ACCROBRANCHE \n",
"1 AÉRONAUTIQUE \n",
"2 AÏKIDO AIKIBUDO ET AFFINITAIRES \n",
"3 AÏKIDO AIKIBUDO ET AFFINITAIRES \n",
"4 AÏKIDO AIKIBUDO ET AFFINITAIRES \n",
".. ... \n",
"283 YOGA \n",
"284 YOGA \n",
"285 YOGA \n",
"286 YOGA \n",
"287 YOGA \n",
"\n",
" title \\\n",
"0 Parcours Aventure Indian Forest Aix-en-Provence \n",
"1 Aéro-club Aix Marseille (A.C.A.M.) \n",
"2 Aikibudo - Self Défense du Pays d’Aix \n",
"3 Aïkido Sainte-Victoire \n",
"4 Aix Université Club Aïkido \n",
".. ... \n",
"283 Centre de Yoga « Abhi-Yoga / Indianités » \n",
"284 Luynes Yoga \n",
"285 Madhya Yoga \n",
"286 YAMA - Yoga Ashtanga Marseille Aix \n",
"287 YTI Yoga et Tradition Indienne \n",
"\n",
" description \\\n",
"0 Parcours Acrobatique, Saut en Quick Jump... \n",
"1 Pratique de l’aviation légère, sportive et de ... \n",
"2 Aikibudo, Self défense, Boxing, Cross-Training... \n",
"3 Aïkido, Aïkiken, Aïki Jo \n",
"4 \n",
".. ... \n",
"283 \n",
"284 Hatha Yoga, Prana Yoga ... \n",
"285 \n",
"286 Yoga dynamique, Stages découverte et perfectio... \n",
"287 Yoga, Relaxation, Concentration... \n",
"\n",
" address phone_numbers \\\n",
"0 Chemin du Viaduc 13100 AIX EN PROVENCE 06 29 28 23 \n",
"1 815, Chemin de la Badesse 13290 LES MILLES 04 42 24 21 \n",
"2 1630, Route de Mimet 13109 SIMIANE-COLLONGUE 06 78 24 23 \n",
"3 Loubassane, 13100 AIX EN PROVENCE 06 63 56 33 \n",
"4 Chemin des Infirmeries 13100 AIX EN PROVENCE 06 18 07 72 \n",
".. ... ... \n",
"283 12, Rue Jean Daret 13090 AIX EN PROVENCE 04 42 64 38 \n",
"284 3115, Avenue Fortune Férrini 13080 LUYNES 07 69 99 14 \n",
"285 15, Rue Venel 13100 AIX EN PROVENCE 04 42 23 42 \n",
"286 4, Rue Pierre de Courbetin 13100 AIX EN PROVENCE 06 40 29 96 \n",
"287 9, Hameau de Malouesse 495, Chemin des Frères ... 06 87 24 71 \n",
"\n",
" mail site \n",
"0 indianforestaix@gmail.com http://www.indianforest-aix.fr \n",
"1 acam@aeroclub-acam.org http://www.aeroclub-acam.org \n",
"2 akdb.sud@gmail.com http://www.akbd-sud.fr \n",
"3 luc.bouchareu@wanadoo.fr http://www.aikido-sainte-victoire.fr \n",
"4 gilpatrick2000@yahoo.fr http://www.buikukan-riondet.fr \n",
".. ... ... \n",
"283 centreabhiyoga@gmail.com http://www.yoga-aix.com \n",
"284 luynesyoga13@gmail.com http://luynesyoga13.fr/ \n",
"285 yogaducentre@wanadoo.fr http://www.yoga-aix-centre.fr \n",
"286 contact@yoga-aix-marseille.fr http://www.yoga-aix-marseille.fr \n",
"287 yti13@free.fr \n",
"\n",
"[288 rows x 7 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# expand df so that every address has it's own row\n",
"from itertools import chain\n",
"\n",
"df_to_dicts = Pipe(\n",
" methodcaller('iterrows'), \n",
" partial(map, Pipe(itemgetter(1), dict)), \n",
" \n",
")\n",
"\n",
"def _replicate(key, d):\n",
" for v in d[key]:\n",
" yield dict(d, **{key: v})\n",
"\n",
"replicate = partial(partial, _replicate)\n",
"\n",
"expand_addresses = Pipe(\n",
" df_to_dicts,\n",
" partial(map, replicate('address')),\n",
" chain.from_iterable,\n",
" pd.DataFrame\n",
")\n",
" \n",
"df = expand_addresses(_df)\n",
"print(f\"{df.shape=}\")\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "cad3ad3c",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T12:18:56.994677Z",
"start_time": "2022-09-09T12:18:56.921274Z"
}
},
"outputs": [],
"source": [
"# address correction\n",
"# Some addresses weren't found on google maps, so I'm correcting those here.\n",
"\n",
"\n",
"def find_replace(d, key, find_replace_dict):\n",
" return dict(d, **{key: find_replace_dict.get(d[key], d[key])})\n",
"\n",
"\n",
"address_find_replace = dict([\n",
" ('La Mareschale 27 Avenue Tübingen 13090 AIX EN PROVENCE (Lundi)',\n",
" 'La Mareschale 27 Avenue Tübingen 13090 AIX EN PROVENCE'),\n",
" ('Espace des Floralies 3 Rue du Docteur Cartotto 13090 AIX EN PROVENCE (Jeudi)',\n",
" 'Espace des Floralies 3 Rue du Docteur Cartotto 13090 AIX EN PROVENCE'),\n",
" ('Gymnase collège Sophie Germain Rue Pierre Bartoletti 13290 AIX-EN-PROVENCE (mardi)',\n",
" 'Gymnase collège Sophie Germain Rue Pierre Bartoletti 13290 AIX-EN-PROVENCE'),\n",
" ('Place des Combattants 13540 PUYRICARD', \n",
" 'Centre Socio Culturel Marie Louise Davin'),\n",
" ('2, Rue Joseph Diouloufet, Rigel 1 13090 AIX EN PROVENCE',\n",
" '2, Rue Joseph Diouloufet 13090 AIX EN PROVENCE'),\n",
" ('46, Le Petit Lac 13480 CABRIS',\n",
" 'Body Arts, Cabriès')\n",
"])\n",
"\n",
"\n",
"df['google_address'] = df['address'].apply(lambda x: address_find_replace.get(x, x))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ddf3d808",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 30,
"id": "e049fbf8",
"metadata": {
"ExecuteTime": {
"end_time": "2022-09-09T12:19:00.975591Z",
"start_time": "2022-09-09T12:19:00.826411Z"
}
},
"outputs": [],
"source": [
"df.to_excel('aix_clubs_et_associations.xlsx', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4069963e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "c3d249a1",
"metadata": {},
"source": [
"Parsed out from:\thttps://www.aixenprovence.fr/Annuaire-des-clubs-et-associations-sportives\n",
"\n",
"Pdf format:\thttps://www.aixenprovence.fr/IMG/pdf/annuaire_assos_sportives.pdf\n",
"\n",
"Google map:\thttps://www.google.com/maps/d/u/0/edit?mid=1MPNMLhF_0A9gTTI6I1sUQD83PEphaNU&usp=sharing\n",
"\n",
"How to upload table to google maps:\thttps://support.google.com/mymaps/answer/3024836?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cstep-import-info-into-the-map\n",
"\t\n",
"Link to prepared data in google sheets: https://docs.google.com/spreadsheets/d/1EoTnAVHotyG5NonMnNDtflCtR7hhhQDGWnR_PEb4erM/edit?usp=sharing\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5520d70e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c0ba094",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "af771d8f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "5d95f011",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1037e49",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "5fd418bf",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "bffdea94",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "02944222",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "de645d69",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "178426b4",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c8b5664",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f42c01e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "2618bd21",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e3b4b86a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1327cae5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
},
"latex_envs": {
"LaTeX_envs_menu_present": true,
"autoclose": false,
"autocomplete": true,
"bibliofile": "biblio.bib",
"cite_by": "apalike",
"current_citInitial": 1,
"eqLabelWithNumbers": true,
"eqNumInitial": 1,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
},
"labels_anchors": false,
"latex_user_defs": false,
"report_style_numbering": false,
"user_envs_cfg": false
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment