Created
November 28, 2013 14:56
-
-
Save mediaczar/7693175 to your computer and use it in GitHub Desktop.
Using the Daily Mail's sitemap.xml as the basis to query Facebook's `link_stat` FQL table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| ################### | |
| import scraperwiki | |
| import requests | |
| import xmltodict | |
| import json | |
| import time | |
| graph_query_root = "https://graph.facebook.com/fql" | |
| graph_attr = ['share_count', 'like_count', 'comment_count'] | |
| sitename = 'dailymail.co.uk' | |
| ################### | |
| def query_graph_api(url): # query the Graph API, return data. | |
| result = {} | |
| graph_query = graph_query_root + "?q=SELECT " + ",".join(graph_attr) + " FROM link_stat WHERE url = '" + url + "'" | |
| print graph_query # debug console | |
| query_data = requests.get(graph_query) | |
| query_json = json.loads(query_data.text) | |
| for item in graph_attr: | |
| result[item] = query_json['data'][0][item] | |
| time.sleep(2) | |
| return result | |
| ################### CREATE DICTIONARY FROM SITEMAP | |
| sitemapURL = 'http://%s/sitemap.xml' % sitename | |
| sitemap_raw = requests.get(sitemapURL) | |
| sitemap_dict = xmltodict.parse(sitemap_raw.text) | |
| ################### COLLECT DATA FROM SITEMAP | |
| for page in sitemap_dict['urlset']['url']: | |
| pages = {} | |
| pages['url'] = page['loc'] | |
| pages['title'] = page['n:news']['n:title'] | |
| pages['pubdate'] = page['n:news']['n:publication_date'] | |
| pages['keywords'] = page['n:news']['n:keywords'] | |
| # Get OpenGraph data | |
| graph_data = query_graph_api(pages['url']) | |
| for item in graph_attr: | |
| pages[item] = graph_data[item] | |
| # Commit data | |
| scraperwiki.sqlite.save(unique_keys=['url'], data=pages) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment