Last active
July 28, 2020 13:36
-
-
Save markkernke/52e2beea36abf74459f083aaa84a43ac to your computer and use it in GitHub Desktop.
A script for scraping the Tableau Knowledgebase for new and updated articles.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Dependancies: | |
| # - requests | |
| # - BeautifulSoup | |
| import requests, re | |
| from multiprocessing import Pool | |
| #from multiprocessing.dummy import Pool | |
| from datetime import datetime, timedelta | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import unquote | |
| kb_sitemap_url = "https://kb.tableau.com/KbSiteMapHtmlLinks" | |
| recency_limit = datetime.today().replace(hour=0,minute=0,second=0,microsecond=0) - timedelta(days=7) | |
| re_published_at = re.compile('^Published:\n(\\d{2} [A-Z]{3} \\d{4})', re.I | re.M) | |
| re_modified_at = re.compile('^Last Modified Date:\n(\\d{2} [A-Z]{3} \\d{4}).*', re.I | re.M) | |
| def get_kb_articles(kb_sitemap_url): | |
| kb_sitemap_text = requests.get(kb_sitemap_url).text | |
| kb_sitemap_links = BeautifulSoup(kb_sitemap_text, 'html.parser').find_all("a") | |
| return [x.text for x in kb_sitemap_links] | |
| def get_kb_article_details(kb_article_url): | |
| print(" > Starting: %s" % kb_article_url) | |
| article = {} | |
| kb_article_text = requests.get(kb_article_url).text | |
| soup = BeautifulSoup(kb_article_text, 'html.parser') | |
| article_info_text = "\n".join(soup.find(id="article-info").stripped_strings) | |
| article['published_at'] = datetime.strptime(re_published_at.match(article_info_text).group(1),"%d %b %Y") | |
| article['modified_at'] = datetime.strptime(re_modified_at.search(article_info_text).group(1),"%d %b %Y") | |
| article_fields = soup.find(id="article-fields") | |
| article_isse = article_fields.find(id="issue") if article_fields.find(id="issue") else article_fields.find(id="question") | |
| article['issue'] = "\n".join([x for x in article_isse.stripped_strings][1:]) | |
| article['environments'] = [x for x in article_fields.find(id="environment").article.stripped_strings][1:] | |
| article['url'] = kb_article_url | |
| return article | |
| if __name__ == '__main__': | |
| kb_articles = [x for x in get_kb_articles(kb_sitemap_url) if '?lang=' not in x and '/articles/howto/' not in x] | |
| print("Total Articles: %d" % len(kb_articles)) | |
| with Pool(50) as p: | |
| articles = p.map(get_kb_article_details,kb_articles) | |
| recently_published_articles = [x for x in articles if x['published_at'] >= recency_limit] | |
| recently_modified_articles = [x for x in articles if x['published_at'] < recency_limit and x['modified_at'] >= recency_limit] | |
| print(len(recently_published_articles)) | |
| print(len(recently_modified_articles)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment