Skip to content

Instantly share code, notes, and snippets.

@markkernke
Last active July 28, 2020 13:36
Show Gist options
  • Select an option

  • Save markkernke/52e2beea36abf74459f083aaa84a43ac to your computer and use it in GitHub Desktop.

Select an option

Save markkernke/52e2beea36abf74459f083aaa84a43ac to your computer and use it in GitHub Desktop.
A script for scraping the Tableau Knowledgebase for new and updated articles.
# Dependancies:
# - requests
# - BeautifulSoup
import requests, re
from multiprocessing import Pool
#from multiprocessing.dummy import Pool
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from urllib.parse import unquote
kb_sitemap_url = "https://kb.tableau.com/KbSiteMapHtmlLinks"
recency_limit = datetime.today().replace(hour=0,minute=0,second=0,microsecond=0) - timedelta(days=7)
re_published_at = re.compile('^Published:\n(\\d{2} [A-Z]{3} \\d{4})', re.I | re.M)
re_modified_at = re.compile('^Last Modified Date:\n(\\d{2} [A-Z]{3} \\d{4}).*', re.I | re.M)
def get_kb_articles(kb_sitemap_url):
kb_sitemap_text = requests.get(kb_sitemap_url).text
kb_sitemap_links = BeautifulSoup(kb_sitemap_text, 'html.parser').find_all("a")
return [x.text for x in kb_sitemap_links]
def get_kb_article_details(kb_article_url):
print(" > Starting: %s" % kb_article_url)
article = {}
kb_article_text = requests.get(kb_article_url).text
soup = BeautifulSoup(kb_article_text, 'html.parser')
article_info_text = "\n".join(soup.find(id="article-info").stripped_strings)
article['published_at'] = datetime.strptime(re_published_at.match(article_info_text).group(1),"%d %b %Y")
article['modified_at'] = datetime.strptime(re_modified_at.search(article_info_text).group(1),"%d %b %Y")
article_fields = soup.find(id="article-fields")
article_isse = article_fields.find(id="issue") if article_fields.find(id="issue") else article_fields.find(id="question")
article['issue'] = "\n".join([x for x in article_isse.stripped_strings][1:])
article['environments'] = [x for x in article_fields.find(id="environment").article.stripped_strings][1:]
article['url'] = kb_article_url
return article
if __name__ == '__main__':
kb_articles = [x for x in get_kb_articles(kb_sitemap_url) if '?lang=' not in x and '/articles/howto/' not in x]
print("Total Articles: %d" % len(kb_articles))
with Pool(50) as p:
articles = p.map(get_kb_article_details,kb_articles)
recently_published_articles = [x for x in articles if x['published_at'] >= recency_limit]
recently_modified_articles = [x for x in articles if x['published_at'] < recency_limit and x['modified_at'] >= recency_limit]
print(len(recently_published_articles))
print(len(recently_modified_articles))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment