Created
March 14, 2021 15:27
-
-
Save newpoow/8438bf52414befeb70f56e16181e81ef to your computer and use it in GitHub Desktop.
Simple script for get post from wordpress blogs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| import http.client | |
| import re | |
| import io | |
| import getopt | |
| import sys | |
| posts = [] | |
| def getContent(soup): | |
| for article in soup.find_all('article'): | |
| posts.append(article.find('a').get_text()) | |
| def main(url, years = [], filename = "posts.txt"): | |
| # conecto com o site, e pego a resposta | |
| connection = http.client.HTTPConnection(url, 80, timeout=30) | |
| for year in years: | |
| print("Coletando posts de {}...".format(year)) | |
| posts.append("========== {} ==========".format(year)) | |
| connection.request("GET", "/{}/".format(year)) | |
| response = connection.getresponse() | |
| # pego o conteudo da resposta | |
| soup = BeautifulSoup(response.read(), 'html.parser') | |
| # pego os posts da pagina | |
| getContent(soup) | |
| # verifico se contem mais paginas | |
| pages = soup.select('a[class=page-numbers]') | |
| if len(pages) > 0: | |
| # obtem a ultima ancora da paginação | |
| page = pages[-1] | |
| index = 1 | |
| if page.get_text().isnumeric(): | |
| total = int(page.get_text()) | |
| else: | |
| total = 0 | |
| page = pages[0] | |
| # obtem o path | |
| href = page['href'] | |
| path = href[href.find(url) + len(url):len(href)] | |
| # se a totalizacao for conhecida | |
| if page.get_text().isnumeric(): | |
| # total = int(page.get_text()) | |
| path = path.replace("/{}/".format(total) , "/{}/".format(index)) | |
| # else: | |
| # total = 0 | |
| # coleta os posts das paginas | |
| while total == 0 or total > index: | |
| index += 1 | |
| path = path.replace("/{}/".format(index - 1) , "/{}/".format(index)) | |
| connection.request("GET", path) | |
| response = connection.getresponse() | |
| content = response.read() | |
| #print("Status: {} and reason: {}".format(response.status, response.reason)) | |
| if total == 0 and response.status != 200: | |
| break | |
| print("... página {} ...".format(index)) | |
| soup = BeautifulSoup(content, 'html.parser') | |
| getContent(soup) | |
| # grava no arquivo | |
| with io.open(filename, 'w+', encoding="utf-8") as f: | |
| for post in posts: | |
| f.write("{}\n".format(post)) | |
| # f.write("\n") | |
| # fecho a conexao com o site | |
| connection.close() | |
| if __name__ == "__main__": | |
| # obtem os argumentos da linha de comando, exceto o nome do arquivo | |
| argv = sys.argv[1:] | |
| opts, args = getopt.getopt(argv, 'u:y:o:', []) | |
| if len(opts) == 0 or len(opts) < 2: | |
| print('Use: getpost.py -u <url> -y <year,year> -o <output>') | |
| sys.exit(2) | |
| url = 'www.retroavengers.com.br' | |
| years = [] | |
| output = "posts.txt" | |
| for opt, arg in opts: | |
| if opt == "-u": | |
| url = arg | |
| elif opt == "-y": | |
| if arg.find("-") > -1: | |
| begin, end = arg.split("-", 2) | |
| years = range(int(begin), int(end) + 1) | |
| years = ["{}".format(year) for year in years] | |
| else: | |
| years = arg.split(",") | |
| elif opt == "-o": | |
| output = arg | |
| main(url, years, output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment