Skip to content

Instantly share code, notes, and snippets.

@newpoow
Created March 14, 2021 15:27
Show Gist options
  • Select an option

  • Save newpoow/8438bf52414befeb70f56e16181e81ef to your computer and use it in GitHub Desktop.

Select an option

Save newpoow/8438bf52414befeb70f56e16181e81ef to your computer and use it in GitHub Desktop.
Simple script for get post from wordpress blogs
from bs4 import BeautifulSoup
import http.client
import re
import io
import getopt
import sys
posts = []
def getContent(soup):
for article in soup.find_all('article'):
posts.append(article.find('a').get_text())
def main(url, years = [], filename = "posts.txt"):
# conecto com o site, e pego a resposta
connection = http.client.HTTPConnection(url, 80, timeout=30)
for year in years:
print("Coletando posts de {}...".format(year))
posts.append("========== {} ==========".format(year))
connection.request("GET", "/{}/".format(year))
response = connection.getresponse()
# pego o conteudo da resposta
soup = BeautifulSoup(response.read(), 'html.parser')
# pego os posts da pagina
getContent(soup)
# verifico se contem mais paginas
pages = soup.select('a[class=page-numbers]')
if len(pages) > 0:
# obtem a ultima ancora da paginação
page = pages[-1]
index = 1
if page.get_text().isnumeric():
total = int(page.get_text())
else:
total = 0
page = pages[0]
# obtem o path
href = page['href']
path = href[href.find(url) + len(url):len(href)]
# se a totalizacao for conhecida
if page.get_text().isnumeric():
# total = int(page.get_text())
path = path.replace("/{}/".format(total) , "/{}/".format(index))
# else:
# total = 0
# coleta os posts das paginas
while total == 0 or total > index:
index += 1
path = path.replace("/{}/".format(index - 1) , "/{}/".format(index))
connection.request("GET", path)
response = connection.getresponse()
content = response.read()
#print("Status: {} and reason: {}".format(response.status, response.reason))
if total == 0 and response.status != 200:
break
print("... página {} ...".format(index))
soup = BeautifulSoup(content, 'html.parser')
getContent(soup)
# grava no arquivo
with io.open(filename, 'w+', encoding="utf-8") as f:
for post in posts:
f.write("{}\n".format(post))
# f.write("\n")
# fecho a conexao com o site
connection.close()
if __name__ == "__main__":
# obtem os argumentos da linha de comando, exceto o nome do arquivo
argv = sys.argv[1:]
opts, args = getopt.getopt(argv, 'u:y:o:', [])
if len(opts) == 0 or len(opts) < 2:
print('Use: getpost.py -u <url> -y <year,year> -o <output>')
sys.exit(2)
url = 'www.retroavengers.com.br'
years = []
output = "posts.txt"
for opt, arg in opts:
if opt == "-u":
url = arg
elif opt == "-y":
if arg.find("-") > -1:
begin, end = arg.split("-", 2)
years = range(int(begin), int(end) + 1)
years = ["{}".format(year) for year in years]
else:
years = arg.split(",")
elif opt == "-o":
output = arg
main(url, years, output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment