Skip to content

Instantly share code, notes, and snippets.

@manasouza
Last active January 21, 2022 00:03
Show Gist options
  • Select an option

  • Save manasouza/d2325b0bb7d4767f4942f5ab01ace9eb to your computer and use it in GitHub Desktop.

Select an option

Save manasouza/d2325b0bb7d4767f4942f5ab01ace9eb to your computer and use it in GitHub Desktop.
Extrai informações de dividendos de Fundos Imobiliários (dividend yield) a partir de informações do site fiis.com.br. A lista de FIIs é obtida a partir da integração com uma planilha do Google Spreadsheets
"""
For execution it needs the installation of scrapy, gspread, google-cloud-storage and google-oauth and the creation of the
environment variable GOOGLE_APPLICATION_CREDENTIALS with the path to the service account file
"""
import os
import scrapy
import re
import json
import gspread
from google.cloud import storage
from google.oauth2.service_account import Credentials
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
SPREADSHEET_ID = '<google-spreadsheets-id>'
SPREADSHEET_TICKETS_TAB = 'Carteira'
SPREADSHEET_DY_TAB = 'Dividendos'
SHEETS_API_SCOPE = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
HEADER_ROW = 2
TICKERS_COLUMN = 1
FII_CODE_COLUMN = 'Ticker'
DY_DATE_COLUMN = 'Data'
DY_VALUE_COLUMN = 'Dividendo/Cota'
storage_client = storage.Client()
fiis = {}
def add_fii_dy_data(fii_code, dy_value, dy_base_date):
fiis[fii_code] = {
'value': dy_value,
'date': dy_base_date
}
def _get_auth_key():
sa_name = '<service-account-name>.json'
sa_bucket = '<storage-bucket-which-contains-creds>'
bucket = storage_client.get_bucket(sa_bucket)
# List objects with the given prefix.
for index, blob in enumerate(bucket.list_blobs()):
if blob.name == sa_name:
creds = blob.download_as_string()
json_acct_info = json.loads(creds)
return json_acct_info
class SpreadsheetIntegration:
def __init__(self):
creds_key_json = _get_auth_key()
credentials = Credentials.from_service_account_info(
creds_key_json,
scopes=SHEETS_API_SCOPE
)
self.set_credentials(credentials)
def set_credentials(self, credentials):
self.gc = gspread.authorize(credentials)
def update_spreadsheet(self, auth_key=_get_auth_key()):
print('update spreadsheet')
worksheet=self.get_worksheet(auth_key)
first_row, last_row = _get_available_spots(main_worksheet)
def get_worksheet(self, worksheet_name, creds_key_json=''):
main_worksheet = self.gc.open_by_key(SPREADSHEET_ID).worksheet(worksheet_name)
return main_worksheet
def _get_spreadsheet_columns_to_update(main_worksheet):
for header_cell in main_worksheet.range(1, 1, 3, main_worksheet.col_count):
if header_cell.value.lower() == current_month:
update_column = header_cell.col
print('update column found: ' + str(update_column))
return update_column
def _get_spreadsheet_row_to_update(category_column, detected_category, worksheet):
for category_cell in worksheet.range(1, category_column, worksheet.row_count, category_column):
if category_cell.value == detected_category:
update_row = category_cell.row
print('update row found: ' + str(update_row))
return update_row
class FiisSpider(scrapy.Spider):
name = 'fiis'
allowed_domains = ['fiis.com.br']
def __init__(self, fii='fii_code'):
super(FiisSpider, self).__init__()
self.start_urls = ['https://fiis.com.br/%s' % fii]
def parse(self, response):
print("procesing: "+response.url)
dy_registry = 'Rendimento no valor de'
dy_registry_date = 'Data base:'
fii_code = response.url.split('/')[-2]
extracted_value = ''
extracted_ref_date = ''
div_id = 'news--wrapper'
fii_updates = response.xpath('//*[@id="news--wrapper"]//ul//li//a//*[@class="title"]/text()').extract()
for element in fii_updates:
if element.startswith(dy_registry):
extracted_value = re.search(dy_registry+'\sR\$\s(\d?,?\d+|)\s', element).group(1)
print(extracted_value)
elif element.startswith(dy_registry_date):
extracted_ref_date = re.search(dy_registry_date+'\s(\d{2}\/\d{2}\/\d{4})\s', element).group(1)
print(extracted_ref_date)
elif extracted_ref_date and extracted_value:
break;
# if extracted_ref_date and extracted_value:
add_fii_dy_data(fii_code, extracted_value, extracted_ref_date)
@staticmethod
def start():
# logging.getLogger('scrapy').propagate = False
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl(FiisSpider)
process.start()
spreadsheets = SpreadsheetIntegration()
def check_dividend_yield(fiis_list=[]):
if not fiis_list:
tickets_worksheet = spreadsheets.get_worksheet(SPREADSHEET_TICKETS_TAB)
fiis_list = [ticker for ticker in tickets_worksheet.col_values(TICKERS_COLUMN) if re.search('\w+11', ticker)]
print(fiis_list)
# check month
settings = get_project_settings()
process = CrawlerProcess(settings)
for fii in fiis_list:
process.crawl(FiisSpider, fii=fii)
process.start()
for k,v in fiis.items():
print('FII: {} => R$ {} em {}'.format(k, v['value'], v['date']))
# TODO: integrate with google spreadsheets to update DY
# spreadsheets = SpreadsheetIntegration()
# spreadsheets.update_spreadsheet()
if __name__ == "__main__":
check_dividend_yield()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment