Last active
March 30, 2024 23:35
-
-
Save Jaakkonen/cadc8ff66be362d0731846df5db610a4 to your computer and use it in GitHub Desktop.
Nordea Finnish machine format (html) bank statement parser - Nordea konekielisen tiliotteen (HTML) lukija Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Parser for Finnish Nordea html format bank statements. | |
| File names are commonly in the name format: | |
| Konekielinen-tiliote(YY-MM-DD).html | |
| Yields categorized rows with deposited/withdrawn sums, reference numbers, messages, dates, etc. extracted with regexes. | |
| Access the rows with the rows dictionary using the RowType enum as the key, and get the properties of the row with the groupdict() method of the match object. | |
| Example: | |
| >>> rows[RowType.Deposit][0].match.groupdict() | |
| {'arkistotunnus': 'GSIPC000000000000', | |
| 'maksup': '01.02.', | |
| 'arvop': '01.02.', | |
| 'counterparty': 'MEIKÄLÄINEN MATTI', | |
| 'maksutyyppi': '710 Pano', | |
| 'viesti': 'Maksun viesti\nNOTPROVIDED\nOPF00000000.0', | |
| 'viite': None, | |
| 'viitetarkenne': None, | |
| 'tapahtumanro': '4', | |
| 'summa': '69,42+'} | |
| """ | |
| from typing import Any, Generator, NamedTuple | |
| from bs4 import BeautifulSoup, ResultSet | |
| from enum import Enum | |
| from collections import defaultdict | |
| import re | |
| from decimal import Decimal | |
| from datetime import datetime | |
| with open("Konekielinen-tiliote(24-03-12).html") as f: | |
| soup = BeautifulSoup(f, "html.parser") | |
| def to_decimal(s: str) -> Decimal: | |
| # If there's a - or + sign at the end, move it to the front | |
| if s[-1] in "+-": | |
| s = s[-1] + s[:-1] | |
| # Remove thousands separator and replace comma with dot | |
| s = s.replace(".", "").replace(",", ".") | |
| return Decimal(s) | |
| # Cells are separated by 📁 emoji | |
| CELL_SEPARATOR = BOX = "📁" | |
| # Text between cells is separated by ↩ emoji | |
| TAG_SEPARATOR = CARD_INDEX = "↩" | |
| class RowType(Enum): | |
| NordeaHeader = r"^↩Nordea↩\n↩1544 NBC FI Customers📁TILIOTE↩Päivämäärä↩(?P<raporttipvm>[\d\.]{10})📁Sivu\s+(?P<page>\d+)↩Kausi↩(?P<rangestart>[\d\.]{10}) -\n\s+(?P<rangeend>[\d\.]{10})↩(?P<moyear>\d{3} / \d{4})$" | |
| AccountHeader1 = r"^↩(?P<accountname>[A-Z\s-]+)📁(?P<accounttype>\w+)↩(?P<accountnumber>[\d-]+)📁Valuutta↩EUR$" | |
| AccountHeader2 = r"^📁IBAN↩(?P<iban>FI[\d ]{20})📁SWIFT/BIC↩NDEAFIHH↩$" | |
| TableHeader = r"^Arkistointitunnus↩Saajan tilinumero📁Maksup↩Arvop📁Saaja / Maksaja↩Viesti📁Tap.↩nro📁Määrä$" | |
| BalanceRow = r"^📁📁(?P<pvm>[\d\.]{10}) Saldo📁📁(?P<saldo>[\d\.,\+-]+)$" | |
| RegisteredDate = r"^Kirjauspäivä (?P<pvm>[\d\.]{6})📁📁📁📁$" | |
| # These are breakdown rows for Nordea banking fees. The main row has the archiving number and the total sum. | |
| # To avoid double counting, these should not be included in the final sums. | |
| # Note that the only difference is that the "arvop" group is missing. Additionally the value has a class in the <td> tag in the HTML but it's not visible in the text. | |
| NordeaPaymentBreakdown = r"J ↩(?P<arkistotunnus>[A-Z\d]+)↩📁(?P<maksup>[\d\.]+)📁/J↩(?P<counterparty>[^↩]+)↩(?P<viesti>[^↩]+?)\s*↩(?P<aika>[^↩]+)↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}(-|\+))" | |
| Withdrawal = r"(A |JE|J )?↩(?P<arkistotunnus>[A-Z\d]+)↩(\s(?P<viite>[\d-]+))?📁(?P<maksup>[\d\.]+)↩(?P<arvop>[\d\.]+)📁/(J|A)↩(?P<counterparty>[^↩]+)↩(?P<maksutyyppi>\d+ [\w-]+)\s+[^↩]+↩(?P<viesti>[^↩]+?)\s*↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}-)" | |
| Deposit = r"(A )?↩(?P<arkistotunnus>[A-Z\d]+)↩📁(?P<maksup>[\d\.]+)↩(?P<arvop>[\d\.]+)📁/J↩(?P<counterparty>[^↩]+)↩(?P<maksutyyppi>\d+ [\w-]+)\s*↩((?P<viesti>[^↩]+?)\s*|Viite (?P<viite>[\d ]+?)\s+↩(?P<viitetarkenne>\w+\n\w+)\s+)↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}\+)" | |
| SaldoRow = r"📁📁(?P<pvm>[\d\.]{10}) Saldo↩Käyttövara📁📁(?P<saldo>[\d\.,\+]+)↩(?P<kayttovara>[\d\.,\+]+)" | |
| PeriodWithdrawalDeposit = r"^📁📁(?P<panot>\d+)↩(?P<otot>\d+)↩Panot(?P<period_deposit>(/kausi|/kk|/vv| yhteensä))↩Otot(?P<period_withdrawal>(/kausi|/kk|/vv| yhteensä))📁📁(?P<panot_summa>[\d\.,\+]+)↩(?P<otot_summa>([\d\.,-]+|0,00\+))$" | |
| NewLine = r"^↩\xa0↩📁📁📁📁$" | |
| regexes = {rowtype: re.compile(rowtype.value) for rowtype in RowType} | |
| class Row(NamedTuple): | |
| rowtype: RowType | |
| text: str | |
| cells: ResultSet | |
| match: re.Match[str] | |
| def extractrows(soup) -> Generator[Row, Any, None]: | |
| """ | |
| Yields rows from the table containing payment details as a list of cells. | |
| """ | |
| t = soup.findAll("table") | |
| for table in t: | |
| rows = table.findAll("tr") | |
| for row in rows: | |
| cells = row.findAll(["td", "th"]) | |
| if cells: | |
| astext = "📁".join(cell.get_text("↩").strip() for cell in cells) | |
| for rowtype in RowType: | |
| if match := re.match(regexes[rowtype], astext): | |
| yield Row(rowtype, astext, cells, match) | |
| break | |
| else: | |
| raise ValueError(f"Unknown row: {astext}") | |
| rows: dict[RowType, list[Row]] = defaultdict(list) | |
| for row in extractrows(soup): | |
| rows[row.rowtype].append(row) | |
| for rowtype, rowdata in rows.items(): | |
| print("#################################") | |
| print(rowtype) | |
| print("#################################") | |
| for row in rowdata: | |
| print(row.match.groupdict()) | |
| print() | |
| # def sanity_check(): | |
| # """ | |
| # Validates the data in the rows | |
| # """ | |
| # # Validate the PeriodWithdrawalDeposit rows | |
| # for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]: | |
| # data = match.groupdict() | |
| # assert ( | |
| # data["period_deposit"] == data["period_withdrawal"] | |
| # ), "Period deposit and withdrawal should match" | |
| # if data["period_deposit"] in {"/kk", "/vv"}: | |
| # # The withdrawal and deposit counts are not counted for monthly and yearly periods | |
| # # (Those are only counted for each "period" ) | |
| # assert data["panot"] == data["otot"] == "0" | |
| # vv_total = next( | |
| # ( | |
| # match.groupdict() | |
| # for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit] | |
| # if match.groupdict()["period_deposit"] == "/vv" | |
| # ), | |
| # None, | |
| # ) | |
| # assert vv_total is not None, "No yearly total found" | |
| # saldo_dates = { | |
| # datetime.strptime(match.groupdict()["pvm"], "%d.%m.%Y"): to_decimal( | |
| # match.groupdict()["saldo"] | |
| # ) | |
| # for rowtype, text, cells, match in [ | |
| # *rows[RowType.BalanceRow], | |
| # *rows[RowType.SaldoRow], | |
| # ] | |
| # } | |
| # START_SALDO_DATE = min(saldo_dates.keys()) | |
| # START_SALDO = saldo_dates[START_SALDO_DATE] | |
| # END_SALDO_DATE = max(saldo_dates.keys()) | |
| # END_SALDO = saldo_dates[END_SALDO_DATE] | |
| # # Sum up the total deposits and withdrawals | |
| # total_deposits = sum( | |
| # to_decimal(match.groupdict()["summa"]) | |
| # for rowtype, text, cells, match in rows[RowType.Deposit] | |
| # ) | |
| # total_withdrawals = sum( | |
| # to_decimal(match.groupdict()["summa"]) | |
| # for rowtype, text, cells, match in rows[RowType.Withdrawal] | |
| # ) | |
| # # Sum up "panot yhteensä" and "otot yhteensä" for the yearly period | |
| # sum_panot = sum( | |
| # to_decimal(match.groupdict()["panot_summa"]) | |
| # for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit] | |
| # if match.groupdict()["period_deposit"] == " yhteensä" | |
| # ) | |
| # sum_otot = sum( | |
| # to_decimal(match.groupdict()["otot_summa"]) | |
| # for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit] | |
| # if match.groupdict()["period_withdrawal"] == " yhteensä" | |
| # ) | |
| # sum_panot_krt = sum( | |
| # to_decimal(match.groupdict()["panot"]) | |
| # for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit] | |
| # if match.groupdict()["period_deposit"] == " yhteensä" | |
| # ) | |
| # sum_otot_krt = sum( | |
| # to_decimal(match.groupdict()["otot"]) | |
| # for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit] | |
| # if match.groupdict()["period_withdrawal"] == " yhteensä" | |
| # ) | |
| # withdrawals_by_month = defaultdict(list) | |
| # for row in rows[RowType.Withdrawal]: | |
| # data = row.match.groupdict() | |
| # withdrawals_by_month[datetime.strptime(data["maksup"], "%d.%m.").month].append( | |
| # row | |
| # ) | |
| # assert ( | |
| # sum(len(v) for v in withdrawals_by_month.values()) | |
| # == len(rows[RowType.Withdrawal]) | |
| # == sum_otot_krt | |
| # ) | |
| # assert len(rows[RowType.Deposit]) == sum_panot_krt | |
| # assert sum_panot == to_decimal(vv_total["panot_summa"]) == total_deposits | |
| # assert sum_otot == to_decimal(vv_total["otot_summa"]) == total_withdrawals | |
| # assert START_SALDO + total_deposits + total_withdrawals == END_SALDO | |
| # sanity_check() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment