Skip to content

Instantly share code, notes, and snippets.

@Sarverott
Created March 10, 2026 00:04
Show Gist options
  • Select an option

  • Save Sarverott/0316cd20dc99e2cac3989b72a78fe43e to your computer and use it in GitHub Desktop.

Select an option

Save Sarverott/0316cd20dc99e2cac3989b72a78fe43e to your computer and use it in GitHub Desktop.
sharing sjp by dataset of huggingface || upowszechnianie SJP przez dataset na huggingface - https://sjp.pl/sl/odmiany/ ; https://huggingface.co/datasets/Apokryf/SJP
from huggingface_hub import login, upload_folder
import requests
import bs4
import markdownify
import zipinfo
import urllib
import toml
sjpUrl = "https://sjp.pl/sl/odmiany/"
sourceserver = requests.get(sjpUrl)
with open("./sjp.pl_sl_odmiany.md", "w") as sjpPage:
sjpPage.write(markdownify.markdownify(sourceserver.text))
webpage = bs4.BeautifulSoup(sourceserver.text)
links = dict()
for link in webpage.find_all("a"):
links[link.string]=link.get("href")
with open("./links.toml", "w") as sjpPage:
sjpPage.write(toml.dumps(links))
for link in links:
if ".zip" in link:
sjpDownload = requests.get(link)
with open("./sjp.zip", 'wb') as fd:
for chunk in sjpDownload.iter_content(chunk_size=128):
fd.write(chunk)
print(zipfile.ZipInfo("./sjp.zip"))
zipfile.ZipFile("./sjp.zip").extractall()
login()
upload_folder(
folder_path=".",
repo_id="Apokryf/SJP",
repo_type="dataset"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment