Last active
January 1, 2026 15:49
-
-
Save micro-tiger/cf67e7458730f03f881040e39d3ba1d3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import re | |
| import random | |
| import string | |
| import urllib.request | |
| import time | |
| import logging | |
| # Creates a folder in "location" to store the pictures and the modified files (local link to imgs) | |
| class FolderCreator: | |
| #def __init__(self, location = ".."): | |
| def __init__(self, location = "."): | |
| self.location = location | |
| def create_folder(self, name): | |
| self.name = name | |
| self.folder = self.location + "/" + self.name | |
| if not os.path.exists(self.folder): | |
| os.mkdir(self.folder) | |
| # Write the content ("filedata") of each new modified file with "filename" as name, on the "folder_path" | |
| class FileWritter: | |
| def write_file(self, folder_path, filename, filedata): | |
| self.folder_path = folder_path | |
| self.filename = filename | |
| self.filedata = filedata | |
| with open(self.folder_path + "\\" + self.filename, "w", encoding="utf-8") as file: | |
| file.write(self.filedata) | |
| # Download the images from the links obtained from the markdown files to the "destination folder" | |
| # The user-agent can be specified in order to circunvent some simple potential connection block from the | |
| # sources of the images | |
| class ImgDownloader: | |
| def download_images(self, url_dict, folder_path, user_agent): | |
| self.url_dict = url_dict | |
| self.folder_path = folder_path | |
| self.user_agent = user_agent | |
| for url, name in self.url_dict.items(): | |
| opener = urllib.request.build_opener() | |
| opener.addheaders = [('User-agent', self.user_agent)] | |
| urllib.request.install_opener(opener) | |
| save_name = self.folder_path + "\\" + name | |
| try: | |
| urllib.request.urlretrieve(url, save_name) | |
| except Exception as e: | |
| logging.exception(f"Error when downloading {url}") | |
| time.sleep(random.randint(0,2)) | |
| # Open and reads the file received and returns the content | |
| class FileOpener: | |
| def open_and_read(self, filename): | |
| self.url_dict = {} | |
| self.filename = filename | |
| try: | |
| with open(os.path.join(os.getcwd(), filename), "r", encoding="utf-8") as self.current_opened_file: | |
| print(f"\nOpened file: {self.filename}") | |
| logging.info(f"Opened file: {self.filename}\n") | |
| return self.current_opened_file.read() | |
| except Exception as e: | |
| logging.exception(f"Error when opening file {self.filename}") | |
| # Find(regex) URL's for images on the received "file_data" and creates a dictionary with the url's for later download as keys | |
| # and a random 10 digit number followed by the images names (something.jpg) | |
| # in order to save the files later and prevent name collisions | |
| class UrlDictCreator: | |
| def create(self, regex, file_data, file_name): | |
| self.file_name = file_name | |
| self.url_dict = {} | |
| self.regex = regex | |
| self.file_data = file_data | |
| try: | |
| # Используем re.finditer для поиска совпадений | |
| matches = re.finditer(self.regex, self.file_data) | |
| #count_name += 1 | |
| for match in matches: | |
| url = match.group(0) # Полный URL | |
| file_extension = match.group(1) # Расширение файла | |
| #count = count + 1 | |
| # Извлечение имени файла из URL (до последней точки) | |
| file_name_without_ext = url.split('/')[-1].rsplit('.', 1)[0] | |
| print(f"URL: {url}, Имя файла: {file_name_without_ext}, Расширение: .{file_extension}") # Отладочный вывод | |
| file_name = file_name.replace(" ", "_") | |
| new_file_name = f"{file_name[:-3]}_{len(self.url_dict)}.{file_extension}" | |
| #new_file_name.replace(" ", "_") | |
| # Сохранение URL и нового имени файла в словаре | |
| if url not in self.url_dict: | |
| self.url_dict[url] = new_file_name | |
| except Exception as e: | |
| logging.exception("Ошибка при попытке найти URL и добавить их в словарь: %s", e) | |
| print(self.url_dict) | |
| return self.url_dict | |
| # Edit the markdown files, changing the url's links for a new name corresponding to the name of the local file | |
| # images that will be downloaded later | |
| class FileDataEditor: | |
| def edit(self, file_data, url_dict, file_name): | |
| self.file_name = file_name | |
| self.url_dict = url_dict | |
| self.file_data = file_data | |
| for key, value in url_dict.items(): | |
| self.file_data = self.file_data.replace(key, value) | |
| print(f"\nreplaced: {key}\nwith {value}\n on file {self.file_name}\n") | |
| logging.info(f"replaced: {key}\nwith: {value}\non file: {self.file_name}\n") | |
| return self.file_data | |
| # Program start: | |
| print("\nStarting..") | |
| # Create new log file | |
| logging.basicConfig(filename='Img_To_Local_Python.log', encoding='utf-8', filemode="w", level=logging.DEBUG) | |
| # Defines the folder to write the new markdown files and the downloaded images | |
| #folder_name = "External_Imgs_to_Local_Files" | |
| #folder_path = os.path.abspath(os.path.join(os.getcwd(),os.pardir) + f"\/{folder_name}\/") | |
| folder_name = "md_images" | |
| folder_path = os.path.abspath(os.getcwd() + f"\/{folder_name}\/") | |
| # Create new folder to receive the downloaded imgs and edited MD files | |
| folder_creator = FolderCreator() | |
| folder_creator.create_folder(folder_name) | |
| logging.info(f"New folder created: {folder_path}\n") | |
| print(f"New folder created: {folder_path}") | |
| logging.info("to receive the imgs and edited markdown files\n") | |
| print("to receive the imgs and edited markdown files\n") | |
| # Regex that will be used to look for url's of images | |
| #regex = r"(?:\(|\[)(?P<url>(?:https?\:(?:\/\/)?)(?:\w|\-|\_|\.|\?|\/)+?\/(?P<end>(?:\w|\-|\_)+\.(?:png|jpg|jpeg|gif|bmp|svg)))(?:\)|\])" | |
| # Regex that will be used to look for url's of images | |
| # Can filter | |
| #  | |
| #  | |
| #  | |
| regex = r'(?<=\]\()https?:\/\/[^\s"()]+\.(png|jpg|jpeg|gif|bmp|svg)(?=\s*["\)]|$)' | |
| # Loop throught every markdown file on this script folder | |
| for filename in os.listdir(os.getcwd()): | |
| print("\n") | |
| if filename[-3:] != ".md": | |
| # log_file_creator.write(f"{filename} ignored (not '.md')\n") | |
| logging.info(f"Skipped file: {filename}\n") | |
| print(f"Skipped file: {filename}") | |
| continue | |
| # Open and read each file | |
| file_opener = FileOpener() | |
| file_data = file_opener.open_and_read(filename) | |
| # Create a dictionary of images URLs for each file | |
| url_dict_creator = UrlDictCreator() | |
| url_dict = url_dict_creator.create(regex, file_data, filename) | |
| # Edit the read content of each file, replacing the found imgs urls with local file names instead | |
| file_data_editor = FileDataEditor() | |
| edited_file_data = file_data_editor.edit(file_data, url_dict, filename) | |
| # Download the images listed on the dictionary of found urls for each file | |
| images_downloader = ImgDownloader() | |
| #user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1" | |
| user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.3" | |
| images_downloader.download_images(url_dict, folder_path, user_agent) | |
| # Write the modified markdown files | |
| if url_dict: | |
| file_name_writter = FileWritter() | |
| file_name_writter.write_file(folder_path,filename, edited_file_data) | |
| print(f"Closed file: {filename}") | |
| logging.info(f"Closed file: {filename}\n") | |
| print("\n\n\nIf everything went OK, you can check your modified markdown") | |
| print("files and the downloaded images on the folder:") | |
| print(f"{folder_path}") | |
| print(f"\nFor more info check the log file on \n{os.getcwd()}\\PythonObsidian.log") | |
| print("\nPress enter to close") | |
| input() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment