Last active
March 26, 2023 17:06
-
-
Save vpnry/b433182b84711ae9c33ed0256429723e to your computer and use it in GitHub Desktop.
Nodejs script that wraps words in HTML file with a specified tag, Pyhon script to split file HTML file into smaller parts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * NodeJS script that wrap all words into tags. | |
| * npm install cheerio | |
| * Generated with the help of ChatGPT :) | |
| **/ | |
| const fs = require("fs"); | |
| const path = require("path"); | |
| const cheerio = require("cheerio"); | |
| const inputDirectoryPath = "_1_split_"; | |
| const outputDirectoryPath = "_2_done_wrap_"; | |
| const myTagName = "w"; | |
| let count = 0; | |
| wrapWord(); | |
| function wrapWord() { | |
| try { | |
| const files = fs.readdirSync(inputDirectoryPath); | |
| // Create output directory if it doesn't exist | |
| if (!fs.existsSync(outputDirectoryPath)) { | |
| fs.mkdirSync(outputDirectoryPath); | |
| } | |
| files.forEach((file) => { | |
| if (path.extname(file) === ".html") { | |
| const inputFilePath = path.join(inputDirectoryPath, file); | |
| const outputFilePath = path.join(outputDirectoryPath, file); | |
| let data = fs.readFileSync(inputFilePath, "utf-8"); | |
| // You may want to comment this | |
| // data = replaceSourceHTML(data); | |
| const $ = cheerio.load(data); | |
| $("*") | |
| .contents() | |
| .filter(function () { | |
| return this.nodeType === 3 && /\S/.test(this.nodeValue); | |
| }) | |
| .each(function () { | |
| // Exclude wrapping text within the <title> tag | |
| if ($(this).parent().is("title")) { | |
| return; | |
| } | |
| if ($(this).parent().is("script")) { | |
| return; | |
| } | |
| const words = $(this).text().split(/\s+/); | |
| const wrappedWords = words.map( | |
| (word) => `<${myTagName}>${word}</${myTagName}>` | |
| ); | |
| $(this).replaceWith(wrappedWords.join(" ")); | |
| }); | |
| // Add script before </body> | |
| // $("body").append('<script src="data/done_script.js"></script>'); | |
| fs.writeFileSync(outputFilePath, $.html(), "utf-8"); | |
| count++; | |
| console.log(`File ${count}. ${outputFilePath} updated successfully.`); | |
| } | |
| }); | |
| console.log( | |
| `Processed ${count} file(s) in directory ${inputDirectoryPath}.` | |
| ); | |
| } catch (err) { | |
| console.error(`Error reading directory: ${err}`); | |
| } | |
| } | |
| function replaceSourceHTML(data) { | |
| // Some replacements | |
| data = data.replace( | |
| "</head>", | |
| '<link rel="stylesheet" type="text/css" href="web/style_unicode.css">\n</head>' | |
| ); | |
| data = data.replace( | |
| "<body>", | |
| '<body>\n<div id="pnrydict" style="display:none;"></div>' | |
| ); | |
| data = data.replace( | |
| "</body>", | |
| '<script src="dictionary/dict_script.js"></script>\n</body>\n' | |
| ); | |
| return data; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import re | |
| script = ''' | |
| <script> | |
| (function applyDarkMode() { | |
| // Retrieve the user's preference from localStorage, if available | |
| const tptkMmLastDarkDayMode = | |
| localStorage.getItem("tptkMmLastDarkDayMode") || "day"; | |
| // If the user's preference is set to "dark", apply the dark mode CSS file | |
| if (tptkMmLastDarkDayMode === "dark") { | |
| document | |
| .getElementById("pageStyle") | |
| .setAttribute("href", "web/style_night_unicode.css"); | |
| } | |
| })(); | |
| </script> | |
| ''' | |
| def get_title(head): | |
| title = '' | |
| match = re.search(r'<title>(.*?)</title>', head) | |
| if match: | |
| title = match.group(1) | |
| return f'<h2 style="color: brown;">{title}</h2><hr>\n' | |
| else: | |
| print("Title not found in HTML") | |
| return '' | |
| def split_html_file(input_file_path, output_dir_path): | |
| with open(input_file_path, 'r', encoding='utf-8') as input_file: | |
| html = input_file.read() | |
| parts = html.split('<body>') | |
| head = parts[0] | |
| title = get_title(head) | |
| head = head.replace( | |
| "</head>", f'<link id="pageStyle" rel="stylesheet" type="text/css" href="web/style_unicode.css">\n{script}</head>') | |
| body = parts[1].split('</body>')[0] | |
| paragraphs = body.split('</p>\n') | |
| chunk_size = len(paragraphs) // 20 + 1 | |
| chunks = [paragraphs[i:i+chunk_size] | |
| for i in range(0, len(paragraphs), chunk_size)] | |
| total_chunks = len(chunks) | |
| for j, chunk in enumerate(chunks): | |
| base_name = f'{os.path.basename(input_file_path)}' | |
| output_file_name = os.path.join( | |
| output_dir_path, base_name) + f'_{j+1:02d}.html' | |
| with open(output_file_name, 'w', encoding='utf-8') as output_file: | |
| output_file.write( | |
| f'{head}<body>\n<div id="pnrydict" style="display:none;"></div>\n{title}<h3><a href="index.html">\u2638 Home</a> Part {j+1:02d} of {total_chunks:02d} [{base_name}]</h3>\n<h3>Links {generate_html_links(base_name, j + 1, total_chunks)}</h3>\n\n') | |
| for i, paragraph in enumerate(chunk): | |
| output_file.write(f'{paragraph}</p>\n') | |
| output_file.write( | |
| '<script src="dictionary/dict_script.js"></script>\n</body>\n</html>') | |
| def generate_html_links(input_basename, n, total_chunks): | |
| links = [] | |
| for i in range(1, total_chunks+1): | |
| link = f'<a href="{input_basename}_{i:02d}.html"> {i:02d}</a>' | |
| if i == n: | |
| link = f'<a href="{input_basename}_{i:02d}.html"><b style="color:brown;">[{i:02d}]</b></a>' | |
| links.append(link) | |
| return ' '.join(links) | |
| if __name__ == '__main__': | |
| input_dir_path = 'Books' | |
| output_dir_path = '_1_split_' | |
| os.makedirs(output_dir_path, exist_ok=True) | |
| n = 0 | |
| for file_name in sorted(os.listdir(input_dir_path)): | |
| if file_name.endswith('.html'): | |
| n += 1 | |
| print(f'{n} === Processing: {file_name}') | |
| input_file_path = os.path.join(input_dir_path, file_name) | |
| split_html_file(input_file_path, output_dir_path) | |
| os.system('node nodejs_wrap_word_with_tag.js') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment