Skip to content

Instantly share code, notes, and snippets.

@vpnry
Last active March 26, 2023 17:06
Show Gist options
  • Select an option

  • Save vpnry/b433182b84711ae9c33ed0256429723e to your computer and use it in GitHub Desktop.

Select an option

Save vpnry/b433182b84711ae9c33ed0256429723e to your computer and use it in GitHub Desktop.
Nodejs script that wraps words in HTML file with a specified tag, Pyhon script to split file HTML file into smaller parts
/**
* NodeJS script that wrap all words into tags.
* npm install cheerio
* Generated with the help of ChatGPT :)
**/
const fs = require("fs");
const path = require("path");
const cheerio = require("cheerio");
const inputDirectoryPath = "_1_split_";
const outputDirectoryPath = "_2_done_wrap_";
const myTagName = "w";
let count = 0;
wrapWord();
function wrapWord() {
try {
const files = fs.readdirSync(inputDirectoryPath);
// Create output directory if it doesn't exist
if (!fs.existsSync(outputDirectoryPath)) {
fs.mkdirSync(outputDirectoryPath);
}
files.forEach((file) => {
if (path.extname(file) === ".html") {
const inputFilePath = path.join(inputDirectoryPath, file);
const outputFilePath = path.join(outputDirectoryPath, file);
let data = fs.readFileSync(inputFilePath, "utf-8");
// You may want to comment this
// data = replaceSourceHTML(data);
const $ = cheerio.load(data);
$("*")
.contents()
.filter(function () {
return this.nodeType === 3 && /\S/.test(this.nodeValue);
})
.each(function () {
// Exclude wrapping text within the <title> tag
if ($(this).parent().is("title")) {
return;
}
if ($(this).parent().is("script")) {
return;
}
const words = $(this).text().split(/\s+/);
const wrappedWords = words.map(
(word) => `<${myTagName}>${word}</${myTagName}>`
);
$(this).replaceWith(wrappedWords.join(" "));
});
// Add script before </body>
// $("body").append('<script src="data/done_script.js"></script>');
fs.writeFileSync(outputFilePath, $.html(), "utf-8");
count++;
console.log(`File ${count}. ${outputFilePath} updated successfully.`);
}
});
console.log(
`Processed ${count} file(s) in directory ${inputDirectoryPath}.`
);
} catch (err) {
console.error(`Error reading directory: ${err}`);
}
}
function replaceSourceHTML(data) {
// Some replacements
data = data.replace(
"</head>",
'<link rel="stylesheet" type="text/css" href="web/style_unicode.css">\n</head>'
);
data = data.replace(
"<body>",
'<body>\n<div id="pnrydict" style="display:none;"></div>'
);
data = data.replace(
"</body>",
'<script src="dictionary/dict_script.js"></script>\n</body>\n'
);
return data;
}
import os
import re
script = '''
<script>
(function applyDarkMode() {
// Retrieve the user's preference from localStorage, if available
const tptkMmLastDarkDayMode =
localStorage.getItem("tptkMmLastDarkDayMode") || "day";
// If the user's preference is set to "dark", apply the dark mode CSS file
if (tptkMmLastDarkDayMode === "dark") {
document
.getElementById("pageStyle")
.setAttribute("href", "web/style_night_unicode.css");
}
})();
</script>
'''
def get_title(head):
title = ''
match = re.search(r'<title>(.*?)</title>', head)
if match:
title = match.group(1)
return f'<h2 style="color: brown;">{title}</h2><hr>\n'
else:
print("Title not found in HTML")
return ''
def split_html_file(input_file_path, output_dir_path):
with open(input_file_path, 'r', encoding='utf-8') as input_file:
html = input_file.read()
parts = html.split('<body>')
head = parts[0]
title = get_title(head)
head = head.replace(
"</head>", f'<link id="pageStyle" rel="stylesheet" type="text/css" href="web/style_unicode.css">\n{script}</head>')
body = parts[1].split('</body>')[0]
paragraphs = body.split('</p>\n')
chunk_size = len(paragraphs) // 20 + 1
chunks = [paragraphs[i:i+chunk_size]
for i in range(0, len(paragraphs), chunk_size)]
total_chunks = len(chunks)
for j, chunk in enumerate(chunks):
base_name = f'{os.path.basename(input_file_path)}'
output_file_name = os.path.join(
output_dir_path, base_name) + f'_{j+1:02d}.html'
with open(output_file_name, 'w', encoding='utf-8') as output_file:
output_file.write(
f'{head}<body>\n<div id="pnrydict" style="display:none;"></div>\n{title}<h3><a href="index.html">\u2638 Home</a> Part {j+1:02d} of {total_chunks:02d} [{base_name}]</h3>\n<h3>Links {generate_html_links(base_name, j + 1, total_chunks)}</h3>\n\n')
for i, paragraph in enumerate(chunk):
output_file.write(f'{paragraph}</p>\n')
output_file.write(
'<script src="dictionary/dict_script.js"></script>\n</body>\n</html>')
def generate_html_links(input_basename, n, total_chunks):
links = []
for i in range(1, total_chunks+1):
link = f'<a href="{input_basename}_{i:02d}.html"> {i:02d}</a>'
if i == n:
link = f'<a href="{input_basename}_{i:02d}.html"><b style="color:brown;">[{i:02d}]</b></a>'
links.append(link)
return ' '.join(links)
if __name__ == '__main__':
input_dir_path = 'Books'
output_dir_path = '_1_split_'
os.makedirs(output_dir_path, exist_ok=True)
n = 0
for file_name in sorted(os.listdir(input_dir_path)):
if file_name.endswith('.html'):
n += 1
print(f'{n} === Processing: {file_name}')
input_file_path = os.path.join(input_dir_path, file_name)
split_html_file(input_file_path, output_dir_path)
os.system('node nodejs_wrap_word_with_tag.js')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment