vpnry/nodejs_wrap_word_with_tag.js

## nodejs_wrap_word_with_tag.js
/**
 * NodeJS script that wrap all words into tags.
 * npm install cheerio
 * Generated with the help of ChatGPT :)
 **/

const fs = require("fs");
const path = require("path");
const cheerio = require("cheerio");

const inputDirectoryPath = "_1_split_";
const outputDirectoryPath = "_2_done_wrap_";
const myTagName = "w";
let count = 0;

wrapWord();

function wrapWord() {
  try {
    const files = fs.readdirSync(inputDirectoryPath);

    // Create output directory if it doesn't exist
    if (!fs.existsSync(outputDirectoryPath)) {
      fs.mkdirSync(outputDirectoryPath);
    }

    files.forEach((file) => {
      if (path.extname(file) === ".html") {
        const inputFilePath = path.join(inputDirectoryPath, file);
        const outputFilePath = path.join(outputDirectoryPath, file);

        let data = fs.readFileSync(inputFilePath, "utf-8");

        // You may want to comment this
        // data = replaceSourceHTML(data);

        const $ = cheerio.load(data);

        $("*")
          .contents()
          .filter(function () {
            return this.nodeType === 3 && /\S/.test(this.nodeValue);
          })
          .each(function () {
            // Exclude wrapping text within the <title> tag
            if ($(this).parent().is("title")) {
              return;
            }

            if ($(this).parent().is("script")) {
              return;
            }

            const words = $(this).text().split(/\s+/);
            const wrappedWords = words.map(
              (word) => `<${myTagName}>${word}</${myTagName}>`
            );
            $(this).replaceWith(wrappedWords.join(" "));
          });

        // Add script before </body>
        // $("body").append('<script src="data/done_script.js"></script>');
        fs.writeFileSync(outputFilePath, $.html(), "utf-8");
        count++;
        console.log(`File ${count}. ${outputFilePath} updated successfully.`);
      }
    });

    console.log(
      `Processed ${count} file(s) in directory ${inputDirectoryPath}.`
    );
  } catch (err) {
    console.error(`Error reading directory: ${err}`);
  }
}

function replaceSourceHTML(data) {
  // Some replacements
  data = data.replace(
    "</head>",
    '<link rel="stylesheet" type="text/css" href="web/style_unicode.css">\n</head>'
  );

  data = data.replace(
    "<body>",
    '<body>\n<div id="pnrydict" style="display:none;"></div>'
  );

  data = data.replace(
    "</body>",
    '<script src="dictionary/dict_script.js"></script>\n</body>\n'
  );
  return data;
}

## split_html_smaller_files.py
import os
import re

script = '''
<script>
(function applyDarkMode() {
    // Retrieve the user's preference from localStorage, if available
    const tptkMmLastDarkDayMode =
    localStorage.getItem("tptkMmLastDarkDayMode") || "day";
    // If the user's preference is set to "dark", apply the dark mode CSS file
    if (tptkMmLastDarkDayMode === "dark") {
    document
        .getElementById("pageStyle")
        .setAttribute("href", "web/style_night_unicode.css");
    }
})();
</script>
'''


def get_title(head):
    title = ''
    match = re.search(r'<title>(.*?)</title>', head)
    if match:
        title = match.group(1)
        return f'<h2 style="color: brown;">{title}</h2><hr>\n'
    else:
        print("Title not found in HTML")
    return ''


def split_html_file(input_file_path, output_dir_path):
    with open(input_file_path, 'r', encoding='utf-8') as input_file:
        html = input_file.read()
        parts = html.split('<body>')
        head = parts[0]
        title = get_title(head)
        head = head.replace(
            "</head>", f'<link id="pageStyle" rel="stylesheet" type="text/css" href="web/style_unicode.css">\n{script}</head>')

        body = parts[1].split('</body>')[0]
        paragraphs = body.split('</p>\n')
        chunk_size = len(paragraphs) // 20 + 1
        chunks = [paragraphs[i:i+chunk_size]
                  for i in range(0, len(paragraphs), chunk_size)]
        total_chunks = len(chunks)

        for j, chunk in enumerate(chunks):
            base_name = f'{os.path.basename(input_file_path)}'
            output_file_name = os.path.join(
                output_dir_path, base_name) + f'_{j+1:02d}.html'
            with open(output_file_name, 'w', encoding='utf-8') as output_file:
                output_file.write(
                    f'{head}<body>\n<div id="pnrydict" style="display:none;"></div>\n{title}<h3><a href="index.html">\u2638 Home</a> Part {j+1:02d} of {total_chunks:02d} [{base_name}]</h3>\n<h3>Links {generate_html_links(base_name, j + 1, total_chunks)}</h3>\n\n')
                for i, paragraph in enumerate(chunk):
                    output_file.write(f'{paragraph}</p>\n')
                output_file.write(
                    '<script src="dictionary/dict_script.js"></script>\n</body>\n</html>')


def generate_html_links(input_basename, n,  total_chunks):
    links = []
    for i in range(1, total_chunks+1):
        link = f'<a href="{input_basename}_{i:02d}.html"> {i:02d}</a>'
        if i == n:
            link = f'<a href="{input_basename}_{i:02d}.html"><b style="color:brown;">[{i:02d}]</b></a>'

        links.append(link)
    return ' '.join(links)


if __name__ == '__main__':
    input_dir_path = 'Books'
    output_dir_path = '_1_split_'
    os.makedirs(output_dir_path, exist_ok=True)
    n = 0
    for file_name in sorted(os.listdir(input_dir_path)):
        if file_name.endswith('.html'):
            n += 1
            print(f'{n} === Processing: {file_name}')
            input_file_path = os.path.join(input_dir_path, file_name)
            split_html_file(input_file_path, output_dir_path)
    os.system('node nodejs_wrap_word_with_tag.js')
	/**
	* NodeJS script that wrap all words into tags.
	* npm install cheerio
	* Generated with the help of ChatGPT :)
	**/

	const fs = require("fs");
	const path = require("path");
	const cheerio = require("cheerio");

	const inputDirectoryPath = "_1_split_";
	const outputDirectoryPath = "_2_done_wrap_";
	const myTagName = "w";
	let count = 0;

	wrapWord();

	function wrapWord() {
	try {
	const files = fs.readdirSync(inputDirectoryPath);

	// Create output directory if it doesn't exist
	if (!fs.existsSync(outputDirectoryPath)) {
	fs.mkdirSync(outputDirectoryPath);
	}

	files.forEach((file) => {
	if (path.extname(file) === ".html") {
	const inputFilePath = path.join(inputDirectoryPath, file);
	const outputFilePath = path.join(outputDirectoryPath, file);

	let data = fs.readFileSync(inputFilePath, "utf-8");

	// You may want to comment this
	// data = replaceSourceHTML(data);

	const $ = cheerio.load(data);

	$("*")
	.contents()
	.filter(function () {
	return this.nodeType === 3 && /\S/.test(this.nodeValue);
	})
	.each(function () {
	// Exclude wrapping text within the <title> tag
	if ($(this).parent().is("title")) {
	return;
	}

	if ($(this).parent().is("script")) {
	return;
	}

	const words = $(this).text().split(/\s+/);
	const wrappedWords = words.map(
	(word) => `<${myTagName}>${word}</${myTagName}>`
	);
	$(this).replaceWith(wrappedWords.join(" "));
	});

	// Add script before </body>
	// $("body").append('<script src="data/done_script.js"></script>');
	fs.writeFileSync(outputFilePath, $.html(), "utf-8");
	count++;
	console.log(`File ${count}. ${outputFilePath} updated successfully.`);
	}
	});

	console.log(
	`Processed ${count} file(s) in directory ${inputDirectoryPath}.`
	);
	} catch (err) {
	console.error(`Error reading directory: ${err}`);
	}
	}

	function replaceSourceHTML(data) {
	// Some replacements
	data = data.replace(
	"</head>",
	'<link rel="stylesheet" type="text/css" href="web/style_unicode.css">\n</head>'
	);

	data = data.replace(
	"<body>",
	'<body>\n<div id="pnrydict" style="display:none;"></div>'
	);

	data = data.replace(
	"</body>",
	'<script src="dictionary/dict_script.js"></script>\n</body>\n'
	);
	return data;
	}
	import os
	import re

	script = '''
	<script>
	(function applyDarkMode() {
	// Retrieve the user's preference from localStorage, if available
	const tptkMmLastDarkDayMode =
	localStorage.getItem("tptkMmLastDarkDayMode") \|\| "day";
	// If the user's preference is set to "dark", apply the dark mode CSS file
	if (tptkMmLastDarkDayMode === "dark") {
	document
	.getElementById("pageStyle")
	.setAttribute("href", "web/style_night_unicode.css");
	}
	})();
	</script>
	'''


	def get_title(head):
	title = ''
	match = re.search(r'<title>(.*?)</title>', head)
	if match:
	title = match.group(1)
	return f'<h2 style="color: brown;">{title}</h2><hr>\n'
	else:
	print("Title not found in HTML")
	return ''


	def split_html_file(input_file_path, output_dir_path):
	with open(input_file_path, 'r', encoding='utf-8') as input_file:
	html = input_file.read()
	parts = html.split('<body>')
	head = parts[0]
	title = get_title(head)
	head = head.replace(
	"</head>", f'<link id="pageStyle" rel="stylesheet" type="text/css" href="web/style_unicode.css">\n{script}</head>')

	body = parts[1].split('</body>')[0]
	paragraphs = body.split('</p>\n')
	chunk_size = len(paragraphs) // 20 + 1
	chunks = [paragraphs[i:i+chunk_size]
	for i in range(0, len(paragraphs), chunk_size)]
	total_chunks = len(chunks)

	for j, chunk in enumerate(chunks):
	base_name = f'{os.path.basename(input_file_path)}'
	output_file_name = os.path.join(
	output_dir_path, base_name) + f'_{j+1:02d}.html'
	with open(output_file_name, 'w', encoding='utf-8') as output_file:
	output_file.write(
	f'{head}<body>\n<div id="pnrydict" style="display:none;"></div>\n{title}<h3><a href="index.html">\u2638 Home</a> Part {j+1:02d} of {total_chunks:02d} [{base_name}]</h3>\n<h3>Links {generate_html_links(base_name, j + 1, total_chunks)}</h3>\n\n')
	for i, paragraph in enumerate(chunk):
	output_file.write(f'{paragraph}</p>\n')
	output_file.write(
	'<script src="dictionary/dict_script.js"></script>\n</body>\n</html>')


	def generate_html_links(input_basename, n, total_chunks):
	links = []
	for i in range(1, total_chunks+1):
	link = f'<a href="{input_basename}_{i:02d}.html"> {i:02d}</a>'
	if i == n:
	link = f'<a href="{input_basename}_{i:02d}.html"><b style="color:brown;">[{i:02d}]</b></a>'

	links.append(link)
	return ' '.join(links)


	if __name__ == '__main__':
	input_dir_path = 'Books'
	output_dir_path = '_1_split_'
	os.makedirs(output_dir_path, exist_ok=True)
	n = 0
	for file_name in sorted(os.listdir(input_dir_path)):
	if file_name.endswith('.html'):
	n += 1
	print(f'{n} === Processing: {file_name}')
	input_file_path = os.path.join(input_dir_path, file_name)
	split_html_file(input_file_path, output_dir_path)
	os.system('node nodejs_wrap_word_with_tag.js')