Skip to content

Instantly share code, notes, and snippets.

@thekcsam
Created May 9, 2025 04:55
Show Gist options
  • Select an option

  • Save thekcsam/1439511df8ee99e94851523d094c8b20 to your computer and use it in GitHub Desktop.

Select an option

Save thekcsam/1439511df8ee99e94851523d094c8b20 to your computer and use it in GitHub Desktop.
# encoding: UTF-8
#
require 'caracal'
require 'find'
require 'json'
# SUTTA
VINAYAPITAKA = 'Vinayapiṭaka'.unicode_normalize(:nfc).freeze
SUTTAPITAKA = '2.Suttapiṭaka'.unicode_normalize(:nfc).freeze
ABHIDHAMMAPIṬAKA = 'Abhidhammapiṭaka'.unicode_normalize(:nfc).freeze
ANNA_GANTHA = 'Añña Gantha'.unicode_normalize(:nfc).freeze
# LANGUAGE
CHINESE = 'Chinese Translation'.freeze
ENGLISH = 'English Translation'.freeze
PALI = 'Pali'.freeze
SINHALA_PALI = 'Sinhala Pali'.freeze
LANGUAGE_KEYS = [
PALI,
SINHALA_PALI,
ENGLISH,
CHINESE,
]
LANGUAGE_FOLDER_REGEXP = Regexp.new("^(chi|eng|pali|#{"pāli".unicode_normalize(:nfc)}|sinhala)")
# REFERENCES
ATTHAKATHA_IN_PALI = 'aṭṭhakathā'.unicode_normalize(:nfc).freeze
ATTHAKATHA = "atthakatha".freeze
TIKA_IN_PALI = 'Tika'.unicode_normalize(:nfc).freeze
TIKA = 'tika'.freeze
IGNORED_FOLDER_NAME_REGEXP = /Complete[\s]?\(All\)/
ROOT_DIRECTORY = '../tipitaka-latest'
OUTPUT_FILE = '../tipitaka-latest/index.docx'
def build_table(pitaka_structure)
Caracal::Document.save(OUTPUT_FILE) do |docx|
pitaka_structure.each do |pitaka_name, folder_tree|
# Type: Arial
docx.h1 pitaka_name, align: :center, size: 50, bold: true
# ===== Generate Table ============
table_rows = generate_rows(folder_tree).sort_by { |row| [row[0], row[1]]}
repeat_cells = find_repeating_cell(table_rows)
table_rows = table_rows.map do |row|
# optionally use repeat_ranges here for formatting (e.g. merge cells, bold, etc.)
[generate_section_cell(row[0])] + row[1..]
end
docx.table table_rows, border_size: 1 do
repeat_cells.each do |(idx, span)|
# NOTE: Merge cell with the similar section name
cell_style rows[idx][0], rowspan: span
end
end unless table_rows.empty?
docx.page
# ===== Generate Table ============
end
end
end
# This function is used to find the position of repeating section
# It is used to merge cells
def find_repeating_cell(rows)
result = []
start_idx = 0
rows.each_cons(2).with_index do |(a, b), i|
if a.first != b.first
length = i + 1 - start_idx
result << [start_idx, length] if length > 1
start_idx = i + 1
end
end
length = rows.size - start_idx
result << [start_idx, length] if length > 1
result
end
def generate_rows(folder_tree, keys = [])
return [] if folder_tree.nil?
if language_folder?(folder_tree)
cells = if keys.size == 1
[keys[0]] + ['']
elsif keys.size > 2
[keys[0]] + [keys[1..-1].join('/')]
else # when key size = 2
[keys[0]] + keys[1..-1]
end
[cells + [generate_hyperlink_cells(folder_tree)]] # <= 1 row
else
folder_tree.flat_map do |key, value|
generate_rows(value, keys + [key])
end
end
end
def language_folder?(folder_tree)
folder_tree.is_a?(Hash) && folder_tree.keys.any? { |name| name.downcase.match?(LANGUAGE_FOLDER_REGEXP) }
end
def parse_pitaka_structure(root_dir)
results = {}
Find.find(root_dir) do |path|
next if path == '.'
next unless File.directory?(path)
# NOTE: don't normalize it so the hyperlink works
# path = path.unicode_normalize(:nfc)
rel_path = path.gsub("#{root_dir}/", '')
parts = rel_path.split('/')
next if parts.empty? || parts.first == '..'
if path.match?(IGNORED_FOLDER_NAME_REGEXP)
Find.prune
next
end
pitaka_name = parts.shift
nikaya = parts.first
next unless nikaya&.match?(/^\d+/)
if contains_language_folder?(path)
language_files = process_language_folders(path)
results[pitaka_name] ||= {}
current = results[pitaka_name]
# The folder structure looks like this: Pitaka (eg. Suttapitaka) > Nikaya (eg. Dighanikaya) > Subcategory (eg. 根本五十篇) > Sub-subcategory (eg. Atthakata/Writing) > Language (eg. Chi) > filename (eg. Jataka)
# RULES:
# i) Pitaka will be the heading of the table
# ii) Nikaya will be in the first column
# iii) Second column will consist of the combined Subcategory/Sub-subcategory string join by '/'
# iv) Last column will consist of a list of files categorized by languages
if pitaka_name.unicode_normalize(:nfc).match?(/#{SUTTAPITAKA}/) &&
parts[-2] != pitaka_name &&
parts.size > 1 &&
# last_match = Regexp.last_match.to_s.capitalize
parts = [nikaya, parts[1..-1].join('/')]
end
parts.each_with_index do |part, folder_depth|
is_last = folder_depth == parts.size - 1
if is_last
current[part] = language_files
else
current[part] ||= {}
current = current[part]
end
end
end
end
results
end
def contains_language_folder?(path)
Dir.children(path).any? do |child|
child_dir = File.join(path, child)
File.directory?(child_dir) && child.downcase.match?(LANGUAGE_FOLDER_REGEXP)
end
end
def generate_hyperlink_cells(languages)
output_docx_dir = File.dirname(OUTPUT_FILE)
Caracal::Core::Models::TableCellModel.new margins: cell_margins do
LANGUAGE_KEYS.each do |lang|
files = languages[lang]
next if files.nil? || files.empty?
p.text lang, bold: true
ul do
files.each do |file|
# NOTE: Do not need to escape nor replace forward slash, windows can understand it
relative_path = Pathname.new(file).relative_path_from(Pathname.new(output_docx_dir))
file_uri = relative_path.to_s # or URI.escape if needed
li { link (File.directory?(file) ? 'Link to folder' : File.basename(file)), file_uri }
end
end
end
end
end
def generate_section_cell(text)
Caracal::Core::Models::TableCellModel.new margins: cell_margins do
p.text text, bold: true, size: 30
end
end
def process_language_folders(path)
writing_regexp = Regexp.new("(?<![a-zA-Z])(著作)")
Dir.each_child(path).each_with_object({}) do |child, languages|
# Skip if it is not a directory
next unless File.directory?(File.join(path, child))
# Skip if it is not a language folder
next unless child.downcase.match?(LANGUAGE_FOLDER_REGEXP)
lang_key = language_key(child.downcase)
# The path here must be directory path
# If it is 'writing', we don't display each file individually because the number of files are usually large
# We link to the folder directly
if File.basename(path) =~ writing_regexp
languages[lang_key] = [
File.join(path, child)
]
next
end
# NOTE: Return all the files including those under nested folder
files = Find.find(File.join(path, child))
.select do |f|
# NOTE: Skipping directory
File.file?(f) &&
# NOTE: Skipping hidden file
!f.split("/")[-1].start_with?('.') &&
# NOTE: Skipping html file because there are a lot of them
!(File.extname(f) == '.html')
end
.sort_by { |f| File.basename(f) }
languages[lang_key] = files unless files.empty?
end
end
def language_key(child_name)
case child_name
when /chi/ then CHINESE
when /eng/ then ENGLISH
when Regexp.new("^(pali|#{"pāli".unicode_normalize(:nfc)})") then PALI
when /^sinhala/ then SINHALA_PALI
else 'Other Translation'
end
end
def cell_margins
{ top: 50, bottom: 50, left: 50, right: 50 }
end
# Main Flow
pitaka_structure = parse_pitaka_structure(ROOT_DIRECTORY)
File.write("output.json", JSON.pretty_generate(pitaka_structure))
build_table(pitaka_structure)
require 'find'
require 'fileutils'
# Folder to check and rename — change this to your actual path
ROOT = "../tipitaka-latest" # <-- Replace with your target directory
renamed_items = []
Find.find(ROOT) do |path|
name = File.basename(path)
parent = File.dirname(path)
# Normalize the filename to NFC form
normalized_name = name.unicode_normalize(:nfc)
# Skip renaming if the filename is already in NFC form
if name != normalized_name
puts "Original: #{path}"
puts "Normalized: #{File.join(parent, normalized_name)}"
# Temporary name to avoid conflicts
temp_name = "#{name}_TEMP_RENAME_#{rand(1000..9999)}" # Use a unique temp name
temp_path = File.join(parent, temp_name)
# First, rename to the temporary name
begin
FileUtils.mv(path, temp_path)
puts "Renamed to temporary name: #{temp_path}"
# Now rename to the normalized (proper) name
new_path = File.join(parent, normalized_name)
FileUtils.mv(temp_path, new_path)
puts "Renamed to final name: #{new_path}"
renamed_items << { from: path, to: new_path }
rescue => e
# If any error occurs during renaming, print the error and skip this file
puts "⚠️ Error renaming #{path}: #{e.message}"
end
end
end
# Output renamed items
puts "\n✅ Renamed items (#{renamed_items.size}):"
renamed_items.each do |item|
puts " #{item[:from]} → #{item[:to]}"
end
puts "\n📊 Total files and folders renamed: #{renamed_items.size}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment