thekcsam/fix_hyperlink.rb

## fix_hyperlink.rb
require 'zip'
require 'nokogiri'

def fix_hyperlinks(docx_path, output_path)
  Zip::File.open(docx_path) do |zipfile|
    # Read relationship file
    rels_entry = zipfile.find_entry('word/_rels/document.xml.rels')
    rels_content = zipfile.read(rels_entry)
    rels_xml = Nokogiri::XML(rels_content)

    # Register the namespace
    rels_xml.root.add_namespace('r', 'http://schemas.openxmlformats.org/package/2006/relationships')

    # Find all hyperlink relationships
    rels_xml.xpath('//r:Relationship[@Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"]',
                   'r' => 'http://schemas.openxmlformats.org/package/2006/relationships').each do |rel|
      old_target = rel.attribute('Target').value

      # Transform the target path
      new_target = transform_target(old_target)

      # Update if changed
      rel.attribute('Target').value = new_target if new_target != old_target
    end

    # Create new DOCX with modified relationships
    create_modified_docx(zipfile, rels_xml, output_path)
  end
end

def transform_target(full_path)
  # Remove the file:///K:\Tipitaka\ prefix
  cleaned = full_path.gsub(/^file:\/\/\/[A-Za-z]:\\Tipitaka\\/i, '') # <- Replace this with proper regexp, inspect the value using list_hyperlink script
  cleaned = cleaned.gsub(/\\/, '/')
end

def create_modified_docx(zipfile, rels_xml, output_path)
  Zip::File.open(output_path, Zip::File::CREATE) do |new_zip|
    # Copy all files except the relationships file
    zipfile.each do |entry|
      next if entry.name == 'word/_rels/document.xml.rels'
      new_zip.get_output_stream(entry.name) { |f| f.write zipfile.read(entry.name) }
    end

    # Add modified relationships file
    new_zip.get_output_stream('word/_rels/document.xml.rels') do |f|
      f.write rels_xml.to_xml(indent: 2)
    end
  end
end

# Usage
require 'cgi'
fix_hyperlinks('input/after-formatting.docx', '../tipitaka/index.docx')

## indexer.rb
# encoding: UTF-8
#
require 'caracal'
require 'find'
require 'json'

# SUTTA
VINAYAPITAKA = 'Vinayapiṭaka'.unicode_normalize(:nfc).freeze
SUTTAPITAKA = '2.Suttapiṭaka'.unicode_normalize(:nfc).freeze
ABHIDHAMMAPIṬAKA = 'Abhidhammapiṭaka'.unicode_normalize(:nfc).freeze
ANNA_GANTHA = 'Añña Gantha'.unicode_normalize(:nfc).freeze

# LANGUAGE
CHINESE = 'Chinese Translation'.freeze
ENGLISH = 'English Translation'.freeze
PALI = 'Pali'.freeze
SINHALA_PALI = 'Sinhala Pali'.freeze
LANGUAGE_KEYS = [
  PALI,
  SINHALA_PALI,
  ENGLISH,
  CHINESE,
]
LANGUAGE_FOLDER_REGEXP = Regexp.new("^(chi|eng|pali|#{"pāli".unicode_normalize(:nfc)}|sinhala)")

# REFERENCES
ATTHAKATHA_IN_PALI = 'aṭṭhakathā'.unicode_normalize(:nfc).freeze
ATTHAKATHA = "atthakatha".freeze
TIKA_IN_PALI = 'Tika'.unicode_normalize(:nfc).freeze
TIKA = 'tika'.freeze

IGNORED_FOLDER_NAME_REGEXP = /Complete[\s]?\(All\)/

ROOT_DIRECTORY = '../tipitaka-latest'
OUTPUT_FILE = '../tipitaka-latest/index.docx'

def build_table(pitaka_structure)
  Caracal::Document.save(OUTPUT_FILE) do |docx|
    pitaka_structure.each do |pitaka_name, folder_tree|
      # Type: Arial
      docx.h1 pitaka_name, align: :center, size: 50, bold: true
      # ===== Generate Table ============
      table_rows = generate_rows(folder_tree).sort_by { |row| [row[0], row[1]]}
      repeat_cells = find_repeating_cell(table_rows)

      table_rows = table_rows.map do |row|
        # optionally use repeat_ranges here for formatting (e.g. merge cells, bold, etc.)
        [generate_section_cell(row[0])] + row[1..]
      end

      docx.table table_rows, border_size: 1 do
        repeat_cells.each do |(idx, span)|
          # NOTE: Merge cell with the similar section name
          cell_style rows[idx][0], rowspan: span
        end
      end unless table_rows.empty?

      docx.page
      # ===== Generate Table ============
    end
  end
end

# This function is used to find the position of repeating section
# It is used to merge cells
def find_repeating_cell(rows)
  result = []
  start_idx = 0

  rows.each_cons(2).with_index do |(a, b), i|
    if a.first != b.first
      length = i + 1 - start_idx
      result << [start_idx, length] if length > 1
      start_idx = i + 1
    end
  end

  length = rows.size - start_idx
  result << [start_idx, length] if length > 1

  result
end

def generate_rows(folder_tree, keys = [])
  return [] if folder_tree.nil?

  if language_folder?(folder_tree)
    cells = if keys.size == 1
      [keys[0]] + ['']
    elsif keys.size > 2
      [keys[0]] + [keys[1..-1].join('/')]
    else # when key size = 2
      [keys[0]] + keys[1..-1]
    end

    [cells + [generate_hyperlink_cells(folder_tree)]] # <= 1 row
  else
    folder_tree.flat_map do |key, value|
      generate_rows(value, keys + [key])
    end
  end
end

def language_folder?(folder_tree)
  folder_tree.is_a?(Hash) && folder_tree.keys.any? { |name| name.downcase.match?(LANGUAGE_FOLDER_REGEXP) }
end

def parse_pitaka_structure(root_dir)
  results = {}

  Find.find(root_dir) do |path|
    next if path == '.'
    next unless File.directory?(path)
    # NOTE: don't normalize it so the hyperlink works
    # path = path.unicode_normalize(:nfc)

    rel_path = path.gsub("#{root_dir}/", '')
    parts = rel_path.split('/')
    next if parts.empty? || parts.first == '..'

    if path.match?(IGNORED_FOLDER_NAME_REGEXP)
      Find.prune
      next
    end

    pitaka_name = parts.shift
    nikaya = parts.first
    next unless nikaya&.match?(/^\d+/)

    if contains_language_folder?(path)
      language_files = process_language_folders(path)
      results[pitaka_name] ||= {}

      current = results[pitaka_name]

      # The folder structure looks like this: Pitaka (eg. Suttapitaka) > Nikaya (eg. Dighanikaya) > Subcategory (eg. 根本五十篇) > Sub-subcategory (eg. Atthakata/Writing) > Language (eg. Chi) > filename (eg. Jataka)
      # RULES:
      # i) Pitaka will be the heading of the table
      # ii) Nikaya will be in the first column
      # iii) Second column will consist of the combined Subcategory/Sub-subcategory string join by '/'
      # iv) Last column will consist of a list of files categorized by languages
      if pitaka_name.unicode_normalize(:nfc).match?(/#{SUTTAPITAKA}/) &&
        parts[-2] != pitaka_name &&
        parts.size > 1 &&
          # last_match = Regexp.last_match.to_s.capitalize
          parts = [nikaya, parts[1..-1].join('/')]
      end

      parts.each_with_index do |part, folder_depth|
        is_last = folder_depth == parts.size - 1
        if is_last
          current[part] = language_files
        else
          current[part] ||= {}
          current = current[part]
        end
      end
    end
  end

  results
end

def contains_language_folder?(path)
  Dir.children(path).any? do |child|
    child_dir = File.join(path, child)
    File.directory?(child_dir) && child.downcase.match?(LANGUAGE_FOLDER_REGEXP)
  end
end

def generate_hyperlink_cells(languages)
  output_docx_dir = File.dirname(OUTPUT_FILE)
  Caracal::Core::Models::TableCellModel.new margins: cell_margins do
    LANGUAGE_KEYS.each do |lang|
      files = languages[lang]
      next if files.nil? || files.empty?

      p.text lang, bold: true
      ul do
        files.each do |file|
          # NOTE: Do not need to escape nor replace forward slash, windows can understand it
          relative_path = Pathname.new(file).relative_path_from(Pathname.new(output_docx_dir))
          file_uri = relative_path.to_s # or URI.escape if needed
          li { link (File.directory?(file) ? 'Link to folder' : File.basename(file)), file_uri }
        end
      end
    end
  end
end

def generate_section_cell(text)
  Caracal::Core::Models::TableCellModel.new margins: cell_margins do
    p.text text, bold: true, size: 30
  end
end

def process_language_folders(path)
  writing_regexp = Regexp.new("(?<![a-zA-Z])(著作)")

  Dir.each_child(path).each_with_object({}) do |child, languages|
    # Skip if it is not a directory
    next unless File.directory?(File.join(path, child))
    # Skip if it is not a language folder
    next unless child.downcase.match?(LANGUAGE_FOLDER_REGEXP)

    lang_key = language_key(child.downcase)

    # The path here must be directory path
    # If it is 'writing', we don't display each file individually because the number of files are usually large
    # We link to the folder directly
    if File.basename(path) =~ writing_regexp
      languages[lang_key] = [
        File.join(path, child)
      ]
      next
    end
    # NOTE: Return all the files including those under nested folder
    files = Find.find(File.join(path, child))
      .select do |f|
        # NOTE: Skipping directory
        File.file?(f) &&
        # NOTE: Skipping hidden file
        !f.split("/")[-1].start_with?('.') &&
        # NOTE: Skipping html file because there are a lot of them
        !(File.extname(f) == '.html')
      end
      .sort_by { |f| File.basename(f) }
    languages[lang_key] = files unless files.empty?
  end
end

def language_key(child_name)
  case child_name
  when /chi/ then CHINESE
  when /eng/ then ENGLISH
  when Regexp.new("^(pali|#{"pāli".unicode_normalize(:nfc)})") then PALI
  when /^sinhala/ then SINHALA_PALI
  else 'Other Translation'
  end
end

def cell_margins
  { top: 50, bottom: 50, left: 50, right: 50 }
end

# Main Flow
pitaka_structure = parse_pitaka_structure(ROOT_DIRECTORY)
File.write("output.json", JSON.pretty_generate(pitaka_structure))
build_table(pitaka_structure)

## list_hyperlink.rb
require 'zip'
require 'nokogiri'
require 'fileutils'

def inspect_hyperlinks_with_debug(docx_path, output_dir='debug_wrong_output')
  # Create output directory
  FileUtils.mkdir_p(output_dir)

  begin
    hyperlinks = []

    Zip::File.open(docx_path) do |zipfile|
      # Extract and save key files for inspection
      # save_xml_file(zipfile, 'word/document.xml', output_dir)
      save_xml_file(zipfile, 'word/_rels/document.xml.rels', output_dir)

      # Load main document XML
      doc_xml = get_xml(zipfile, 'word/document.xml')
      rels_xml = get_xml(zipfile, 'word/_rels/document.xml.rels')

      # Process all tables
      doc_xml.xpath('//w:tbl').each_with_index do |table, table_idx|
        table.xpath('.//w:tr').each_with_index do |row, row_idx|
          cells = row.xpath('.//w:tc')
          next unless cells.size >= 3

          third_cell = cells[2]

          process_hyperlinks(third_cell, rels_xml, hyperlinks, table_idx + 1, row_idx + 1)
        end
      end
    end

    # Output results
    puts "\nDebug files saved to #{output_dir} directory"

  rescue => e
    puts "Error processing document: #{e.message}"
    puts "Backtrace: #{e.backtrace.first(5).join("\n")}"
  end
end

def save_xml_file(zipfile, entry_path, output_dir)
  return unless zipfile.find_entry(entry_path)

  content = zipfile.read(entry_path)
  filename = entry_path.gsub('/', '_')
  File.write("#{output_dir}/#{filename}", content)
end

def get_xml(zipfile, entry_path)
  return nil unless zipfile.find_entry(entry_path)
  Nokogiri::XML(zipfile.read(entry_path))
end

def process_hyperlinks(node, rels_xml, hyperlinks, table_num, row_num)
  ns = {
    'w' => 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
    'r' => 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
  }

  node.xpath('.//w:hyperlink', ns).each do |hyperlink|
    # Get relationship ID or anchor
    rel_id = hyperlink.attribute('id')&.value
    anchor = hyperlink.attribute('anchor')&.value

    # Get display text
    text = hyperlink.xpath('.//w:t', ns).map(&:text).join

    # Determine target
    target, type = if rel_id && rels_xml
                     # External hyperlink via relationship
                     target = rels_xml.xpath("//r:Relationship[@Id='#{rel_id}']/@Target", ns).first&.value
                     [target, target&.start_with?('http') ? 'EXTERNAL' : 'INTERNAL']
                   elsif anchor
                     # Internal bookmark link
                     [anchor, 'BOOKMARK']
                   else
                     ['MALFORMED', 'UNKNOWN']
                   end

    hyperlinks << {
      table: table_num,
      row: row_num,
      text: text,
      target: target || 'MISSING',
      type: type
    }
  end
end

# Usage
inspect_hyperlinks_with_debug('../tipitaka/working-solution.docx')

## rename_pali_file.rb
require 'find'
require 'fileutils'

# Folder to check and rename — change this to your actual path
ROOT = "../tipitaka-latest"  # <-- Replace with your target directory

renamed_items = []

Find.find(ROOT) do |path|
  name = File.basename(path)
  parent = File.dirname(path)

  # Normalize the filename to NFC form
  normalized_name = name.unicode_normalize(:nfc)

  # Skip renaming if the filename is already in NFC form
  if name != normalized_name
    puts "Original: #{path}"
    puts "Normalized: #{File.join(parent, normalized_name)}"

    # Temporary name to avoid conflicts
    temp_name = "#{name}_TEMP_RENAME_#{rand(1000..9999)}"  # Use a unique temp name
    temp_path = File.join(parent, temp_name)

    # First, rename to the temporary name
    begin
      FileUtils.mv(path, temp_path)
      puts "Renamed to temporary name: #{temp_path}"

      # Now rename to the normalized (proper) name
      new_path = File.join(parent, normalized_name)
      FileUtils.mv(temp_path, new_path)
      puts "Renamed to final name: #{new_path}"

      renamed_items << { from: path, to: new_path }
    rescue => e
      # If any error occurs during renaming, print the error and skip this file
      puts "⚠️ Error renaming #{path}: #{e.message}"
    end
  end
end

# Output renamed items
puts "\n✅ Renamed items (#{renamed_items.size}):"
renamed_items.each do |item|
  puts "  #{item[:from]} → #{item[:to]}"
end

puts "\n📊 Total files and folders renamed: #{renamed_items.size}"
	require 'zip'
	require 'nokogiri'

	def fix_hyperlinks(docx_path, output_path)
	Zip::File.open(docx_path) do \|zipfile\|
	# Read relationship file
	rels_entry = zipfile.find_entry('word/_rels/document.xml.rels')
	rels_content = zipfile.read(rels_entry)
	rels_xml = Nokogiri::XML(rels_content)

	# Register the namespace
	rels_xml.root.add_namespace('r', 'http://schemas.openxmlformats.org/package/2006/relationships')

	# Find all hyperlink relationships
	rels_xml.xpath('//r:Relationship[@Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"]',
	'r' => 'http://schemas.openxmlformats.org/package/2006/relationships').each do \|rel\|
	old_target = rel.attribute('Target').value

	# Transform the target path
	new_target = transform_target(old_target)

	# Update if changed
	rel.attribute('Target').value = new_target if new_target != old_target
	end

	# Create new DOCX with modified relationships
	create_modified_docx(zipfile, rels_xml, output_path)
	end
	end

	def transform_target(full_path)
	# Remove the file:///K:\Tipitaka\ prefix
	cleaned = full_path.gsub(/^file:\/\/\/[A-Za-z]:\\Tipitaka\\/i, '') # <- Replace this with proper regexp, inspect the value using list_hyperlink script
	cleaned = cleaned.gsub(/\\/, '/')
	end

	def create_modified_docx(zipfile, rels_xml, output_path)
	Zip::File.open(output_path, Zip::File::CREATE) do \|new_zip\|
	# Copy all files except the relationships file
	zipfile.each do \|entry\|
	next if entry.name == 'word/_rels/document.xml.rels'
	new_zip.get_output_stream(entry.name) { \|f\| f.write zipfile.read(entry.name) }
	end

	# Add modified relationships file
	new_zip.get_output_stream('word/_rels/document.xml.rels') do \|f\|
	f.write rels_xml.to_xml(indent: 2)
	end
	end
	end

	# Usage
	require 'cgi'
	fix_hyperlinks('input/after-formatting.docx', '../tipitaka/index.docx')
	# encoding: UTF-8
	#
	require 'caracal'
	require 'find'
	require 'json'

	# SUTTA
	VINAYAPITAKA = 'Vinayapiṭaka'.unicode_normalize(:nfc).freeze
	SUTTAPITAKA = '2.Suttapiṭaka'.unicode_normalize(:nfc).freeze
	ABHIDHAMMAPIṬAKA = 'Abhidhammapiṭaka'.unicode_normalize(:nfc).freeze
	ANNA_GANTHA = 'Añña Gantha'.unicode_normalize(:nfc).freeze

	# LANGUAGE
	CHINESE = 'Chinese Translation'.freeze
	ENGLISH = 'English Translation'.freeze
	PALI = 'Pali'.freeze
	SINHALA_PALI = 'Sinhala Pali'.freeze
	LANGUAGE_KEYS = [
	PALI,
	SINHALA_PALI,
	ENGLISH,
	CHINESE,
	]
	LANGUAGE_FOLDER_REGEXP = Regexp.new("^(chi\|eng\|pali\|#{"pāli".unicode_normalize(:nfc)}\|sinhala)")

	# REFERENCES
	ATTHAKATHA_IN_PALI = 'aṭṭhakathā'.unicode_normalize(:nfc).freeze
	ATTHAKATHA = "atthakatha".freeze
	TIKA_IN_PALI = 'Tika'.unicode_normalize(:nfc).freeze
	TIKA = 'tika'.freeze

	IGNORED_FOLDER_NAME_REGEXP = /Complete[\s]?\(All\)/

	ROOT_DIRECTORY = '../tipitaka-latest'
	OUTPUT_FILE = '../tipitaka-latest/index.docx'

	def build_table(pitaka_structure)
	Caracal::Document.save(OUTPUT_FILE) do \|docx\|
	pitaka_structure.each do \|pitaka_name, folder_tree\|
	# Type: Arial
	docx.h1 pitaka_name, align: :center, size: 50, bold: true
	# ===== Generate Table ============
	table_rows = generate_rows(folder_tree).sort_by { \|row\| [row[0], row[1]]}
	repeat_cells = find_repeating_cell(table_rows)

	table_rows = table_rows.map do \|row\|
	# optionally use repeat_ranges here for formatting (e.g. merge cells, bold, etc.)
	[generate_section_cell(row[0])] + row[1..]
	end

	docx.table table_rows, border_size: 1 do
	repeat_cells.each do \|(idx, span)\|
	# NOTE: Merge cell with the similar section name
	cell_style rows[idx][0], rowspan: span
	end
	end unless table_rows.empty?

	docx.page
	# ===== Generate Table ============
	end
	end
	end

	# This function is used to find the position of repeating section
	# It is used to merge cells
	def find_repeating_cell(rows)
	result = []
	start_idx = 0

	rows.each_cons(2).with_index do \|(a, b), i\|
	if a.first != b.first
	length = i + 1 - start_idx
	result << [start_idx, length] if length > 1
	start_idx = i + 1
	end
	end

	length = rows.size - start_idx
	result << [start_idx, length] if length > 1

	result
	end

	def generate_rows(folder_tree, keys = [])
	return [] if folder_tree.nil?

	if language_folder?(folder_tree)
	cells = if keys.size == 1
	[keys[0]] + ['']
	elsif keys.size > 2
	[keys[0]] + [keys[1..-1].join('/')]
	else # when key size = 2
	[keys[0]] + keys[1..-1]
	end

	[cells + [generate_hyperlink_cells(folder_tree)]] # <= 1 row
	else
	folder_tree.flat_map do \|key, value\|
	generate_rows(value, keys + [key])
	end
	end
	end

	def language_folder?(folder_tree)
	folder_tree.is_a?(Hash) && folder_tree.keys.any? { \|name\| name.downcase.match?(LANGUAGE_FOLDER_REGEXP) }
	end

	def parse_pitaka_structure(root_dir)
	results = {}

	Find.find(root_dir) do \|path\|
	next if path == '.'
	next unless File.directory?(path)
	# NOTE: don't normalize it so the hyperlink works
	# path = path.unicode_normalize(:nfc)

	rel_path = path.gsub("#{root_dir}/", '')
	parts = rel_path.split('/')
	next if parts.empty? \|\| parts.first == '..'

	if path.match?(IGNORED_FOLDER_NAME_REGEXP)
	Find.prune
	next
	end

	pitaka_name = parts.shift
	nikaya = parts.first
	next unless nikaya&.match?(/^\d+/)

	if contains_language_folder?(path)
	language_files = process_language_folders(path)
	results[pitaka_name] \|\|= {}

	current = results[pitaka_name]

	# The folder structure looks like this: Pitaka (eg. Suttapitaka) > Nikaya (eg. Dighanikaya) > Subcategory (eg. 根本五十篇) > Sub-subcategory (eg. Atthakata/Writing) > Language (eg. Chi) > filename (eg. Jataka)
	# RULES:
	# i) Pitaka will be the heading of the table
	# ii) Nikaya will be in the first column
	# iii) Second column will consist of the combined Subcategory/Sub-subcategory string join by '/'
	# iv) Last column will consist of a list of files categorized by languages
	if pitaka_name.unicode_normalize(:nfc).match?(/#{SUTTAPITAKA}/) &&
	parts[-2] != pitaka_name &&
	parts.size > 1 &&
	# last_match = Regexp.last_match.to_s.capitalize
	parts = [nikaya, parts[1..-1].join('/')]
	end

	parts.each_with_index do \|part, folder_depth\|
	is_last = folder_depth == parts.size - 1
	if is_last
	current[part] = language_files
	else
	current[part] \|\|= {}
	current = current[part]
	end
	end
	end
	end

	results
	end

	def contains_language_folder?(path)
	Dir.children(path).any? do \|child\|
	child_dir = File.join(path, child)
	File.directory?(child_dir) && child.downcase.match?(LANGUAGE_FOLDER_REGEXP)
	end
	end

	def generate_hyperlink_cells(languages)
	output_docx_dir = File.dirname(OUTPUT_FILE)
	Caracal::Core::Models::TableCellModel.new margins: cell_margins do
	LANGUAGE_KEYS.each do \|lang\|
	files = languages[lang]
	next if files.nil? \|\| files.empty?

	p.text lang, bold: true
	ul do
	files.each do \|file\|
	# NOTE: Do not need to escape nor replace forward slash, windows can understand it
	relative_path = Pathname.new(file).relative_path_from(Pathname.new(output_docx_dir))
	file_uri = relative_path.to_s # or URI.escape if needed
	li { link (File.directory?(file) ? 'Link to folder' : File.basename(file)), file_uri }
	end
	end
	end
	end
	end

	def generate_section_cell(text)
	Caracal::Core::Models::TableCellModel.new margins: cell_margins do
	p.text text, bold: true, size: 30
	end
	end

	def process_language_folders(path)
	writing_regexp = Regexp.new("(?<![a-zA-Z])(著作)")

	Dir.each_child(path).each_with_object({}) do \|child, languages\|
	# Skip if it is not a directory
	next unless File.directory?(File.join(path, child))
	# Skip if it is not a language folder
	next unless child.downcase.match?(LANGUAGE_FOLDER_REGEXP)

	lang_key = language_key(child.downcase)

	# The path here must be directory path
	# If it is 'writing', we don't display each file individually because the number of files are usually large
	# We link to the folder directly
	if File.basename(path) =~ writing_regexp
	languages[lang_key] = [
	File.join(path, child)
	]
	next
	end
	# NOTE: Return all the files including those under nested folder
	files = Find.find(File.join(path, child))
	.select do \|f\|
	# NOTE: Skipping directory
	File.file?(f) &&
	# NOTE: Skipping hidden file
	!f.split("/")[-1].start_with?('.') &&
	# NOTE: Skipping html file because there are a lot of them
	!(File.extname(f) == '.html')
	end
	.sort_by { \|f\| File.basename(f) }
	languages[lang_key] = files unless files.empty?
	end
	end

	def language_key(child_name)
	case child_name
	when /chi/ then CHINESE
	when /eng/ then ENGLISH
	when Regexp.new("^(pali\|#{"pāli".unicode_normalize(:nfc)})") then PALI
	when /^sinhala/ then SINHALA_PALI
	else 'Other Translation'
	end
	end

	def cell_margins
	{ top: 50, bottom: 50, left: 50, right: 50 }
	end

	# Main Flow
	pitaka_structure = parse_pitaka_structure(ROOT_DIRECTORY)
	File.write("output.json", JSON.pretty_generate(pitaka_structure))
	build_table(pitaka_structure)
	require 'zip'
	require 'nokogiri'
	require 'fileutils'

	def inspect_hyperlinks_with_debug(docx_path, output_dir='debug_wrong_output')
	# Create output directory
	FileUtils.mkdir_p(output_dir)

	begin
	hyperlinks = []

	Zip::File.open(docx_path) do \|zipfile\|
	# Extract and save key files for inspection
	# save_xml_file(zipfile, 'word/document.xml', output_dir)
	save_xml_file(zipfile, 'word/_rels/document.xml.rels', output_dir)

	# Load main document XML
	doc_xml = get_xml(zipfile, 'word/document.xml')
	rels_xml = get_xml(zipfile, 'word/_rels/document.xml.rels')

	# Process all tables
	doc_xml.xpath('//w:tbl').each_with_index do \|table, table_idx\|
	table.xpath('.//w:tr').each_with_index do \|row, row_idx\|
	cells = row.xpath('.//w:tc')
	next unless cells.size >= 3

	third_cell = cells[2]

	process_hyperlinks(third_cell, rels_xml, hyperlinks, table_idx + 1, row_idx + 1)
	end
	end
	end

	# Output results
	puts "\nDebug files saved to #{output_dir} directory"

	rescue => e
	puts "Error processing document: #{e.message}"
	puts "Backtrace: #{e.backtrace.first(5).join("\n")}"
	end
	end

	def save_xml_file(zipfile, entry_path, output_dir)
	return unless zipfile.find_entry(entry_path)

	content = zipfile.read(entry_path)
	filename = entry_path.gsub('/', '_')
	File.write("#{output_dir}/#{filename}", content)
	end

	def get_xml(zipfile, entry_path)
	return nil unless zipfile.find_entry(entry_path)
	Nokogiri::XML(zipfile.read(entry_path))
	end

	def process_hyperlinks(node, rels_xml, hyperlinks, table_num, row_num)
	ns = {
	'w' => 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
	'r' => 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
	}

	node.xpath('.//w:hyperlink', ns).each do \|hyperlink\|
	# Get relationship ID or anchor
	rel_id = hyperlink.attribute('id')&.value
	anchor = hyperlink.attribute('anchor')&.value

	# Get display text
	text = hyperlink.xpath('.//w:t', ns).map(&:text).join

	# Determine target
	target, type = if rel_id && rels_xml
	# External hyperlink via relationship
	target = rels_xml.xpath("//r:Relationship[@Id='#{rel_id}']/@Target", ns).first&.value
	[target, target&.start_with?('http') ? 'EXTERNAL' : 'INTERNAL']
	elsif anchor
	# Internal bookmark link
	[anchor, 'BOOKMARK']
	else
	['MALFORMED', 'UNKNOWN']
	end

	hyperlinks << {
	table: table_num,
	row: row_num,
	text: text,
	target: target \|\| 'MISSING',
	type: type
	}
	end
	end

	# Usage
	inspect_hyperlinks_with_debug('../tipitaka/working-solution.docx')
	require 'find'
	require 'fileutils'

	# Folder to check and rename — change this to your actual path
	ROOT = "../tipitaka-latest" # <-- Replace with your target directory

	renamed_items = []

	Find.find(ROOT) do \|path\|
	name = File.basename(path)
	parent = File.dirname(path)

	# Normalize the filename to NFC form
	normalized_name = name.unicode_normalize(:nfc)

	# Skip renaming if the filename is already in NFC form
	if name != normalized_name
	puts "Original: #{path}"
	puts "Normalized: #{File.join(parent, normalized_name)}"

	# Temporary name to avoid conflicts
	temp_name = "#{name}_TEMP_RENAME_#{rand(1000..9999)}" # Use a unique temp name
	temp_path = File.join(parent, temp_name)

	# First, rename to the temporary name
	begin
	FileUtils.mv(path, temp_path)
	puts "Renamed to temporary name: #{temp_path}"

	# Now rename to the normalized (proper) name
	new_path = File.join(parent, normalized_name)
	FileUtils.mv(temp_path, new_path)
	puts "Renamed to final name: #{new_path}"

	renamed_items << { from: path, to: new_path }
	rescue => e
	# If any error occurs during renaming, print the error and skip this file
	puts "⚠️ Error renaming #{path}: #{e.message}"
	end
	end
	end

	# Output renamed items
	puts "\n✅ Renamed items (#{renamed_items.size}):"
	renamed_items.each do \|item\|
	puts " #{item[:from]} → #{item[:to]}"
	end

	puts "\n📊 Total files and folders renamed: #{renamed_items.size}"