Created
May 9, 2025 04:55
-
-
Save thekcsam/1439511df8ee99e94851523d094c8b20 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'zip' | |
| require 'nokogiri' | |
| def fix_hyperlinks(docx_path, output_path) | |
| Zip::File.open(docx_path) do |zipfile| | |
| # Read relationship file | |
| rels_entry = zipfile.find_entry('word/_rels/document.xml.rels') | |
| rels_content = zipfile.read(rels_entry) | |
| rels_xml = Nokogiri::XML(rels_content) | |
| # Register the namespace | |
| rels_xml.root.add_namespace('r', 'http://schemas.openxmlformats.org/package/2006/relationships') | |
| # Find all hyperlink relationships | |
| rels_xml.xpath('//r:Relationship[@Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"]', | |
| 'r' => 'http://schemas.openxmlformats.org/package/2006/relationships').each do |rel| | |
| old_target = rel.attribute('Target').value | |
| # Transform the target path | |
| new_target = transform_target(old_target) | |
| # Update if changed | |
| rel.attribute('Target').value = new_target if new_target != old_target | |
| end | |
| # Create new DOCX with modified relationships | |
| create_modified_docx(zipfile, rels_xml, output_path) | |
| end | |
| end | |
| def transform_target(full_path) | |
| # Remove the file:///K:\Tipitaka\ prefix | |
| cleaned = full_path.gsub(/^file:\/\/\/[A-Za-z]:\\Tipitaka\\/i, '') # <- Replace this with proper regexp, inspect the value using list_hyperlink script | |
| cleaned = cleaned.gsub(/\\/, '/') | |
| end | |
| def create_modified_docx(zipfile, rels_xml, output_path) | |
| Zip::File.open(output_path, Zip::File::CREATE) do |new_zip| | |
| # Copy all files except the relationships file | |
| zipfile.each do |entry| | |
| next if entry.name == 'word/_rels/document.xml.rels' | |
| new_zip.get_output_stream(entry.name) { |f| f.write zipfile.read(entry.name) } | |
| end | |
| # Add modified relationships file | |
| new_zip.get_output_stream('word/_rels/document.xml.rels') do |f| | |
| f.write rels_xml.to_xml(indent: 2) | |
| end | |
| end | |
| end | |
| # Usage | |
| require 'cgi' | |
| fix_hyperlinks('input/after-formatting.docx', '../tipitaka/index.docx') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # encoding: UTF-8 | |
| # | |
| require 'caracal' | |
| require 'find' | |
| require 'json' | |
| # SUTTA | |
| VINAYAPITAKA = 'Vinayapiṭaka'.unicode_normalize(:nfc).freeze | |
| SUTTAPITAKA = '2.Suttapiṭaka'.unicode_normalize(:nfc).freeze | |
| ABHIDHAMMAPIṬAKA = 'Abhidhammapiṭaka'.unicode_normalize(:nfc).freeze | |
| ANNA_GANTHA = 'Añña Gantha'.unicode_normalize(:nfc).freeze | |
| # LANGUAGE | |
| CHINESE = 'Chinese Translation'.freeze | |
| ENGLISH = 'English Translation'.freeze | |
| PALI = 'Pali'.freeze | |
| SINHALA_PALI = 'Sinhala Pali'.freeze | |
| LANGUAGE_KEYS = [ | |
| PALI, | |
| SINHALA_PALI, | |
| ENGLISH, | |
| CHINESE, | |
| ] | |
| LANGUAGE_FOLDER_REGEXP = Regexp.new("^(chi|eng|pali|#{"pāli".unicode_normalize(:nfc)}|sinhala)") | |
| # REFERENCES | |
| ATTHAKATHA_IN_PALI = 'aṭṭhakathā'.unicode_normalize(:nfc).freeze | |
| ATTHAKATHA = "atthakatha".freeze | |
| TIKA_IN_PALI = 'Tika'.unicode_normalize(:nfc).freeze | |
| TIKA = 'tika'.freeze | |
| IGNORED_FOLDER_NAME_REGEXP = /Complete[\s]?\(All\)/ | |
| ROOT_DIRECTORY = '../tipitaka-latest' | |
| OUTPUT_FILE = '../tipitaka-latest/index.docx' | |
| def build_table(pitaka_structure) | |
| Caracal::Document.save(OUTPUT_FILE) do |docx| | |
| pitaka_structure.each do |pitaka_name, folder_tree| | |
| # Type: Arial | |
| docx.h1 pitaka_name, align: :center, size: 50, bold: true | |
| # ===== Generate Table ============ | |
| table_rows = generate_rows(folder_tree).sort_by { |row| [row[0], row[1]]} | |
| repeat_cells = find_repeating_cell(table_rows) | |
| table_rows = table_rows.map do |row| | |
| # optionally use repeat_ranges here for formatting (e.g. merge cells, bold, etc.) | |
| [generate_section_cell(row[0])] + row[1..] | |
| end | |
| docx.table table_rows, border_size: 1 do | |
| repeat_cells.each do |(idx, span)| | |
| # NOTE: Merge cell with the similar section name | |
| cell_style rows[idx][0], rowspan: span | |
| end | |
| end unless table_rows.empty? | |
| docx.page | |
| # ===== Generate Table ============ | |
| end | |
| end | |
| end | |
| # This function is used to find the position of repeating section | |
| # It is used to merge cells | |
| def find_repeating_cell(rows) | |
| result = [] | |
| start_idx = 0 | |
| rows.each_cons(2).with_index do |(a, b), i| | |
| if a.first != b.first | |
| length = i + 1 - start_idx | |
| result << [start_idx, length] if length > 1 | |
| start_idx = i + 1 | |
| end | |
| end | |
| length = rows.size - start_idx | |
| result << [start_idx, length] if length > 1 | |
| result | |
| end | |
| def generate_rows(folder_tree, keys = []) | |
| return [] if folder_tree.nil? | |
| if language_folder?(folder_tree) | |
| cells = if keys.size == 1 | |
| [keys[0]] + [''] | |
| elsif keys.size > 2 | |
| [keys[0]] + [keys[1..-1].join('/')] | |
| else # when key size = 2 | |
| [keys[0]] + keys[1..-1] | |
| end | |
| [cells + [generate_hyperlink_cells(folder_tree)]] # <= 1 row | |
| else | |
| folder_tree.flat_map do |key, value| | |
| generate_rows(value, keys + [key]) | |
| end | |
| end | |
| end | |
| def language_folder?(folder_tree) | |
| folder_tree.is_a?(Hash) && folder_tree.keys.any? { |name| name.downcase.match?(LANGUAGE_FOLDER_REGEXP) } | |
| end | |
| def parse_pitaka_structure(root_dir) | |
| results = {} | |
| Find.find(root_dir) do |path| | |
| next if path == '.' | |
| next unless File.directory?(path) | |
| # NOTE: don't normalize it so the hyperlink works | |
| # path = path.unicode_normalize(:nfc) | |
| rel_path = path.gsub("#{root_dir}/", '') | |
| parts = rel_path.split('/') | |
| next if parts.empty? || parts.first == '..' | |
| if path.match?(IGNORED_FOLDER_NAME_REGEXP) | |
| Find.prune | |
| next | |
| end | |
| pitaka_name = parts.shift | |
| nikaya = parts.first | |
| next unless nikaya&.match?(/^\d+/) | |
| if contains_language_folder?(path) | |
| language_files = process_language_folders(path) | |
| results[pitaka_name] ||= {} | |
| current = results[pitaka_name] | |
| # The folder structure looks like this: Pitaka (eg. Suttapitaka) > Nikaya (eg. Dighanikaya) > Subcategory (eg. 根本五十篇) > Sub-subcategory (eg. Atthakata/Writing) > Language (eg. Chi) > filename (eg. Jataka) | |
| # RULES: | |
| # i) Pitaka will be the heading of the table | |
| # ii) Nikaya will be in the first column | |
| # iii) Second column will consist of the combined Subcategory/Sub-subcategory string join by '/' | |
| # iv) Last column will consist of a list of files categorized by languages | |
| if pitaka_name.unicode_normalize(:nfc).match?(/#{SUTTAPITAKA}/) && | |
| parts[-2] != pitaka_name && | |
| parts.size > 1 && | |
| # last_match = Regexp.last_match.to_s.capitalize | |
| parts = [nikaya, parts[1..-1].join('/')] | |
| end | |
| parts.each_with_index do |part, folder_depth| | |
| is_last = folder_depth == parts.size - 1 | |
| if is_last | |
| current[part] = language_files | |
| else | |
| current[part] ||= {} | |
| current = current[part] | |
| end | |
| end | |
| end | |
| end | |
| results | |
| end | |
| def contains_language_folder?(path) | |
| Dir.children(path).any? do |child| | |
| child_dir = File.join(path, child) | |
| File.directory?(child_dir) && child.downcase.match?(LANGUAGE_FOLDER_REGEXP) | |
| end | |
| end | |
| def generate_hyperlink_cells(languages) | |
| output_docx_dir = File.dirname(OUTPUT_FILE) | |
| Caracal::Core::Models::TableCellModel.new margins: cell_margins do | |
| LANGUAGE_KEYS.each do |lang| | |
| files = languages[lang] | |
| next if files.nil? || files.empty? | |
| p.text lang, bold: true | |
| ul do | |
| files.each do |file| | |
| # NOTE: Do not need to escape nor replace forward slash, windows can understand it | |
| relative_path = Pathname.new(file).relative_path_from(Pathname.new(output_docx_dir)) | |
| file_uri = relative_path.to_s # or URI.escape if needed | |
| li { link (File.directory?(file) ? 'Link to folder' : File.basename(file)), file_uri } | |
| end | |
| end | |
| end | |
| end | |
| end | |
| def generate_section_cell(text) | |
| Caracal::Core::Models::TableCellModel.new margins: cell_margins do | |
| p.text text, bold: true, size: 30 | |
| end | |
| end | |
| def process_language_folders(path) | |
| writing_regexp = Regexp.new("(?<![a-zA-Z])(著作)") | |
| Dir.each_child(path).each_with_object({}) do |child, languages| | |
| # Skip if it is not a directory | |
| next unless File.directory?(File.join(path, child)) | |
| # Skip if it is not a language folder | |
| next unless child.downcase.match?(LANGUAGE_FOLDER_REGEXP) | |
| lang_key = language_key(child.downcase) | |
| # The path here must be directory path | |
| # If it is 'writing', we don't display each file individually because the number of files are usually large | |
| # We link to the folder directly | |
| if File.basename(path) =~ writing_regexp | |
| languages[lang_key] = [ | |
| File.join(path, child) | |
| ] | |
| next | |
| end | |
| # NOTE: Return all the files including those under nested folder | |
| files = Find.find(File.join(path, child)) | |
| .select do |f| | |
| # NOTE: Skipping directory | |
| File.file?(f) && | |
| # NOTE: Skipping hidden file | |
| !f.split("/")[-1].start_with?('.') && | |
| # NOTE: Skipping html file because there are a lot of them | |
| !(File.extname(f) == '.html') | |
| end | |
| .sort_by { |f| File.basename(f) } | |
| languages[lang_key] = files unless files.empty? | |
| end | |
| end | |
| def language_key(child_name) | |
| case child_name | |
| when /chi/ then CHINESE | |
| when /eng/ then ENGLISH | |
| when Regexp.new("^(pali|#{"pāli".unicode_normalize(:nfc)})") then PALI | |
| when /^sinhala/ then SINHALA_PALI | |
| else 'Other Translation' | |
| end | |
| end | |
| def cell_margins | |
| { top: 50, bottom: 50, left: 50, right: 50 } | |
| end | |
| # Main Flow | |
| pitaka_structure = parse_pitaka_structure(ROOT_DIRECTORY) | |
| File.write("output.json", JSON.pretty_generate(pitaka_structure)) | |
| build_table(pitaka_structure) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'zip' | |
| require 'nokogiri' | |
| require 'fileutils' | |
| def inspect_hyperlinks_with_debug(docx_path, output_dir='debug_wrong_output') | |
| # Create output directory | |
| FileUtils.mkdir_p(output_dir) | |
| begin | |
| hyperlinks = [] | |
| Zip::File.open(docx_path) do |zipfile| | |
| # Extract and save key files for inspection | |
| # save_xml_file(zipfile, 'word/document.xml', output_dir) | |
| save_xml_file(zipfile, 'word/_rels/document.xml.rels', output_dir) | |
| # Load main document XML | |
| doc_xml = get_xml(zipfile, 'word/document.xml') | |
| rels_xml = get_xml(zipfile, 'word/_rels/document.xml.rels') | |
| # Process all tables | |
| doc_xml.xpath('//w:tbl').each_with_index do |table, table_idx| | |
| table.xpath('.//w:tr').each_with_index do |row, row_idx| | |
| cells = row.xpath('.//w:tc') | |
| next unless cells.size >= 3 | |
| third_cell = cells[2] | |
| process_hyperlinks(third_cell, rels_xml, hyperlinks, table_idx + 1, row_idx + 1) | |
| end | |
| end | |
| end | |
| # Output results | |
| puts "\nDebug files saved to #{output_dir} directory" | |
| rescue => e | |
| puts "Error processing document: #{e.message}" | |
| puts "Backtrace: #{e.backtrace.first(5).join("\n")}" | |
| end | |
| end | |
| def save_xml_file(zipfile, entry_path, output_dir) | |
| return unless zipfile.find_entry(entry_path) | |
| content = zipfile.read(entry_path) | |
| filename = entry_path.gsub('/', '_') | |
| File.write("#{output_dir}/#{filename}", content) | |
| end | |
| def get_xml(zipfile, entry_path) | |
| return nil unless zipfile.find_entry(entry_path) | |
| Nokogiri::XML(zipfile.read(entry_path)) | |
| end | |
| def process_hyperlinks(node, rels_xml, hyperlinks, table_num, row_num) | |
| ns = { | |
| 'w' => 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', | |
| 'r' => 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' | |
| } | |
| node.xpath('.//w:hyperlink', ns).each do |hyperlink| | |
| # Get relationship ID or anchor | |
| rel_id = hyperlink.attribute('id')&.value | |
| anchor = hyperlink.attribute('anchor')&.value | |
| # Get display text | |
| text = hyperlink.xpath('.//w:t', ns).map(&:text).join | |
| # Determine target | |
| target, type = if rel_id && rels_xml | |
| # External hyperlink via relationship | |
| target = rels_xml.xpath("//r:Relationship[@Id='#{rel_id}']/@Target", ns).first&.value | |
| [target, target&.start_with?('http') ? 'EXTERNAL' : 'INTERNAL'] | |
| elsif anchor | |
| # Internal bookmark link | |
| [anchor, 'BOOKMARK'] | |
| else | |
| ['MALFORMED', 'UNKNOWN'] | |
| end | |
| hyperlinks << { | |
| table: table_num, | |
| row: row_num, | |
| text: text, | |
| target: target || 'MISSING', | |
| type: type | |
| } | |
| end | |
| end | |
| # Usage | |
| inspect_hyperlinks_with_debug('../tipitaka/working-solution.docx') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'find' | |
| require 'fileutils' | |
| # Folder to check and rename — change this to your actual path | |
| ROOT = "../tipitaka-latest" # <-- Replace with your target directory | |
| renamed_items = [] | |
| Find.find(ROOT) do |path| | |
| name = File.basename(path) | |
| parent = File.dirname(path) | |
| # Normalize the filename to NFC form | |
| normalized_name = name.unicode_normalize(:nfc) | |
| # Skip renaming if the filename is already in NFC form | |
| if name != normalized_name | |
| puts "Original: #{path}" | |
| puts "Normalized: #{File.join(parent, normalized_name)}" | |
| # Temporary name to avoid conflicts | |
| temp_name = "#{name}_TEMP_RENAME_#{rand(1000..9999)}" # Use a unique temp name | |
| temp_path = File.join(parent, temp_name) | |
| # First, rename to the temporary name | |
| begin | |
| FileUtils.mv(path, temp_path) | |
| puts "Renamed to temporary name: #{temp_path}" | |
| # Now rename to the normalized (proper) name | |
| new_path = File.join(parent, normalized_name) | |
| FileUtils.mv(temp_path, new_path) | |
| puts "Renamed to final name: #{new_path}" | |
| renamed_items << { from: path, to: new_path } | |
| rescue => e | |
| # If any error occurs during renaming, print the error and skip this file | |
| puts "⚠️ Error renaming #{path}: #{e.message}" | |
| end | |
| end | |
| end | |
| # Output renamed items | |
| puts "\n✅ Renamed items (#{renamed_items.size}):" | |
| renamed_items.each do |item| | |
| puts " #{item[:from]} → #{item[:to]}" | |
| end | |
| puts "\n📊 Total files and folders renamed: #{renamed_items.size}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment