Last active
January 21, 2026 18:10
-
-
Save misterburton/e836997d1b5335982500e0520db21c24 to your computer and use it in GitHub Desktop.
AI-Powered Localization - Translation Generation Script (Claude Opus 4.5)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * pre-translate.js | |
| * | |
| * Extracts translatable content from HTML pages and sends it to a translation API. | |
| * Stores results in Vercel KV for fast edge-cached retrieval. | |
| * | |
| * REQUIREMENTS: | |
| * - Node.js 18+ | |
| * - npm packages: fs-extra, jsdom, @vercel/kv, dotenv | |
| * - Vercel project with KV database configured | |
| * - Translation API endpoint (see /api/translate.js) | |
| * - Local dev server running (vercel dev) on port 3000 | |
| * | |
| * SETUP: | |
| * 1. npm install fs-extra jsdom @vercel/kv dotenv | |
| * 2. Run `vercel env pull .env.local` to get KV credentials | |
| * 3. Create sitemap.xml listing all pages to translate | |
| * 4. Run `vercel dev` in a separate terminal | |
| * 5. Run `node pre-translate.js` | |
| * | |
| * HOW IT WORKS: | |
| * - Reads sitemap.xml to find all pages | |
| * - Extracts content from elements with data-l10n-id attributes | |
| * - Computes a hash of the content to detect changes | |
| * - Sends content to translation API for each target language | |
| * - Stores translation hash in content-hashes.json to skip unchanged pages | |
| * | |
| * NOTE ON content-hashes.json: | |
| * This file is created automatically on first run. It stores SHA-256 hashes | |
| * of your source content, allowing the script to skip unchanged pages on | |
| * subsequent runs. If the file doesn't exist, the script creates it. If you | |
| * need to force re-translation of a page, delete its "_translationHash" entry. | |
| */ | |
| const fs = require('fs-extra'); | |
| const path = require('path'); | |
| const { JSDOM } = require('jsdom'); | |
| const crypto = require('crypto'); | |
| const { createClient } = require('@vercel/kv'); | |
| require('dotenv').config({ path: '.env.local' }); | |
| // Initialize Vercel KV client | |
| const kv = createClient({ | |
| url: process.env.KV_REST_API_URL, | |
| token: process.env.KV_REST_API_TOKEN, | |
| }); | |
| // Supported languages - the world's most spoken languages | |
| // All 6 languages have both translation and narration support | |
| const LANGUAGES = [ | |
| { code: 'en', name: 'English' }, | |
| { code: 'es', name: 'Spanish' }, | |
| { code: 'zh', name: 'Chinese' }, | |
| { code: 'hi', name: 'Hindi' }, | |
| { code: 'ar', name: 'Arabic' }, | |
| { code: 'fr', name: 'French' } | |
| ]; | |
| // Configuration | |
| const CONFIG = { | |
| // Your site's base URL (used to parse sitemap) | |
| siteUrl: 'https://yoursite.com/', | |
| // Local dev server URL for translation API | |
| apiUrl: 'http://localhost:3000/api/translate', | |
| // How many languages to translate concurrently | |
| concurrencyLimit: 10, | |
| // Timeout for each translation request (ms) | |
| requestTimeout: 120000 | |
| }; | |
| async function preTranslate() { | |
| const sitemapPath = path.resolve('sitemap.xml'); | |
| if (!fs.existsSync(sitemapPath)) { | |
| console.error('Error: sitemap.xml not found'); | |
| console.error('Create a sitemap.xml file listing all pages to translate.'); | |
| return; | |
| } | |
| const sitemapContent = await fs.readFile(sitemapPath, 'utf8'); | |
| const urls = sitemapContent.match(/<loc>(.*?)<\/loc>/g) | |
| .map(loc => loc.replace(/<\/?loc>/g, '')) | |
| .filter(url => url.startsWith(CONFIG.siteUrl)); | |
| // Load existing content hashes to detect changes | |
| const hashFile = path.resolve('content-hashes.json'); | |
| let hashes = fs.existsSync(hashFile) ? await fs.readJson(hashFile) : {}; | |
| for (const url of urls) { | |
| let relativePath = url.replace(CONFIG.siteUrl, ''); | |
| if (!relativePath || relativePath.endsWith('/')) relativePath += 'index.html'; | |
| const filePath = path.resolve(relativePath); | |
| if (!fs.existsSync(filePath)) { | |
| console.warn(`File not found for URL ${url}: ${filePath}`); | |
| continue; | |
| } | |
| const pageName = getPageName(relativePath); | |
| console.log(`\nProcessing page: ${pageName} (${relativePath})`); | |
| // Parse HTML and extract translatable content | |
| const html = await fs.readFile(filePath, 'utf8'); | |
| const dom = new JSDOM(html); | |
| const { document } = dom.window; | |
| // Find all elements with data-l10n-id attribute | |
| const translatableElements = Array.from(document.querySelectorAll('[data-l10n-id]')); | |
| const originalContent = {}; | |
| const normalizedForHash = {}; | |
| const allIds = translatableElements.map(el => el.dataset.l10nId).filter(Boolean).sort(); | |
| // Extract content from each translatable element | |
| allIds.forEach(id => { | |
| const el = document.querySelector(`[data-l10n-id="${id}"]`); | |
| if (!el) return; | |
| let content = ''; | |
| let textForHash = ''; | |
| // Handle different element types | |
| if (el.tagName === 'INPUT' || el.tagName === 'TEXTAREA') { | |
| content = el.placeholder || ''; | |
| textForHash = content; | |
| } else if (el.tagName === 'META') { | |
| content = el.getAttribute('content') || ''; | |
| textForHash = content; | |
| } else { | |
| content = el.innerHTML.trim(); | |
| // For hash comparison, use text content only (strip dynamic elements) | |
| const clone = el.cloneNode(true); | |
| clone.querySelectorAll('.footer-year, .copy-button, .line-number').forEach(e => e.remove()); | |
| textForHash = clone.textContent; | |
| } | |
| if (content) { | |
| originalContent[id] = content; | |
| // Normalize text for consistent hashing | |
| normalizedForHash[id] = textForHash | |
| .replace(/[\u200B-\u200D\uFEFF]/g, '') // Remove zero-width chars | |
| .replace(/\s+/g, ' ') // Collapse whitespace | |
| .trim(); | |
| } | |
| }); | |
| // Compute hash of all content to detect changes | |
| const contentHash = crypto.createHash('sha256') | |
| .update(JSON.stringify(normalizedForHash)) | |
| .digest('hex'); | |
| if (!hashes[pageName]) hashes[pageName] = {}; | |
| // Skip if content hasn't changed since last translation | |
| if (hashes[pageName]._translationHash === contentHash) { | |
| console.log(`Skipping ${pageName}: Content unchanged.`); | |
| continue; | |
| } | |
| console.log(`Changes detected in ${pageName}. Translating to ${LANGUAGES.length - 1} languages...`); | |
| // Translate to all languages except English | |
| const languagesToTranslate = LANGUAGES.filter(l => l.code !== 'en'); | |
| const MAX_RETRIES = 2; | |
| const RETRY_DELAY = 2000; // 2 seconds between retries | |
| let failedLanguages = []; | |
| // Helper function to translate a single language | |
| async function translateLanguage(lang) { | |
| const controller = new AbortController(); | |
| const timeoutId = setTimeout(() => controller.abort(), CONFIG.requestTimeout); | |
| try { | |
| const response = await fetch(CONFIG.apiUrl, { | |
| method: 'POST', | |
| headers: { 'Content-Type': 'application/json' }, | |
| body: JSON.stringify({ | |
| pageId: pageName, | |
| content: originalContent, | |
| targetLanguage: lang.name, | |
| contentHash: contentHash, | |
| bypassCache: true // Force fresh translation | |
| }), | |
| signal: controller.signal | |
| }); | |
| clearTimeout(timeoutId); | |
| if (!response.ok) { | |
| const err = await response.text(); | |
| console.error(` [${lang.code}] Failed: ${err}`); | |
| return { success: false, lang, error: err }; | |
| } | |
| const data = await response.json(); | |
| console.log(` [${lang.code}] ${data.cached ? 'Cached' : 'Translated'}`); | |
| return { success: true, lang }; | |
| } catch (error) { | |
| clearTimeout(timeoutId); | |
| const msg = error.name === 'AbortError' ? 'Timed out' : error.message; | |
| console.error(` [${lang.code}] Error: ${msg}`); | |
| return { success: false, lang, error: msg }; | |
| } | |
| } | |
| // Initial translation pass | |
| for (let i = 0; i < languagesToTranslate.length; i += CONFIG.concurrencyLimit) { | |
| const batch = languagesToTranslate.slice(i, i + CONFIG.concurrencyLimit); | |
| console.log(` Batch ${Math.floor(i / CONFIG.concurrencyLimit) + 1}/${Math.ceil(languagesToTranslate.length / CONFIG.concurrencyLimit)}...`); | |
| const results = await Promise.all(batch.map(lang => translateLanguage(lang))); | |
| // Collect failed languages | |
| results.forEach(result => { | |
| if (!result.success) { | |
| failedLanguages.push(result.lang); | |
| } | |
| }); | |
| } | |
| // Retry failed languages | |
| if (failedLanguages.length > 0) { | |
| for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { | |
| console.log(`\n Retrying ${failedLanguages.length} failed language(s) (attempt ${attempt}/${MAX_RETRIES})...`); | |
| await new Promise(resolve => setTimeout(resolve, RETRY_DELAY)); | |
| const retryResults = await Promise.all(failedLanguages.map(lang => translateLanguage(lang))); | |
| // Remove successful retries from failedLanguages | |
| failedLanguages = retryResults | |
| .filter(result => !result.success) | |
| .map(result => result.lang); | |
| if (failedLanguages.length === 0) { | |
| console.log(` ✓ All retries successful!`); | |
| break; | |
| } | |
| } | |
| } | |
| // Always save the hash after attempting translation. | |
| // The hash tracks SOURCE CONTENT state, not API success. | |
| // This prevents re-translating the same content due to intermittent API failures. | |
| // Failed languages can be identified by checking KV cache directly. | |
| hashes[pageName]._translationHash = contentHash; | |
| await fs.writeJson(hashFile, hashes, { spaces: 2 }); | |
| if (failedLanguages.length === 0) { | |
| console.log(`✓ Successfully translated ${pageName} to all languages.`); | |
| } else { | |
| console.log(`⚠ Translated ${pageName} with ${failedLanguages.length} failure(s): ${failedLanguages.map(l => l.code).join(', ')}`); | |
| console.log(` To retry: delete _translationHash for this page in content-hashes.json`); | |
| } | |
| } | |
| console.log('\nPre-translation complete.'); | |
| } | |
| /** | |
| * Extract page name from file path | |
| * Examples: | |
| * index.html -> 'home' | |
| * about/index.html -> 'about' | |
| * blog/post.html -> 'post' | |
| */ | |
| function getPageName(filePath) { | |
| const fileName = path.basename(filePath, '.html'); | |
| const parentDir = path.dirname(filePath); | |
| if (fileName === 'index') { | |
| if (parentDir === '.' || parentDir === '') return 'home'; | |
| return path.basename(parentDir); | |
| } | |
| return fileName; | |
| } | |
| // Run the script | |
| preTranslate().catch(console.error); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment