Skip to content

Instantly share code, notes, and snippets.

@misterburton
Last active January 21, 2026 18:10
Show Gist options
  • Select an option

  • Save misterburton/e836997d1b5335982500e0520db21c24 to your computer and use it in GitHub Desktop.

Select an option

Save misterburton/e836997d1b5335982500e0520db21c24 to your computer and use it in GitHub Desktop.
AI-Powered Localization - Translation Generation Script (Claude Opus 4.5)
/**
* pre-translate.js
*
* Extracts translatable content from HTML pages and sends it to a translation API.
* Stores results in Vercel KV for fast edge-cached retrieval.
*
* REQUIREMENTS:
* - Node.js 18+
* - npm packages: fs-extra, jsdom, @vercel/kv, dotenv
* - Vercel project with KV database configured
* - Translation API endpoint (see /api/translate.js)
* - Local dev server running (vercel dev) on port 3000
*
* SETUP:
* 1. npm install fs-extra jsdom @vercel/kv dotenv
* 2. Run `vercel env pull .env.local` to get KV credentials
* 3. Create sitemap.xml listing all pages to translate
* 4. Run `vercel dev` in a separate terminal
* 5. Run `node pre-translate.js`
*
* HOW IT WORKS:
* - Reads sitemap.xml to find all pages
* - Extracts content from elements with data-l10n-id attributes
* - Computes a hash of the content to detect changes
* - Sends content to translation API for each target language
* - Stores translation hash in content-hashes.json to skip unchanged pages
*
* NOTE ON content-hashes.json:
* This file is created automatically on first run. It stores SHA-256 hashes
* of your source content, allowing the script to skip unchanged pages on
* subsequent runs. If the file doesn't exist, the script creates it. If you
* need to force re-translation of a page, delete its "_translationHash" entry.
*/
const fs = require('fs-extra');
const path = require('path');
const { JSDOM } = require('jsdom');
const crypto = require('crypto');
const { createClient } = require('@vercel/kv');
require('dotenv').config({ path: '.env.local' });
// Initialize Vercel KV client
const kv = createClient({
url: process.env.KV_REST_API_URL,
token: process.env.KV_REST_API_TOKEN,
});
// Supported languages - the world's most spoken languages
// All 6 languages have both translation and narration support
const LANGUAGES = [
{ code: 'en', name: 'English' },
{ code: 'es', name: 'Spanish' },
{ code: 'zh', name: 'Chinese' },
{ code: 'hi', name: 'Hindi' },
{ code: 'ar', name: 'Arabic' },
{ code: 'fr', name: 'French' }
];
// Configuration
const CONFIG = {
// Your site's base URL (used to parse sitemap)
siteUrl: 'https://yoursite.com/',
// Local dev server URL for translation API
apiUrl: 'http://localhost:3000/api/translate',
// How many languages to translate concurrently
concurrencyLimit: 10,
// Timeout for each translation request (ms)
requestTimeout: 120000
};
async function preTranslate() {
const sitemapPath = path.resolve('sitemap.xml');
if (!fs.existsSync(sitemapPath)) {
console.error('Error: sitemap.xml not found');
console.error('Create a sitemap.xml file listing all pages to translate.');
return;
}
const sitemapContent = await fs.readFile(sitemapPath, 'utf8');
const urls = sitemapContent.match(/<loc>(.*?)<\/loc>/g)
.map(loc => loc.replace(/<\/?loc>/g, ''))
.filter(url => url.startsWith(CONFIG.siteUrl));
// Load existing content hashes to detect changes
const hashFile = path.resolve('content-hashes.json');
let hashes = fs.existsSync(hashFile) ? await fs.readJson(hashFile) : {};
for (const url of urls) {
let relativePath = url.replace(CONFIG.siteUrl, '');
if (!relativePath || relativePath.endsWith('/')) relativePath += 'index.html';
const filePath = path.resolve(relativePath);
if (!fs.existsSync(filePath)) {
console.warn(`File not found for URL ${url}: ${filePath}`);
continue;
}
const pageName = getPageName(relativePath);
console.log(`\nProcessing page: ${pageName} (${relativePath})`);
// Parse HTML and extract translatable content
const html = await fs.readFile(filePath, 'utf8');
const dom = new JSDOM(html);
const { document } = dom.window;
// Find all elements with data-l10n-id attribute
const translatableElements = Array.from(document.querySelectorAll('[data-l10n-id]'));
const originalContent = {};
const normalizedForHash = {};
const allIds = translatableElements.map(el => el.dataset.l10nId).filter(Boolean).sort();
// Extract content from each translatable element
allIds.forEach(id => {
const el = document.querySelector(`[data-l10n-id="${id}"]`);
if (!el) return;
let content = '';
let textForHash = '';
// Handle different element types
if (el.tagName === 'INPUT' || el.tagName === 'TEXTAREA') {
content = el.placeholder || '';
textForHash = content;
} else if (el.tagName === 'META') {
content = el.getAttribute('content') || '';
textForHash = content;
} else {
content = el.innerHTML.trim();
// For hash comparison, use text content only (strip dynamic elements)
const clone = el.cloneNode(true);
clone.querySelectorAll('.footer-year, .copy-button, .line-number').forEach(e => e.remove());
textForHash = clone.textContent;
}
if (content) {
originalContent[id] = content;
// Normalize text for consistent hashing
normalizedForHash[id] = textForHash
.replace(/[\u200B-\u200D\uFEFF]/g, '') // Remove zero-width chars
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
}
});
// Compute hash of all content to detect changes
const contentHash = crypto.createHash('sha256')
.update(JSON.stringify(normalizedForHash))
.digest('hex');
if (!hashes[pageName]) hashes[pageName] = {};
// Skip if content hasn't changed since last translation
if (hashes[pageName]._translationHash === contentHash) {
console.log(`Skipping ${pageName}: Content unchanged.`);
continue;
}
console.log(`Changes detected in ${pageName}. Translating to ${LANGUAGES.length - 1} languages...`);
// Translate to all languages except English
const languagesToTranslate = LANGUAGES.filter(l => l.code !== 'en');
const MAX_RETRIES = 2;
const RETRY_DELAY = 2000; // 2 seconds between retries
let failedLanguages = [];
// Helper function to translate a single language
async function translateLanguage(lang) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), CONFIG.requestTimeout);
try {
const response = await fetch(CONFIG.apiUrl, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
pageId: pageName,
content: originalContent,
targetLanguage: lang.name,
contentHash: contentHash,
bypassCache: true // Force fresh translation
}),
signal: controller.signal
});
clearTimeout(timeoutId);
if (!response.ok) {
const err = await response.text();
console.error(` [${lang.code}] Failed: ${err}`);
return { success: false, lang, error: err };
}
const data = await response.json();
console.log(` [${lang.code}] ${data.cached ? 'Cached' : 'Translated'}`);
return { success: true, lang };
} catch (error) {
clearTimeout(timeoutId);
const msg = error.name === 'AbortError' ? 'Timed out' : error.message;
console.error(` [${lang.code}] Error: ${msg}`);
return { success: false, lang, error: msg };
}
}
// Initial translation pass
for (let i = 0; i < languagesToTranslate.length; i += CONFIG.concurrencyLimit) {
const batch = languagesToTranslate.slice(i, i + CONFIG.concurrencyLimit);
console.log(` Batch ${Math.floor(i / CONFIG.concurrencyLimit) + 1}/${Math.ceil(languagesToTranslate.length / CONFIG.concurrencyLimit)}...`);
const results = await Promise.all(batch.map(lang => translateLanguage(lang)));
// Collect failed languages
results.forEach(result => {
if (!result.success) {
failedLanguages.push(result.lang);
}
});
}
// Retry failed languages
if (failedLanguages.length > 0) {
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
console.log(`\n Retrying ${failedLanguages.length} failed language(s) (attempt ${attempt}/${MAX_RETRIES})...`);
await new Promise(resolve => setTimeout(resolve, RETRY_DELAY));
const retryResults = await Promise.all(failedLanguages.map(lang => translateLanguage(lang)));
// Remove successful retries from failedLanguages
failedLanguages = retryResults
.filter(result => !result.success)
.map(result => result.lang);
if (failedLanguages.length === 0) {
console.log(` ✓ All retries successful!`);
break;
}
}
}
// Always save the hash after attempting translation.
// The hash tracks SOURCE CONTENT state, not API success.
// This prevents re-translating the same content due to intermittent API failures.
// Failed languages can be identified by checking KV cache directly.
hashes[pageName]._translationHash = contentHash;
await fs.writeJson(hashFile, hashes, { spaces: 2 });
if (failedLanguages.length === 0) {
console.log(`✓ Successfully translated ${pageName} to all languages.`);
} else {
console.log(`⚠ Translated ${pageName} with ${failedLanguages.length} failure(s): ${failedLanguages.map(l => l.code).join(', ')}`);
console.log(` To retry: delete _translationHash for this page in content-hashes.json`);
}
}
console.log('\nPre-translation complete.');
}
/**
* Extract page name from file path
* Examples:
* index.html -> 'home'
* about/index.html -> 'about'
* blog/post.html -> 'post'
*/
function getPageName(filePath) {
const fileName = path.basename(filePath, '.html');
const parentDir = path.dirname(filePath);
if (fileName === 'index') {
if (parentDir === '.' || parentDir === '') return 'home';
return path.basename(parentDir);
}
return fileName;
}
// Run the script
preTranslate().catch(console.error);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment