Skip to content

Instantly share code, notes, and snippets.

@mbutler
Created January 20, 2026 18:59
Show Gist options
  • Select an option

  • Save mbutler/d951d54f3c5cf20d3a64020ffcb34eba to your computer and use it in GitHub Desktop.

Select an option

Save mbutler/d951d54f3c5cf20d3a64020ffcb34eba to your computer and use it in GitHub Desktop.
export CSVs from Omeka collections
<?php
/**
* export_vanallen.php — streams CSV for Collection items/files/transcripts.
* See query params at bottom of this header.
*
* Params:
* - collection_id (int, default 22)
* - only_with_transcript (0|1, default 0)
* - debug (0|1, default 0)
* - title_id, source_id, trans_id (ints) to override element ids
*/
$dbHost = 'localhost';
$dbName = 'omekaprod';
$dbUser = 'mtbutler';
$dbPass = '';
$collectionId = isset($_GET['collection_id']) ? (int)$_GET['collection_id'] : 22;
$onlyWithTranscript = isset($_GET['only_with_transcript']) ? (int)$_GET['only_with_transcript'] : 0;
$debug = isset($_GET['debug']) ? (int)$_GET['debug'] : 0;
$overrideTitleId = isset($_GET['title_id']) ? (int)$_GET['title_id'] : null;
$overrideSourceId = isset($_GET['source_id']) ? (int)$_GET['source_id'] : null;
$overrideTransId = isset($_GET['trans_id']) ? (int)$_GET['trans_id'] : null;
ini_set('memory_limit', '1024M');
set_time_limit(0);
while (ob_get_level() > 0) ob_end_clean();
$dsn = "mysql:host={$dbHost};dbname={$dbName};charset=utf8mb4";
$options = [
PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION,
PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC,
PDO::MYSQL_ATTR_USE_BUFFERED_QUERY => false,
PDO::ATTR_EMULATE_PREPARES => false
];
try {
$pdo = new PDO($dsn, $dbUser, $dbPass, $options);
$pdo->exec("SET NAMES utf8mb4");
$pdo->exec("SET SESSION sql_big_selects = 1");
} catch (Throwable $e) {
header('Content-Type: text/plain; charset=utf-8', true, 500);
echo "Database connection failed: " . $e->getMessage();
exit;
}
function getElementId(PDO $pdo, string $setName, string $elementName): ?int {
$st = $pdo->prepare("
SELECT e.id
FROM elements e
JOIN element_sets s ON s.id = e.element_set_id
WHERE s.name = :setName AND e.name = :elementName
LIMIT 1
");
$st->execute([':setName' => $setName, ':elementName' => $elementName]);
$row = $st->fetch();
return $row ? (int)$row['id'] : null;
}
$dcTitleId = $overrideTitleId ?: getElementId($pdo, 'Dublin Core', 'Title');
$dcSourceId = $overrideSourceId ?: getElementId($pdo, 'Dublin Core', 'Source');
$scriptusTransId = $overrideTransId ?: getElementId($pdo, 'Scriptus', 'Transcription');
if ($debug) {
header('Content-Type: text/plain; charset=utf-8');
$items = (int)$pdo->query("SELECT COUNT(*) c FROM items WHERE collection_id = {$collectionId}")->fetch()['c'];
$st = $pdo->prepare("
SELECT COUNT(*) c
FROM items i JOIN files f ON f.item_id = i.id
WHERE i.collection_id = :cid
");
$st->execute([':cid' => $collectionId]);
$files = (int)$st->fetch()['c'];
$transcripts = 'n/a';
if ($scriptusTransId) {
$st = $pdo->prepare("
SELECT COUNT(*) c
FROM items i
JOIN files f ON f.item_id = i.id
JOIN element_texts et
ON et.record_type = 'File' AND et.record_id = f.id AND et.element_id = :eid
WHERE i.collection_id = :cid
");
$st->execute([':eid' => $scriptusTransId, ':cid' => $collectionId]);
$transcripts = (int)$st->fetch()['c'];
}
echo "DEBUG\n";
echo "collection_id: {$collectionId}\n";
echo "dcTitleId: " . var_export($dcTitleId, true) . "\n";
echo "dcSourceId: " . var_export($dcSourceId, true) . "\n";
echo "scriptusTransId: " . var_export($scriptusTransId, true) . "\n";
echo "items in collection: {$items}\n";
echo "files for those items: {$files}\n";
echo "files with transcription: {$transcripts}\n";
$sample = $pdo->prepare("
SELECT i.id AS item_id, f.id AS file_id, f.filename
FROM items i JOIN files f ON f.item_id = i.id
WHERE i.collection_id = :cid
ORDER BY i.id, f.id LIMIT 5
");
$sample->execute([':cid' => $collectionId]);
echo "\nSAMPLE item+file rows:\n";
foreach ($sample as $r) echo json_encode($r, JSON_UNESCAPED_SLASHES|JSON_UNESCAPED_UNICODE) . "\n";
exit;
}
$dcTitleIdSafe = $dcTitleId ?: 0;
$dcSourceIdSafe = $dcSourceId ?: 0;
$scriptusTransIdSafe = $scriptusTransId ?: 0;
$sql = "
SELECT
i.id AS item_id,
it.item_title AS main_object_title,
f.id AS file_id,
f.filename AS file_name,
ft.file_title AS part_title,
isrc.source_value AS digital_collection_link,
tr.transcription AS transcript
FROM items i
JOIN files f ON f.item_id = i.id
LEFT JOIN (
SELECT record_id AS item_id, MAX(text) AS item_title
FROM element_texts
WHERE record_type = 'Item' AND element_id = :dc_title_item
GROUP BY record_id
) it ON it.item_id = i.id
LEFT JOIN (
SELECT record_id AS item_id, MAX(text) AS source_value
FROM element_texts
WHERE record_type = 'Item' AND element_id = :dc_source
GROUP BY record_id
) isrc ON isrc.item_id = i.id
LEFT JOIN (
SELECT record_id AS file_id, MAX(text) AS file_title
FROM element_texts
WHERE record_type = 'File' AND element_id = :dc_title_file
GROUP BY record_id
) ft ON ft.file_id = f.id
LEFT JOIN (
SELECT et1.record_id AS file_id, et1.text AS transcription
FROM element_texts et1
JOIN (
SELECT record_id, MAX(id) AS max_id
FROM element_texts
WHERE record_type = 'File' AND element_id = :scriptus_trans
GROUP BY record_id
) latest ON latest.max_id = et1.id
) tr ON tr.file_id = f.id
WHERE i.collection_id = :collection_id
" . ($onlyWithTranscript ? " AND tr.transcription IS NOT NULL\n" : "") . "
ORDER BY i.id, f.id
";
$filenameWarn = (!$dcTitleId || !$dcSourceId || !$scriptusTransId) ? '_WARN_ELEM_IDS' : '';
$filename = "vanallen_collection{$collectionId}{$filenameWarn}_" . date('Ymd_His') . ".csv";
header('Content-Type: text/csv; charset=utf-8');
header('Content-Disposition: attachment; filename="'.$filename.'"');
header('X-Accel-Buffering: no');
$out = fopen('php://output', 'w');
fputcsv($out, ['item_id','main_object_title','file_id','file_name','part_title','digital_collection_link','transcript']);
try {
$stmt = $pdo->prepare($sql);
$stmt->execute([
':dc_title_item' => $dcTitleIdSafe,
':dc_source' => $dcSourceIdSafe,
':dc_title_file' => $dcTitleIdSafe,
':scriptus_trans' => $scriptusTransIdSafe,
':collection_id' => $collectionId
]);
$rows = 0;
while ($row = $stmt->fetch()) {
if (isset($row['transcript']) && $row['transcript'] !== null) {
$row['transcript'] = str_replace(["\r\n", "\r"], "\n", $row['transcript']);
}
fputcsv($out, [
$row['item_id'],
$row['main_object_title'],
$row['file_id'],
$row['file_name'],
$row['part_title'],
$row['digital_collection_link'],
$row['transcript']
]);
if ((++$rows % 500) === 0) fflush($out);
}
} catch (Throwable $e) {
fwrite($out, "\n# ERROR: " . $e->getMessage() . "\n");
}
fclose($out);
exit;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment