Created
January 20, 2026 18:59
-
-
Save mbutler/d951d54f3c5cf20d3a64020ffcb34eba to your computer and use it in GitHub Desktop.
export CSVs from Omeka collections
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * export_vanallen.php — streams CSV for Collection items/files/transcripts. | |
| * See query params at bottom of this header. | |
| * | |
| * Params: | |
| * - collection_id (int, default 22) | |
| * - only_with_transcript (0|1, default 0) | |
| * - debug (0|1, default 0) | |
| * - title_id, source_id, trans_id (ints) to override element ids | |
| */ | |
| $dbHost = 'localhost'; | |
| $dbName = 'omekaprod'; | |
| $dbUser = 'mtbutler'; | |
| $dbPass = ''; | |
| $collectionId = isset($_GET['collection_id']) ? (int)$_GET['collection_id'] : 22; | |
| $onlyWithTranscript = isset($_GET['only_with_transcript']) ? (int)$_GET['only_with_transcript'] : 0; | |
| $debug = isset($_GET['debug']) ? (int)$_GET['debug'] : 0; | |
| $overrideTitleId = isset($_GET['title_id']) ? (int)$_GET['title_id'] : null; | |
| $overrideSourceId = isset($_GET['source_id']) ? (int)$_GET['source_id'] : null; | |
| $overrideTransId = isset($_GET['trans_id']) ? (int)$_GET['trans_id'] : null; | |
| ini_set('memory_limit', '1024M'); | |
| set_time_limit(0); | |
| while (ob_get_level() > 0) ob_end_clean(); | |
| $dsn = "mysql:host={$dbHost};dbname={$dbName};charset=utf8mb4"; | |
| $options = [ | |
| PDO::ATTR_ERRMODE => PDO::ERRMODE_EXCEPTION, | |
| PDO::ATTR_DEFAULT_FETCH_MODE => PDO::FETCH_ASSOC, | |
| PDO::MYSQL_ATTR_USE_BUFFERED_QUERY => false, | |
| PDO::ATTR_EMULATE_PREPARES => false | |
| ]; | |
| try { | |
| $pdo = new PDO($dsn, $dbUser, $dbPass, $options); | |
| $pdo->exec("SET NAMES utf8mb4"); | |
| $pdo->exec("SET SESSION sql_big_selects = 1"); | |
| } catch (Throwable $e) { | |
| header('Content-Type: text/plain; charset=utf-8', true, 500); | |
| echo "Database connection failed: " . $e->getMessage(); | |
| exit; | |
| } | |
| function getElementId(PDO $pdo, string $setName, string $elementName): ?int { | |
| $st = $pdo->prepare(" | |
| SELECT e.id | |
| FROM elements e | |
| JOIN element_sets s ON s.id = e.element_set_id | |
| WHERE s.name = :setName AND e.name = :elementName | |
| LIMIT 1 | |
| "); | |
| $st->execute([':setName' => $setName, ':elementName' => $elementName]); | |
| $row = $st->fetch(); | |
| return $row ? (int)$row['id'] : null; | |
| } | |
| $dcTitleId = $overrideTitleId ?: getElementId($pdo, 'Dublin Core', 'Title'); | |
| $dcSourceId = $overrideSourceId ?: getElementId($pdo, 'Dublin Core', 'Source'); | |
| $scriptusTransId = $overrideTransId ?: getElementId($pdo, 'Scriptus', 'Transcription'); | |
| if ($debug) { | |
| header('Content-Type: text/plain; charset=utf-8'); | |
| $items = (int)$pdo->query("SELECT COUNT(*) c FROM items WHERE collection_id = {$collectionId}")->fetch()['c']; | |
| $st = $pdo->prepare(" | |
| SELECT COUNT(*) c | |
| FROM items i JOIN files f ON f.item_id = i.id | |
| WHERE i.collection_id = :cid | |
| "); | |
| $st->execute([':cid' => $collectionId]); | |
| $files = (int)$st->fetch()['c']; | |
| $transcripts = 'n/a'; | |
| if ($scriptusTransId) { | |
| $st = $pdo->prepare(" | |
| SELECT COUNT(*) c | |
| FROM items i | |
| JOIN files f ON f.item_id = i.id | |
| JOIN element_texts et | |
| ON et.record_type = 'File' AND et.record_id = f.id AND et.element_id = :eid | |
| WHERE i.collection_id = :cid | |
| "); | |
| $st->execute([':eid' => $scriptusTransId, ':cid' => $collectionId]); | |
| $transcripts = (int)$st->fetch()['c']; | |
| } | |
| echo "DEBUG\n"; | |
| echo "collection_id: {$collectionId}\n"; | |
| echo "dcTitleId: " . var_export($dcTitleId, true) . "\n"; | |
| echo "dcSourceId: " . var_export($dcSourceId, true) . "\n"; | |
| echo "scriptusTransId: " . var_export($scriptusTransId, true) . "\n"; | |
| echo "items in collection: {$items}\n"; | |
| echo "files for those items: {$files}\n"; | |
| echo "files with transcription: {$transcripts}\n"; | |
| $sample = $pdo->prepare(" | |
| SELECT i.id AS item_id, f.id AS file_id, f.filename | |
| FROM items i JOIN files f ON f.item_id = i.id | |
| WHERE i.collection_id = :cid | |
| ORDER BY i.id, f.id LIMIT 5 | |
| "); | |
| $sample->execute([':cid' => $collectionId]); | |
| echo "\nSAMPLE item+file rows:\n"; | |
| foreach ($sample as $r) echo json_encode($r, JSON_UNESCAPED_SLASHES|JSON_UNESCAPED_UNICODE) . "\n"; | |
| exit; | |
| } | |
| $dcTitleIdSafe = $dcTitleId ?: 0; | |
| $dcSourceIdSafe = $dcSourceId ?: 0; | |
| $scriptusTransIdSafe = $scriptusTransId ?: 0; | |
| $sql = " | |
| SELECT | |
| i.id AS item_id, | |
| it.item_title AS main_object_title, | |
| f.id AS file_id, | |
| f.filename AS file_name, | |
| ft.file_title AS part_title, | |
| isrc.source_value AS digital_collection_link, | |
| tr.transcription AS transcript | |
| FROM items i | |
| JOIN files f ON f.item_id = i.id | |
| LEFT JOIN ( | |
| SELECT record_id AS item_id, MAX(text) AS item_title | |
| FROM element_texts | |
| WHERE record_type = 'Item' AND element_id = :dc_title_item | |
| GROUP BY record_id | |
| ) it ON it.item_id = i.id | |
| LEFT JOIN ( | |
| SELECT record_id AS item_id, MAX(text) AS source_value | |
| FROM element_texts | |
| WHERE record_type = 'Item' AND element_id = :dc_source | |
| GROUP BY record_id | |
| ) isrc ON isrc.item_id = i.id | |
| LEFT JOIN ( | |
| SELECT record_id AS file_id, MAX(text) AS file_title | |
| FROM element_texts | |
| WHERE record_type = 'File' AND element_id = :dc_title_file | |
| GROUP BY record_id | |
| ) ft ON ft.file_id = f.id | |
| LEFT JOIN ( | |
| SELECT et1.record_id AS file_id, et1.text AS transcription | |
| FROM element_texts et1 | |
| JOIN ( | |
| SELECT record_id, MAX(id) AS max_id | |
| FROM element_texts | |
| WHERE record_type = 'File' AND element_id = :scriptus_trans | |
| GROUP BY record_id | |
| ) latest ON latest.max_id = et1.id | |
| ) tr ON tr.file_id = f.id | |
| WHERE i.collection_id = :collection_id | |
| " . ($onlyWithTranscript ? " AND tr.transcription IS NOT NULL\n" : "") . " | |
| ORDER BY i.id, f.id | |
| "; | |
| $filenameWarn = (!$dcTitleId || !$dcSourceId || !$scriptusTransId) ? '_WARN_ELEM_IDS' : ''; | |
| $filename = "vanallen_collection{$collectionId}{$filenameWarn}_" . date('Ymd_His') . ".csv"; | |
| header('Content-Type: text/csv; charset=utf-8'); | |
| header('Content-Disposition: attachment; filename="'.$filename.'"'); | |
| header('X-Accel-Buffering: no'); | |
| $out = fopen('php://output', 'w'); | |
| fputcsv($out, ['item_id','main_object_title','file_id','file_name','part_title','digital_collection_link','transcript']); | |
| try { | |
| $stmt = $pdo->prepare($sql); | |
| $stmt->execute([ | |
| ':dc_title_item' => $dcTitleIdSafe, | |
| ':dc_source' => $dcSourceIdSafe, | |
| ':dc_title_file' => $dcTitleIdSafe, | |
| ':scriptus_trans' => $scriptusTransIdSafe, | |
| ':collection_id' => $collectionId | |
| ]); | |
| $rows = 0; | |
| while ($row = $stmt->fetch()) { | |
| if (isset($row['transcript']) && $row['transcript'] !== null) { | |
| $row['transcript'] = str_replace(["\r\n", "\r"], "\n", $row['transcript']); | |
| } | |
| fputcsv($out, [ | |
| $row['item_id'], | |
| $row['main_object_title'], | |
| $row['file_id'], | |
| $row['file_name'], | |
| $row['part_title'], | |
| $row['digital_collection_link'], | |
| $row['transcript'] | |
| ]); | |
| if ((++$rows % 500) === 0) fflush($out); | |
| } | |
| } catch (Throwable $e) { | |
| fwrite($out, "\n# ERROR: " . $e->getMessage() . "\n"); | |
| } | |
| fclose($out); | |
| exit; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment