Skip to content

Instantly share code, notes, and snippets.

@inilim
Created November 16, 2025 21:35
Show Gist options
  • Select an option

  • Save inilim/38c22c06660ee19e423cea86067e36b9 to your computer and use it in GitHub Desktop.

Select an option

Save inilim/38c22c06660ee19e423cea86067e36b9 to your computer and use it in GitHub Desktop.
4.2# Парсинг LiveLib. Парсинг данных из файлов и занесение в БД. (PHP, Crawler, SqLite)
<?php
error_reporting(E_ALL);
set_time_limit(0);
date_default_timezone_set('Etc/GMT-3');
require_once __DIR__ . '/functions.php';
# Функции для работы с PDO SqLite
require_once __DIR__ . '/_INIL_connectLite.php';
require_once __DIR__ . '/vendor/autoload.php';
ini_set('memory_limit', '5024M');
timeRun();
use Symfony\Component\DomCrawler\Crawler;
L_INIL_DB::$pathToFileDB = 'BASE_livelib.db';
$arr = [];
foreach(range(0, 50000000, 100) as $step)
{
$urls = L_SqlStart('SELECT * FROM urls WHERE status = 1 LIMIT 100 OFFSET ' . $step, [], 2);
if(sizeof($urls) === 0)
{
break;
}
foreach($urls as $url)
{
//echo $url['id'] . PHP_EOL;
$path = createFolderTree($url['url'], 'livelib_pages', 2);
$html = fgc($path . sha_($url['url']) . '.html');
$k = sha_($url['url']);
$html = deCompress($html);
$crw = new Crawler($html);
if($crw->filter('script[type="application/ld+json"]')->count() === 0)
{
dd($url);
continue;
}
$json_ld = $crw->filter('script[type="application/ld+json"]')->text();
if(!isJson($json_ld))
{
$json_ld = str_replace('\\', '|', $json_ld);
if(!isJson($json_ld))
{
dd($url);
dde($json_ld);
}
}
$arr_ld = jsonDecode( $json_ld );
/* if($url['id'] == 841)
{
dd($url['url']);
dde($arr_ld);
} */
$arr[$k]['poster'] = $arr_ld['image'];
$arr[$k]['idUrl'] = $url['id'];
$arr[$k]['title'] = html_entity_decode($arr_ld['name']);
$arr[$k]['desc'] = html_entity_decode($arr_ld['description']);
$arr[$k]['isbn'] = $arr_ld['isbn'] ?? '';
if(isset($arr_ld['genre']))
{
$arr[$k]['genre'] = explode(',', html_entity_decode($arr_ld['genre']));
$arr[$k]['genre'] = am($arr[$k]['genre'], fn($a) => trim($a));
}
else
{
$arr[$k]['genre'] = [];
}
$arr[$k]['author'] = trim(html_entity_decode($arr_ld['author']['name'] ?? 'NULL'));
if($arr[$k]['author'] == '')
{
$arr[$k]['author'] = 'NULL';
}
// Издательство
$arr[$k]['publisher'] = html_entity_decode($arr_ld['publisher']['name'] ?? '');
$arr[$k]['publisher'] = trim($arr[$k]['publisher']);
$arr[$k]['publisher'] = str_replace('Манн, Иванов и Фербер', '#09123986534', $arr[$k]['publisher']);
$arr[$k]['publisher'] = explode(',', $arr[$k]['publisher']);
$arr[$k]['publisher'] = am($arr[$k]['publisher'], function($a){
$a = trim($a);
$a = str_replace('#09123986534', 'Манн, Иванов и Фербер', $a);
return $a;
});
$info_blick = $crw->filter('div.bc-info');
$arr[$k]['lang'] = '';
$arr[$k]['count_pages'] = '';
$arr[$k]['y'] = '';
if($info_blick->count() > 0)
{
// Жанры и теги
$t = $info_blick->filter('div.bc-info__wrapper')->eq(2);
$list = $t->filter('a')->each(function(Crawler $node){
return [
'href' => $node->attr('href'),
'value' => $node->text(),
];
});
$list = array_filter($list, fn($a) => !str_contains($a['value'], 'Все теги'));
$list = am($list, fn($a) => trim($a['value']));
$arr[$k]['genre'] = array_merge($arr[$k]['genre'], $list);
$arr[$k]['genre'] = array_unique($arr[$k]['genre']);
// Дополнительная информация об издании
$t = $info_blick->filter('div.bc-info__wrapper')->eq(1);
$list = $t->filter('p')->each(function(Crawler $node){
return trim($node->text());
});
//dde($list);
foreach($list as $text)
{
if(str_contains($text, 'Год издания:'))
{
$arr[$k]['y'] = onlyInt($text);
}
if(str_contains($text, 'Язык:'))
{
$text = str_replace('Язык:', '', $text);
$arr[$k]['lang'] = trim($text);
if(stlen($arr[$k]['lang']) > 15)
{
$arr[$k]['lang'] = '';
}
}
// Страницы
if(str_contains($text, 'Количество страниц:'))
{
preg_match('#Количество\ страниц\:\ ([0-9]{1,4})#u', $text, $m);
if(isset($m[1]))
{
$arr[$k]['count_pages'] = $m[1];
}
}
elseif(str_contains($text, 'стр.'))
{
preg_match('#([0-9]{1,4})\ стр#u', $text, $m);
if(isset($m[1]))
{
$arr[$k]['count_pages'] = $m[1];
}
preg_match('#([0-9]{1,4})стр#u', $text, $m);
if(isset($m[1]))
{
$arr[$k]['count_pages'] = $m[1];
}
}
elseif(str_contains($text, 'pages'))
{
preg_match('#([0-9]{1,4})\ pages#', $text, $m);
if(isset($m[1]))
{
$arr[$k]['count_pages'] = $m[1];
}
}
elseif(str_contains($text, 'Страниц:'))
{
preg_match('#Страниц\:\ ([0-9]{1,4})#u', $text, $m);
if(isset($m[1]))
{
$arr[$k]['count_pages'] = $m[1];
}
}
}
}
}
L_execCommitPack('INSERT INTO books (
title,
author,
[desc],
isbn,
lang,
count_pages,
y,
poster,
idUrl
)
VALUES (
:title,
:author,
:desc,
:isbn,
:lang,
:count_pages,
:y,
:poster,
:idUrl
);', $arr, 500);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment