Created
November 16, 2025 21:35
-
-
Save inilim/38c22c06660ee19e423cea86067e36b9 to your computer and use it in GitHub Desktop.
4.2# Парсинг LiveLib. Парсинг данных из файлов и занесение в БД. (PHP, Crawler, SqLite)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| error_reporting(E_ALL); | |
| set_time_limit(0); | |
| date_default_timezone_set('Etc/GMT-3'); | |
| require_once __DIR__ . '/functions.php'; | |
| # Функции для работы с PDO SqLite | |
| require_once __DIR__ . '/_INIL_connectLite.php'; | |
| require_once __DIR__ . '/vendor/autoload.php'; | |
| ini_set('memory_limit', '5024M'); | |
| timeRun(); | |
| use Symfony\Component\DomCrawler\Crawler; | |
| L_INIL_DB::$pathToFileDB = 'BASE_livelib.db'; | |
| $arr = []; | |
| foreach(range(0, 50000000, 100) as $step) | |
| { | |
| $urls = L_SqlStart('SELECT * FROM urls WHERE status = 1 LIMIT 100 OFFSET ' . $step, [], 2); | |
| if(sizeof($urls) === 0) | |
| { | |
| break; | |
| } | |
| foreach($urls as $url) | |
| { | |
| //echo $url['id'] . PHP_EOL; | |
| $path = createFolderTree($url['url'], 'livelib_pages', 2); | |
| $html = fgc($path . sha_($url['url']) . '.html'); | |
| $k = sha_($url['url']); | |
| $html = deCompress($html); | |
| $crw = new Crawler($html); | |
| if($crw->filter('script[type="application/ld+json"]')->count() === 0) | |
| { | |
| dd($url); | |
| continue; | |
| } | |
| $json_ld = $crw->filter('script[type="application/ld+json"]')->text(); | |
| if(!isJson($json_ld)) | |
| { | |
| $json_ld = str_replace('\\', '|', $json_ld); | |
| if(!isJson($json_ld)) | |
| { | |
| dd($url); | |
| dde($json_ld); | |
| } | |
| } | |
| $arr_ld = jsonDecode( $json_ld ); | |
| /* if($url['id'] == 841) | |
| { | |
| dd($url['url']); | |
| dde($arr_ld); | |
| } */ | |
| $arr[$k]['poster'] = $arr_ld['image']; | |
| $arr[$k]['idUrl'] = $url['id']; | |
| $arr[$k]['title'] = html_entity_decode($arr_ld['name']); | |
| $arr[$k]['desc'] = html_entity_decode($arr_ld['description']); | |
| $arr[$k]['isbn'] = $arr_ld['isbn'] ?? ''; | |
| if(isset($arr_ld['genre'])) | |
| { | |
| $arr[$k]['genre'] = explode(',', html_entity_decode($arr_ld['genre'])); | |
| $arr[$k]['genre'] = am($arr[$k]['genre'], fn($a) => trim($a)); | |
| } | |
| else | |
| { | |
| $arr[$k]['genre'] = []; | |
| } | |
| $arr[$k]['author'] = trim(html_entity_decode($arr_ld['author']['name'] ?? 'NULL')); | |
| if($arr[$k]['author'] == '') | |
| { | |
| $arr[$k]['author'] = 'NULL'; | |
| } | |
| // Издательство | |
| $arr[$k]['publisher'] = html_entity_decode($arr_ld['publisher']['name'] ?? ''); | |
| $arr[$k]['publisher'] = trim($arr[$k]['publisher']); | |
| $arr[$k]['publisher'] = str_replace('Манн, Иванов и Фербер', '#09123986534', $arr[$k]['publisher']); | |
| $arr[$k]['publisher'] = explode(',', $arr[$k]['publisher']); | |
| $arr[$k]['publisher'] = am($arr[$k]['publisher'], function($a){ | |
| $a = trim($a); | |
| $a = str_replace('#09123986534', 'Манн, Иванов и Фербер', $a); | |
| return $a; | |
| }); | |
| $info_blick = $crw->filter('div.bc-info'); | |
| $arr[$k]['lang'] = ''; | |
| $arr[$k]['count_pages'] = ''; | |
| $arr[$k]['y'] = ''; | |
| if($info_blick->count() > 0) | |
| { | |
| // Жанры и теги | |
| $t = $info_blick->filter('div.bc-info__wrapper')->eq(2); | |
| $list = $t->filter('a')->each(function(Crawler $node){ | |
| return [ | |
| 'href' => $node->attr('href'), | |
| 'value' => $node->text(), | |
| ]; | |
| }); | |
| $list = array_filter($list, fn($a) => !str_contains($a['value'], 'Все теги')); | |
| $list = am($list, fn($a) => trim($a['value'])); | |
| $arr[$k]['genre'] = array_merge($arr[$k]['genre'], $list); | |
| $arr[$k]['genre'] = array_unique($arr[$k]['genre']); | |
| // Дополнительная информация об издании | |
| $t = $info_blick->filter('div.bc-info__wrapper')->eq(1); | |
| $list = $t->filter('p')->each(function(Crawler $node){ | |
| return trim($node->text()); | |
| }); | |
| //dde($list); | |
| foreach($list as $text) | |
| { | |
| if(str_contains($text, 'Год издания:')) | |
| { | |
| $arr[$k]['y'] = onlyInt($text); | |
| } | |
| if(str_contains($text, 'Язык:')) | |
| { | |
| $text = str_replace('Язык:', '', $text); | |
| $arr[$k]['lang'] = trim($text); | |
| if(stlen($arr[$k]['lang']) > 15) | |
| { | |
| $arr[$k]['lang'] = ''; | |
| } | |
| } | |
| // Страницы | |
| if(str_contains($text, 'Количество страниц:')) | |
| { | |
| preg_match('#Количество\ страниц\:\ ([0-9]{1,4})#u', $text, $m); | |
| if(isset($m[1])) | |
| { | |
| $arr[$k]['count_pages'] = $m[1]; | |
| } | |
| } | |
| elseif(str_contains($text, 'стр.')) | |
| { | |
| preg_match('#([0-9]{1,4})\ стр#u', $text, $m); | |
| if(isset($m[1])) | |
| { | |
| $arr[$k]['count_pages'] = $m[1]; | |
| } | |
| preg_match('#([0-9]{1,4})стр#u', $text, $m); | |
| if(isset($m[1])) | |
| { | |
| $arr[$k]['count_pages'] = $m[1]; | |
| } | |
| } | |
| elseif(str_contains($text, 'pages')) | |
| { | |
| preg_match('#([0-9]{1,4})\ pages#', $text, $m); | |
| if(isset($m[1])) | |
| { | |
| $arr[$k]['count_pages'] = $m[1]; | |
| } | |
| } | |
| elseif(str_contains($text, 'Страниц:')) | |
| { | |
| preg_match('#Страниц\:\ ([0-9]{1,4})#u', $text, $m); | |
| if(isset($m[1])) | |
| { | |
| $arr[$k]['count_pages'] = $m[1]; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| L_execCommitPack('INSERT INTO books ( | |
| title, | |
| author, | |
| [desc], | |
| isbn, | |
| lang, | |
| count_pages, | |
| y, | |
| poster, | |
| idUrl | |
| ) | |
| VALUES ( | |
| :title, | |
| :author, | |
| :desc, | |
| :isbn, | |
| :lang, | |
| :count_pages, | |
| :y, | |
| :poster, | |
| :idUrl | |
| );', $arr, 500); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment