Skip to content

Instantly share code, notes, and snippets.

@happyman
Created July 27, 2023 23:34
Show Gist options
  • Select an option

  • Save happyman/c75320c24a19b5029b05c51316bd6aa6 to your computer and use it in GitHub Desktop.

Select an option

Save happyman/c75320c24a19b5029b05c51316bd6aa6 to your computer and use it in GitHub Desktop.
Xuite blog 備份檔轉換成 hugo md 程式
<?php
require('vendor/autoload.php');
use League\HTMLToMarkdown\HtmlConverter;
/* 將 xuite 的 mt text 轉換為 hugo 的 md 檔
requirement: https://github.com/thephpleague/html-to-markdown
注意: 檔名 blog_YYYYmmdd.md based on date, 也就是 mt 檔案中的 DATE 欄位. 若一天有兩篇則要另外改寫此程式處理.
md 會放在 posts/
images 會放在 assets/
將 posts/*.md 複製到 logseq/blog/pages
將 assets 放到 website 上, 必修改下列 img_url
*/
// 這是你打算 host image 的地方
$img_url='https://blog.happyman.idv.tw/assets/img_%s.%s';
// 轉換完將 assets 搬過去
// 改為你的 blog 網址
$referrer="https://blog.xuite.net/happyman/blog1";
// 來源檔案
$src = "../xuite_blog_export_6508.txt";
// 若文章要直接放到 hugo 目錄去, 否則 $fpath = ""
$fpath = "../myblog/content/";
// 開始轉換
$fp = fopen($src, "r");
$blog = [];
$body = 0;
$cur = 0;
while (!feof($fp)) {
$line = fgets($fp);
$line = trim($line);
if ($line == '--------') {
$cur++;
$body = 1; // 開始吃 body
continue;
}
if ($line == "----") { // start of body
$body = 1;
continue;
}
if ($line == "-----") { // start of comment
$body++; // 開始吃 comment
continue;
}
if (preg_match("/^(\S+):(.*)$/", $line, $mat)) {
if ($mat[1] == 'BODY' || $mat[1] == 'COMMENT') {
continue;
}
if ($body >= 2) {
$cur1 = $body - 2;
if ($mat[1] == 'EMAIL' || $mat[1] == 'IP' || (!empty($mat[2]) && $mat[2][0] == ' '))
$blog[$cur]['COMMENT'][$cur1][$mat[1]] = trim($mat[2]);
else
$blog[$cur]['COMMENT'][$cur1]['BODY'] .= $line . "\n";
} else
$blog[$cur][$mat[1]] = trim($mat[2]);
} else {
if ($body == 1)
$blog[$cur]['BODY'] .= $line . "\n";
else if ($body >= 2)
$blog[$cur]['COMMENT'][$cur1]['BODY'] .= $line . "\n";
}
}
mkdir($fpath . "posts/", 0755, true);
mkdir("assets/", 0755, true);
$converter = new HtmlConverter(array('strip_tags' => true, 'use_autolinks', false));
$i = 0;
foreach ($blog as $article) {
printf("\n--------\n%d / %d\n-----------\n", ++$i, count($blog));
if (!isset($article['BODY']) || !isset($article['DATE']))
continue;
// 去掉 html comments
$article['MD'] = $converter->convert( preg_replace('/<!--(.|\s)*?-->/', '', $article['BODY']));
// 將 img link 變成 assets
$article['MD2'] = process_xuite_links($article['MD']);
// 處理 comments, 留言為歷史的一部分,變成正文
if (!empty($article['COMMENT'])) {
$count=1;
$comment = "\n##### 留言:\n";
$ckeys=["MD2"=>"BODY"];
foreach ($article['COMMENT'] as $cmt) {
if (empty($cmt['BODY']))
continue;
$cmt['MD'] = $converter->convert($cmt['BODY']);
$cmt['MD2'] = process_xuite_links($cmt['MD']);
print_r($cmt);
// fwrite($fq, sprintf("- %d\n", $count++));
$comment .= sprintf("- %d\n", $count++);
foreach ($cmt as $key => $val) {
if ($key == 'BODY' || $key == 'MD')
continue;
// fwrite($fq, sprintf("\t- %s: %s\n", $key, $val));
if (!empty($val))
$comment .= sprintf("\t- %s: %s\n", isset($ckeys[$key])? $ckeys[$key] : $key, $val);
}
}
//fclose($fq);
}
## write
if (1) {
$fname = $fpath . "posts/blog_" .date("Ymd", strtotime($article['DATE'])) . ".md";
$fq = fopen($fname, "w");
/*
---
title: "First_post"
date: 2023-07-27T08:54:03+08:00 // print gmdate("Y-m-d\TH:i:s\Z");
categories: "test"
tags:
draft: true
---
*/
if (!empty($article['COMMENT'])){
$tags = "留言";
}else{
$tags = "";
}
fwrite($fq,
sprintf("---\ntitle: \"%s\"\ndate: %s\ncategories: [\"%s\"]\ndraft: %s\ntags: [\"%s\"]\n---\n",
addslashes($article['TITLE']), gmdate("Y-m-d\TH:i:s+08:00",strtotime($article['DATE'])), $article['CATEGORY'],
($article['STATUS']=='publish')?"false":"true", $tags));
fwrite($fq, $article['MD2']);
if (!empty($article['COMMENT'])) {
//$backlink_cmt = date("Y_m_d", strtotime($article['DATE'])) . "_comment";
//fwrite($fq, sprintf("\n\n留言: [[%s]]\n", $backlink_cmt));
fwrite($fq, $comment);
}
fclose($fq);
touch($fname, strtotime($article['DATE']));
//printf("%s created\n", $fname);
//break;
}
print_r($article);
}
function remove_photo_link($mdstr)
{
$pattern = '/\[!\[(.*?)\]\((.*?)\)\]\((.*?xuite.*?\/redir)\)/';
preg_match_all($pattern, $mdstr, $mat);
if (!empty($mat)) {
$match=[]; $replace=[];
for ($i = 0; $i < count($mat[0]); $i++) {
$match[] = $mat[0][$i];
$replace[] = sprintf("![%s](%s)", $mat[1][$i],$mat[2][$i]);
}
print_r(array($match, $replace));
return str_replace($match, $replace, $mdstr);
}
return $mdstr;
}
# 將 xuite 上的圖片下載回來 before it's too late
function process_xuite_links($mdstr)
{
global $referrer;
$pattern = "/\((http[\:|\/|a-z|0-9|\.|_]*?.[jpg|png|gif])\).*?/";
// 將要 host 到哪裡去 請設定
global $img_url;
$opts = array(
'http' => array(
'method' => "GET",
'timeout' => 1,
'header' => "Accept-language: en\r\n" .
"Referer: $referrer\r\n" .
//'Cookie: AviviD_uuid=7ede7d0a-715d-4204-b966-278bb2a6748f; webuserid=6453e49e-a437-2d67-bd9f-33fe3914e2b1; __auc=5459d990186ee3475356c2103f2; __htid=76fcb5a8-fb91-409a-ab3b-f1dd9a0cfcb7; _ss_pp_id=7b2a185a02920b755911663711309497; __gads=ID=409c3d0cac31ef6f-22ac61d004dc0057:T=1679033660:RT=1679033660:S=ALNI_MYo3HpzPBUNCcSizsc1MnYUtpnOEQ; AviviD_already_exist=1; AviviD_waterfall_status=0; AviviD_refresh_uuid_status=2; AviviD_token_retake=0; HalfYearSN=11848044; HalfYearSNSum=e2158cc67c2e16622e61fabe11896b4c; acceptPrivacy=clicked; _ga_5NKYZNCRDM=GS1.1.1681111430.1.0.1681111435.55.0.0; __utmz=24866355.1681117077.3.2.utmcsr=photo.xuite.net|utmccn=(referral)|utmcmd=referral|utmcct=/; cto_bundle=lOGgk183Smk3U2hORXhVYWQ0eGt6MHZMdjRMYVFJTmdSTUVYWUUlMkJSVU04MGNjR1M4MXRHJTJCcUhIb0dibG9TYm1PenAyN1k4cndiSjZkQzZvMmdheG11cTZTSERRWExZT2NtZ1FHU1lJNTVjd09WQmdHWnhaYktYdEVOUklURFY5SUZGeCUyRmllSG9UZmt2cCUyRnVDNkVhc2lXRU0xQSUzRCUzRA; FOTOSSID=a287ebul8url9jbmer7nhb4uu3; XWWWSESSID=ekn4l0roimb882pa5b320c0kf1; XBLOGSESSID=9h0hjnvd5gh0rhjmh0rntvd8r3; XMYSESSID=7fafumv39csm4a530qdckb17e2; _ht_hi=1; _gid=GA1.2.653715525.1681341601; truvid_d={"52899":{"r":1,"t":"2023-04-12T23:20:07Z"},"52928":{"r":1,"t":"2023-04-12T23:20:07Z"},"57028":{"r":1,"t":"2023-04-12T23:21:41Z"}}; __utmc=24866355; __utma=24866355.788854697.1679033661.1681118945.1681341722.5; page_view=3; __asc=b879fb7718779f10b26863422ea; __gpi=UID=00000a1ea612dbe7:T=1679033660:RT=1681378052:S=ALNI_MYCrBmNMsxl6W5GcNh3gMI33rVlSg; _ht_em=1; _td=13a519ff-8cf7-42d1-8e17-9fc9eda2c7de; _ga=GA1.1.788854697.1679033661; vlog_ck=500151681342463787946508; _ga_98P4JR415X=GS1.1.1681378780.3.1.1681378795.0.0.0' . "\r\n" .
// check function.stream-context-create on php.net
"User-Agent: Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.102011-10-16 20:23:10\r\n" // i.e. An iPad
)
);
$context = stream_context_create($opts);
if (preg_match_all($pattern, $mdstr, $mat)) {
foreach ($mat[1] as $link) {
if (strstr($link, "blog.xuite") || (strstr($link, "photo.xuite") && preg_match('~[0-9]+~', $link ))) {
if (strstr($link, "_r9009"))
continue;
$match[] = "@$link@";
$ext = substr($link, strrpos($link, '.') + 1);
//$replace[] = sprintf("https://blog.happyman.idv.tw/imglink.php?url=%s",urlencode($link));
$replace[] = sprintf($img_url, md5($link), $ext);
$outfile = sprintf('assets/img_%s.%s', md5($link), $ext);
//printf("download %s => %s\n",$link,$outfile);
//copy($link,$outfile);
if (!file_exists($outfile) || filesize($outfile) == 0) {
printf("download %s => %s\n", $link, $outfile);
$data = false;
while ($data === false) {
$link = str_replace("http://", "https://", $link);
$data = file_get_contents($link, false, $context);
//sleep(0.5);
if ($data === false) {
echo "retry...\n";
sleep(0.5);
}
}
file_put_contents($outfile, $data);
// sleep(0.5);
}
}
}
// debug
print_r($mat);
if (!empty($match))
return remove_photo_link(preg_replace($match, $replace, $mdstr));
}
return remove_photo_link($mdstr);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment