Created
July 27, 2023 23:34
-
-
Save happyman/c75320c24a19b5029b05c51316bd6aa6 to your computer and use it in GitHub Desktop.
Xuite blog 備份檔轉換成 hugo md 程式
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| require('vendor/autoload.php'); | |
| use League\HTMLToMarkdown\HtmlConverter; | |
| /* 將 xuite 的 mt text 轉換為 hugo 的 md 檔 | |
| requirement: https://github.com/thephpleague/html-to-markdown | |
| 注意: 檔名 blog_YYYYmmdd.md based on date, 也就是 mt 檔案中的 DATE 欄位. 若一天有兩篇則要另外改寫此程式處理. | |
| md 會放在 posts/ | |
| images 會放在 assets/ | |
| 將 posts/*.md 複製到 logseq/blog/pages | |
| 將 assets 放到 website 上, 必修改下列 img_url | |
| */ | |
| // 這是你打算 host image 的地方 | |
| $img_url='https://blog.happyman.idv.tw/assets/img_%s.%s'; | |
| // 轉換完將 assets 搬過去 | |
| // 改為你的 blog 網址 | |
| $referrer="https://blog.xuite.net/happyman/blog1"; | |
| // 來源檔案 | |
| $src = "../xuite_blog_export_6508.txt"; | |
| // 若文章要直接放到 hugo 目錄去, 否則 $fpath = "" | |
| $fpath = "../myblog/content/"; | |
| // 開始轉換 | |
| $fp = fopen($src, "r"); | |
| $blog = []; | |
| $body = 0; | |
| $cur = 0; | |
| while (!feof($fp)) { | |
| $line = fgets($fp); | |
| $line = trim($line); | |
| if ($line == '--------') { | |
| $cur++; | |
| $body = 1; // 開始吃 body | |
| continue; | |
| } | |
| if ($line == "----") { // start of body | |
| $body = 1; | |
| continue; | |
| } | |
| if ($line == "-----") { // start of comment | |
| $body++; // 開始吃 comment | |
| continue; | |
| } | |
| if (preg_match("/^(\S+):(.*)$/", $line, $mat)) { | |
| if ($mat[1] == 'BODY' || $mat[1] == 'COMMENT') { | |
| continue; | |
| } | |
| if ($body >= 2) { | |
| $cur1 = $body - 2; | |
| if ($mat[1] == 'EMAIL' || $mat[1] == 'IP' || (!empty($mat[2]) && $mat[2][0] == ' ')) | |
| $blog[$cur]['COMMENT'][$cur1][$mat[1]] = trim($mat[2]); | |
| else | |
| $blog[$cur]['COMMENT'][$cur1]['BODY'] .= $line . "\n"; | |
| } else | |
| $blog[$cur][$mat[1]] = trim($mat[2]); | |
| } else { | |
| if ($body == 1) | |
| $blog[$cur]['BODY'] .= $line . "\n"; | |
| else if ($body >= 2) | |
| $blog[$cur]['COMMENT'][$cur1]['BODY'] .= $line . "\n"; | |
| } | |
| } | |
| mkdir($fpath . "posts/", 0755, true); | |
| mkdir("assets/", 0755, true); | |
| $converter = new HtmlConverter(array('strip_tags' => true, 'use_autolinks', false)); | |
| $i = 0; | |
| foreach ($blog as $article) { | |
| printf("\n--------\n%d / %d\n-----------\n", ++$i, count($blog)); | |
| if (!isset($article['BODY']) || !isset($article['DATE'])) | |
| continue; | |
| // 去掉 html comments | |
| $article['MD'] = $converter->convert( preg_replace('/<!--(.|\s)*?-->/', '', $article['BODY'])); | |
| // 將 img link 變成 assets | |
| $article['MD2'] = process_xuite_links($article['MD']); | |
| // 處理 comments, 留言為歷史的一部分,變成正文 | |
| if (!empty($article['COMMENT'])) { | |
| $count=1; | |
| $comment = "\n##### 留言:\n"; | |
| $ckeys=["MD2"=>"BODY"]; | |
| foreach ($article['COMMENT'] as $cmt) { | |
| if (empty($cmt['BODY'])) | |
| continue; | |
| $cmt['MD'] = $converter->convert($cmt['BODY']); | |
| $cmt['MD2'] = process_xuite_links($cmt['MD']); | |
| print_r($cmt); | |
| // fwrite($fq, sprintf("- %d\n", $count++)); | |
| $comment .= sprintf("- %d\n", $count++); | |
| foreach ($cmt as $key => $val) { | |
| if ($key == 'BODY' || $key == 'MD') | |
| continue; | |
| // fwrite($fq, sprintf("\t- %s: %s\n", $key, $val)); | |
| if (!empty($val)) | |
| $comment .= sprintf("\t- %s: %s\n", isset($ckeys[$key])? $ckeys[$key] : $key, $val); | |
| } | |
| } | |
| //fclose($fq); | |
| } | |
| ## write | |
| if (1) { | |
| $fname = $fpath . "posts/blog_" .date("Ymd", strtotime($article['DATE'])) . ".md"; | |
| $fq = fopen($fname, "w"); | |
| /* | |
| --- | |
| title: "First_post" | |
| date: 2023-07-27T08:54:03+08:00 // print gmdate("Y-m-d\TH:i:s\Z"); | |
| categories: "test" | |
| tags: | |
| draft: true | |
| --- | |
| */ | |
| if (!empty($article['COMMENT'])){ | |
| $tags = "留言"; | |
| }else{ | |
| $tags = ""; | |
| } | |
| fwrite($fq, | |
| sprintf("---\ntitle: \"%s\"\ndate: %s\ncategories: [\"%s\"]\ndraft: %s\ntags: [\"%s\"]\n---\n", | |
| addslashes($article['TITLE']), gmdate("Y-m-d\TH:i:s+08:00",strtotime($article['DATE'])), $article['CATEGORY'], | |
| ($article['STATUS']=='publish')?"false":"true", $tags)); | |
| fwrite($fq, $article['MD2']); | |
| if (!empty($article['COMMENT'])) { | |
| //$backlink_cmt = date("Y_m_d", strtotime($article['DATE'])) . "_comment"; | |
| //fwrite($fq, sprintf("\n\n留言: [[%s]]\n", $backlink_cmt)); | |
| fwrite($fq, $comment); | |
| } | |
| fclose($fq); | |
| touch($fname, strtotime($article['DATE'])); | |
| //printf("%s created\n", $fname); | |
| //break; | |
| } | |
| print_r($article); | |
| } | |
| function remove_photo_link($mdstr) | |
| { | |
| $pattern = '/\[!\[(.*?)\]\((.*?)\)\]\((.*?xuite.*?\/redir)\)/'; | |
| preg_match_all($pattern, $mdstr, $mat); | |
| if (!empty($mat)) { | |
| $match=[]; $replace=[]; | |
| for ($i = 0; $i < count($mat[0]); $i++) { | |
| $match[] = $mat[0][$i]; | |
| $replace[] = sprintf("", $mat[1][$i],$mat[2][$i]); | |
| } | |
| print_r(array($match, $replace)); | |
| return str_replace($match, $replace, $mdstr); | |
| } | |
| return $mdstr; | |
| } | |
| # 將 xuite 上的圖片下載回來 before it's too late | |
| function process_xuite_links($mdstr) | |
| { | |
| global $referrer; | |
| $pattern = "/\((http[\:|\/|a-z|0-9|\.|_]*?.[jpg|png|gif])\).*?/"; | |
| // 將要 host 到哪裡去 請設定 | |
| global $img_url; | |
| $opts = array( | |
| 'http' => array( | |
| 'method' => "GET", | |
| 'timeout' => 1, | |
| 'header' => "Accept-language: en\r\n" . | |
| "Referer: $referrer\r\n" . | |
| //'Cookie: AviviD_uuid=7ede7d0a-715d-4204-b966-278bb2a6748f; webuserid=6453e49e-a437-2d67-bd9f-33fe3914e2b1; __auc=5459d990186ee3475356c2103f2; __htid=76fcb5a8-fb91-409a-ab3b-f1dd9a0cfcb7; _ss_pp_id=7b2a185a02920b755911663711309497; __gads=ID=409c3d0cac31ef6f-22ac61d004dc0057:T=1679033660:RT=1679033660:S=ALNI_MYo3HpzPBUNCcSizsc1MnYUtpnOEQ; AviviD_already_exist=1; AviviD_waterfall_status=0; AviviD_refresh_uuid_status=2; AviviD_token_retake=0; HalfYearSN=11848044; HalfYearSNSum=e2158cc67c2e16622e61fabe11896b4c; acceptPrivacy=clicked; _ga_5NKYZNCRDM=GS1.1.1681111430.1.0.1681111435.55.0.0; __utmz=24866355.1681117077.3.2.utmcsr=photo.xuite.net|utmccn=(referral)|utmcmd=referral|utmcct=/; cto_bundle=lOGgk183Smk3U2hORXhVYWQ0eGt6MHZMdjRMYVFJTmdSTUVYWUUlMkJSVU04MGNjR1M4MXRHJTJCcUhIb0dibG9TYm1PenAyN1k4cndiSjZkQzZvMmdheG11cTZTSERRWExZT2NtZ1FHU1lJNTVjd09WQmdHWnhaYktYdEVOUklURFY5SUZGeCUyRmllSG9UZmt2cCUyRnVDNkVhc2lXRU0xQSUzRCUzRA; FOTOSSID=a287ebul8url9jbmer7nhb4uu3; XWWWSESSID=ekn4l0roimb882pa5b320c0kf1; XBLOGSESSID=9h0hjnvd5gh0rhjmh0rntvd8r3; XMYSESSID=7fafumv39csm4a530qdckb17e2; _ht_hi=1; _gid=GA1.2.653715525.1681341601; truvid_d={"52899":{"r":1,"t":"2023-04-12T23:20:07Z"},"52928":{"r":1,"t":"2023-04-12T23:20:07Z"},"57028":{"r":1,"t":"2023-04-12T23:21:41Z"}}; __utmc=24866355; __utma=24866355.788854697.1679033661.1681118945.1681341722.5; page_view=3; __asc=b879fb7718779f10b26863422ea; __gpi=UID=00000a1ea612dbe7:T=1679033660:RT=1681378052:S=ALNI_MYCrBmNMsxl6W5GcNh3gMI33rVlSg; _ht_em=1; _td=13a519ff-8cf7-42d1-8e17-9fc9eda2c7de; _ga=GA1.1.788854697.1679033661; vlog_ck=500151681342463787946508; _ga_98P4JR415X=GS1.1.1681378780.3.1.1681378795.0.0.0' . "\r\n" . | |
| // check function.stream-context-create on php.net | |
| "User-Agent: Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.102011-10-16 20:23:10\r\n" // i.e. An iPad | |
| ) | |
| ); | |
| $context = stream_context_create($opts); | |
| if (preg_match_all($pattern, $mdstr, $mat)) { | |
| foreach ($mat[1] as $link) { | |
| if (strstr($link, "blog.xuite") || (strstr($link, "photo.xuite") && preg_match('~[0-9]+~', $link ))) { | |
| if (strstr($link, "_r9009")) | |
| continue; | |
| $match[] = "@$link@"; | |
| $ext = substr($link, strrpos($link, '.') + 1); | |
| //$replace[] = sprintf("https://blog.happyman.idv.tw/imglink.php?url=%s",urlencode($link)); | |
| $replace[] = sprintf($img_url, md5($link), $ext); | |
| $outfile = sprintf('assets/img_%s.%s', md5($link), $ext); | |
| //printf("download %s => %s\n",$link,$outfile); | |
| //copy($link,$outfile); | |
| if (!file_exists($outfile) || filesize($outfile) == 0) { | |
| printf("download %s => %s\n", $link, $outfile); | |
| $data = false; | |
| while ($data === false) { | |
| $link = str_replace("http://", "https://", $link); | |
| $data = file_get_contents($link, false, $context); | |
| //sleep(0.5); | |
| if ($data === false) { | |
| echo "retry...\n"; | |
| sleep(0.5); | |
| } | |
| } | |
| file_put_contents($outfile, $data); | |
| // sleep(0.5); | |
| } | |
| } | |
| } | |
| // debug | |
| print_r($mat); | |
| if (!empty($match)) | |
| return remove_photo_link(preg_replace($match, $replace, $mdstr)); | |
| } | |
| return remove_photo_link($mdstr); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment