Created
June 24, 2022 04:49
-
-
Save dauuricus/b06fbdb60dda27d9e56673f00082d811 to your computer and use it in GitHub Desktop.
for binder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "04f91d55-f2b4-4e7d-8ef7-b9e5a65cb48c", | |
| "metadata": {}, | |
| "source": [ | |
| "## install \n", | |
| "requests,beautifulsoup4,deep-translator" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "ed08d1f3-0f3d-406f-9c28-07a15c598ffd", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "!python -m pip install requests beautifulsoup4\n", | |
| "#!pip install googletrans==4.0.0-rc1\n", | |
| "!pip install deep-translator" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "79d676c5-90f3-42ed-b552-6f4f1b443bc4", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "url = 'https://www.independent.co.uk/voices/julian-assange-wife-stella-moris-extradition-wikileaks-b2106602.html'\n", | |
| "lang = 'ja'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "8188ecad-7c27-4346-a201-6bc875a8965f", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import requests,re\n", | |
| "from bs4 import BeautifulSoup\n", | |
| "from urllib.parse import urlparse\n", | |
| "\n", | |
| "#googletrans case:\n", | |
| "#from googletrans import Translator\n", | |
| "#translator = Translator()\n", | |
| "\n", | |
| "from deep_translator import GoogleTranslator\n", | |
| "#translated = GoogleTranslator(source='auto', target='de').translate(\"keep it up, you are awesome\") # output -> Weiter so, du bist großartig\n", | |
| "\n", | |
| "\n", | |
| "#url = 'https://www.independent.co.uk/voices/julian-assange-wife-stella-moris-extradition-wikileaks-b2106602.html'\n", | |
| "#lang = 'ja'\n", | |
| "uAgent = {'User-Agent': \"Mozilla/5.0 (Linux; Android 9) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36\",'Referer': 'https://www.google.com/'}\n", | |
| "se = requests.Session()\n", | |
| "res = se.get(url,headers=uAgent)\n", | |
| "sch = urlparse(res.url).scheme\n", | |
| "base = urlparse(res.url).netloc\n", | |
| "\n", | |
| "#import os\n", | |
| "#result = os.popen(\"curl -s \" + url).read()\n", | |
| "#sch = urlparse(url).scheme\n", | |
| "#base = urlparse(url).netloc\n", | |
| "#soup = BeautifulSoup(result, \"html.parser\")\n", | |
| "soup = BeautifulSoup(res.text, \"html.parser\")\n", | |
| "res.close()\n", | |
| "del se\n", | |
| "\n", | |
| "ptag_list_0 = soup.find_all('p')\n", | |
| "h6tag_list_0 = soup.find_all('h6')\n", | |
| "title_list_0 = soup.find_all('title')\n", | |
| "print(title_list_0[0].text)\n", | |
| "title = re.sub(r'\\s','_',title_list_0[0].text)\n", | |
| "\n", | |
| "link = soup.find_all('link')\n", | |
| "if len(link) > 0:\n", | |
| " for index,v in enumerate(link):\n", | |
| " if not v.has_attr('rel'):\n", | |
| " continue\n", | |
| "# print(index,v['rel'])\n", | |
| " if v['rel'] == [\"stylesheet\"]:\n", | |
| " #css location\n", | |
| " #print(type(v))\n", | |
| " if not v.has_attr('href'):\n", | |
| " #if ('href') in v:\n", | |
| " continue\n", | |
| "# print(v['href'])\n", | |
| " if (bool(re.match(r'^http',v['href']))==False):\n", | |
| " if (bool(re.match(r'^\\/',v['href']))==True):\n", | |
| " link[index]['href'] = sch + \"://\" + base + v['href']\n", | |
| " else:\n", | |
| " link[index]['href'] = sch + \"://\" + base + '/' +v['href']\n", | |
| " print(link[index]['href'])\n", | |
| "\n", | |
| "image = soup.find_all('img')\n", | |
| "if len(image) > 0:\n", | |
| " for index,im in enumerate(image):\n", | |
| "# continue\n", | |
| "# print(index,im)\n", | |
| " #if im['alt'] == \"Bellingcat\" or im['alt'] == \"GIJNlogo\":\n", | |
| " if not im.has_attr('src'):\n", | |
| " continue\n", | |
| " if (bool(re.match(r'^http',im['src']))==False):\n", | |
| " print(im['src'])\n", | |
| " # image[index]['src'] = 'https://www.bellingcat.com' + im['src']\n", | |
| " if (bool(re.match(r'^\\/',im['src']))==True):\n", | |
| " image[index]['src'] = sch + '://' + base + im['src']\n", | |
| " print(index,image[index]['src'])\n", | |
| " else:\n", | |
| " image[index]['src'] = sch + '://' + base + '/' + im['src']\n", | |
| " print(index,image[index]['src'])\n", | |
| "\n", | |
| "import time\n", | |
| "counter = 0\n", | |
| "def trans(list0,translator,counter):\n", | |
| "#def trans(list0,lang):\n", | |
| " link_list = []\n", | |
| " link_words_list = []\n", | |
| "\n", | |
| " for index,lines in enumerate(list0):\n", | |
| " counter2 = counter\n", | |
| " print()\n", | |
| " print(index, lines)\n", | |
| "# xxxx = lines.text.strip()\n", | |
| "\n", | |
| " #(?<=\\<p\\>)(.+)(?=\\<\\/p)\n", | |
| " #(\\w|,|\\.|\\&|\\=|;|([ —-]))+(?!([^<]*>))\n", | |
| "\n", | |
| " soup2 = BeautifulSoup(str(lines), \"html.parser\")\n", | |
| " a_link = soup2.find_all('a')\n", | |
| " newtag = []\n", | |
| " if len(a_link) > 0:\n", | |
| " for i,v in enumerate(a_link):\n", | |
| " #link_words = re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',str(v))\n", | |
| " link_href = v.get('href')\n", | |
| " if (bool(re.search(r'^http',link_href))==False):\n", | |
| " if (bool(re.search(r'^\\/',link_href))==True):\n", | |
| " link_href = sch + '://' + base + link_href\n", | |
| " else:\n", | |
| " link_href = sch + '://' + base + '/' + link_href\n", | |
| "\n", | |
| " link_words = v.text\n", | |
| " print()\n", | |
| " print(\"words\",link_words)\n", | |
| " print(\"a link:\",link_href)\n", | |
| " link_list.append(link_href)\n", | |
| " link_words_list.append(link_words)\n", | |
| "\n", | |
| " if len(link_words) > 0:\n", | |
| " tag = soup.new_tag('a',href= link_href)\n", | |
| " if link_words != '':\n", | |
| " tag.string = link_words\n", | |
| " elif link_words == False:\n", | |
| " tag.string = str(link_href)\n", | |
| " else:\n", | |
| " tag.string = str(link_href)\n", | |
| " newtag.append(tag)\n", | |
| "\n", | |
| " print(newtag)\n", | |
| "\n", | |
| "\n", | |
| " xxxx1 = re.finditer(r'((\\.|\\d|\\w|&|\\=|[ \\(\\)\\-;:,%#+…|\"“’‘”\\'&\\?\\!\\.])*(?!([^<]*>)))',str(lines))\n", | |
| " xxxx2 = \"\"\n", | |
| " for word in xxxx1:\n", | |
| " xxxx2 += word[1] + ' '\n", | |
| " print()\n", | |
| " print(xxxx2)\n", | |
| "\n", | |
| "# mark_words = []\n", | |
| "# mark_words2 = []\n", | |
| "#\n", | |
| "# link_addr = re.findall(r'(?<=href\\=\\\").+?(?=\\\")',str(lines))\n", | |
| "# if len(link_addr) > 0:\n", | |
| "# atag = re.findall(r'(?<=\\<a).+?(?=\\<\\/a)',str(lines))\n", | |
| "# print(atag)\n", | |
| "# for a_text in atag:\n", | |
| "# mark_words += re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',a_text)\n", | |
| "# for v in mark_words:\n", | |
| "# strvv = ' '.join(v)\n", | |
| "# mark_words2.append(strvv.strip())\n", | |
| "# print(\"words\",mark_words2)\n", | |
| "# print('link:',link_addr)\n", | |
| "\n", | |
| " xxxx3 = re.sub(r\"\\s{3,}\",' ',xxxx2.strip())\n", | |
| " \n", | |
| " print()\n", | |
| " print(xxxx3)\n", | |
| "\n", | |
| "# if(re.match(r'\\w|\\“',xxxx) != None ):\n", | |
| " if(re.match(r'\\w|\\“',xxxx3) != None ):\n", | |
| " print()\n", | |
| "# print(xxxx3)\n", | |
| " #pattern match\n", | |
| "# texts = re.sub(r'\\.\\s+','. ',xxxx)\n", | |
| "# texts = re.sub(r'\\s{2}',' \\'',texts)\n", | |
| " texts = re.sub(r'\\s{2,}',' \\'',xxxx3)\n", | |
| " texts = re.sub(r'\\.\\s+','. ',texts)\n", | |
| " texts = re.sub(r'\\?\\s+','? ',texts)\n", | |
| " texts = re.sub(r'\\!\\s+','! ',texts)\n", | |
| " texts = re.sub(r'\\,\\s+',', ',texts)\n", | |
| " print()\n", | |
| "# print(index, xxxx)\n", | |
| " print(index, texts)\n", | |
| " if len(newtag) > 0:\n", | |
| " for link_v in newtag:\n", | |
| " print('newtag text:',link_v.text)\n", | |
| " print('newtag val:',link_v)\n", | |
| " counter += 1\n", | |
| " try:\n", | |
| " texts = re.sub(rf\"{link_v.text}\",f\"𓃵☽𓃡☽✸✦✦{link_v.text}𓃡✦✦✧{counter}✧✸\",texts)\n", | |
| "# texts = re.sub(rf\"{link_v.text}\",'<span class=\"e;notranslate\"e;>' + f\"𓃵☽𓃡☽✸✦✦{link_v.text}𓃡✦✦✧{counter}✧✸\"+'</span>',texts)\n", | |
| " print('texts :',texts)\n", | |
| " except:\n", | |
| " print('error')\n", | |
| " texts = link_v.text\n", | |
| "\n", | |
| " try:\n", | |
| " print()\n", | |
| " print('translated:')\n", | |
| "# translator = GoogleTranslator(source='auto', target=lang)\n", | |
| " translated = translator.translate(text=texts)\n", | |
| " print(index, translated)\n", | |
| " #googletrans case:\n", | |
| "# translated = translator.translate(str(texts), dest=lang)\n", | |
| "# print(index, translated.text)\n", | |
| " print('______________________________')\n", | |
| "# list0[index].string = translated.text\n", | |
| " list0[index].string = translated\n", | |
| " if len(newtag) > 0:\n", | |
| " for link in newtag:\n", | |
| " counter2 += 1\n", | |
| " div = soup.new_tag('div')\n", | |
| " div.string = '✦link✧✸' + str(counter2) + ':'\n", | |
| " div.append(link)\n", | |
| " list0[index].append(div)\n", | |
| "\n", | |
| " except:\n", | |
| "# time.sleep(5)\n", | |
| " print('translated: fail')\n", | |
| "\n", | |
| " return link_list,link_words_list,soup\n", | |
| "\n", | |
| "#deep-translator case:\n", | |
| "translator = GoogleTranslator(source='auto', target=lang)\n", | |
| "links1,word1,soup = trans(h6tag_list_0,translator,counter)\n", | |
| "links2,word2,soup = trans(ptag_list_0,translator,counter)\n", | |
| "del translator\n", | |
| "#trans(ptag_list_0,lang)\n", | |
| "#trans(h6tag_list_0,lang)\n", | |
| "\n", | |
| "links3 = []\n", | |
| "if links1 != None and links2 != None:\n", | |
| " links3 = links1 + links2\n", | |
| "elif links1 != None:\n", | |
| " links3 = links1\n", | |
| "else:\n", | |
| " pass\n", | |
| "\n", | |
| "word3 = []\n", | |
| "if word1 != None and word2 != None:\n", | |
| " word3 = word1 + word2\n", | |
| "elif word1 != None:\n", | |
| " word3 = word1\n", | |
| "else:\n", | |
| " pass\n", | |
| "\n", | |
| "metatag = soup.new_tag('meta')\n", | |
| "metatag.attrs['charset'] = \"utf-8\"\n", | |
| "soup.head.append(metatag)\n", | |
| "\n", | |
| "#import os\n", | |
| "#filename = os.path.basename(url)\n", | |
| "filename = title[0:6] + '.html'\n", | |
| "\n", | |
| "with open(filename, \"wb\") as f_output:\n", | |
| " f_output.write(soup.prettify(\"utf-8\"))\n", | |
| "\n", | |
| "# 𓃵𓃡☽✸✦✦ 𓃡✦✦✧ ✧✸\n", | |
| "\n", | |
| "file = open(filename, \"r\", encoding='utf-8')\n", | |
| "line_list = file.readlines()\n", | |
| "newtext = \"\"\n", | |
| "re_pattern = re.compile(r\"(𓃡☽✸✦{2}\\S+?𓃡✦{2}✧\\d+?✧✸)\")\n", | |
| "for linebyline in line_list:\n", | |
| " temp_1 = []\n", | |
| " a_link_num = re.findall(r'𓃡☽✸✦{2}\\S+?𓃡✦{2}✧(\\d+?)✧✸',linebyline)\n", | |
| " if len(a_link_num) > 0:\n", | |
| " temp_0 = []\n", | |
| " line2 = linebyline\n", | |
| " for i,v in enumerate(a_link_num):\n", | |
| " if not v in temp_0:\n", | |
| " temp_0.append(v)\n", | |
| " print('a_link_num:',i,v)\n", | |
| " num = int(v)\n", | |
| "\n", | |
| " extract_words = re.finditer(r\"𓃡☽✸✦{2}(\\S+?)𓃡✦{2}✧\\d+?✧✸\",linebyline)\n", | |
| "\n", | |
| " if extract_words != None:\n", | |
| " if num < len(links3):\n", | |
| " for iew,w in enumerate(extract_words):\n", | |
| " ws = str(w.group()) #link_words ...translated word\n", | |
| " if not ws in temp_1:\n", | |
| " temp_1.append(ws)\n", | |
| " print(ws)\n", | |
| " matc = re.findall(re_pattern,line2)\n", | |
| " if len(matc) > 0:\n", | |
| " for ms in matc:\n", | |
| " if (ms.find(ws)) != -1:\n", | |
| "\n", | |
| " link_number = re.match(r'𓃡☽✸✦{2}\\S+?𓃡✦{2}✧(?P<number>\\d+?)✧✸',ws)\n", | |
| " #print('link_number:',link_number.groups()[0])\n", | |
| " # linl_number.groups()[0] == link_number.group('number')\n", | |
| " print('link_number:',link_number.group('number'))\n", | |
| " number = int(link_number.groups()[0])\n", | |
| " embed_link = str(links3[number-1])\n", | |
| " word = str(word3[number-1])\n", | |
| " print('non skipped')\n", | |
| " line2 = line2.replace(ws,f\"<a href={embed_link}>{ws}</a>\")\n", | |
| "\n", | |
| " else:\n", | |
| " print('skipped!!!')\n", | |
| " newtext += line2\n", | |
| " else:\n", | |
| " newtext += linebyline\n", | |
| " newtext = re.sub(r'𓃵|☽|✸|✦✦|𓃡☽|𓃡','',newtext)\n", | |
| "re.purge()\n", | |
| "file.close()\n", | |
| "\n", | |
| "with open('generated.html', \"w+\", encoding='utf-8') as file:\n", | |
| " file.write(newtext)\n", | |
| "# 𓃵𓃡☽✸✦✦ 𓃡✦✦✧ ✧✸" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment