Skip to content

Instantly share code, notes, and snippets.

@frydaykg
Created April 14, 2018 20:33
Show Gist options
  • Select an option

  • Save frydaykg/c0b20c02917fcd6d8913e15b0b273acf to your computer and use it in GitHub Desktop.

Select an option

Save frydaykg/c0b20c02917fcd6d8913e15b0b273acf to your computer and use it in GitHub Desktop.
DOwnload books from mybook.ru
import requests
from itertools import product
import time
import re
import os
import subprocess
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def saveBinary(session, link, filePath):
resp = session.get(link)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
if os.path.isfile(filePath):
print(filePath, 'already exists')
with open(filePath, 'bw') as f:
f.write(resp.content)
return resp.content
def loadBook(s, link):
print(link)
resp = s.get(link + 'reader/', verify=False)
m = re.search('prefix": "/storage/public/books/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f-]+?)/"', resp.text)
hash = m.group(1)
m = re.search('"name": "([\w\W]+?)"', resp.text)
name = m.group(1)
name = re.sub('[^\w\-_\. ]', '_', name)
m = re.search('"cover": "([\w\W]+?)"', resp.text)
link_to_cover = "https://mybook.ru" + m.group(1)
m = re.search('"book": "([\w\W]+?)"', resp.text)
api_url = "https://mybook.ru" + m.group(1)
resp = s.get(api_url)
m = re.search('"cover_name":"([\w\W]+?)"', resp.text)
author_name = m.group(1)
print('Try to load book ' + name)
final = ""
cid = 0
tlink = 'https://mybook.ru/storage/public/books/%s/%s/%s/' % (hash[:2], hash[2:4], hash)
bookLoadDone = True
while bookLoadDone:
link = (tlink + "content%s.html") % cid
pageLoadDone = True
while pageLoadDone:
print('Try to load page ' + str(cid), link)
try:
resp = s.get(link)
if resp.status_code == 404:
bookLoadDone = False
break
text = resp.text.encode('ISO-8859-1').decode('utf8')
if cid == 0:
final += text[:-15]
else:
final += text[text.index("<body>") + 6:-15]
pageLoadDone = False
except Exception as e:
print('Error: ', e)
time.sleep(0.5)
cid += 1
cid = 0
contentLoadDone = True
while contentLoadDone:
link = (tlink + "contentnotes%s.html") % cid
pageLoadDone = True
while pageLoadDone:
print('Try to load content note ' + str(cid), link)
try:
resp = s.get(link)
if resp.status_code == 404:
contentLoadDone = False
break
text = resp.text.encode('ISO-8859-1').decode('utf8')
final += text[text.index("<body>") + 6:-15]
pageLoadDone = False
except Exception as e:
print('Error: ', e)
time.sleep(0.5)
cid += 1
final += "</body></html>"
html_folder = "html"
htmlname = name + ".html"
bookHtmlFolder = os.path.join(html_folder, author_name, name + "___" + hash)
path_to_html = os.path.join(bookHtmlFolder, htmlname)
final = re.sub('href="content[0-9]+\.html', 'href="', final)
final = re.sub('href="contentnotes[0-9]+\.html', 'href="' , final)
os.makedirs(bookHtmlFolder, exist_ok=True)
with open(path_to_html, 'wb') as f:
f.write(final.encode('utf8'))
#links
for m in re.finditer('<link [\w\W]+? href="([\w\W]+?)"', final):
if m.group(1) == 'unicode_fonts.css':
continue
print('Try to load link ' + m.group(1))
text = saveBinary(s, tlink + m.group(1), os.path.join(bookHtmlFolder, m.group(1))).decode('utf8')
for m2 in re.finditer('src: url\((.+)\)', text):
print('Try to load link ' + m2.group(1))
saveBinary(s, tlink + m2.group(1), os.path.join(bookHtmlFolder, m2.group(1)))
#images
for m in re.finditer('<img[\w\W]+?src="([\w\.]+?)"', final):
print('Try to load image ' + m.group(1))
saveBinary(s, tlink + m.group(1), os.path.join(bookHtmlFolder, m.group(1)))
#cover
path_to_cover = os.path.join(bookHtmlFolder, "book_cover.jpg")
saveBinary(s, link_to_cover, path_to_cover)
path_to_cover_html = os.path.join(bookHtmlFolder, "book_cover.html")
with open(path_to_cover_html, 'w') as f:
f.write('<html><body background="book_cover.jpg" style="background-repeat:no-repeat;background-position: center center;"></body></html>')
pdfFolder = "pdf"
bookPdfFolder = os.path.join(pdfFolder, author_name)
pdfName = name + '___' + hash +'.pdf'
os.makedirs(pdfFolder, exist_ok=True)
#subprocess.run(['wkhtmltopdf', 'cover', path_to_cover_html, path_to_html, os.path.join(bookPdfFolder, pdfName)])
def getLinkFromAuthorPage(s, link):
link = link + "/date-list/?page="
cid = 1
links = []
while True:
text = s.get(link + str(cid)).text
if '<!-- Book list -->' in text:
for i in re.finditer('<form id="readbook-form" action="([\w\W]+?)reader/"', text):
links.append('https://mybook.ru' + i.group(1))
cid += 1
else:
break
return links
s = requests.session()
s.post('https://mybook.ru/account/login/', verify=False, data={'username': '', 'password': ''})
auths = []
for auth in auths:
links = set(getLinkFromAuthorPage(s, auth))
for link in links:
loadBook(s, link)
#
# loadBook(s, 'https://mybook.ru/author/yakov-isidorovich-perelman/101-golovolomka/')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment