frydaykg/mybook.py

## mybook.py
import requests
from itertools import product
import time
import re
import os
import subprocess
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)


def saveBinary(session, link, filePath):
    resp = session.get(link)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    if os.path.isfile(filePath):
        print(filePath, 'already exists')
    with open(filePath, 'bw') as f:
        f.write(resp.content)
    return resp.content

def loadBook(s, link):
    print(link)
    resp = s.get(link + 'reader/', verify=False)
    m = re.search('prefix": "/storage/public/books/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f-]+?)/"', resp.text)
    hash = m.group(1)

    m = re.search('"name": "([\w\W]+?)"', resp.text)
    name = m.group(1)
    name = re.sub('[^\w\-_\. ]', '_', name)

    m = re.search('"cover": "([\w\W]+?)"', resp.text)
    link_to_cover = "https://mybook.ru" + m.group(1)

    m = re.search('"book": "([\w\W]+?)"', resp.text)
    api_url = "https://mybook.ru" + m.group(1)
    resp = s.get(api_url)
    m = re.search('"cover_name":"([\w\W]+?)"', resp.text)
    author_name = m.group(1)

    print('Try to load book ' + name)


    final = ""
    cid  = 0
    tlink = 'https://mybook.ru/storage/public/books/%s/%s/%s/' % (hash[:2], hash[2:4], hash)

    bookLoadDone = True
    while bookLoadDone:
        link = (tlink + "content%s.html") % cid
        pageLoadDone = True
        while pageLoadDone:
            print('Try to load page ' + str(cid), link)
            try:
                resp = s.get(link)
                if resp.status_code == 404:
                    bookLoadDone = False
                    break
                text = resp.text.encode('ISO-8859-1').decode('utf8')
                if cid == 0:
                    final += text[:-15]
                else:
                    final += text[text.index("<body>") + 6:-15]
                pageLoadDone = False
            except Exception as e:
                print('Error: ', e)
                time.sleep(0.5)
        cid += 1

    cid  = 0
    contentLoadDone = True
    while contentLoadDone:

        link = (tlink + "contentnotes%s.html") % cid
        pageLoadDone = True
        while pageLoadDone:
            print('Try to load content note ' + str(cid), link)
            try:
                resp = s.get(link)
                if resp.status_code == 404:
                    contentLoadDone = False
                    break
                text = resp.text.encode('ISO-8859-1').decode('utf8')
                final += text[text.index("<body>") + 6:-15]
                pageLoadDone = False
            except Exception as e:
                print('Error: ', e)
                time.sleep(0.5)
        cid += 1

    final += "</body></html>"

    html_folder = "html"
    htmlname = name + ".html"
    bookHtmlFolder = os.path.join(html_folder, author_name, name  + "___" + hash)
    path_to_html = os.path.join(bookHtmlFolder, htmlname)
    final = re.sub('href="content[0-9]+\.html', 'href="', final)
    final = re.sub('href="contentnotes[0-9]+\.html', 'href="' , final)

    os.makedirs(bookHtmlFolder, exist_ok=True)
    with open(path_to_html, 'wb') as f:
        f.write(final.encode('utf8'))


    #links
    for m  in re.finditer('<link [\w\W]+? href="([\w\W]+?)"', final):
        if m.group(1) == 'unicode_fonts.css':
            continue
        print('Try to load link ' + m.group(1))
        text = saveBinary(s, tlink + m.group(1), os.path.join(bookHtmlFolder, m.group(1))).decode('utf8')
        for m2 in re.finditer('src: url\((.+)\)', text):
            print('Try to load link ' + m2.group(1))
            saveBinary(s, tlink + m2.group(1), os.path.join(bookHtmlFolder, m2.group(1)))

    #images
    for m  in re.finditer('<img[\w\W]+?src="([\w\.]+?)"', final):
        print('Try to load image ' + m.group(1))
        saveBinary(s, tlink + m.group(1), os.path.join(bookHtmlFolder, m.group(1)))

    #cover
    path_to_cover = os.path.join(bookHtmlFolder, "book_cover.jpg")
    saveBinary(s, link_to_cover, path_to_cover)
    path_to_cover_html = os.path.join(bookHtmlFolder, "book_cover.html")
    with open(path_to_cover_html, 'w') as f:
        f.write('<html><body background="book_cover.jpg" style="background-repeat:no-repeat;background-position: center center;"></body></html>')


    pdfFolder = "pdf"
    bookPdfFolder = os.path.join(pdfFolder, author_name)
    pdfName = name + '___' + hash +'.pdf'
    os.makedirs(pdfFolder, exist_ok=True)
    #subprocess.run(['wkhtmltopdf', 'cover', path_to_cover_html,  path_to_html, os.path.join(bookPdfFolder, pdfName)])


def getLinkFromAuthorPage(s, link):
    link = link + "/date-list/?page="
    cid = 1
    links = []
    while True:
        text = s.get(link + str(cid)).text
        if '<!-- Book list -->' in text:
            for i in re.finditer('<form id="readbook-form" action="([\w\W]+?)reader/"', text):
                links.append('https://mybook.ru' + i.group(1))
            cid += 1
        else:
            break
    return links

s = requests.session()
s.post('https://mybook.ru/account/login/', verify=False, data={'username': '', 'password': ''})

auths = []

for auth in auths:
    links = set(getLinkFromAuthorPage(s, auth))
    for link in links:
         loadBook(s, link)
#
# loadBook(s, 'https://mybook.ru/author/yakov-isidorovich-perelman/101-golovolomka/')
	import requests
	from itertools import product
	import time
	import re
	import os
	import subprocess
	from requests.packages.urllib3.exceptions import InsecureRequestWarning

	requests.packages.urllib3.disable_warnings(InsecureRequestWarning)


	def saveBinary(session, link, filePath):
	resp = session.get(link)
	os.makedirs(os.path.dirname(filePath), exist_ok=True)
	if os.path.isfile(filePath):
	print(filePath, 'already exists')
	with open(filePath, 'bw') as f:
	f.write(resp.content)
	return resp.content

	def loadBook(s, link):
	print(link)
	resp = s.get(link + 'reader/', verify=False)
	m = re.search('prefix": "/storage/public/books/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f-]+?)/"', resp.text)
	hash = m.group(1)

	m = re.search('"name": "([\w\W]+?)"', resp.text)
	name = m.group(1)
	name = re.sub('[^\w\-_\. ]', '_', name)

	m = re.search('"cover": "([\w\W]+?)"', resp.text)
	link_to_cover = "https://mybook.ru" + m.group(1)

	m = re.search('"book": "([\w\W]+?)"', resp.text)
	api_url = "https://mybook.ru" + m.group(1)
	resp = s.get(api_url)
	m = re.search('"cover_name":"([\w\W]+?)"', resp.text)
	author_name = m.group(1)

	print('Try to load book ' + name)


	final = ""
	cid = 0
	tlink = 'https://mybook.ru/storage/public/books/%s/%s/%s/' % (hash[:2], hash[2:4], hash)

	bookLoadDone = True
	while bookLoadDone:
	link = (tlink + "content%s.html") % cid
	pageLoadDone = True
	while pageLoadDone:
	print('Try to load page ' + str(cid), link)
	try:
	resp = s.get(link)
	if resp.status_code == 404:
	bookLoadDone = False
	break
	text = resp.text.encode('ISO-8859-1').decode('utf8')
	if cid == 0:
	final += text[:-15]
	else:
	final += text[text.index("<body>") + 6:-15]
	pageLoadDone = False
	except Exception as e:
	print('Error: ', e)
	time.sleep(0.5)
	cid += 1

	cid = 0
	contentLoadDone = True
	while contentLoadDone:

	link = (tlink + "contentnotes%s.html") % cid
	pageLoadDone = True
	while pageLoadDone:
	print('Try to load content note ' + str(cid), link)
	try:
	resp = s.get(link)
	if resp.status_code == 404:
	contentLoadDone = False
	break
	text = resp.text.encode('ISO-8859-1').decode('utf8')
	final += text[text.index("<body>") + 6:-15]
	pageLoadDone = False
	except Exception as e:
	print('Error: ', e)
	time.sleep(0.5)
	cid += 1

	final += "</body></html>"

	html_folder = "html"
	htmlname = name + ".html"
	bookHtmlFolder = os.path.join(html_folder, author_name, name + "___" + hash)
	path_to_html = os.path.join(bookHtmlFolder, htmlname)
	final = re.sub('href="content[0-9]+\.html', 'href="', final)
	final = re.sub('href="contentnotes[0-9]+\.html', 'href="' , final)

	os.makedirs(bookHtmlFolder, exist_ok=True)
	with open(path_to_html, 'wb') as f:
	f.write(final.encode('utf8'))


	#links
	for m in re.finditer('<link [\w\W]+? href="([\w\W]+?)"', final):
	if m.group(1) == 'unicode_fonts.css':
	continue
	print('Try to load link ' + m.group(1))
	text = saveBinary(s, tlink + m.group(1), os.path.join(bookHtmlFolder, m.group(1))).decode('utf8')
	for m2 in re.finditer('src: url\((.+)\)', text):
	print('Try to load link ' + m2.group(1))
	saveBinary(s, tlink + m2.group(1), os.path.join(bookHtmlFolder, m2.group(1)))

	#images
	for m in re.finditer('<img[\w\W]+?src="([\w\.]+?)"', final):
	print('Try to load image ' + m.group(1))
	saveBinary(s, tlink + m.group(1), os.path.join(bookHtmlFolder, m.group(1)))

	#cover
	path_to_cover = os.path.join(bookHtmlFolder, "book_cover.jpg")
	saveBinary(s, link_to_cover, path_to_cover)
	path_to_cover_html = os.path.join(bookHtmlFolder, "book_cover.html")
	with open(path_to_cover_html, 'w') as f:
	f.write('<html><body background="book_cover.jpg" style="background-repeat:no-repeat;background-position: center center;"></body></html>')


	pdfFolder = "pdf"
	bookPdfFolder = os.path.join(pdfFolder, author_name)
	pdfName = name + '___' + hash +'.pdf'
	os.makedirs(pdfFolder, exist_ok=True)
	#subprocess.run(['wkhtmltopdf', 'cover', path_to_cover_html, path_to_html, os.path.join(bookPdfFolder, pdfName)])


	def getLinkFromAuthorPage(s, link):
	link = link + "/date-list/?page="
	cid = 1
	links = []
	while True:
	text = s.get(link + str(cid)).text
	if '<!-- Book list -->' in text:
	for i in re.finditer('<form id="readbook-form" action="([\w\W]+?)reader/"', text):
	links.append('https://mybook.ru' + i.group(1))
	cid += 1
	else:
	break
	return links

	s = requests.session()
	s.post('https://mybook.ru/account/login/', verify=False, data={'username': '', 'password': ''})

	auths = []

	for auth in auths:
	links = set(getLinkFromAuthorPage(s, auth))
	for link in links:
	loadBook(s, link)
	#
	# loadBook(s, 'https://mybook.ru/author/yakov-isidorovich-perelman/101-golovolomka/')
No results found