Skip to content

Instantly share code, notes, and snippets.

@LoboLofi
Last active October 6, 2020 23:56
Show Gist options
  • Select an option

  • Save LoboLofi/497d317c21bf8bb20a509a5a2b80d1e4 to your computer and use it in GitHub Desktop.

Select an option

Save LoboLofi/497d317c21bf8bb20a509a5a2b80d1e4 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import re
import sys
def download( fileUri, filename ):
'''
Function to download files with http get.
The limit size is around 10Mb
'''
print("Download: " + fileUri )
url = fileUri
r = requests.get(url, allow_redirects=True)
open( filename, 'wb' ).write(r.content)
def getUrlFrom(url, tag, att, http="https" ):
'''
Download an hmtml file from url and return all the att from the tags specified.
'''
r = requests.get(url, allow_redirects=True)
soup = BeautifulSoup( r.content, features="lxml" )
ret = []
for link in soup.findAll(tag, attrs={att: re.compile("^"+http+"://")}):
ret.append( link.get( att ).rstrip() )
return ret
def filterPngFiles(link):
'''
Probably you only want png or jpg files, so you can change the extencion.
'''
extension = '.png'
if link[-4:] == extension:
return True
else:
return False
def downloadManga(path, num, pathObjetive):
'''
First, you get all the links to posibly images from the path.
Next, you filter for the extension you think contains your manga.
Last, you download the image file.
'''
print("Download from: " + path)
listImg = getUrlFrom( path, 'img', 'src')
filImg = filter(filterPngFiles, listImg)
listImg = list(filImg)
if len(listImg) == 0:
#Sometimes a lot of files does't come in https, but in https, so you want to check for that
listImg = getUrlFrom( path, 'img', 'src', 'http')
filImg = filter(filterPngFiles, listImg)
#Is very probably you get a lot of disctint names, is a very good idea
#standarize the names. Num is used as the chapter number and i as the
#page numer.
#Again, you must know the image extension you want.
i = 0
for im in listImg:
i = i + 1
download(im,pathObjetive+"/"+'{0:03d}'.format(num)+"-"+'{0:03d}'.format(i)+".png")
for i in range(1,358):
#downloadManga('https://readberserk.com/chapter/berserk-chapter-' + '{0:03d}'.format(i) + '/', i, 'berserk')
downloadManga(sys.argv[1] + '{0:03d}'.format(i) + '/', i, sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment