-
-
Save drelatgithub/6f2888c2109f29206786105f2d00db5e to your computer and use it in GitHub Desktop.
| ############################################################################### | |
| # | |
| # Docin document downloader | |
| # | |
| # Valid as of 2022-12-13 | |
| # | |
| ############################################################################### | |
| import argparse | |
| import os | |
| from types import SimpleNamespace | |
| import urllib.request | |
| conf = SimpleNamespace( | |
| docin_pid = 0, | |
| output_dir = "" | |
| ) | |
| def download_image(pid): | |
| i = 0 | |
| while True: | |
| i += 1 | |
| try: | |
| urllib.request.urlretrieve( | |
| "http://211.147.220.164/index.jsp?file={}&pageno={}".format(pid, i), | |
| os.path.join(conf.output_dir, "{}.png".format(i)) | |
| ) | |
| except urllib.error.HTTPError: | |
| break | |
| else: | |
| print("Page", i, "saved.") | |
| if __name__ == "__main__": | |
| # Parse the arguments | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("docin_pid", type=str, help="The number after \"p-\" in docin url") | |
| parser.add_argument("output_dir", type=str, help="The output directory") | |
| args = parser.parse_args() | |
| conf.docin_pid = args.docin_pid | |
| conf.output_dir = args.output_dir | |
| # Do the work | |
| download_image(conf.docin_pid) |
import os
import urllib.request
from PIL import Image
def download_images(pid, output_dir):
images = []
i = 1
while True:
try:
file_name = f"{i}.png"
file_path = os.path.join(output_dir, file_name)
url = f"http://211.147.220.164/index.jsp?file={pid}&pageno={i}"
urllib.request.urlretrieve(url, file_path)
print(f"Page {i} downloaded.")
images.append(file_path)
i += 1
except urllib.error.HTTPError:
print("Download completed or no more pages to fetch.")
break
return images
def images_to_pdf(images, output_dir):
pdf_path = os.path.join(output_dir, "document.pdf")
image_list = [Image.open(image).convert('RGB') for image in images]
if image_list:
image_list[0].save(pdf_path, save_all=True, append_images=image_list[1:])
print(f"PDF saved at {pdf_path}")
if name == "main":
pid = 2064621039 # Example PID, replace with the actual PID you want to use
output_dir = "/content/drive/MyDrive/Data" # Change to your desired output directory
if not os.path.exists(output_dir):
os.makedirs(output_dir)
images = download_images(pid, output_dir)
if images:
images_to_pdf(images, output_dir)
Here is new code
https://www.docin.com/p-4780561796.html
这个链接没法使用,有办法吗?
Works in 2022, thanks!
(add &width=1836&height=2376 to request for better resolution images)