Batch download pp.163.com
批量下载网易摄影 pp.163.com 的照片
使用方法:
修改成你需要的线程和下载器 然后:
python 163pp.py [分辨率] URL1 URL2...
分辨率:
murl 中等
surl 小
lurl 好像是很小很小。。。。。。。。
turl 比小还小
qurl 正方形
| #!/usr/bin/env python | |
| #coding:utf-8 | |
| # Author: Beining http://www.cnbeining.com/ cnbeining[at]gmail.com | |
| # Purpose: Batch download pp.163.com | |
| # Created: 03/04/2015 | |
| # License: GNU GPL 2.0 https://www.gnu.org/licenses/gpl-2.0.html | |
| import os | |
| import sys | |
| import unittest | |
| import urllib2 | |
| import logging | |
| import re | |
| from multiprocessing import Pool | |
| from multiprocessing.dummy import Pool as ThreadPool | |
| import getopt | |
| import subprocess | |
| global DOWNLOAD_SOFTWARE, FAKE_HEADER, LOCATION_DIR, resolution | |
| FAKE_HEADER = { | |
| 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.16 Safari/537.36', | |
| 'Cache-Control': 'no-cache', | |
| 'Pragma': 'no-cache'} | |
| LOCATION_DIR = os.getcwd() | |
| DOWNLOAD_SOFTWARE = 'wget' | |
| #---------------------------------------------------------------------- | |
| def page_reader(url): | |
| """str->str | |
| read pages.""" | |
| request = urllib2.Request(url, headers=FAKE_HEADER) | |
| response = urllib2.urlopen(request) | |
| data = response.read() | |
| return data | |
| #---------------------------------------------------------------------- | |
| def page_parser(webpage): | |
| """str->dict | |
| url:http://pp.163.com/daowuzhe123/pp/13424132.html | |
| prpr~""" | |
| logging.info('Retriving purl...') | |
| for i in webpage.split('\n'): | |
| if 'purl' in i: | |
| purl = 'http://' + i.strip()[6:-2] | |
| #http://s1.ph.126.net/WwP8GD1A3ocjPfENOdgrdQ==/192414543510075.js | |
| for i in webpage.split('\n'): | |
| if 'name:' in i: | |
| folder_name = i.decode('gbk').strip()[7:-2] | |
| print(folder_name) | |
| break | |
| try: | |
| os.mkdir(folder_name) | |
| except Exception: | |
| pass | |
| os.chdir(LOCATION_DIR + '/' + folder_name) | |
| purl_data = page_reader(purl) | |
| purl_processed = purl_data.split('[{')[1].split('}]')[0].split('},{') | |
| purl_processed_list =['{' + i + '}' for i in purl_processed] | |
| pattern = r"([a-zA-Z_][a-zA-Z_0-9]*)\s*\:" | |
| repl = lambda match: '"{}":'.format(match.group(1)) | |
| dict_big = {} | |
| #print(purl_processed_list) | |
| for i in purl_processed_list: | |
| #print(i) | |
| dict_this = {} | |
| dict_this = eval(re.sub(pattern, repl, i)) | |
| #print(dict_this) | |
| photoId = dict_this['photoId'] | |
| #print(photoId) | |
| dict_big[photoId] = dict_this | |
| return dict_big | |
| #---------------------------------------------------------------------- | |
| def download_video_link((filename, DOWNLOAD_SOFTWARE, img_url)): | |
| """""" | |
| logging.info('Downloading #{filename}...'.format(filename = filename)) | |
| if DOWNLOAD_SOFTWARE == 'aria2c': | |
| cmd = 'aria2c -c -k1M --out {filename} "{img_url}"' | |
| elif DOWNLOAD_SOFTWARE == 'wget': | |
| cmd = 'wget -c -O {filename} "{img_url}"' | |
| elif DOWNLOAD_SOFTWARE == 'curl': | |
| cmd = 'curl -L -C -o {filename} "{img_url}"' | |
| elif DOWNLOAD_SOFTWARE == 'axel': | |
| cmd = 'axel -o {filename} "{img_url}"' | |
| cmd = cmd.format(filename = filename, img_url = img_url) | |
| logging.debug(cmd) | |
| execute_cmd(cmd) | |
| #---------------------------------------------------------------------- | |
| def execute_cmd(cmd): | |
| """""" | |
| return_code = subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| if return_code == 0: | |
| pass | |
| else: | |
| logging.warning('ERROR') | |
| return return_code | |
| #---------------------------------------------------------------------- | |
| def parse_list(img_dict, resolution): | |
| """dict->None""" | |
| down_list = [] | |
| for i in img_dict: | |
| filename = str(img_dict[i]['photoId']) + '.' + img_dict[i][resolution].split('.')[-1] | |
| img_url = 'http://img' + img_dict[i][resolution][0] + '.ph.126.net' + img_dict[i][resolution][1:] | |
| down_list.append((filename, DOWNLOAD_SOFTWARE, img_url)) | |
| return down_list | |
| #---------------------------------------------------------------------- | |
| def downloader(down_list, workers = 5): | |
| """""" | |
| from multiprocessing.dummy import Pool as ThreadPool | |
| # Make the Pool of workers | |
| pool = ThreadPool(int(workers)) | |
| # Open the urls in their own threads | |
| # and return the results | |
| results = pool.map(download_video_link, down_list) | |
| #close the pool and wait for the work to finish | |
| pool.close() | |
| pool.join() | |
| #---------------------------------------------------------------------- | |
| def main(link, resolution): | |
| """""" | |
| page_data = page_reader(link) | |
| link_dict = page_parser(page_data) | |
| down_list = parse_list(link_dict, resolution) | |
| downloader(down_list, 5) | |
| if __name__=='__main__': | |
| resolution = sys.argv[1] | |
| argv_list = sys.argv[2:] | |
| for link in argv_list: | |
| os.chdir(LOCATION_DIR) | |
| main(link, resolution) | |
| print('Done!') |