Skip to content

Instantly share code, notes, and snippets.

@ymkim92
Last active September 7, 2024 14:56
Show Gist options
  • Select an option

  • Save ymkim92/b7b744f1effccd8c3331509b0dba4f1c to your computer and use it in GitHub Desktop.

Select an option

Save ymkim92/b7b744f1effccd8c3331509b0dba4f1c to your computer and use it in GitHub Desktop.
podbbang에서 인기 에피소드를 추려서 다운로드 받는 Python script (using beautifulsoap)
import re
import requests
from itertools import count
from pathlib import Path
from bs4 import BeautifulSoup
from clint.textui import progress
import sys
SCAN_PAGES = 2
NUM_DOWNLOADS = 17
CHARS_TO_REMOVE_FROM_FILENAME = "|\\?*<\":>+[]/'·\.,()’"
episode_list = []
def get_list_by_like(pid):
for page in range(SCAN_PAGES):
page += 1
url = 'http://www.podbbang.com/podbbangchnew/episode_list?id={pid}&page={page}'.format(pid=pid, page=page)
print(url)
response = requests.get(url)
response.encoding = 'utf8'
html = response.text
soup = BeautifulSoup(html, 'lxml')
for dl_tag in soup.select('li > dl'):
# print(dl_tag)
try:
title = dl_tag.find('dt')['title']
new_title = get_android_filename(title)
like = int(dl_tag.find('dd', class_ ='dd_heart').text.replace(',', ''))
js = dl_tag['onclick']
matched = re.search(r"'(\d+)',\s*'(\w+/\w+)'", js)
if matched:
eid, content_type = matched.groups()
if 'video' in content_type:
continue
episode_list.append({
'pid': pid,
'eid': eid,
'title': new_title,
'like': like
})
print(content_type)
assert('audio' in content_type)
# print('{}=>{} ({})'.format(title, new_title, like))
# open(new_title+'.mp3', 'a').close()
except KeyError:
print('Ended')
return None
return episode_list
def get_list_by_keyword(pid):
for page in range(SCAN_PAGES):
page += 1
url = 'http://www.podbbang.com/podbbangchnew/episode_list?id={pid}&page={page}'.format(pid=pid, page=page)
print(url)
response = requests.get(url)
response.encoding = 'utf8'
html = response.text
soup = BeautifulSoup(html, 'lxml')
for dl_tag in soup.select('li > dl'):
# print(dl_tag)
try:
title = dl_tag.find('dt')['title']
new_title = get_android_filename(title)
like = int(dl_tag.find('dd', class_ ='dd_heart').text.replace(',', ''))
js = dl_tag['onclick']
matched = re.search(r"'(\d+)',\s*'(\w+/\w+)'", js)
if matched:
eid, content_type = matched.groups()
if 'video' in content_type:
continue
episode_list.append({
'pid': pid,
'eid': eid,
'title': new_title,
'like': like
})
print(content_type)
assert('audio' in content_type)
# print('{}=>{} ({})'.format(title, new_title, like))
# open(new_title+'.mp3', 'a').close()
except KeyError:
print('Ended')
return None
return episode_list
def mp3_download(pid, eid, title):
url = 'http://www.podbbang.com/download?pid={pid}&eid={eid}'.format(pid=pid, eid=eid)
headers = {
'Referer': 'http://www.podbbang.com/ch/{pid}'.format(pid=pid),
}
r = requests.get(url, headers=headers, stream=True)
if r.status_code == 200:
filepath = Path('{}.mp3'.format(title))
total_length = int(r.headers.get('content-length'))
if filepath.exists() and filepath.stat().st_size == total_length:
print('{} - File exists.'.format(title))
else:
print('{} - Downloading'.format(title))
with filepath.open('wb') as f:
chunk_size = 1024
expected_size = (total_length//chunk_size) + 1
for chunk in progress.bar(r.iter_content(chunk_size=chunk_size), expected_size=expected_size):
f.write(chunk)
else:
print('download failed. status code = {}'.format(r.status_code))
def get_android_filename(filename):
new_fn = filename
for c in CHARS_TO_REMOVE_FROM_FILENAME:
new_fn = new_fn.replace(c, '')
return new_fn
podcast_dict = {
'법륜스님': 1805,
'신과함께': 15781,
'매불쇼': 16898,
}
if __name__ == '__main__':
# pname = '법륜스님'
pname = '신과함께'
# pname = '매불쇼'
elist = get_list_by_like(podcast_dict[pname])
newlist = sorted(elist, key=lambda k: k['like'], reverse=True)
cnt = 1
for ep in newlist:
if NUM_DOWNLOADS < cnt:
break
if not 'P2-Live' in ep['title']:
# print(ep)
mp3_download(ep['pid'], ep['eid'], ep['title'])
cnt += 1
@ymkim92
Copy link
Author

ymkim92 commented Mar 24, 2019

위 스크립트를 실행하기 위해서는 아래 3가지 모듈을 설치해야 합니다.
pip install beautifulsoup4 requests clint

본 스크립트는 아래 스크립트를 수정하여 작성했습니다.
https://gist.github.com/allieus/c374d78ee624541af10450ba4431138e

@ymkim92
Copy link
Author

ymkim92 commented Mar 24, 2019

아래 두 변수를 사용하세요.
SCAN_PAGES = 5 # 얼마나 많은 페이지들을 대상으로 할 것인지?
NUM_DOWNLOADS = 10 # like(하트) 기준으로 상위 몇 개의 에피소드를 다운로드 할 지?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment