|
#!/usr/bin/env python2 |
|
# -*- coding: utf-8 -*- |
|
# vi:ts=4 sw=4 et |
|
|
|
import os |
|
import os.path |
|
import re |
|
|
|
|
|
GLOBAL_BASEDIR = 'twitpic' |
|
|
|
|
|
def urlopen(url, referer=None): |
|
import contextlib |
|
import urllib2 |
|
|
|
headers = { |
|
'User-agent': "I just want to download Super Meat Boy pics and won't abuse the service", |
|
} |
|
if referer: |
|
headers['Referer'] = referer |
|
|
|
return contextlib.closing( |
|
urllib2.urlopen( |
|
urllib2.Request( |
|
url=url, |
|
headers=headers |
|
) |
|
) |
|
) |
|
|
|
|
|
class TwitpicPicture(object): |
|
'''Handles a picture page, like this: http://twitpic.com/4g0ahu |
|
|
|
Available attributes/properties/methods: |
|
.id = The twitpic id (AKA hash) |
|
.http_url = URL for this twitpic |
|
.http_url_full = URL for the full version of this twitpic |
|
.local_dir = The local directory where the files are stored |
|
.local_html = The local HTML file for the twitpic page |
|
.local_html_full = The local HTML file for the twitpic full page |
|
|
|
Metadata extracted from the page HTML: |
|
.date |
|
.caption |
|
.img_name |
|
.img_full_url (only valid for a few minutes) |
|
.local_img_full (local path to the image) |
|
|
|
''' |
|
|
|
re_twitpic_url = re.compile(r'https?://twitpic.com/(?P<id>[^/]+)(/full)?') |
|
re_img_name = re.compile(r'/(?P<name>[a-zA-Z0-9_.]+.(gif|png|jpe?g))') |
|
|
|
base_dir = GLOBAL_BASEDIR |
|
|
|
def __init__(self, url_or_id): |
|
'''Receives either a full URL to the twitpic page, or just the hash.''' |
|
|
|
match = self.re_twitpic_url.match(url_or_id) |
|
if match: |
|
self.id = match.group('id') |
|
else: |
|
self.id = url_or_id |
|
|
|
# Metadata |
|
self._date = None |
|
self._caption = None |
|
self._img_name = None |
|
self._img_full_url = None |
|
|
|
|
|
@property |
|
def http_url(self): |
|
return 'http://twitpic.com/{0}'.format(self.id) |
|
@property |
|
def http_url_full(self): |
|
return 'http://twitpic.com/{0}/full'.format(self.id) |
|
|
|
@property |
|
def local_dir(self): |
|
return os.path.join(self.base_dir, self.id) |
|
@property |
|
def local_html(self): |
|
return os.path.join(self.local_dir, 'index.html') |
|
@property |
|
def local_html_full(self): |
|
return os.path.join(self.local_dir, 'full.html') |
|
@property |
|
def local_img_full(self): |
|
return os.path.join(self.local_dir, self.img_name) |
|
|
|
@property |
|
def date(self): |
|
if self._date is None: |
|
self.parse_page() |
|
return self._date |
|
@property |
|
def caption(self): |
|
if self._caption is None: |
|
self.parse_page() |
|
return self._caption |
|
@property |
|
def img_name(self): |
|
if self._img_name is None: |
|
self.parse_page_full() |
|
return self._img_name |
|
@property |
|
def img_full_url(self): |
|
if self._img_full_url is None: |
|
self.parse_page_full() |
|
return self._img_full_url |
|
|
|
|
|
def mkdir(self): |
|
'''Creates the directory where everything will be saved. Ignores any |
|
errors. |
|
|
|
''' |
|
|
|
try: |
|
#os.makedirs(self.local_dir) |
|
os.mkdir(self.local_dir) |
|
except OSError: |
|
pass |
|
|
|
|
|
def open_page(self, which_one='index'): |
|
'''Loads a page (or image) either from the web or from local disk, and |
|
returns a file-like object. |
|
|
|
When loading from the web, automatically saves a local copy and returns |
|
a file-like object for the local copy. |
|
''' |
|
|
|
# Which page? The normal one, or the full one? |
|
if which_one == 'index': |
|
url = self.http_url |
|
local_file = self.local_html |
|
referer = None |
|
elif which_one == 'full': |
|
url = self.http_url_full |
|
local_file = self.local_html_full |
|
referer = None |
|
elif which_one == 'img_full': |
|
url = self.img_full_url |
|
local_file = self.local_img_full |
|
referer = self.http_url_full |
|
else: |
|
raise ValueError('Unknown page type "{0}"'.format(which_one)) |
|
|
|
if not os.path.isfile(local_file): |
|
# Load from web (and save a local copy) |
|
self.mkdir() |
|
with urlopen(url, referer) as u: |
|
with open(local_file, 'wb') as f: |
|
f.write(u.read()) |
|
|
|
# Setting the local modified time to the header received from |
|
# server |
|
import email.utils |
|
date_str = u.headers.get('Last-Modified', None) |
|
if date_str: |
|
date_tuple = email.utils.parsedate_tz(date_str) |
|
timestamp = email.utils.mktime_tz(date_tuple) |
|
os.utime(local_file, (timestamp, timestamp)) |
|
|
|
# Load from local copy |
|
return open(local_file, 'rb') |
|
|
|
def open_page_full(self): |
|
return self.open_page(which_one='full') |
|
|
|
def open_img_full(self): |
|
return self.open_page(which_one='img_full') |
|
|
|
|
|
def parse_page(self): |
|
import datetime |
|
import lxml.html |
|
from lxml.cssselect import CSSSelector |
|
|
|
root = lxml.html.parse(self.open_page()).getroot() |
|
|
|
# Getting the photo date: |
|
# <p><span id="photo-info-name"> Team Meat</span> |
|
# April 2, 2011 </p> |
|
date_sel = CSSSelector('#photo-info-name') |
|
date_string = date_sel(root)[0].tail.strip() |
|
self._date = datetime.datetime.strptime(date_string, '%B %d, %Y') |
|
|
|
# Getting the caption (the photo text) |
|
caption_sel = CSSSelector('#view-photo-caption') |
|
self._caption = caption_sel(root)[0].text.strip() |
|
|
|
with open(os.path.join(self.local_dir, 'caption.txt'), 'wb') as f: |
|
f.write(self._caption) |
|
|
|
return root |
|
|
|
def parse_page_full(self): |
|
import lxml.html |
|
from lxml.cssselect import CSSSelector |
|
root = lxml.html.parse(self.open_page_full()).getroot() |
|
|
|
# Getting the image name |
|
img_sel = CSSSelector('body > img') |
|
img_el = img_sel(root)[0] |
|
img_src = img_el.get('src') |
|
img_name = self.re_img_name.search(img_src).group('name') |
|
# The caption is also available as image alternate text |
|
# img_alt = img_el.get('alt') |
|
|
|
self._img_full_url = img_src |
|
self._img_name = img_name |
|
|
|
return root |
|
|
|
|
|
def get_twitpic_pictures_from_profile_page(url): |
|
'''Iterator that receives a picture page and returns links to each |
|
individual URL. |
|
|
|
Sample URLs: |
|
http://twitpic.com/photos/SuperMeatBoy |
|
http://twitpic.com/photos/SuperMeatBoy?page=2 |
|
''' |
|
|
|
with urlopen(url) as u: |
|
import lxml.html |
|
from lxml.cssselect import CSSSelector |
|
root = lxml.html.parse(u).getroot() |
|
root.make_links_absolute() |
|
|
|
pics_sel = CSSSelector('.user-photo a') |
|
for a in pics_sel(root): |
|
yield a.get('href') |
|
|
|
|
|
def process_one_url(url): |
|
print 'Processing {0}'.format(url) |
|
tp = TwitpicPicture(url) |
|
print ' Caption: {0}'.format(tp.caption) |
|
tp.open_page() |
|
tp.open_page_full() |
|
tp.open_img_full() |
|
|
|
|
|
def main(args): |
|
if len(args) == 0: |
|
print 'Huh... Just pass all URLs as parameters...' |
|
print 'TODO: write a nice parameter parsing code. Oh, and a proper help message.' |
|
|
|
import re |
|
re_profile = re.compile(r'https?://twitpic.com/photos/.+') |
|
re_twitpic = re.compile(r'https?://twitpic.com/.+') |
|
|
|
for url in args: |
|
if re_profile.match(url): |
|
print 'Parsing {0}'.format(url) |
|
for linked_url in get_twitpic_pictures_from_profile_page(url): |
|
process_one_url(linked_url) |
|
elif re_twitpic.match(url): |
|
process_one_url(url) |
|
else: |
|
print 'Invalid URL: {0}'.format(url) |
|
|
|
if __name__ == '__main__': |
|
import sys |
|
main(sys.argv[1:]) |