Skip to content

Instantly share code, notes, and snippets.

@denilsonsa
Created January 16, 2026 19:24
Show Gist options
  • Select an option

  • Save denilsonsa/3077d8fdd63b8261034009a58ec7542e to your computer and use it in GitHub Desktop.

Select an option

Save denilsonsa/3077d8fdd63b8261034009a58ec7542e to your computer and use it in GitHub Desktop.
download_twitpic.py

This is a small Python 2 script written in 2011 to download a bunch of images from the (defunct) website twitpic.com.

Back then, Twitter was really a microblogging platform that only allowed short text messages and nothing else. TwitPic was a third-party website to host user-submitted images with short URLs, and those URLs could be embedded into normal tweets.

Around that time, I was playing a lot of Super Meat Boy, and for some reason I wanted to download and view all the images posted in that Twitter/TwitPic account. So I wrote this script. I ran it like this:

./download_twitpic.py  http://twitpic.com/photos/SuperMeatBoy?page={1..9}

The script managed to download about 150 images and around 50MB.

Nowadays, this script is obsolete. Not only the website is gone, but also it needs to be ported to Python 3. Regardless, I'm sharing it with the world as it can be the basis (or the inspiration) for other scripts with similar purposes.

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# vi:ts=4 sw=4 et
import os
import os.path
import re
GLOBAL_BASEDIR = 'twitpic'
def urlopen(url, referer=None):
import contextlib
import urllib2
headers = {
'User-agent': "I just want to download Super Meat Boy pics and won't abuse the service",
}
if referer:
headers['Referer'] = referer
return contextlib.closing(
urllib2.urlopen(
urllib2.Request(
url=url,
headers=headers
)
)
)
class TwitpicPicture(object):
'''Handles a picture page, like this: http://twitpic.com/4g0ahu
Available attributes/properties/methods:
.id = The twitpic id (AKA hash)
.http_url = URL for this twitpic
.http_url_full = URL for the full version of this twitpic
.local_dir = The local directory where the files are stored
.local_html = The local HTML file for the twitpic page
.local_html_full = The local HTML file for the twitpic full page
Metadata extracted from the page HTML:
.date
.caption
.img_name
.img_full_url (only valid for a few minutes)
.local_img_full (local path to the image)
'''
re_twitpic_url = re.compile(r'https?://twitpic.com/(?P<id>[^/]+)(/full)?')
re_img_name = re.compile(r'/(?P<name>[a-zA-Z0-9_.]+.(gif|png|jpe?g))')
base_dir = GLOBAL_BASEDIR
def __init__(self, url_or_id):
'''Receives either a full URL to the twitpic page, or just the hash.'''
match = self.re_twitpic_url.match(url_or_id)
if match:
self.id = match.group('id')
else:
self.id = url_or_id
# Metadata
self._date = None
self._caption = None
self._img_name = None
self._img_full_url = None
@property
def http_url(self):
return 'http://twitpic.com/{0}'.format(self.id)
@property
def http_url_full(self):
return 'http://twitpic.com/{0}/full'.format(self.id)
@property
def local_dir(self):
return os.path.join(self.base_dir, self.id)
@property
def local_html(self):
return os.path.join(self.local_dir, 'index.html')
@property
def local_html_full(self):
return os.path.join(self.local_dir, 'full.html')
@property
def local_img_full(self):
return os.path.join(self.local_dir, self.img_name)
@property
def date(self):
if self._date is None:
self.parse_page()
return self._date
@property
def caption(self):
if self._caption is None:
self.parse_page()
return self._caption
@property
def img_name(self):
if self._img_name is None:
self.parse_page_full()
return self._img_name
@property
def img_full_url(self):
if self._img_full_url is None:
self.parse_page_full()
return self._img_full_url
def mkdir(self):
'''Creates the directory where everything will be saved. Ignores any
errors.
'''
try:
#os.makedirs(self.local_dir)
os.mkdir(self.local_dir)
except OSError:
pass
def open_page(self, which_one='index'):
'''Loads a page (or image) either from the web or from local disk, and
returns a file-like object.
When loading from the web, automatically saves a local copy and returns
a file-like object for the local copy.
'''
# Which page? The normal one, or the full one?
if which_one == 'index':
url = self.http_url
local_file = self.local_html
referer = None
elif which_one == 'full':
url = self.http_url_full
local_file = self.local_html_full
referer = None
elif which_one == 'img_full':
url = self.img_full_url
local_file = self.local_img_full
referer = self.http_url_full
else:
raise ValueError('Unknown page type "{0}"'.format(which_one))
if not os.path.isfile(local_file):
# Load from web (and save a local copy)
self.mkdir()
with urlopen(url, referer) as u:
with open(local_file, 'wb') as f:
f.write(u.read())
# Setting the local modified time to the header received from
# server
import email.utils
date_str = u.headers.get('Last-Modified', None)
if date_str:
date_tuple = email.utils.parsedate_tz(date_str)
timestamp = email.utils.mktime_tz(date_tuple)
os.utime(local_file, (timestamp, timestamp))
# Load from local copy
return open(local_file, 'rb')
def open_page_full(self):
return self.open_page(which_one='full')
def open_img_full(self):
return self.open_page(which_one='img_full')
def parse_page(self):
import datetime
import lxml.html
from lxml.cssselect import CSSSelector
root = lxml.html.parse(self.open_page()).getroot()
# Getting the photo date:
# <p><span id="photo-info-name"> Team Meat</span>
# April 2, 2011 </p>
date_sel = CSSSelector('#photo-info-name')
date_string = date_sel(root)[0].tail.strip()
self._date = datetime.datetime.strptime(date_string, '%B %d, %Y')
# Getting the caption (the photo text)
caption_sel = CSSSelector('#view-photo-caption')
self._caption = caption_sel(root)[0].text.strip()
with open(os.path.join(self.local_dir, 'caption.txt'), 'wb') as f:
f.write(self._caption)
return root
def parse_page_full(self):
import lxml.html
from lxml.cssselect import CSSSelector
root = lxml.html.parse(self.open_page_full()).getroot()
# Getting the image name
img_sel = CSSSelector('body > img')
img_el = img_sel(root)[0]
img_src = img_el.get('src')
img_name = self.re_img_name.search(img_src).group('name')
# The caption is also available as image alternate text
# img_alt = img_el.get('alt')
self._img_full_url = img_src
self._img_name = img_name
return root
def get_twitpic_pictures_from_profile_page(url):
'''Iterator that receives a picture page and returns links to each
individual URL.
Sample URLs:
http://twitpic.com/photos/SuperMeatBoy
http://twitpic.com/photos/SuperMeatBoy?page=2
'''
with urlopen(url) as u:
import lxml.html
from lxml.cssselect import CSSSelector
root = lxml.html.parse(u).getroot()
root.make_links_absolute()
pics_sel = CSSSelector('.user-photo a')
for a in pics_sel(root):
yield a.get('href')
def process_one_url(url):
print 'Processing {0}'.format(url)
tp = TwitpicPicture(url)
print ' Caption: {0}'.format(tp.caption)
tp.open_page()
tp.open_page_full()
tp.open_img_full()
def main(args):
if len(args) == 0:
print 'Huh... Just pass all URLs as parameters...'
print 'TODO: write a nice parameter parsing code. Oh, and a proper help message.'
import re
re_profile = re.compile(r'https?://twitpic.com/photos/.+')
re_twitpic = re.compile(r'https?://twitpic.com/.+')
for url in args:
if re_profile.match(url):
print 'Parsing {0}'.format(url)
for linked_url in get_twitpic_pictures_from_profile_page(url):
process_one_url(linked_url)
elif re_twitpic.match(url):
process_one_url(url)
else:
print 'Invalid URL: {0}'.format(url)
if __name__ == '__main__':
import sys
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment