denilsonsa/README.md

## README.md

      
    Raw
  

              README.md
            
          
    This is a small Python 2 script written in 2011 to download a bunch of images from the (defunct) website twitpic.com.
Back then, Twitter was really a microblogging platform that only allowed short text messages and nothing else. TwitPic was a third-party website to host user-submitted images with short URLs, and those URLs could be embedded into normal tweets.
Around that time, I was playing a lot of Super Meat Boy, and for some reason I wanted to download and view all the images posted in that Twitter/TwitPic account. So I wrote this script. I ran it like this:
./download_twitpic.py  http://twitpic.com/photos/SuperMeatBoy?page={1..9}
The script managed to download about 150 images and around 50MB.
Nowadays, this script is obsolete. Not only the website is gone, but also it needs to be ported to Python 3. Regardless, I'm sharing it with the world as it can be the basis (or the inspiration) for other scripts with similar purposes.

  
## download_twitpic.py
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# vi:ts=4 sw=4 et

import os
import os.path
import re


GLOBAL_BASEDIR = 'twitpic'


def urlopen(url, referer=None):
    import contextlib
    import urllib2

    headers = {
        'User-agent': "I just want to download Super Meat Boy pics and won't abuse the service",
    }
    if referer:
        headers['Referer'] = referer

    return contextlib.closing(
        urllib2.urlopen(
            urllib2.Request(
                url=url,
                headers=headers
            )
        )
    )


class TwitpicPicture(object):
    '''Handles a picture page, like this: http://twitpic.com/4g0ahu

    Available attributes/properties/methods:
    .id              = The twitpic id (AKA hash)
    .http_url        = URL for this twitpic
    .http_url_full   = URL for the full version of this twitpic
    .local_dir       = The local directory where the files are stored
    .local_html      = The local HTML file for the twitpic page
    .local_html_full = The local HTML file for the twitpic full page

    Metadata extracted from the page HTML:
    .date
    .caption
    .img_name
    .img_full_url  (only valid for a few minutes)
    .local_img_full  (local path to the image)

    '''

    re_twitpic_url = re.compile(r'https?://twitpic.com/(?P<id>[^/]+)(/full)?')
    re_img_name = re.compile(r'/(?P<name>[a-zA-Z0-9_.]+.(gif|png|jpe?g))')

    base_dir = GLOBAL_BASEDIR

    def __init__(self, url_or_id):
        '''Receives either a full URL to the twitpic page, or just the hash.'''

        match = self.re_twitpic_url.match(url_or_id)
        if match:
            self.id = match.group('id')
        else:
            self.id = url_or_id

        # Metadata
        self._date = None
        self._caption = None
        self._img_name = None
        self._img_full_url = None


    @property
    def http_url(self):
        return 'http://twitpic.com/{0}'.format(self.id)
    @property
    def http_url_full(self):
        return 'http://twitpic.com/{0}/full'.format(self.id)

    @property
    def local_dir(self):
        return os.path.join(self.base_dir, self.id)
    @property
    def local_html(self):
        return os.path.join(self.local_dir, 'index.html')
    @property
    def local_html_full(self):
        return os.path.join(self.local_dir, 'full.html')
    @property
    def local_img_full(self):
        return os.path.join(self.local_dir, self.img_name)

    @property
    def date(self):
        if self._date is None:
            self.parse_page()
        return self._date
    @property
    def caption(self):
        if self._caption is None:
            self.parse_page()
        return self._caption
    @property
    def img_name(self):
        if self._img_name is None:
            self.parse_page_full()
        return self._img_name
    @property
    def img_full_url(self):
        if self._img_full_url is None:
            self.parse_page_full()
        return self._img_full_url


    def mkdir(self):
        '''Creates the directory where everything will be saved. Ignores any
        errors.

        '''

        try:
            #os.makedirs(self.local_dir)
            os.mkdir(self.local_dir)
        except OSError:
            pass


    def open_page(self, which_one='index'):
        '''Loads a page (or image) either from the web or from local disk, and
        returns a file-like object.

        When loading from the web, automatically saves a local copy and returns
        a file-like object for the local copy.
        '''

        # Which page? The normal one, or the full one?
        if which_one == 'index':
            url = self.http_url
            local_file = self.local_html
            referer = None
        elif which_one == 'full':
            url = self.http_url_full
            local_file = self.local_html_full
            referer = None
        elif which_one == 'img_full':
            url = self.img_full_url
            local_file = self.local_img_full
            referer = self.http_url_full
        else:
            raise ValueError('Unknown page type "{0}"'.format(which_one))

        if not os.path.isfile(local_file):
            # Load from web (and save a local copy)
            self.mkdir()
            with urlopen(url, referer) as u:
                with open(local_file, 'wb') as f:
                    f.write(u.read())

                # Setting the local modified time to the header received from
                # server
                import email.utils
                date_str = u.headers.get('Last-Modified', None)
                if date_str:
                    date_tuple = email.utils.parsedate_tz(date_str)
                    timestamp = email.utils.mktime_tz(date_tuple)
                    os.utime(local_file, (timestamp, timestamp))

        # Load from local copy
        return open(local_file, 'rb')

    def open_page_full(self):
        return self.open_page(which_one='full')

    def open_img_full(self):
        return self.open_page(which_one='img_full')


    def parse_page(self):
        import datetime
        import lxml.html
        from lxml.cssselect import CSSSelector

        root = lxml.html.parse(self.open_page()).getroot()

        # Getting the photo date:
        # <p><span id="photo-info-name"> Team Meat</span>
        #			  	April 2, 2011		  		</p>
        date_sel = CSSSelector('#photo-info-name')
        date_string = date_sel(root)[0].tail.strip()
        self._date = datetime.datetime.strptime(date_string, '%B %d, %Y')

        # Getting the caption (the photo text)
        caption_sel = CSSSelector('#view-photo-caption')
        self._caption = caption_sel(root)[0].text.strip()

        with open(os.path.join(self.local_dir, 'caption.txt'), 'wb') as f:
            f.write(self._caption)

        return root

    def parse_page_full(self):
        import lxml.html
        from lxml.cssselect import CSSSelector
        root = lxml.html.parse(self.open_page_full()).getroot()

        # Getting the image name
        img_sel = CSSSelector('body > img')
        img_el = img_sel(root)[0]
        img_src = img_el.get('src')
        img_name = self.re_img_name.search(img_src).group('name')
        # The caption is also available as image alternate text
        # img_alt = img_el.get('alt')

        self._img_full_url = img_src
        self._img_name = img_name

        return root


def get_twitpic_pictures_from_profile_page(url):
    '''Iterator that receives a picture page and returns links to each
    individual URL.

    Sample URLs:
    http://twitpic.com/photos/SuperMeatBoy
    http://twitpic.com/photos/SuperMeatBoy?page=2
    '''

    with urlopen(url) as u:
        import lxml.html
        from lxml.cssselect import CSSSelector
        root = lxml.html.parse(u).getroot()
        root.make_links_absolute()

        pics_sel = CSSSelector('.user-photo a')
        for a in pics_sel(root):
            yield a.get('href')


def process_one_url(url):
    print 'Processing {0}'.format(url)
    tp = TwitpicPicture(url)
    print '  Caption: {0}'.format(tp.caption)
    tp.open_page()
    tp.open_page_full()
    tp.open_img_full()


def main(args):
    if len(args) == 0:
        print 'Huh... Just pass all URLs as parameters...'
        print 'TODO: write a nice parameter parsing code. Oh, and a proper help message.'

    import re
    re_profile = re.compile(r'https?://twitpic.com/photos/.+')
    re_twitpic = re.compile(r'https?://twitpic.com/.+')

    for url in args:
        if re_profile.match(url):
            print 'Parsing {0}'.format(url)
            for linked_url in get_twitpic_pictures_from_profile_page(url):
                process_one_url(linked_url)
        elif re_twitpic.match(url):
            process_one_url(url)
        else:
            print 'Invalid URL: {0}'.format(url)

if __name__ == '__main__':
    import sys
    main(sys.argv[1:])
	#!/usr/bin/env python2
	# -- coding: utf-8 --
	# vi:ts=4 sw=4 et

	import os
	import os.path
	import re


	GLOBAL_BASEDIR = 'twitpic'


	def urlopen(url, referer=None):
	import contextlib
	import urllib2

	headers = {
	'User-agent': "I just want to download Super Meat Boy pics and won't abuse the service",
	}
	if referer:
	headers['Referer'] = referer

	return contextlib.closing(
	urllib2.urlopen(
	urllib2.Request(
	url=url,
	headers=headers
	)
	)
	)


	class TwitpicPicture(object):
	'''Handles a picture page, like this: http://twitpic.com/4g0ahu

	Available attributes/properties/methods:
	.id = The twitpic id (AKA hash)
	.http_url = URL for this twitpic
	.http_url_full = URL for the full version of this twitpic
	.local_dir = The local directory where the files are stored
	.local_html = The local HTML file for the twitpic page
	.local_html_full = The local HTML file for the twitpic full page

	Metadata extracted from the page HTML:
	.date
	.caption
	.img_name
	.img_full_url (only valid for a few minutes)
	.local_img_full (local path to the image)

	'''

	re_twitpic_url = re.compile(r'https?://twitpic.com/(?P<id>[^/]+)(/full)?')
	re_img_name = re.compile(r'/(?P<name>[a-zA-Z0-9_.]+.(gif\|png\|jpe?g))')

	base_dir = GLOBAL_BASEDIR

	def __init__(self, url_or_id):
	'''Receives either a full URL to the twitpic page, or just the hash.'''

	match = self.re_twitpic_url.match(url_or_id)
	if match:
	self.id = match.group('id')
	else:
	self.id = url_or_id

	# Metadata
	self._date = None
	self._caption = None
	self._img_name = None
	self._img_full_url = None


	@property
	def http_url(self):
	return 'http://twitpic.com/{0}'.format(self.id)
	@property
	def http_url_full(self):
	return 'http://twitpic.com/{0}/full'.format(self.id)

	@property
	def local_dir(self):
	return os.path.join(self.base_dir, self.id)
	@property
	def local_html(self):
	return os.path.join(self.local_dir, 'index.html')
	@property
	def local_html_full(self):
	return os.path.join(self.local_dir, 'full.html')
	@property
	def local_img_full(self):
	return os.path.join(self.local_dir, self.img_name)

	@property
	def date(self):
	if self._date is None:
	self.parse_page()
	return self._date
	@property
	def caption(self):
	if self._caption is None:
	self.parse_page()
	return self._caption
	@property
	def img_name(self):
	if self._img_name is None:
	self.parse_page_full()
	return self._img_name
	@property
	def img_full_url(self):
	if self._img_full_url is None:
	self.parse_page_full()
	return self._img_full_url


	def mkdir(self):
	'''Creates the directory where everything will be saved. Ignores any
	errors.

	'''

	try:
	#os.makedirs(self.local_dir)
	os.mkdir(self.local_dir)
	except OSError:
	pass


	def open_page(self, which_one='index'):
	'''Loads a page (or image) either from the web or from local disk, and
	returns a file-like object.

	When loading from the web, automatically saves a local copy and returns
	a file-like object for the local copy.
	'''

	# Which page? The normal one, or the full one?
	if which_one == 'index':
	url = self.http_url
	local_file = self.local_html
	referer = None
	elif which_one == 'full':
	url = self.http_url_full
	local_file = self.local_html_full
	referer = None
	elif which_one == 'img_full':
	url = self.img_full_url
	local_file = self.local_img_full
	referer = self.http_url_full
	else:
	raise ValueError('Unknown page type "{0}"'.format(which_one))

	if not os.path.isfile(local_file):
	# Load from web (and save a local copy)
	self.mkdir()
	with urlopen(url, referer) as u:
	with open(local_file, 'wb') as f:
	f.write(u.read())

	# Setting the local modified time to the header received from
	# server
	import email.utils
	date_str = u.headers.get('Last-Modified', None)
	if date_str:
	date_tuple = email.utils.parsedate_tz(date_str)
	timestamp = email.utils.mktime_tz(date_tuple)
	os.utime(local_file, (timestamp, timestamp))

	# Load from local copy
	return open(local_file, 'rb')

	def open_page_full(self):
	return self.open_page(which_one='full')

	def open_img_full(self):
	return self.open_page(which_one='img_full')


	def parse_page(self):
	import datetime
	import lxml.html
	from lxml.cssselect import CSSSelector

	root = lxml.html.parse(self.open_page()).getroot()

	# Getting the photo date:
	# <p><span id="photo-info-name"> Team Meat</span>
	# April 2, 2011 </p>
	date_sel = CSSSelector('#photo-info-name')
	date_string = date_sel(root)[0].tail.strip()
	self._date = datetime.datetime.strptime(date_string, '%B %d, %Y')

	# Getting the caption (the photo text)
	caption_sel = CSSSelector('#view-photo-caption')
	self._caption = caption_sel(root)[0].text.strip()

	with open(os.path.join(self.local_dir, 'caption.txt'), 'wb') as f:
	f.write(self._caption)

	return root

	def parse_page_full(self):
	import lxml.html
	from lxml.cssselect import CSSSelector
	root = lxml.html.parse(self.open_page_full()).getroot()

	# Getting the image name
	img_sel = CSSSelector('body > img')
	img_el = img_sel(root)[0]
	img_src = img_el.get('src')
	img_name = self.re_img_name.search(img_src).group('name')
	# The caption is also available as image alternate text
	# img_alt = img_el.get('alt')

	self._img_full_url = img_src
	self._img_name = img_name

	return root


	def get_twitpic_pictures_from_profile_page(url):
	'''Iterator that receives a picture page and returns links to each
	individual URL.

	Sample URLs:
	http://twitpic.com/photos/SuperMeatBoy
	http://twitpic.com/photos/SuperMeatBoy?page=2
	'''

	with urlopen(url) as u:
	import lxml.html
	from lxml.cssselect import CSSSelector
	root = lxml.html.parse(u).getroot()
	root.make_links_absolute()

	pics_sel = CSSSelector('.user-photo a')
	for a in pics_sel(root):
	yield a.get('href')


	def process_one_url(url):
	print 'Processing {0}'.format(url)
	tp = TwitpicPicture(url)
	print ' Caption: {0}'.format(tp.caption)
	tp.open_page()
	tp.open_page_full()
	tp.open_img_full()


	def main(args):
	if len(args) == 0:
	print 'Huh... Just pass all URLs as parameters...'
	print 'TODO: write a nice parameter parsing code. Oh, and a proper help message.'

	import re
	re_profile = re.compile(r'https?://twitpic.com/photos/.+')
	re_twitpic = re.compile(r'https?://twitpic.com/.+')

	for url in args:
	if re_profile.match(url):
	print 'Parsing {0}'.format(url)
	for linked_url in get_twitpic_pictures_from_profile_page(url):
	process_one_url(linked_url)
	elif re_twitpic.match(url):
	process_one_url(url)
	else:
	print 'Invalid URL: {0}'.format(url)

	if __name__ == '__main__':
	import sys
	main(sys.argv[1:])
No results found