Last active
May 7, 2016 19:06
-
-
Save llvtt/53799972737f1846c533 to your computer and use it in GitHub Desktop.
Stream Reddit posts into MongoDB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import datetime | |
| import optparse | |
| import sys | |
| import praw | |
| import praw.helpers | |
| import pymongo | |
| import requests.exceptions | |
| MONGO_HOST = "localhost" | |
| MONGO_PORT = 27017 | |
| USER_AGENT_STR = "reddit_to_mongo/0.1" | |
| def convert_to_document(post): | |
| # Retrieve comments if there are any. | |
| comments = {} | |
| if post.num_comments > 0: | |
| comment_list = praw.helpers.flatten_tree(post.comments, | |
| depth_first=True) | |
| comments = {"comments": [ | |
| { | |
| "text": comment.body, | |
| "author": { | |
| "id": comment.author.id, | |
| "name": comment.author.name | |
| }, | |
| "created": datetime.datetime.fromtimestamp(comment.created) | |
| } | |
| for comment in comment_list | |
| ]} | |
| # Just select a few fields for our purposes. | |
| post_doc = { | |
| "_id": post.id, | |
| "title": post.title, | |
| "author": { | |
| "id": post.author.id, | |
| "name": post.author.name | |
| }, | |
| "subreddit": post.subreddit.display_name, | |
| "text": post.selftext, | |
| "date": datetime.datetime.fromtimestamp(post.created), | |
| "num_comments": 0 if not comments else len(comments['comments']) | |
| } | |
| post_doc.update(comments) | |
| return post_doc | |
| if __name__ == '__main__': | |
| parser = optparse.OptionParser() | |
| parser.add_option("--mongo-host", default=MONGO_HOST, dest='mongo_host', | |
| help="Hostname where MongoDB is running.") | |
| parser.add_option("--mongo-port", default=MONGO_PORT, dest='mongo_port', | |
| type=int, help="Port on which MongoDB is listening.") | |
| parser.add_option("--reddit-namespace", default='reddit.posts', | |
| dest='reddit_namespace', | |
| help="Reddit data namespace.") | |
| options, _ = parser.parse_args() | |
| mongodb = pymongo.MongoClient(options.mongo_host, options.mongo_port) | |
| reddit_db, reddit_coll = options.reddit_namespace.split('.', 1) | |
| collection = mongodb[reddit_db][reddit_coll] | |
| reddit = praw.Reddit(USER_AGENT_STR) | |
| try: | |
| # Get an infinite stream of all new posts to the front page. | |
| new_posts = praw.helpers.submission_stream(reddit, "all") | |
| for post in new_posts: | |
| # Insert each post into MongoDB. | |
| try: | |
| collection.save(convert_to_document(post)) | |
| print("processed: %s..." % post.title[:20]) | |
| except requests.exceptions.HTTPError: | |
| # HTTP request to retrieve more information about the post | |
| # returned with a 4xx error. | |
| pass | |
| except AttributeError: | |
| # Post or Comment may have been deleted between retrieving it | |
| # and accessing its fields | |
| pass | |
| except KeyboardInterrupt: | |
| sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment