Created
January 1, 2016 02:51
-
-
Save christinac/ee6e14f256956db07dcb to your computer and use it in GitHub Desktop.
How the book-extraction magic happens before all those books get dropped
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| from datetime import datetime | |
| import json | |
| import re | |
| from xml.sax.saxutils import unescape | |
| import models | |
| from bookkit import Book, BookNotFoundException | |
| from utils.gmailservice import GmailService # modified version of https://developers.google.com/gmail/api/quickstart/python | |
| class ResultSet(object): | |
| """Takes in a Gmail service (already constructed in the request) and returns a set of potential books for a user""" | |
| service = None | |
| user = None | |
| data = [] | |
| potential_books = [] | |
| def __init__(self, service, user): | |
| self.service = GmailService(service=service) | |
| self.user = user | |
| super(ResultSet, self).__init__() | |
| def extend(self, data): | |
| self.data.extend(data) | |
| def search_and_extend(self, query, page_token=None): | |
| threads = self.service.search(query, page_token=page_token) | |
| self.extend(threads['threads']) | |
| return threads.get('nextPageToken') or threads.get('pageToken') | |
| def capture_and_extract(self): | |
| self.capture() | |
| self.extract() | |
| def capture(self): | |
| query = 'from:amazon.com ((-kindle subject:shipped) OR (kindle order))' | |
| next_token = self.search_and_extend(query) | |
| while next_token: | |
| next_token = self.search_and_extend(query, page_token=next_token) | |
| return self.data | |
| def extract(self): | |
| self.service.get_messages([x['id'] for x in self.data], self.list_callback) | |
| def list_callback(self, request_id, response, exception): | |
| if exception: | |
| return | |
| message = MessageExtractor(response) | |
| self.potential_books.append(PotentialBook(message.title(), message.date())) | |
| if len(self.potential_books) == len(self.data): | |
| ## try to make books for the user in an async task | |
| class MessageExtractor(object): | |
| """Takes in a gmail message, finds a message date, and looks for a book title""" | |
| message = None | |
| def __init__(self, message): | |
| self.message = message | |
| super(MessageExtractor, self).__init__() | |
| def date(self): | |
| return datetime.fromtimestamp(float(self.message['internalDate'])/1000) | |
| def title(self): | |
| return unescape(self._title()) | |
| def _title(self): | |
| if self.message.get('payload'): | |
| for header in self.message['payload'].get('headers'): | |
| if 'Subject' in header['name']: | |
| try: | |
| return re.search('"(.*)"', header['value']).groups()[0] | |
| except AttributeError: | |
| try: | |
| return re.search('Amazon.com order of (.*)', header['value']).groups()[0] | |
| except AttributeError: | |
| try: | |
| return re.search('"(.*)"', self.message['snippet']).groups()[0] | |
| except AttributeError: | |
| return '' # Empty string so unescape doesn't break, is caught elsewhere | |
| class PotentialBook(object): | |
| """A title that's been extracted and now needs to be looked up on Amazon and maybe saved with a user""" | |
| title = None | |
| date = None | |
| def __init__(self, title, date): | |
| self.title = title | |
| self.date = date | |
| super(PotentialBook, self).__init__() | |
| def create(self, user): | |
| if self.title: | |
| try: | |
| ## make the book and save it with the user | |
| except BookNotFoundException: | |
| pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment