Script to fetch historic tweets containing a given hashtag and insert them into a database
Edit settings in config.json
| { | |
| "db_host" : "", | |
| "db_user" : "", | |
| "db_passwd" : "", | |
| "db_name" : "", | |
| "db_table" : "", | |
| "hashtag" : "" | |
| } |
| #!/usr/bin/env python | |
| """ | |
| Fetch historic tweets containing a given hashtag and insert them into a database | |
| Settings in config.json | |
| """ | |
| from twitterscraper import query_tweets | |
| import mysql.connector as mysql | |
| from datetime import datetime | |
| import json | |
| # load credentials | |
| try: | |
| with open('config.json') as f: | |
| config = json.load(f) | |
| except Exception as e: | |
| print(e) | |
| print('####################################################\n#') | |
| print('# Fetch historic tweets for #{} !\n#'.format(config['hashtag'])) | |
| print('####################################################') | |
| # create db connection | |
| try: | |
| db = mysql.connect( | |
| host=config['db_host'], | |
| user=config['db_user'], | |
| passwd=config['db_passwd'], | |
| database=config['db_name'] | |
| ) | |
| except mysql.Error: | |
| print("Error connecting to Database") | |
| quit() | |
| # create cursor instance | |
| cursor = db.cursor() | |
| # create table if it dosen't exist | |
| cursor.execute(f"""CREATE TABLE IF NOT EXISTS `{config['db_table']}` ( | |
| `id` INT(11) NOT NULL AUTO_INCREMENT, | |
| `user_id` bigint(20) NOT NULL, | |
| `tweet_id` bigint(20) NOT NULL, | |
| `tweet` mediumtext NOT NULL, | |
| `timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, | |
| PRIMARY KEY (`id`) | |
| ) ENGINE=InnoDB DEFAULT CHARSET=latin1 AUTO_INCREMENT=3 ; | |
| """) | |
| db.commit() | |
| # Get tweets | |
| tweets = query_tweets(config['hashtag'], 100000) | |
| # todo move limit to config | |
| print("Scraped {} tweets containing #{}\n".format( | |
| len(tweets), config['hashtag'])) | |
| # define the query | |
| query = 'INSERT INTO {} (id, user_id, tweet_id, tweet, timestamp) VALUES (NULL, %s, %s, %s, %s)'.format( | |
| config['db_table']) | |
| for t in tweets: | |
| # only save if the string is a hashtag not just a text string | |
| if('#{}'.format(config['hashtag']) in t.text.lower()): | |
| print('Inserting tweet {} from {} ( @{} )'.format( | |
| t.user_id, t.tweet_id, t.username, t.screen_name)) | |
| # convert the timestamp in order to allow it to be JSON encoded | |
| t.timestamp = datetime.strftime(t.timestamp, '%Y-%m-%d %H:%M:%S') | |
| values = (t.user_id, t.tweet_id, json.dumps(t.__dict__), t.timestamp) | |
| # execute the query | |
| cursor.execute(query, values) | |
| # commit to db | |
| db.commit() | |
| print(cursor.rowcount, "record inserted") |