Last active
January 31, 2021 21:51
-
-
Save iamvee/86f60f60f1a9376175a4aecb7c6b1746 to your computer and use it in GitHub Desktop.
tweets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "reserved-journalism", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "import re\n", | |
| "import tweepy\n", | |
| "import csv\n", | |
| "import pandas as pd\n", | |
| "import time\n", | |
| "import collections\n", | |
| "\n", | |
| "consumer_key = ''\n", | |
| "consumer_secret = ''\n", | |
| "\n", | |
| "access_token = ''\n", | |
| "access_token_secret = ''\n", | |
| "\n", | |
| "\n", | |
| "auth = tweepy.OAuthHandler(consumer_key, consumer_secret)\n", | |
| "auth.set_access_token(access_token, access_token_secret)\n", | |
| "api = tweepy.API(auth,wait_on_rate_limit=True)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "suffering-capitol", | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "hashtag = \"#hashtag\"\n", | |
| "lim = 100\n", | |
| "\n", | |
| "steps = lim\n", | |
| "\n", | |
| "current_time = time.ctime().replace(\" \",\"_\")\n", | |
| "dir_name = f\"./{hashtag[1:]}\"\n", | |
| "path = f\"./{hashtag[1:]}/{current_time}\"\n", | |
| "outfile = f\"{path}/out.csv\"\n", | |
| "outusers = f\"{path}/users.csv\"\n", | |
| "\n", | |
| "\n", | |
| "try:\n", | |
| " os.mkdir(f\"./{hashtag[1:]}\")\n", | |
| "except FileExistsError:\n", | |
| " print(\"directory exists\")\n", | |
| "finally:\n", | |
| " os.mkdir(path)\n", | |
| " \n", | |
| " \n", | |
| "csv_file = open(outfile, 'a')\n", | |
| "csv_writer_tweets = csv.writer(csv_file)\n", | |
| "csv_writer_tweets.writerow(\n", | |
| " ['created_at', 'id', 'id_str', 'user_id', 'screen name', 'account created at', 'text', 'truncated',\n", | |
| " 'in_reply_to_status_id', 'in_reply_to_status_id_str', \n", | |
| " 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', \n", | |
| " 'geo', 'coordinates', 'place', 'contributors', \n", | |
| " 'is_quote_status', 'retweet_count', 'favorite_count', \n", | |
| " 'favorited', 'retweeted', 'lang'])\n", | |
| "\n", | |
| "csv_users = open(outusers, 'a')\n", | |
| "csv_writer_users = csv.writer(csv_users)\n", | |
| "csv_writer_users.writerow(\n", | |
| " ['id', 'id_str', 'name', 'screen_name', 'location', 'description', \n", | |
| " 'url','protected', 'followers_count', 'friends_count', \n", | |
| " 'listed_count', 'created_at', 'favourites_count', 'utc_offset', \n", | |
| " 'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang', \n", | |
| " 'contributors_enabled', 'is_translator', 'is_translation_enabled', \n", | |
| " 'has_extended_profile', 'default_profile', 'default_profile_image', \n", | |
| " 'following', 'follow_request_sent', 'notifications', 'translator_type'])\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "print(time.ctime())\n", | |
| "for i, tweet in enumerate(tweepy.Cursor(api.search,q=hashtag,count=lim).items()):\n", | |
| " if not tweet.user.id in users: \n", | |
| " csv_writer_users.writerow([\n", | |
| " tweet.user._json['id'], tweet.user._json['id_str'], tweet.user._json['name'],\n", | |
| " tweet.user._json['screen_name'], tweet.user._json['location'],\n", | |
| " tweet.user._json['description'], tweet.user._json['url'], \n", | |
| " tweet.user._json['protected'], tweet.user._json['followers_count'], \n", | |
| " tweet.user._json['friends_count'], tweet.user._json['listed_count'],\n", | |
| " tweet.user._json['created_at'], tweet.user._json['favourites_count'], \n", | |
| " tweet.user._json['utc_offset'], tweet.user._json['time_zone'], \n", | |
| " tweet.user._json['geo_enabled'], tweet.user._json['verified'], \n", | |
| " tweet.user._json['statuses_count'], tweet.user._json['lang'], \n", | |
| " tweet.user._json['contributors_enabled'], tweet.user._json['is_translator'], \n", | |
| " tweet.user._json['is_translation_enabled'], tweet.user._json['has_extended_profile'],\n", | |
| " tweet.user._json['default_profile'], tweet.user._json['default_profile_image'], \n", | |
| " tweet.user._json['following'], tweet.user._json['follow_request_sent'], \n", | |
| " tweet.user._json['notifications'], tweet.user._json['translator_type']\n", | |
| " ])\n", | |
| " \n", | |
| " csv_writer_tweets.writerow([\n", | |
| " tweet._json['created_at'], tweet._json['id'], tweet._json['id_str'], \n", | |
| " tweet.user._json['id'], tweet.user._json['screen_name'], str(tweet.user.created_at),\n", | |
| " tweet._json['text'], tweet._json['truncated'], tweet._json['in_reply_to_status_id'],\n", | |
| " tweet._json['in_reply_to_status_id_str'], tweet._json['in_reply_to_user_id'],\n", | |
| " tweet._json['in_reply_to_user_id_str'], tweet._json['in_reply_to_screen_name'], \n", | |
| " tweet._json['geo'], tweet._json['coordinates'], tweet._json['place'],\n", | |
| " tweet._json['contributors'], tweet._json['is_quote_status'], \n", | |
| " tweet._json['retweet_count'], tweet._json['favorite_count'], \n", | |
| " tweet._json['favorited'], tweet._json['retweeted'], tweet._json['lang']])\n", | |
| " if i > lim:\n", | |
| " print(f\"{i:8<}\", flush=True, end=\" \")\n", | |
| " lim += steps" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "enabling-manner", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_tweets = pd.read_csv(outfile)\n", | |
| "df_users = pd.read_csv(outusers)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "undefined-documentary", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = df_tweets\n", | |
| "df[\"RT\"] = df[\"text\"].map(lambda x: x.startswith(\"RT\"))\n", | |
| "\n", | |
| "x = df[df[\"RT\"]==False][\"text\"].map(\n", | |
| " lambda x: re.sub(\"#\\S+\", \"\", x).replace('\\n', '')).map(\n", | |
| " lambda x: re.sub(\"@\\w+\", \"\", x)).map(\n", | |
| " lambda x: re.sub(\"https://t.co/\\S+\", \"\", x)).map(\n", | |
| " lambda x:re.sub(\"\\s\", \"\", x)).map(\n", | |
| " lambda x: x[:20])\n", | |
| "\n", | |
| "y = sorted(set(x))\n", | |
| "\n", | |
| "print(f\"uniq {len(y)}\\noriginal {len(df[df['RT']==False])}\\nall {len(df['RT'])}\")\n", | |
| "print(\"accounts \", len(set(df[\"screen name\"])))\n", | |
| "\n", | |
| "df.tail(1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "rubber-operation", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "account_number = collections.Counter([x[:4] for x in set(df[\"account created at\"])])\n", | |
| "tweet_number = collections.Counter(df[\"account created at\"].map(lambda x: x[:4]))\n", | |
| "\n", | |
| "\n", | |
| "print(f\"{'year':>5}, {'accounts':>9}, {'tweets':>8}, {'tweet/account':15}\")\n", | |
| "\n", | |
| "for k in sorted(account_number.keys()):\n", | |
| " print(f\"{k:5}, {account_number[k]:9}, {tweet_number[k]:8}, {tweet_number[k]/ account_number[k]:12.1f}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "frequent-immunology", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "sticky-expression", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "infinite-shoot", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.1" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment