Last active
January 31, 2021 21:51
-
-
Save iamvee/86f60f60f1a9376175a4aecb7c6b1746 to your computer and use it in GitHub Desktop.
tweets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "functional-billy", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "\n", | |
| "import re\n", | |
| "import pandas as pd\n", | |
| "import collections\n", | |
| "import scipy \n", | |
| "import matplotlib\n", | |
| "import matplotlib.pyplot\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "remarkable-casting", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_tweets = pd.read_csv('./out.csv')\n", | |
| "df_users = pd.read_csv('./users.csv')\n", | |
| "\n", | |
| "\n", | |
| "df = df_tweets\n", | |
| "df[\"RT\"] = df[\"text\"].map(lambda x: x.startswith(\"RT\"))\n", | |
| "\n", | |
| "x = df[df[\"RT\"]==False][\"text\"].map(\n", | |
| " lambda x: re.sub(\"#\\S+\", \"\", x).replace('\\n', '')).map(\n", | |
| " lambda x: re.sub(\"@\\w+\", \"\", x)).map(\n", | |
| " lambda x: re.sub(\"https://t.co/\\S+\", \"\", x)).map(\n", | |
| " lambda x:re.sub(\"\\s\", \"\", x)).map(\n", | |
| " lambda x: x[:30])\n", | |
| "\n", | |
| "y = sorted(set(x))\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "data = {}\n", | |
| "data[\"original\"] = len(y)\n", | |
| "data[\"original+copy\"] = len(df[df['RT']==False])\n", | |
| "data[\"duplicated\"] = data[\"original+copy\"] - data[\"original\"]\n", | |
| "data[\"all tweets\"] = len(df['RT'])\n", | |
| "data[\"retweets\"] = data[\"all tweets\"] - data[\"original+copy\"]\n", | |
| "data[\"accounts\"] = len(set(df[\"screen name\"]))\n", | |
| "\n", | |
| "print(f\"\"\"\n", | |
| "{data[\"original\"]:>10} | original tweets (duplicated tweets excluded)\n", | |
| "{data[\"original+copy\"]:>10} | original tweets + (duplicated)\n", | |
| "{data[\"duplicated\"]:>10} | duplicated\n", | |
| "{data[\"retweets\"]:>10} | retweets\n", | |
| "{data[\"all tweets\"]:>10} | all tweets\n", | |
| "\n", | |
| "{data[\"accounts\"]:>10} | accounts\n", | |
| "{data[\"all tweets\"] / data[\"accounts\"]:>10.1f} | average tweets per account\n", | |
| "\n", | |
| " start : {min(df['created_at'])}\n", | |
| " stop : {max(df['created_at'])}\n", | |
| "\"\"\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "demanding-barrier", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "matplotlib.pyplot.pie([data[\"original\"], data[\"duplicated\"], data[\"retweets\"]], \n", | |
| " labels=[f\"original\\n {100*data['original']/data['all tweets']:.2f} %\", \n", | |
| " f\"duplicated \\n {100*data['duplicated']/data['all tweets']:.2f} %\", \n", | |
| " f\"retweet\\n {100*data['retweets']/data['all tweets']:.2f} %\"]);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "seasonal-husband", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "account_number = collections.Counter([x[:4] for x in set(df[\"account created at\"])])\n", | |
| "tweet_number = collections.Counter(df[\"account created at\"].map(lambda x: x[:4]))\n", | |
| "\n", | |
| "\n", | |
| "print(f\"{'year':>5}, {'accounts':>9}, {'tweets':>8}, {'tweet/account':15}\")\n", | |
| "\n", | |
| "for k in sorted(account_number.keys()):\n", | |
| " print(f\"{k:5}, {account_number[k]:9}, {tweet_number[k]:8}, {tweet_number[k]/ account_number[k]:12.1f}\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "christian-tonight", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "matplotlib.pyplot.figure(figsize=(15, 5))\n", | |
| "# df['friends_count'].plot.density(xlim=[0,60000])\n", | |
| "df_users['followers_count'].plot.density(xlim=[0,60000])\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "verified-bleeding", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "vals = list(collections.Counter(df_tweets.groupby('user_id').count()[\"id\"]).items())\n", | |
| "svals = sorted(vals, key=lambda s: s[0])\n", | |
| "wvals = [(t,c,t*c*5) for t, c in svals]\n", | |
| "x, y,z = list(zip(*wvals))\n", | |
| "\n", | |
| "\n", | |
| "matplotlib.pyplot.figure(figsize=(15, 10))\n", | |
| "\n", | |
| "matplotlib.pyplot.scatter(x, y, s=z, alpha=0.5)\n", | |
| "\n", | |
| "# matplotlib.pyplot.plot(x, y)\n", | |
| "matplotlib.pyplot.grid(True)\n", | |
| "\n", | |
| "matplotlib.pyplot.scatter([max(x)//2,], [max(y)//2,], s=[sum(z),], c='r', alpha=0.3)\n", | |
| "\n", | |
| "matplotlib.pyplot.xticks(range(0, max(x)+10, 50), [str(x) for x in range(0, max(x)+10, 50)])\n", | |
| "# matplotlib.pyplot.xlim([-1, max(x)+1])\n", | |
| "# x, y, z\n", | |
| "1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "absent-script", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "skip_first_n =50\n", | |
| "\n", | |
| "vals = list(collections.Counter(df_tweets.groupby('user_id').count()[\"id\"]).items())\n", | |
| "svals = sorted(vals, key=lambda s: s[0])\n", | |
| "wvals = [(t,c,t*c) for t, c in svals]\n", | |
| "x, y,z = list(zip(*wvals[skip_first_n:]))\n", | |
| "\n", | |
| "\n", | |
| "matplotlib.pyplot.figure(figsize=(15, 10))\n", | |
| "\n", | |
| "matplotlib.pyplot.scatter(x, y, s=z, alpha=0.3)\n", | |
| "\n", | |
| "sumz = sum(z)\n", | |
| "for person in [1, 2, 3, 5, 7, 15, 30, 70, 150]:\n", | |
| " if person < 5:\n", | |
| " matplotlib.pyplot.scatter([person,], [sumz//person,], s=[sumz,], c='g', alpha=0.3)\n", | |
| " matplotlib.pyplot.scatter([person,], [sumz//person,], s=[person,], c='k',alpha=0.5)\n", | |
| " matplotlib.pyplot.text(person, sumz//person, f\"{person:<3} tweets -> {sumz//person}\")\n", | |
| "\n", | |
| "\n", | |
| "# matplotlib.pyplot.plot(x, y)\n", | |
| "\n", | |
| "# matplotlib.pyplot.grid(True)\n", | |
| "matplotlib.pyplot.text(200, 500, f\"current \\n--> {sum(y)} ppl\")\n", | |
| "\n", | |
| "matplotlib.pyplot.xticks(range(0, max(x)+1, 50), [str(x) for x in range(0, max(x)+1, 50)])\n", | |
| "# matplotlib.pyplot.xlim([-1, max(x)+1])\n", | |
| "# x, y, z\n", | |
| "1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "promising-dover", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_users['id'].to_csv('./engh_uid.csv')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "native-bearing", | |
| "metadata": {}, | |
| "source": [ | |
| "# dirty code " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "exempt-workplace", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%%bash\n", | |
| "# cat engh_ids\n", | |
| "# cat misazim_ids" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "independent-prison", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with open('engh_ids') as f:\n", | |
| " engh = set(f.read().split()[1:-1])\n", | |
| " \n", | |
| "with open('misazim_ids') as f:\n", | |
| " misz = set(f.read().split()[1:-1])\n", | |
| " \n", | |
| " " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "desirable-jacob", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "intrs = engh.intersection(misz)\n", | |
| "enghu = engh - misz\n", | |
| "miszu = misz - engh \n", | |
| "\n", | |
| "len(intrs), len(enghu), len(miszu), " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "animated-owner", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "engh_ids = [int(ii) for ii in enghu]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "executed-effort", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "sx = list(df_users[df_users['id'].isin(engh_ids)][\"created_at\"])\n", | |
| "sy = [ssx[-4:] + \" \" + ssx[4:7] for ssx in sx]\n", | |
| "sorted(collections.Counter(sy).items(), key=lambda sfds:sfds[-1], reverse=1)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "shared-marking", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_users.created_at" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "intellectual-sandwich", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.1" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment