Skip to content

Instantly share code, notes, and snippets.

@SansPapyrus683
Last active December 7, 2025 18:32
Show Gist options
  • Select an option

  • Save SansPapyrus683/c551e6b48d6cefdf9e8345720fc2e098 to your computer and use it in GitHub Desktop.

Select an option

Save SansPapyrus683/c551e6b48d6cefdf9e8345720fc2e098 to your computer and use it in GitHub Desktop.
download all your twitter anime girls!
import os
from collections import defaultdict
path = "/home/sanspapyrus683/Downloads/gallery-dl/twitter/SansPapyrus683"
groups = defaultdict(list)
for i in os.listdir(path):
start = i.rfind("_") + 1
end = i.rfind(".")
num = int(i[start:end])
author_end = i.rfind("_", 0, start - 1)
author = i[:author_end]
id_ = int(i[author_end + 1:start - 1])
groups[id_].append((author, num, i))
for id_, tweets in groups.items():
tweets.sort(key=lambda t: t[1])
all_authors = {t[0] for t in tweets}
all_nums = [t[1] for t in tweets]
assert len(all_authors) == 1
assert all_nums == list(range(1, len(all_nums) + 1))
if len(tweets) == 1:
continue
paths = [os.path.join(path, t[2]) for t in tweets]
times = [(os.path.getmtime(p), os.path.getatime(p)) for p in paths]
times.sort(reverse=True)
if len(tweets) > 2:
print(paths)
print(times)
for p, t in zip(paths, times):
os.utime(p, t)
import json
import os
import re
import shutil
import sys
from datetime import datetime, timedelta
from email import utils
import requests
def extract_id(tweet: str) -> int:
tweet = os.path.splitext(tweet)[0]
first = tweet.rfind("_")
second = tweet[:first].rfind("_")
return int(tweet[second + 1 : first])
def load_twt_obj(file: str) -> list:
raw = open(file, encoding="utf8").read()
return json.loads(raw[raw.find("=") + 1 :])
# also add deleted-tweets.js if your'e like that
tweets = load_twt_obj("data/tweets.js")
del_dir = "data/deleted_tweets_media"
gen_dir = "data/tweets_media"
for fn in os.listdir(del_dir):
shutil.copy(os.path.join(del_dir, fn), gen_dir)
have_alr = set()
for store in sys.argv[1:]:
for name in os.listdir(store):
have_alr.add(extract_id(name))
# after getting the actual images this isn't needed but just in case
all_raw_media = os.listdir(gen_dir)
all_media = {}
for i in all_raw_media:
post_id = i[: i.find("-")]
img_id = i[i.find("-") + 1 : i.rfind(".")]
_, ext = os.path.splitext(i)
if post_id not in all_media:
all_media[post_id] = {}
all_media[post_id][img_id] = ext
# sort them from oldest to newest
tweets.sort(key=lambda t: utils.parsedate_to_datetime(t["tweet"]["created_at"]))
handle_fmt = re.compile(r"RT @([^:]*):")
img_id_fmt = re.compile(r"http://pbs\.twimg\.com/media/([^\.*]*)\.")
os.makedirs("good_media", exist_ok=True)
all_paths = []
print(f"alright, a total of {len(tweets)} tweets to go through. let's go!")
for v, t in enumerate(tweets):
if (v + 1) % 100 == 0:
print(f"at tweet #{v + 1}")
t = t["tweet"]
match = handle_fmt.match(t["full_text"])
if match is None:
continue
handle = match.group(1)
og_id = t["id"]
if "media" not in t["entities"]:
continue
media = t["extended_entities"]["media"]
src_id = [m["source_status_id"] for m in media]
assert len(set(src_id)) == 1 # just a sanity check
src_id = int(src_id[0])
if src_id in have_alr:
continue
curr_paths = []
# quick hack to get videos to download
vid = all_media[og_id]
# most videos are standalone. there's one (1) tweet so far that violates this
if ".mp4" in vid.values() and len(vid) == 1:
vid_id = list(vid.keys())[0]
stupid_path = os.path.join(gen_dir, f"{og_id}-{vid_id}.mp4")
sigma_path = f"good_media/{handle}_{src_id}_1.mp4"
shutil.copy(stupid_path, sigma_path)
curr_paths.append(sigma_path)
for img_at, m in enumerate(media):
img_id = img_id_fmt.match(m["media_url"])
# sometimes you have things like ext_tw_video_thumb or tweet_video_thumb
if img_id is None:
continue
img_id = img_id.group(1)
if img_id not in all_media.get(og_id, []):
continue
ext = all_media[og_id][img_id]
sigma_path = f"good_media/{handle}_{src_id}_{img_at + 1}{ext}"
stupid_path = os.path.join(gen_dir, f"{og_id}-{img_id}{ext}")
dl_url = f"http://pbs.twimg.com/media/{img_id}{ext}:orig"
img_data = requests.get(dl_url).content
if not img_data:
shutil.copy(stupid_path, sigma_path)
else:
with open(sigma_path, "wb") as written:
written.write(img_data)
curr_paths.append(sigma_path)
all_paths.extend(reversed(curr_paths))
now = datetime.now()
epoch = datetime(1970, 1, 1)
for v, p in enumerate(reversed(all_paths)):
delta = (now - timedelta(seconds=2 * v) - epoch).total_seconds()
os.utime(p, times=(delta, delta))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment