SansPapyrus683/dedupe.py

## dedupe.py
import os
from collections import defaultdict
import re

from PIL import Image
import imagehash

fmt = re.compile(r"(.*)\_(\d+)_(\d+)")

path = os.path.expanduser("~/OneDrive/Pictures/twitter")
path = os.path.expanduser("/run/media/sanspapyrus683/PHILIPS/twitter")

os.chdir(path)

tweets = defaultdict(lambda: defaultdict(list))
for i in os.listdir():
    if i.endswith(".mp4"):
        continue

    match = fmt.match(i)
    author = match.group(1)
    id_ = int(match.group(2))
    pos = int(match.group(3))
    tweets[author][id_].append((pos, i, imagehash.average_hash(Image.open(i))))

to_del = []
for author, g in tweets.items():
    ids = sorted(g.keys())
    for i in ids:
        g[i].sort()

    for i in range(len(ids)):
        g1 = g[ids[i]]
        for j in range(i + 1, len(ids)):
            g2 = g[ids[j]]

            if len(g1) != len(g2):
                continue

            for a, b in zip(g1, g2):
                if a[2] - b[2] >= 3:
                    break
            else:
                to_del.extend(g1)
                break

for d in to_del:
    os.remove(d[1])

## twitter.py
import json
import os
import re
import shutil
import sys
from datetime import datetime, timedelta
from email import utils

import requests


def extract_id(tweet: str) -> int:
    tweet = os.path.splitext(tweet)[0]
    first = tweet.rfind("_")
    second = tweet[:first].rfind("_")
    return int(tweet[second + 1 : first])


def load_twt_obj(file: str) -> list:
    raw = open(file, encoding="utf8").read()
    return json.loads(raw[raw.find("=") + 1 :])


# also add deleted-tweets.js if your'e like that
tweets = load_twt_obj("data/tweets.js")

del_dir = "data/deleted_tweets_media"
gen_dir = "data/tweets_media"
for fn in os.listdir(del_dir):
    shutil.copy(os.path.join(del_dir, fn), gen_dir)

have_alr = set()
for store in sys.argv[1:]:
    for name in os.listdir(store):
        have_alr.add(extract_id(name))

# after getting the actual images this isn't needed but just in case
all_raw_media = os.listdir(gen_dir)
all_media = {}
for i in all_raw_media:
    post_id = i[: i.find("-")]
    img_id = i[i.find("-") + 1 : i.rfind(".")]
    _, ext = os.path.splitext(i)
    if post_id not in all_media:
        all_media[post_id] = {}
    all_media[post_id][img_id] = ext

# sort them from oldest to newest
tweets.sort(key=lambda t: utils.parsedate_to_datetime(t["tweet"]["created_at"]))

handle_fmt = re.compile(r"RT @([^:]*):")
img_id_fmt = re.compile(r"http://pbs\.twimg\.com/media/([^\.*]*)\.")
os.makedirs("good_media", exist_ok=True)
all_paths = []

print(f"alright, a total of {len(tweets)} tweets to go through. let's go!")
for v, t in enumerate(tweets):
    if (v + 1) % 100 == 0:
        print(f"at tweet #{v + 1}")

    t = t["tweet"]
    match = handle_fmt.match(t["full_text"])
    if match is None:
        continue

    handle = match.group(1)
    og_id = t["id"]

    if "media" not in t["entities"]:
        continue

    media = t["extended_entities"]["media"]
    src_id = [m["source_status_id"] for m in media]
    assert len(set(src_id)) == 1  # just a sanity check
    src_id = int(src_id[0])

    if src_id in have_alr:
        continue

    curr_paths = []

    # quick hack to get videos to download
    vid = all_media[og_id]
    # most videos are standalone. there's one (1) tweet so far that violates this
    if ".mp4" in vid.values() and len(vid) == 1:
        vid_id = list(vid.keys())[0]
        stupid_path = os.path.join(gen_dir, f"{og_id}-{vid_id}.mp4")
        sigma_path = f"good_media/{handle}_{src_id}_1.mp4"
        shutil.copy(stupid_path, sigma_path)
        curr_paths.append(sigma_path)

    for img_at, m in enumerate(media):
        img_id = img_id_fmt.match(m["media_url"])
        # sometimes you have things like ext_tw_video_thumb or tweet_video_thumb
        if img_id is None:
            continue

        img_id = img_id.group(1)
        if img_id not in all_media.get(og_id, []):
            continue

        ext = all_media[og_id][img_id]
        sigma_path = f"good_media/{handle}_{src_id}_{img_at + 1}{ext}"
        stupid_path = os.path.join(gen_dir, f"{og_id}-{img_id}{ext}")

        dl_url = f"http://pbs.twimg.com/media/{img_id}{ext}:orig"
        img_data = requests.get(dl_url).content
        if not img_data:
            shutil.copy(stupid_path, sigma_path)
        else:
            with open(sigma_path, "wb") as written:
                written.write(img_data)

        curr_paths.append(sigma_path)

    all_paths.extend(reversed(curr_paths))

now = datetime.now()
epoch = datetime(1970, 1, 1)
for v, p in enumerate(reversed(all_paths)):
    delta = (now - timedelta(seconds=2 * v) - epoch).total_seconds()
    os.utime(p, times=(delta, delta))

## validate.py
import os
from collections import defaultdict
from datetime import datetime, timedelta

path = os.path.expanduser("~/OneDrive/Pictures/twitter")

os.chdir(path)

groups = defaultdict(list)
for i in os.listdir():
    start = i.rfind("_") + 1
    end = i.rfind(".")
    num = int(i[start:end])

    author_end = i.rfind("_", 0, start - 1)
    author = i[:author_end]
    id_ = int(i[author_end + 1:start - 1])
    groups[id_].append((author, num, i))

all_times = []

now = datetime.now()
epoch = datetime(1970, 1, 1)

at = 0
to_fix = []
for id_, tweets in sorted(groups.items(), reverse=True):
    tweets.sort(key=lambda t: t[1])
    all_authors = {t[0].lower() for t in tweets}
    all_nums = [t[1] for t in tweets]
    assert len(all_authors) == 1
    assert all_nums == list(range(1, len(all_nums) + 1))

    for t in tweets:
        delta = (now - timedelta(seconds=2 * at) - epoch).total_seconds()
        os.utime(t[2], times=(delta, delta))
        at += 1
	import os
	from collections import defaultdict
	import re

	from PIL import Image
	import imagehash

	fmt = re.compile(r"(.*)\_(\d+)_(\d+)")

	path = os.path.expanduser("~/OneDrive/Pictures/twitter")
	path = os.path.expanduser("/run/media/sanspapyrus683/PHILIPS/twitter")

	os.chdir(path)

	tweets = defaultdict(lambda: defaultdict(list))
	for i in os.listdir():
	if i.endswith(".mp4"):
	continue

	match = fmt.match(i)
	author = match.group(1)
	id_ = int(match.group(2))
	pos = int(match.group(3))
	tweets[author][id_].append((pos, i, imagehash.average_hash(Image.open(i))))

	to_del = []
	for author, g in tweets.items():
	ids = sorted(g.keys())
	for i in ids:
	g[i].sort()

	for i in range(len(ids)):
	g1 = g[ids[i]]
	for j in range(i + 1, len(ids)):
	g2 = g[ids[j]]

	if len(g1) != len(g2):
	continue

	for a, b in zip(g1, g2):
	if a[2] - b[2] >= 3:
	break
	else:
	to_del.extend(g1)
	break

	for d in to_del:
	os.remove(d[1])
	import json
	import os
	import re
	import shutil
	import sys
	from datetime import datetime, timedelta
	from email import utils

	import requests


	def extract_id(tweet: str) -> int:
	tweet = os.path.splitext(tweet)[0]
	first = tweet.rfind("_")
	second = tweet[:first].rfind("_")
	return int(tweet[second + 1 : first])


	def load_twt_obj(file: str) -> list:
	raw = open(file, encoding="utf8").read()
	return json.loads(raw[raw.find("=") + 1 :])


	# also add deleted-tweets.js if your'e like that
	tweets = load_twt_obj("data/tweets.js")

	del_dir = "data/deleted_tweets_media"
	gen_dir = "data/tweets_media"
	for fn in os.listdir(del_dir):
	shutil.copy(os.path.join(del_dir, fn), gen_dir)

	have_alr = set()
	for store in sys.argv[1:]:
	for name in os.listdir(store):
	have_alr.add(extract_id(name))

	# after getting the actual images this isn't needed but just in case
	all_raw_media = os.listdir(gen_dir)
	all_media = {}
	for i in all_raw_media:
	post_id = i[: i.find("-")]
	img_id = i[i.find("-") + 1 : i.rfind(".")]
	_, ext = os.path.splitext(i)
	if post_id not in all_media:
	all_media[post_id] = {}
	all_media[post_id][img_id] = ext

	# sort them from oldest to newest
	tweets.sort(key=lambda t: utils.parsedate_to_datetime(t["tweet"]["created_at"]))

	handle_fmt = re.compile(r"RT @([^:]*):")
	img_id_fmt = re.compile(r"http://pbs\.twimg\.com/media/([^\.])\.")
	os.makedirs("good_media", exist_ok=True)
	all_paths = []

	print(f"alright, a total of {len(tweets)} tweets to go through. let's go!")
	for v, t in enumerate(tweets):
	if (v + 1) % 100 == 0:
	print(f"at tweet #{v + 1}")

	t = t["tweet"]
	match = handle_fmt.match(t["full_text"])
	if match is None:
	continue

	handle = match.group(1)
	og_id = t["id"]

	if "media" not in t["entities"]:
	continue

	media = t["extended_entities"]["media"]
	src_id = [m["source_status_id"] for m in media]
	assert len(set(src_id)) == 1 # just a sanity check
	src_id = int(src_id[0])

	if src_id in have_alr:
	continue

	curr_paths = []

	# quick hack to get videos to download
	vid = all_media[og_id]
	# most videos are standalone. there's one (1) tweet so far that violates this
	if ".mp4" in vid.values() and len(vid) == 1:
	vid_id = list(vid.keys())[0]
	stupid_path = os.path.join(gen_dir, f"{og_id}-{vid_id}.mp4")
	sigma_path = f"good_media/{handle}_{src_id}_1.mp4"
	shutil.copy(stupid_path, sigma_path)
	curr_paths.append(sigma_path)

	for img_at, m in enumerate(media):
	img_id = img_id_fmt.match(m["media_url"])
	# sometimes you have things like ext_tw_video_thumb or tweet_video_thumb
	if img_id is None:
	continue

	img_id = img_id.group(1)
	if img_id not in all_media.get(og_id, []):
	continue

	ext = all_media[og_id][img_id]
	sigma_path = f"good_media/{handle}_{src_id}_{img_at + 1}{ext}"
	stupid_path = os.path.join(gen_dir, f"{og_id}-{img_id}{ext}")

	dl_url = f"http://pbs.twimg.com/media/{img_id}{ext}:orig"
	img_data = requests.get(dl_url).content
	if not img_data:
	shutil.copy(stupid_path, sigma_path)
	else:
	with open(sigma_path, "wb") as written:
	written.write(img_data)

	curr_paths.append(sigma_path)

	all_paths.extend(reversed(curr_paths))

	now = datetime.now()
	epoch = datetime(1970, 1, 1)
	for v, p in enumerate(reversed(all_paths)):
	delta = (now - timedelta(seconds=2 * v) - epoch).total_seconds()
	os.utime(p, times=(delta, delta))