dzogrim/linkedin_export_parser_sweeper.py

## linkedin_export_parser_sweeper.py
#!/usr/bin/env python3
# linkedin_export_parser_sweeper.py
# Version 0.2 – 2025-12-02
#
# CLI tool designed to parse, filter, and clean up your public LinkedIn
# interactions (comments, reactions, shares) based on the official CSV data
# export. Provides interactive triage, persistent SQLite-based whitelist/
# blacklist, and semi-automated comment removal through Safari integration.
#
# (c) 2025 dzogrim
# License: MIT

import argparse
import csv
import sqlite3
import subprocess
import sys
from pathlib import Path
from datetime import datetime, timedelta

# May want to add Votes.csv and Rich_Media.csv
CSV_FILES = {
    "comments": ["Comments.csv"],
    "reactions": ["Reactions.csv"],
    "shares": ["Shares.csv", "InstantReposts.csv"],
}

# ---------- CSV helpers ----------

def pick_first(row, keys):
    for k in keys:
        if k in row and row[k]:
            return row[k]
    return None


def guess_url(row, rtype):
    url = pick_first(
        row,
        [
            "Permalink",
            "URL",
            "Url",
            "Link",
            "Share URL",
            "Share Url",
            "shareUrl",
            "Original URL",
            "Original Url",
        ],
    )
    if url:
        return url

    for k, v in row.items():
        if v and "url" in k.lower():
            return v
    return None


def guess_text(row, rtype):
    text = pick_first(
        row,
        [
            "Comment",
            "Comment Text",
            "CommentBody",
            "Message",
            "Text",
            "Share Text",
            "Body",
            "Title",
            "Headline",
            "Commentary",
            "Share Commentary",
        ],
    )
    if text:
        return text

    for k, v in row.items():
        kl = k.lower()
        if v and any(
            token in kl
            for token in ["text", "title", "headline", "comment", "summary", "commentary"]
        ):
            return v
    return None


def summarize_row(row, rtype):
    time = pick_first(
        row,
        ["Created At", "CreatedAt", "createdAt", "Date", "Timestamp"],
    )
    url = guess_url(row, rtype)
    text = guess_text(row, rtype)

    if text and len(text) > 120:
        text = text[:120] + "…"

    return {
        "type": rtype,
        "time": time,
        "url": url,
        "text": text,
    }


def read_csv(path: Path, rtype: str):
    entries = []
    with path.open("r", encoding="utf-8-sig", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            entries.append(summarize_row(row, rtype))
    return entries


def scan_export_dir(root: Path):
    results = {"comments": [], "reactions": [], "shares": []}
    for rtype, filenames in CSV_FILES.items():
        for name in filenames:
            path = root / name
            if not path.is_file():
                continue
            entries = read_csv(path, rtype)
            results[rtype].extend(entries)
    return results


# ---------- date / filtering helpers ----------

def parse_linkedin_date(s):
    if not s:
        return None

    for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d"):
        try:
            return datetime.strptime(s, fmt)
        except ValueError:
            pass
    return None


def filter_older_than(entries, days):
    cutoff = datetime.now() - timedelta(days=days)
    filtered = []
    for e in entries:
        d = parse_linkedin_date(e["time"])
        if d and d < cutoff:
            filtered.append(e)
    return filtered


# ---------- SQLite store for actions (deleted / ignored / keep) ----------

def init_db(db_path: Path):
    db_path.parent.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(str(db_path))
    conn.execute(
        """
        CREATE TABLE IF NOT EXISTS ignored (
            url TEXT PRIMARY KEY,
            type TEXT,
            added_at TEXT,
            note TEXT
        )
        """
    )
    conn.commit()
    return conn


def load_ignored_urls(conn):
    cur = conn.cursor()
    cur.execute("SELECT url FROM ignored")
    rows = cur.fetchall()
    return {r[0] for r in rows}


def add_ignored(conn, entry, note):
    if not entry.get("url"):
        return
    conn.execute(
        "INSERT OR REPLACE INTO ignored(url, type, added_at, note) VALUES (?, ?, ?, ?)",
        (
            entry["url"],
            entry.get("type") or "",
            datetime.now().isoformat(timespec="seconds"),
            note,
        ),
    )
    conn.commit()


# ---------- main ----------

def main():
    parser = argparse.ArgumentParser(
        description=(
            "Dry-run and semi-automatic cleanup of your LinkedIn public "
            "interactions based on the CSV export."
        )
    )
    parser.add_argument(
        "export_dir",
        type=Path,
        help="Directory containing your LinkedIn export (Complete_LinkedInDataExport_*/).",
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=10,
        help="Max number of examples to display per type (0 = all, default: 10).",
    )
    parser.add_argument(
        "--older-than",
        type=int,
        default=None,
        help="Only keep interactions older than N days.",
    )
    parser.add_argument(
        "--export-urls",
        type=Path,
        default=None,
        help="If set, write filtered interactions to this file (type;date;url).",
    )
    parser.add_argument(
        "--run-mode",
        choices=["safari-comments"],
        default=None,
        help="Semi-auto mode: 'safari-comments' opens comment URLs in Safari.",
    )
    parser.add_argument(
        "--db-path",
        type=Path,
        default=Path("~/.linkedin_cleanup.sqlite"),
        help="Path to the SQLite database (default: ~/.linkedin_cleanup.sqlite).",
    )

    args = parser.parse_args()

    if not args.export_dir.is_dir():
        print(f"{args.export_dir} is not a directory.")
        return

    # DB
    db_path = args.db_path.expanduser()
    conn = init_db(db_path)
    ignored_urls = load_ignored_urls(conn)

    # Read export
    results = scan_export_dir(args.export_dir)

    # Age filter
    if args.older_than:
        for rtype in results:
            results[rtype] = filter_older_than(results[rtype], args.older_than)

    # Filter against DB (anything already recorded, regardless of note)
    for rtype in results:
        results[rtype] = [
            e for e in results[rtype]
            if not (e.get("url") and e["url"] in ignored_urls)
        ]

    # Optional URL export
    if args.export_urls:
        with args.export_urls.open("w", encoding="utf-8") as f:
            for rtype in ["comments", "reactions", "shares"]:
                for e in results[rtype]:
                    if not e["url"]:
                        continue
                    date = e["time"] or ""
                    f.write(f"{rtype};{date};{e['url']}\n")
        print(f"Liste des URL exportée dans : {args.export_urls}")

    # ---------- run-mode (Safari + DB: deleted/keep/ignored) ----------
    if args.run_mode == "safari-comments":

        # Must be running on macOS
        if sys.platform != "darwin":
            print("Error: 'safari-comments' run mode is only supported on macOS.")
            print(f"Current platform detected: {sys.platform}")
            return

        comments = [e for e in results["comments"] if e.get("url")]
        if args.limit and args.limit > 0:
            comments = comments[: args.limit]

        total = len(comments)
        if not total:
            print("No comments to process in this run mode.")
            return

        print(f"Run-mode safari-comments: {total} potential comment(s).")
        print("For each comment:")
        print("  y  = open in Safari to potentially delete it")
        print("  N  = keep it and whitelist it (keep, never proposed again)")
        print("  i  = ignore permanently (stored as 'ignored', never proposed again)")
        print("  q  = quit this run mode\n")

        for idx, e in enumerate(comments, 1):
            print(f"[{idx}/{total}]")
            if e["time"]:
                print(f"  Date : {e['time']}")
            print(f"  URL  : {e['url']}")
            if e["text"]:
                print(f"  Text : {e['text']}")
            while True:
                ans = input("Open in Safari? [y/N/i/q] ").strip().lower()
                if ans in ("", "n"):
                    add_ignored(conn, e, note="keep")
                    print("  -> kept and stored as 'keep' (whitelist, not proposed again).\n")
                    break
                elif ans == "i":
                    add_ignored(conn, e, note="ignored")
                    print("  -> stored as 'ignored' (not proposed again).\n")
                    break
                elif ans == "q":
                    print("End of run-mode.")
                    return
                elif ans == "y":
                    print("  -> opening in Safari…")
                    subprocess.run(["open", "-a", "Safari", e["url"]])
                    input("Delete (or not) the comment, then press Enter to continue... ")

                    while True:
                        ans2 = input(
                            "Was the comment deleted? [y/N=keep and never propose again] "
                        ).strip().lower()
                        if ans2 in ("", "n"):
                            add_ignored(conn, e, note="keep")
                            print("  -> kept and stored as 'keep' (whitelist, not proposed again).\n")
                            break
                        elif ans2 == "y":
                            add_ignored(conn, e, note="deleted")
                            print("  -> stored as 'deleted' in DB (not proposed again).\n")
                            break
                        else:
                            print("Invalid answer (use 'y' or Enter).")
                    break
                else:
                    print("Invalid choice.")

        print("\nRun-mode safari-comments finished.")
        return

    # ---------- standard dry-run output ----------
    total = sum(len(v) for v in results.values())
    print("\n=== DRY-RUN SUMMARY ===")
    print(f"  Comments  : {len(results['comments'])}")
    print(f"  Reactions : {len(results['reactions'])}")
    print(f"  Shares    : {len(results['shares'])}")
    print(f"  TOTAL     : {total}\n")

    for rtype, label in [
        ("comments", "COMMENTS"),
        ("reactions", "REACTIONS / LIKES"),
        ("shares", "SHARES / REPOSTS"),
    ]:
        entries = results[rtype]
        if not entries:
            continue

        if args.limit and args.limit > 0:
            to_show = entries[: args.limit]
            header_suffix = f" (examples, max {args.limit})"
        else:
            to_show = entries
            header_suffix = " (all examples)"

        print(f"--- {label}{header_suffix} ---")
        for i, e in enumerate(to_show, 1):
            print(f"[{i}]")
            if e["time"]:
                print(f"  Date : {e['time']}")
            if e["url"]:
                print(f"  URL  : {e['url']}")
            if e["text"]:
                print(f"  Text : {e['text']}")
            print()

    print("Dry run finished. Nothing was modified on LinkedIn.")


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# linkedin_export_parser_sweeper.py
	# Version 0.2 – 2025-12-02
	#
	# CLI tool designed to parse, filter, and clean up your public LinkedIn
	# interactions (comments, reactions, shares) based on the official CSV data
	# export. Provides interactive triage, persistent SQLite-based whitelist/
	# blacklist, and semi-automated comment removal through Safari integration.
	#
	# (c) 2025 dzogrim
	# License: MIT

	import argparse
	import csv
	import sqlite3
	import subprocess
	import sys
	from pathlib import Path
	from datetime import datetime, timedelta

	# May want to add Votes.csv and Rich_Media.csv
	CSV_FILES = {
	"comments": ["Comments.csv"],
	"reactions": ["Reactions.csv"],
	"shares": ["Shares.csv", "InstantReposts.csv"],
	}

	# ---------- CSV helpers ----------

	def pick_first(row, keys):
	for k in keys:
	if k in row and row[k]:
	return row[k]
	return None


	def guess_url(row, rtype):
	url = pick_first(
	row,
	[
	"Permalink",
	"URL",
	"Url",
	"Link",
	"Share URL",
	"Share Url",
	"shareUrl",
	"Original URL",
	"Original Url",
	],
	)
	if url:
	return url

	for k, v in row.items():
	if v and "url" in k.lower():
	return v
	return None


	def guess_text(row, rtype):
	text = pick_first(
	row,
	[
	"Comment",
	"Comment Text",
	"CommentBody",
	"Message",
	"Text",
	"Share Text",
	"Body",
	"Title",
	"Headline",
	"Commentary",
	"Share Commentary",
	],
	)
	if text:
	return text

	for k, v in row.items():
	kl = k.lower()
	if v and any(
	token in kl
	for token in ["text", "title", "headline", "comment", "summary", "commentary"]
	):
	return v
	return None


	def summarize_row(row, rtype):
	time = pick_first(
	row,
	["Created At", "CreatedAt", "createdAt", "Date", "Timestamp"],
	)
	url = guess_url(row, rtype)
	text = guess_text(row, rtype)

	if text and len(text) > 120:
	text = text[:120] + "…"

	return {
	"type": rtype,
	"time": time,
	"url": url,
	"text": text,
	}


	def read_csv(path: Path, rtype: str):
	entries = []
	with path.open("r", encoding="utf-8-sig", newline="") as f:
	reader = csv.DictReader(f)
	for row in reader:
	entries.append(summarize_row(row, rtype))
	return entries


	def scan_export_dir(root: Path):
	results = {"comments": [], "reactions": [], "shares": []}
	for rtype, filenames in CSV_FILES.items():
	for name in filenames:
	path = root / name
	if not path.is_file():
	continue
	entries = read_csv(path, rtype)
	results[rtype].extend(entries)
	return results


	# ---------- date / filtering helpers ----------

	def parse_linkedin_date(s):
	if not s:
	return None

	for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d"):
	try:
	return datetime.strptime(s, fmt)
	except ValueError:
	pass
	return None


	def filter_older_than(entries, days):
	cutoff = datetime.now() - timedelta(days=days)
	filtered = []
	for e in entries:
	d = parse_linkedin_date(e["time"])
	if d and d < cutoff:
	filtered.append(e)
	return filtered


	# ---------- SQLite store for actions (deleted / ignored / keep) ----------

	def init_db(db_path: Path):
	db_path.parent.mkdir(parents=True, exist_ok=True)
	conn = sqlite3.connect(str(db_path))
	conn.execute(
	"""
	CREATE TABLE IF NOT EXISTS ignored (
	url TEXT PRIMARY KEY,
	type TEXT,
	added_at TEXT,
	note TEXT
	)
	"""
	)
	conn.commit()
	return conn


	def load_ignored_urls(conn):
	cur = conn.cursor()
	cur.execute("SELECT url FROM ignored")
	rows = cur.fetchall()
	return {r[0] for r in rows}


	def add_ignored(conn, entry, note):
	if not entry.get("url"):
	return
	conn.execute(
	"INSERT OR REPLACE INTO ignored(url, type, added_at, note) VALUES (?, ?, ?, ?)",
	(
	entry["url"],
	entry.get("type") or "",
	datetime.now().isoformat(timespec="seconds"),
	note,
	),
	)
	conn.commit()


	# ---------- main ----------

	def main():
	parser = argparse.ArgumentParser(
	description=(
	"Dry-run and semi-automatic cleanup of your LinkedIn public "
	"interactions based on the CSV export."
	)
	)
	parser.add_argument(
	"export_dir",
	type=Path,
	help="Directory containing your LinkedIn export (Complete_LinkedInDataExport_*/).",
	)
	parser.add_argument(
	"--limit",
	type=int,
	default=10,
	help="Max number of examples to display per type (0 = all, default: 10).",
	)
	parser.add_argument(
	"--older-than",
	type=int,
	default=None,
	help="Only keep interactions older than N days.",
	)
	parser.add_argument(
	"--export-urls",
	type=Path,
	default=None,
	help="If set, write filtered interactions to this file (type;date;url).",
	)
	parser.add_argument(
	"--run-mode",
	choices=["safari-comments"],
	default=None,
	help="Semi-auto mode: 'safari-comments' opens comment URLs in Safari.",
	)
	parser.add_argument(
	"--db-path",
	type=Path,
	default=Path("~/.linkedin_cleanup.sqlite"),
	help="Path to the SQLite database (default: ~/.linkedin_cleanup.sqlite).",
	)

	args = parser.parse_args()

	if not args.export_dir.is_dir():
	print(f"{args.export_dir} is not a directory.")
	return

	# DB
	db_path = args.db_path.expanduser()
	conn = init_db(db_path)
	ignored_urls = load_ignored_urls(conn)

	# Read export
	results = scan_export_dir(args.export_dir)

	# Age filter
	if args.older_than:
	for rtype in results:
	results[rtype] = filter_older_than(results[rtype], args.older_than)

	# Filter against DB (anything already recorded, regardless of note)
	for rtype in results:
	results[rtype] = [
	e for e in results[rtype]
	if not (e.get("url") and e["url"] in ignored_urls)
	]

	# Optional URL export
	if args.export_urls:
	with args.export_urls.open("w", encoding="utf-8") as f:
	for rtype in ["comments", "reactions", "shares"]:
	for e in results[rtype]:
	if not e["url"]:
	continue
	date = e["time"] or ""
	f.write(f"{rtype};{date};{e['url']}\n")
	print(f"Liste des URL exportée dans : {args.export_urls}")

	# ---------- run-mode (Safari + DB: deleted/keep/ignored) ----------
	if args.run_mode == "safari-comments":

	# Must be running on macOS
	if sys.platform != "darwin":
	print("Error: 'safari-comments' run mode is only supported on macOS.")
	print(f"Current platform detected: {sys.platform}")
	return

	comments = [e for e in results["comments"] if e.get("url")]
	if args.limit and args.limit > 0:
	comments = comments[: args.limit]

	total = len(comments)
	if not total:
	print("No comments to process in this run mode.")
	return

	print(f"Run-mode safari-comments: {total} potential comment(s).")
	print("For each comment:")
	print(" y = open in Safari to potentially delete it")
	print(" N = keep it and whitelist it (keep, never proposed again)")
	print(" i = ignore permanently (stored as 'ignored', never proposed again)")
	print(" q = quit this run mode\n")

	for idx, e in enumerate(comments, 1):
	print(f"[{idx}/{total}]")
	if e["time"]:
	print(f" Date : {e['time']}")
	print(f" URL : {e['url']}")
	if e["text"]:
	print(f" Text : {e['text']}")
	while True:
	ans = input("Open in Safari? [y/N/i/q] ").strip().lower()
	if ans in ("", "n"):
	add_ignored(conn, e, note="keep")
	print(" -> kept and stored as 'keep' (whitelist, not proposed again).\n")
	break
	elif ans == "i":
	add_ignored(conn, e, note="ignored")
	print(" -> stored as 'ignored' (not proposed again).\n")
	break
	elif ans == "q":
	print("End of run-mode.")
	return
	elif ans == "y":
	print(" -> opening in Safari…")
	subprocess.run(["open", "-a", "Safari", e["url"]])
	input("Delete (or not) the comment, then press Enter to continue... ")

	while True:
	ans2 = input(
	"Was the comment deleted? [y/N=keep and never propose again] "
	).strip().lower()
	if ans2 in ("", "n"):
	add_ignored(conn, e, note="keep")
	print(" -> kept and stored as 'keep' (whitelist, not proposed again).\n")
	break
	elif ans2 == "y":
	add_ignored(conn, e, note="deleted")
	print(" -> stored as 'deleted' in DB (not proposed again).\n")
	break
	else:
	print("Invalid answer (use 'y' or Enter).")
	break
	else:
	print("Invalid choice.")

	print("\nRun-mode safari-comments finished.")
	return

	# ---------- standard dry-run output ----------
	total = sum(len(v) for v in results.values())
	print("\n=== DRY-RUN SUMMARY ===")
	print(f" Comments : {len(results['comments'])}")
	print(f" Reactions : {len(results['reactions'])}")
	print(f" Shares : {len(results['shares'])}")
	print(f" TOTAL : {total}\n")

	for rtype, label in [
	("comments", "COMMENTS"),
	("reactions", "REACTIONS / LIKES"),
	("shares", "SHARES / REPOSTS"),
	]:
	entries = results[rtype]
	if not entries:
	continue

	if args.limit and args.limit > 0:
	to_show = entries[: args.limit]
	header_suffix = f" (examples, max {args.limit})"
	else:
	to_show = entries
	header_suffix = " (all examples)"

	print(f"--- {label}{header_suffix} ---")
	for i, e in enumerate(to_show, 1):
	print(f"[{i}]")
	if e["time"]:
	print(f" Date : {e['time']}")
	if e["url"]:
	print(f" URL : {e['url']}")
	if e["text"]:
	print(f" Text : {e['text']}")
	print()

	print("Dry run finished. Nothing was modified on LinkedIn.")


	if __name__ == "__main__":
	main()
No results found