Created
October 19, 2025 18:18
-
-
Save lukacat10/42bb63f2ba3b4b4dcac1c507c9456108 to your computer and use it in GitHub Desktop.
Convert whatsapp exported chat text files to raindrop.io csv format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import re | |
| import sys | |
| import csv | |
| import argparse | |
| from pathlib import Path | |
| from datetime import datetime | |
| from zoneinfo import ZoneInfo | |
| # Matches lines like: | |
| # [19/12/2024, 23:51:30] Name: message ... | |
| # [19/12/2024, 23:51] Name: message ... | |
| CREATED_RE = re.compile( | |
| r'^[^\[]*\[' # tolerant of junk/BOM before '[' | |
| r'(\d{1,2}/\d{1,2}/\d{4}),\s*' # dd/mm/yyyy, | |
| r'([0-2]?\d:[0-5]?\d' # H:MM | |
| r'(?:[:.][0-5]?\d)?' # optional :SS or .SS | |
| r')\]' # closing bracket | |
| ) | |
| # Basic, robust URL finder (http/https) | |
| URL_RE = re.compile(r'https?://[^\s\'"<>]+') | |
| def parse_args(): | |
| p = argparse.ArgumentParser( | |
| description="Extract URLs from chat export lines into CSV (url, created, note)." | |
| ) | |
| p.add_argument("input_file", help="Input text file") | |
| p.add_argument("output_csv", help="Output CSV file") | |
| p.add_argument( | |
| "--created-format", | |
| choices=["iso", "unix"], | |
| default="iso", | |
| help="Format of 'created' column (default: iso)" | |
| ) | |
| p.add_argument( | |
| "--tz", | |
| default="Asia/Ho_Chi_Minh", | |
| help="Timezone for interpreting timestamps (IANA name, default: Asia/Ho_Chi_Minh)" | |
| ) | |
| p.add_argument( | |
| "--require-timestamp", | |
| action="store_true", | |
| help="Only emit rows when a timestamp was parsed (otherwise skip)" | |
| ) | |
| p.add_argument( | |
| "--dedupe", | |
| action="store_true", | |
| help="De-duplicate identical (url, created, note) rows" | |
| ) | |
| return p.parse_args() | |
| def parse_created(created_match: re.Match, tz: ZoneInfo, out_fmt: str) -> str | None: | |
| """ | |
| Convert the matched day-first date & time to ISO 8601 or Unix seconds (string). | |
| Returns None if parsing fails. | |
| """ | |
| try: | |
| date_str = created_match.group(1) # dd/mm/yyyy | |
| time_str = created_match.group(2) # H:MM[:SS] | |
| # Normalize seconds separator if '.' used | |
| time_str = time_str.replace(".", ":") | |
| # Split date | |
| d, m, y = date_str.split("/") | |
| # Ensure two-digit day/month and seconds present for consistent parsing | |
| parts = time_str.split(":") | |
| if len(parts) == 2: | |
| hh, mm = parts | |
| ss = "00" | |
| else: | |
| hh, mm, ss = parts[:3] | |
| # Zero-pad | |
| hh = hh.zfill(2); mm = mm.zfill(2); ss = ss.zfill(2) | |
| d = d.zfill(2); m = m.zfill(2) | |
| dt = datetime(int(y), int(m), int(d), int(hh), int(mm), int(ss), tzinfo=tz) | |
| if out_fmt == "iso": | |
| return dt.isoformat() | |
| else: | |
| return str(int(dt.timestamp())) | |
| except Exception: | |
| return None | |
| def main(): | |
| args = parse_args() | |
| tz = ZoneInfo(args.tz) | |
| input_path = Path(args.input_file) | |
| output_path = Path(args.output_csv) | |
| total_lines = 0 | |
| lines_with_urls = 0 | |
| timestamp_mismatches = 0 | |
| rows_written = 0 | |
| seen = set() | |
| rows = [] | |
| with input_path.open("r", encoding="utf-8", errors="ignore") as f: | |
| for raw in f: | |
| total_lines += 1 | |
| line = raw.rstrip("\n").lstrip("\ufeff").strip() | |
| if not line: | |
| continue | |
| m = CREATED_RE.match(line) | |
| created = None | |
| if m: | |
| created = parse_created(m, tz, args.created_format) | |
| if created is None: | |
| timestamp_mismatches += 1 | |
| else: | |
| timestamp_mismatches += 1 | |
| urls = URL_RE.findall(line) | |
| if urls: | |
| lines_with_urls += 1 | |
| # If requiring a timestamp, skip if we didn't parse it | |
| if args.require-timestamp and (not created): | |
| continue | |
| for u in urls: | |
| row = {"url": u, "created": created or "", "note": line} | |
| if args.dedupe: | |
| key = (row["url"], row["created"], row["note"]) | |
| if key in seen: | |
| continue | |
| seen.add(key) | |
| rows.append(row) | |
| rows_written += 1 | |
| with output_path.open("w", newline="", encoding="utf-8") as out: | |
| writer = csv.DictWriter(out, fieldnames=["url", "created", "note"]) | |
| writer.writeheader() | |
| writer.writerows(rows) | |
| print(f"✅ Wrote {rows_written} rows to {output_path}") | |
| print(f"Total lines: {total_lines}") | |
| print(f"Lines with URLs: {lines_with_urls}") | |
| print(f"Timestamp mismatches: {timestamp_mismatches}") | |
| if args.created_format == "iso": | |
| print("Created format: ISO 8601 (e.g., 2024-12-19T23:51:30+07:00)") | |
| else: | |
| print("Created format: Unix seconds (e.g., 1734636690)") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment