Skip to content

Instantly share code, notes, and snippets.

@lukacat10
Created October 19, 2025 18:18
Show Gist options
  • Select an option

  • Save lukacat10/42bb63f2ba3b4b4dcac1c507c9456108 to your computer and use it in GitHub Desktop.

Select an option

Save lukacat10/42bb63f2ba3b4b4dcac1c507c9456108 to your computer and use it in GitHub Desktop.
Convert whatsapp exported chat text files to raindrop.io csv format
#!/usr/bin/env python3
import re
import sys
import csv
import argparse
from pathlib import Path
from datetime import datetime
from zoneinfo import ZoneInfo
# Matches lines like:
# [19/12/2024, 23:51:30] Name: message ...
# [19/12/2024, 23:51] Name: message ...
CREATED_RE = re.compile(
r'^[^\[]*\[' # tolerant of junk/BOM before '['
r'(\d{1,2}/\d{1,2}/\d{4}),\s*' # dd/mm/yyyy,
r'([0-2]?\d:[0-5]?\d' # H:MM
r'(?:[:.][0-5]?\d)?' # optional :SS or .SS
r')\]' # closing bracket
)
# Basic, robust URL finder (http/https)
URL_RE = re.compile(r'https?://[^\s\'"<>]+')
def parse_args():
p = argparse.ArgumentParser(
description="Extract URLs from chat export lines into CSV (url, created, note)."
)
p.add_argument("input_file", help="Input text file")
p.add_argument("output_csv", help="Output CSV file")
p.add_argument(
"--created-format",
choices=["iso", "unix"],
default="iso",
help="Format of 'created' column (default: iso)"
)
p.add_argument(
"--tz",
default="Asia/Ho_Chi_Minh",
help="Timezone for interpreting timestamps (IANA name, default: Asia/Ho_Chi_Minh)"
)
p.add_argument(
"--require-timestamp",
action="store_true",
help="Only emit rows when a timestamp was parsed (otherwise skip)"
)
p.add_argument(
"--dedupe",
action="store_true",
help="De-duplicate identical (url, created, note) rows"
)
return p.parse_args()
def parse_created(created_match: re.Match, tz: ZoneInfo, out_fmt: str) -> str | None:
"""
Convert the matched day-first date & time to ISO 8601 or Unix seconds (string).
Returns None if parsing fails.
"""
try:
date_str = created_match.group(1) # dd/mm/yyyy
time_str = created_match.group(2) # H:MM[:SS]
# Normalize seconds separator if '.' used
time_str = time_str.replace(".", ":")
# Split date
d, m, y = date_str.split("/")
# Ensure two-digit day/month and seconds present for consistent parsing
parts = time_str.split(":")
if len(parts) == 2:
hh, mm = parts
ss = "00"
else:
hh, mm, ss = parts[:3]
# Zero-pad
hh = hh.zfill(2); mm = mm.zfill(2); ss = ss.zfill(2)
d = d.zfill(2); m = m.zfill(2)
dt = datetime(int(y), int(m), int(d), int(hh), int(mm), int(ss), tzinfo=tz)
if out_fmt == "iso":
return dt.isoformat()
else:
return str(int(dt.timestamp()))
except Exception:
return None
def main():
args = parse_args()
tz = ZoneInfo(args.tz)
input_path = Path(args.input_file)
output_path = Path(args.output_csv)
total_lines = 0
lines_with_urls = 0
timestamp_mismatches = 0
rows_written = 0
seen = set()
rows = []
with input_path.open("r", encoding="utf-8", errors="ignore") as f:
for raw in f:
total_lines += 1
line = raw.rstrip("\n").lstrip("\ufeff").strip()
if not line:
continue
m = CREATED_RE.match(line)
created = None
if m:
created = parse_created(m, tz, args.created_format)
if created is None:
timestamp_mismatches += 1
else:
timestamp_mismatches += 1
urls = URL_RE.findall(line)
if urls:
lines_with_urls += 1
# If requiring a timestamp, skip if we didn't parse it
if args.require-timestamp and (not created):
continue
for u in urls:
row = {"url": u, "created": created or "", "note": line}
if args.dedupe:
key = (row["url"], row["created"], row["note"])
if key in seen:
continue
seen.add(key)
rows.append(row)
rows_written += 1
with output_path.open("w", newline="", encoding="utf-8") as out:
writer = csv.DictWriter(out, fieldnames=["url", "created", "note"])
writer.writeheader()
writer.writerows(rows)
print(f"✅ Wrote {rows_written} rows to {output_path}")
print(f"Total lines: {total_lines}")
print(f"Lines with URLs: {lines_with_urls}")
print(f"Timestamp mismatches: {timestamp_mismatches}")
if args.created_format == "iso":
print("Created format: ISO 8601 (e.g., 2024-12-19T23:51:30+07:00)")
else:
print("Created format: Unix seconds (e.g., 1734636690)")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment