lukacat10/chat_to_csv.py

## chat_to_csv.py
#!/usr/bin/env python3
import re
import sys
import csv
import argparse
from pathlib import Path
from datetime import datetime
from zoneinfo import ZoneInfo

# Matches lines like:
# [19/12/2024, 23:51:30] Name: message ...
# [19/12/2024, 23:51] Name: message ...
CREATED_RE = re.compile(
    r'^[^\[]*\['                    # tolerant of junk/BOM before '['
    r'(\d{1,2}/\d{1,2}/\d{4}),\s*'  # dd/mm/yyyy,
    r'([0-2]?\d:[0-5]?\d'           # H:MM
    r'(?:[:.][0-5]?\d)?'            # optional :SS or .SS
    r')\]'                          # closing bracket
)

# Basic, robust URL finder (http/https)
URL_RE = re.compile(r'https?://[^\s\'"<>]+')

def parse_args():
    p = argparse.ArgumentParser(
        description="Extract URLs from chat export lines into CSV (url, created, note)."
    )
    p.add_argument("input_file", help="Input text file")
    p.add_argument("output_csv", help="Output CSV file")
    p.add_argument(
        "--created-format",
        choices=["iso", "unix"],
        default="iso",
        help="Format of 'created' column (default: iso)"
    )
    p.add_argument(
        "--tz",
        default="Asia/Ho_Chi_Minh",
        help="Timezone for interpreting timestamps (IANA name, default: Asia/Ho_Chi_Minh)"
    )
    p.add_argument(
        "--require-timestamp",
        action="store_true",
        help="Only emit rows when a timestamp was parsed (otherwise skip)"
    )
    p.add_argument(
        "--dedupe",
        action="store_true",
        help="De-duplicate identical (url, created, note) rows"
    )
    return p.parse_args()

def parse_created(created_match: re.Match, tz: ZoneInfo, out_fmt: str) -> str | None:
    """
    Convert the matched day-first date & time to ISO 8601 or Unix seconds (string).
    Returns None if parsing fails.
    """
    try:
        date_str = created_match.group(1)  # dd/mm/yyyy
        time_str = created_match.group(2)  # H:MM[:SS]
        # Normalize seconds separator if '.' used
        time_str = time_str.replace(".", ":")
        # Split date
        d, m, y = date_str.split("/")
        # Ensure two-digit day/month and seconds present for consistent parsing
        parts = time_str.split(":")
        if len(parts) == 2:
            hh, mm = parts
            ss = "00"
        else:
            hh, mm, ss = parts[:3]
        # Zero-pad
        hh = hh.zfill(2); mm = mm.zfill(2); ss = ss.zfill(2)
        d = d.zfill(2); m = m.zfill(2)

        dt = datetime(int(y), int(m), int(d), int(hh), int(mm), int(ss), tzinfo=tz)
        if out_fmt == "iso":
            return dt.isoformat()
        else:
            return str(int(dt.timestamp()))
    except Exception:
        return None

def main():
    args = parse_args()

    tz = ZoneInfo(args.tz)

    input_path = Path(args.input_file)
    output_path = Path(args.output_csv)

    total_lines = 0
    lines_with_urls = 0
    timestamp_mismatches = 0
    rows_written = 0

    seen = set()
    rows = []

    with input_path.open("r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            total_lines += 1
            line = raw.rstrip("\n").lstrip("\ufeff").strip()
            if not line:
                continue

            m = CREATED_RE.match(line)
            created = None
            if m:
                created = parse_created(m, tz, args.created_format)
                if created is None:
                    timestamp_mismatches += 1
            else:
                timestamp_mismatches += 1

            urls = URL_RE.findall(line)
            if urls:
                lines_with_urls += 1

            # If requiring a timestamp, skip if we didn't parse it
            if args.require-timestamp and (not created):
                continue

            for u in urls:
                row = {"url": u, "created": created or "", "note": line}
                if args.dedupe:
                    key = (row["url"], row["created"], row["note"])
                    if key in seen:
                        continue
                    seen.add(key)
                rows.append(row)
                rows_written += 1

    with output_path.open("w", newline="", encoding="utf-8") as out:
        writer = csv.DictWriter(out, fieldnames=["url", "created", "note"])
        writer.writeheader()
        writer.writerows(rows)

    print(f"✅ Wrote {rows_written} rows to {output_path}")
    print(f"Total lines: {total_lines}")
    print(f"Lines with URLs: {lines_with_urls}")
    print(f"Timestamp mismatches: {timestamp_mismatches}")
    if args.created_format == "iso":
        print("Created format: ISO 8601 (e.g., 2024-12-19T23:51:30+07:00)")
    else:
        print("Created format: Unix seconds (e.g., 1734636690)")

if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	import re
	import sys
	import csv
	import argparse
	from pathlib import Path
	from datetime import datetime
	from zoneinfo import ZoneInfo

	# Matches lines like:
	# [19/12/2024, 23:51:30] Name: message ...
	# [19/12/2024, 23:51] Name: message ...
	CREATED_RE = re.compile(
	r'^[^\[]*\[' # tolerant of junk/BOM before '['
	r'(\d{1,2}/\d{1,2}/\d{4}),\s*' # dd/mm/yyyy,
	r'([0-2]?\d:[0-5]?\d' # H:MM
	r'(?:[:.][0-5]?\d)?' # optional :SS or .SS
	r')\]' # closing bracket
	)

	# Basic, robust URL finder (http/https)
	URL_RE = re.compile(r'https?://[^\s\'"<>]+')

	def parse_args():
	p = argparse.ArgumentParser(
	description="Extract URLs from chat export lines into CSV (url, created, note)."
	)
	p.add_argument("input_file", help="Input text file")
	p.add_argument("output_csv", help="Output CSV file")
	p.add_argument(
	"--created-format",
	choices=["iso", "unix"],
	default="iso",
	help="Format of 'created' column (default: iso)"
	)
	p.add_argument(
	"--tz",
	default="Asia/Ho_Chi_Minh",
	help="Timezone for interpreting timestamps (IANA name, default: Asia/Ho_Chi_Minh)"
	)
	p.add_argument(
	"--require-timestamp",
	action="store_true",
	help="Only emit rows when a timestamp was parsed (otherwise skip)"
	)
	p.add_argument(
	"--dedupe",
	action="store_true",
	help="De-duplicate identical (url, created, note) rows"
	)
	return p.parse_args()

	def parse_created(created_match: re.Match, tz: ZoneInfo, out_fmt: str) -> str \| None:
	"""
	Convert the matched day-first date & time to ISO 8601 or Unix seconds (string).
	Returns None if parsing fails.
	"""
	try:
	date_str = created_match.group(1) # dd/mm/yyyy
	time_str = created_match.group(2) # H:MM[:SS]
	# Normalize seconds separator if '.' used
	time_str = time_str.replace(".", ":")
	# Split date
	d, m, y = date_str.split("/")
	# Ensure two-digit day/month and seconds present for consistent parsing
	parts = time_str.split(":")
	if len(parts) == 2:
	hh, mm = parts
	ss = "00"
	else:
	hh, mm, ss = parts[:3]
	# Zero-pad
	hh = hh.zfill(2); mm = mm.zfill(2); ss = ss.zfill(2)
	d = d.zfill(2); m = m.zfill(2)

	dt = datetime(int(y), int(m), int(d), int(hh), int(mm), int(ss), tzinfo=tz)
	if out_fmt == "iso":
	return dt.isoformat()
	else:
	return str(int(dt.timestamp()))
	except Exception:
	return None

	def main():
	args = parse_args()

	tz = ZoneInfo(args.tz)

	input_path = Path(args.input_file)
	output_path = Path(args.output_csv)

	total_lines = 0
	lines_with_urls = 0
	timestamp_mismatches = 0
	rows_written = 0

	seen = set()
	rows = []

	with input_path.open("r", encoding="utf-8", errors="ignore") as f:
	for raw in f:
	total_lines += 1
	line = raw.rstrip("\n").lstrip("\ufeff").strip()
	if not line:
	continue

	m = CREATED_RE.match(line)
	created = None
	if m:
	created = parse_created(m, tz, args.created_format)
	if created is None:
	timestamp_mismatches += 1
	else:
	timestamp_mismatches += 1

	urls = URL_RE.findall(line)
	if urls:
	lines_with_urls += 1

	# If requiring a timestamp, skip if we didn't parse it
	if args.require-timestamp and (not created):
	continue

	for u in urls:
	row = {"url": u, "created": created or "", "note": line}
	if args.dedupe:
	key = (row["url"], row["created"], row["note"])
	if key in seen:
	continue
	seen.add(key)
	rows.append(row)
	rows_written += 1

	with output_path.open("w", newline="", encoding="utf-8") as out:
	writer = csv.DictWriter(out, fieldnames=["url", "created", "note"])
	writer.writeheader()
	writer.writerows(rows)

	print(f"✅ Wrote {rows_written} rows to {output_path}")
	print(f"Total lines: {total_lines}")
	print(f"Lines with URLs: {lines_with_urls}")
	print(f"Timestamp mismatches: {timestamp_mismatches}")
	if args.created_format == "iso":
	print("Created format: ISO 8601 (e.g., 2024-12-19T23:51:30+07:00)")
	else:
	print("Created format: Unix seconds (e.g., 1734636690)")

	if __name__ == "__main__":
	main()
No results found