florianm/create_test_data.py

## create_test_data.py
"""Utilities to generate test data."""

import argparse
import csv
import hashlib
import random
from datetime import datetime, timedelta
from pathlib import Path

import yaml


def random_string(max_len):
    """Return a random string."""
    length = random.randint(1, max_len)
    return "".join(
        random.choices(
            "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ",
            k=length,
        ),
    )


def random_integer(min_val, max_val):
    """Return a random integer."""
    return random.randint(min_val, max_val)


def random_number(min_val, max_val):
    """Return a random number."""
    return random.randint(10 * min_val, 10 * max_val) / 10


def random_date(min_date, max_date, date_format):
    """Return a random date."""
    min_dt = datetime.strptime(str(min_date), "%Y-%m-%d")
    max_dt = datetime.strptime(str(max_date), "%Y-%m-%d")
    delta = max_dt - min_dt
    random_days = random.randint(0, delta.days)
    dt = min_dt + timedelta(days=random_days)
    return dt.strftime("%Y-%m-%d" if date_format == "Y-m-d" else date_format)


def random_md5():
    """Return a random md5 hash."""
    return hashlib.md5(str(random.random()).encode()).hexdigest()


def random_bool():
    """Return a random boolean value."""
    return random.choice([True, False])


def dirty_value(colspec):
    """Generate a value that intentionally fails the schema."""
    t = colspec["type"]
    if t == "integer":
        return "not_an_int"
    elif t == "date":
        return "not_a_date"
    elif t == "md5":
        return "not_a_md5"
    elif t == "string":
        return random_string(colspec.get("max", 10) + 100)
    return "DIRTY"


def generate_test_csv(spec, add_dirt=False, nrow=1000, dest="test_data.csv"):
    """
    Generate a CSV of test data.

    This function generates a CSV file with test data containing
    the specified number of rows and columns as per specification.

    Args:
        spec (path): A path to a file with a frictionless data schema.
        add_dirt (bool): Whether to add values failing the schema.
          Default: False
        nrow (int): The number of rows to generate. Default: 1000
        dest (path): A path to write the resulting CSV to.

    """
    # Read the frictionless data YAML schema
    with Path.open(spec, encoding="utf-8") as yml_file:
        schema = yaml.safe_load(yml_file)

    # Extract fields from the schema
    fields = schema.get("fields", [])
    columns = [field["name"] for field in fields]
    colspecs = fields

    Path.mkdir(Path.parent(dest), parents=True, exist_ok=True)

    with Path.open(dest, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(columns)
        for i in range(nrow):
            row = []
            for colspec in colspecs:
                if add_dirt and random.random() < 0.05:
                    val = dirty_value(colspec)
                else:
                    t = colspec["type"]
                    if t == "md5":
                        val = random_md5()
                    elif t == "string":
                        val = random_string(colspec.get("max", 100))
                    elif t == "integer":
                        val = random_integer(
                            colspec.get("min", 0),
                            colspec.get("max", 1000),
                        )
                    elif t == "number":
                        val = random_number(
                            colspec.get("min", 0),
                            colspec.get("max", 1000),
                        )
                    elif t == "date":
                        val = random_date(
                            colspec.get("min", "2000-01-01"),
                            colspec.get("max", "2025-01-01"),
                            colspec.get("format", "Y-m-d"),
                        )
                    elif t == "boolean":
                        val = random_bool()
                    else:
                        val = ""
                row.append(val)
            writer.writerow(row)
    return dest


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Generate test CSV data based on a frictionless data schema.",
    )
    parser.add_argument(
        "--spec",
        type=str,
        default="data/test_data.yml",
        help="Path to the frictionless data schema YAML file, annotated with ranges.",
    )
    parser.add_argument(
        "--add-dirt",
        action="store_true",
        help="Add dirty (invalid) values to the data.",
    )
    parser.add_argument(
        "--nrow",
        type=int,
        default=1000,
        help="Number of rows to generate.",
    )
    parser.add_argument(
        "--dest",
        type=str,
        default="data",
        help="Destination file path for the generated CSV.",
    )

    args = parser.parse_args()

    csv_path = generate_test_csv(
        spec=args.spec,
        add_dirt=args.add_dirt,
        nrow=args.nrow,
        dest=args.dest,
    )
    print(f"Test data CSV generated at: {csv_path}")

## test_data.yml
# Frictionless Data Table Schema for all supported data types
---
fields:
  - name: id
    type: md5
    description: Unique identifier (md5 hash)
  - name: integer_col
    type: integer
    description: Example integer value
    min: 10
    max: 1000
  - name: number_col
    type: number
    description: Example number (float) value
    min: 10
    max: 1000
  - name: boolean_col
    type: boolean
    description: Example boolean value
  - name: date_col
    type: date
    description: Example date value (YYYY-MM-DD)
    min: 2000-01-01
    max: 2026-01-01
    format: Y-m-d
  - name: string_col
    type: string
    description: Example string value
    max: 100
    # - name: time_col
    #   type: time
    #   description: Example time value (HH:MM:SS)
    # - name: datetime_col
    #   type: datetime
    #   description: Example datetime value (ISO 8601)
    # - name: year_col
    #   type: year
    #   description: Example year value (e.g., 2024)
    # - name: yearmonth_col
    #   type: yearmonth
    #   description: Example year and month value (YYYY-MM)
    # - name: duration_col
    #   type: duration
    #   description: Example duration value (PnYnMnDTnHnMnS)
    # - name: geopoint_col
    #   type: geopoint
    #   description: Example geopoint value (lat,lon)
    # - name: geojson_col
    #   type: geojson
    #   description: Example GeoJSON object
    # - name: object_col
    #   type: object
    #   description: Example object (JSON)
    # - name: array_col
    #   type: array
    #   description: Example array (list of values)
primaryKey: id
	"""Utilities to generate test data."""

	import argparse
	import csv
	import hashlib
	import random
	from datetime import datetime, timedelta
	from pathlib import Path

	import yaml


	def random_string(max_len):
	"""Return a random string."""
	length = random.randint(1, max_len)
	return "".join(
	random.choices(
	"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ",
	k=length,
	),
	)


	def random_integer(min_val, max_val):
	"""Return a random integer."""
	return random.randint(min_val, max_val)


	def random_number(min_val, max_val):
	"""Return a random number."""
	return random.randint(10 * min_val, 10 * max_val) / 10


	def random_date(min_date, max_date, date_format):
	"""Return a random date."""
	min_dt = datetime.strptime(str(min_date), "%Y-%m-%d")
	max_dt = datetime.strptime(str(max_date), "%Y-%m-%d")
	delta = max_dt - min_dt
	random_days = random.randint(0, delta.days)
	dt = min_dt + timedelta(days=random_days)
	return dt.strftime("%Y-%m-%d" if date_format == "Y-m-d" else date_format)


	def random_md5():
	"""Return a random md5 hash."""
	return hashlib.md5(str(random.random()).encode()).hexdigest()


	def random_bool():
	"""Return a random boolean value."""
	return random.choice([True, False])


	def dirty_value(colspec):
	"""Generate a value that intentionally fails the schema."""
	t = colspec["type"]
	if t == "integer":
	return "not_an_int"
	elif t == "date":
	return "not_a_date"
	elif t == "md5":
	return "not_a_md5"
	elif t == "string":
	return random_string(colspec.get("max", 10) + 100)
	return "DIRTY"


	def generate_test_csv(spec, add_dirt=False, nrow=1000, dest="test_data.csv"):
	"""
	Generate a CSV of test data.

	This function generates a CSV file with test data containing
	the specified number of rows and columns as per specification.

	Args:
	spec (path): A path to a file with a frictionless data schema.
	add_dirt (bool): Whether to add values failing the schema.
	Default: False
	nrow (int): The number of rows to generate. Default: 1000
	dest (path): A path to write the resulting CSV to.

	"""
	# Read the frictionless data YAML schema
	with Path.open(spec, encoding="utf-8") as yml_file:
	schema = yaml.safe_load(yml_file)

	# Extract fields from the schema
	fields = schema.get("fields", [])
	columns = [field["name"] for field in fields]
	colspecs = fields

	Path.mkdir(Path.parent(dest), parents=True, exist_ok=True)

	with Path.open(dest, "w", newline="", encoding="utf-8") as f:
	writer = csv.writer(f)
	writer.writerow(columns)
	for i in range(nrow):
	row = []
	for colspec in colspecs:
	if add_dirt and random.random() < 0.05:
	val = dirty_value(colspec)
	else:
	t = colspec["type"]
	if t == "md5":
	val = random_md5()
	elif t == "string":
	val = random_string(colspec.get("max", 100))
	elif t == "integer":
	val = random_integer(
	colspec.get("min", 0),
	colspec.get("max", 1000),
	)
	elif t == "number":
	val = random_number(
	colspec.get("min", 0),
	colspec.get("max", 1000),
	)
	elif t == "date":
	val = random_date(
	colspec.get("min", "2000-01-01"),
	colspec.get("max", "2025-01-01"),
	colspec.get("format", "Y-m-d"),
	)
	elif t == "boolean":
	val = random_bool()
	else:
	val = ""
	row.append(val)
	writer.writerow(row)
	return dest


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Generate test CSV data based on a frictionless data schema.",
	)
	parser.add_argument(
	"--spec",
	type=str,
	default="data/test_data.yml",
	help="Path to the frictionless data schema YAML file, annotated with ranges.",
	)
	parser.add_argument(
	"--add-dirt",
	action="store_true",
	help="Add dirty (invalid) values to the data.",
	)
	parser.add_argument(
	"--nrow",
	type=int,
	default=1000,
	help="Number of rows to generate.",
	)
	parser.add_argument(
	"--dest",
	type=str,
	default="data",
	help="Destination file path for the generated CSV.",
	)

	args = parser.parse_args()

	csv_path = generate_test_csv(
	spec=args.spec,
	add_dirt=args.add_dirt,
	nrow=args.nrow,
	dest=args.dest,
	)
	print(f"Test data CSV generated at: {csv_path}")
	# Frictionless Data Table Schema for all supported data types
	---
	fields:
	- name: id
	type: md5
	description: Unique identifier (md5 hash)
	- name: integer_col
	type: integer
	description: Example integer value
	min: 10
	max: 1000
	- name: number_col
	type: number
	description: Example number (float) value
	min: 10
	max: 1000
	- name: boolean_col
	type: boolean
	description: Example boolean value
	- name: date_col
	type: date
	description: Example date value (YYYY-MM-DD)
	min: 2000-01-01
	max: 2026-01-01
	format: Y-m-d
	- name: string_col
	type: string
	description: Example string value
	max: 100
	# - name: time_col
	# type: time
	# description: Example time value (HH:MM:SS)
	# - name: datetime_col
	# type: datetime
	# description: Example datetime value (ISO 8601)
	# - name: year_col
	# type: year
	# description: Example year value (e.g., 2024)
	# - name: yearmonth_col
	# type: yearmonth
	# description: Example year and month value (YYYY-MM)
	# - name: duration_col
	# type: duration
	# description: Example duration value (PnYnMnDTnHnMnS)
	# - name: geopoint_col
	# type: geopoint
	# description: Example geopoint value (lat,lon)
	# - name: geojson_col
	# type: geojson
	# description: Example GeoJSON object
	# - name: object_col
	# type: object
	# description: Example object (JSON)
	# - name: array_col
	# type: array
	# description: Example array (list of values)
	primaryKey: id