Skip to content

Instantly share code, notes, and snippets.

@florianm
Created September 10, 2025 09:37
Show Gist options
  • Select an option

  • Save florianm/565d462221ac9b8a7305cfa4f6347d33 to your computer and use it in GitHub Desktop.

Select an option

Save florianm/565d462221ac9b8a7305cfa4f6347d33 to your computer and use it in GitHub Desktop.
"""Utilities to generate test data."""
import argparse
import csv
import hashlib
import random
from datetime import datetime, timedelta
from pathlib import Path
import yaml
def random_string(max_len):
"""Return a random string."""
length = random.randint(1, max_len)
return "".join(
random.choices(
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ",
k=length,
),
)
def random_integer(min_val, max_val):
"""Return a random integer."""
return random.randint(min_val, max_val)
def random_number(min_val, max_val):
"""Return a random number."""
return random.randint(10 * min_val, 10 * max_val) / 10
def random_date(min_date, max_date, date_format):
"""Return a random date."""
min_dt = datetime.strptime(str(min_date), "%Y-%m-%d")
max_dt = datetime.strptime(str(max_date), "%Y-%m-%d")
delta = max_dt - min_dt
random_days = random.randint(0, delta.days)
dt = min_dt + timedelta(days=random_days)
return dt.strftime("%Y-%m-%d" if date_format == "Y-m-d" else date_format)
def random_md5():
"""Return a random md5 hash."""
return hashlib.md5(str(random.random()).encode()).hexdigest()
def random_bool():
"""Return a random boolean value."""
return random.choice([True, False])
def dirty_value(colspec):
"""Generate a value that intentionally fails the schema."""
t = colspec["type"]
if t == "integer":
return "not_an_int"
elif t == "date":
return "not_a_date"
elif t == "md5":
return "not_a_md5"
elif t == "string":
return random_string(colspec.get("max", 10) + 100)
return "DIRTY"
def generate_test_csv(spec, add_dirt=False, nrow=1000, dest="test_data.csv"):
"""
Generate a CSV of test data.
This function generates a CSV file with test data containing
the specified number of rows and columns as per specification.
Args:
spec (path): A path to a file with a frictionless data schema.
add_dirt (bool): Whether to add values failing the schema.
Default: False
nrow (int): The number of rows to generate. Default: 1000
dest (path): A path to write the resulting CSV to.
"""
# Read the frictionless data YAML schema
with Path.open(spec, encoding="utf-8") as yml_file:
schema = yaml.safe_load(yml_file)
# Extract fields from the schema
fields = schema.get("fields", [])
columns = [field["name"] for field in fields]
colspecs = fields
Path.mkdir(Path.parent(dest), parents=True, exist_ok=True)
with Path.open(dest, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(columns)
for i in range(nrow):
row = []
for colspec in colspecs:
if add_dirt and random.random() < 0.05:
val = dirty_value(colspec)
else:
t = colspec["type"]
if t == "md5":
val = random_md5()
elif t == "string":
val = random_string(colspec.get("max", 100))
elif t == "integer":
val = random_integer(
colspec.get("min", 0),
colspec.get("max", 1000),
)
elif t == "number":
val = random_number(
colspec.get("min", 0),
colspec.get("max", 1000),
)
elif t == "date":
val = random_date(
colspec.get("min", "2000-01-01"),
colspec.get("max", "2025-01-01"),
colspec.get("format", "Y-m-d"),
)
elif t == "boolean":
val = random_bool()
else:
val = ""
row.append(val)
writer.writerow(row)
return dest
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generate test CSV data based on a frictionless data schema.",
)
parser.add_argument(
"--spec",
type=str,
default="data/test_data.yml",
help="Path to the frictionless data schema YAML file, annotated with ranges.",
)
parser.add_argument(
"--add-dirt",
action="store_true",
help="Add dirty (invalid) values to the data.",
)
parser.add_argument(
"--nrow",
type=int,
default=1000,
help="Number of rows to generate.",
)
parser.add_argument(
"--dest",
type=str,
default="data",
help="Destination file path for the generated CSV.",
)
args = parser.parse_args()
csv_path = generate_test_csv(
spec=args.spec,
add_dirt=args.add_dirt,
nrow=args.nrow,
dest=args.dest,
)
print(f"Test data CSV generated at: {csv_path}")
# Frictionless Data Table Schema for all supported data types
---
fields:
- name: id
type: md5
description: Unique identifier (md5 hash)
- name: integer_col
type: integer
description: Example integer value
min: 10
max: 1000
- name: number_col
type: number
description: Example number (float) value
min: 10
max: 1000
- name: boolean_col
type: boolean
description: Example boolean value
- name: date_col
type: date
description: Example date value (YYYY-MM-DD)
min: 2000-01-01
max: 2026-01-01
format: Y-m-d
- name: string_col
type: string
description: Example string value
max: 100
# - name: time_col
# type: time
# description: Example time value (HH:MM:SS)
# - name: datetime_col
# type: datetime
# description: Example datetime value (ISO 8601)
# - name: year_col
# type: year
# description: Example year value (e.g., 2024)
# - name: yearmonth_col
# type: yearmonth
# description: Example year and month value (YYYY-MM)
# - name: duration_col
# type: duration
# description: Example duration value (PnYnMnDTnHnMnS)
# - name: geopoint_col
# type: geopoint
# description: Example geopoint value (lat,lon)
# - name: geojson_col
# type: geojson
# description: Example GeoJSON object
# - name: object_col
# type: object
# description: Example object (JSON)
# - name: array_col
# type: array
# description: Example array (list of values)
primaryKey: id
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment