Created
September 10, 2025 09:37
-
-
Save florianm/565d462221ac9b8a7305cfa4f6347d33 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Utilities to generate test data.""" | |
| import argparse | |
| import csv | |
| import hashlib | |
| import random | |
| from datetime import datetime, timedelta | |
| from pathlib import Path | |
| import yaml | |
| def random_string(max_len): | |
| """Return a random string.""" | |
| length = random.randint(1, max_len) | |
| return "".join( | |
| random.choices( | |
| "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ", | |
| k=length, | |
| ), | |
| ) | |
| def random_integer(min_val, max_val): | |
| """Return a random integer.""" | |
| return random.randint(min_val, max_val) | |
| def random_number(min_val, max_val): | |
| """Return a random number.""" | |
| return random.randint(10 * min_val, 10 * max_val) / 10 | |
| def random_date(min_date, max_date, date_format): | |
| """Return a random date.""" | |
| min_dt = datetime.strptime(str(min_date), "%Y-%m-%d") | |
| max_dt = datetime.strptime(str(max_date), "%Y-%m-%d") | |
| delta = max_dt - min_dt | |
| random_days = random.randint(0, delta.days) | |
| dt = min_dt + timedelta(days=random_days) | |
| return dt.strftime("%Y-%m-%d" if date_format == "Y-m-d" else date_format) | |
| def random_md5(): | |
| """Return a random md5 hash.""" | |
| return hashlib.md5(str(random.random()).encode()).hexdigest() | |
| def random_bool(): | |
| """Return a random boolean value.""" | |
| return random.choice([True, False]) | |
| def dirty_value(colspec): | |
| """Generate a value that intentionally fails the schema.""" | |
| t = colspec["type"] | |
| if t == "integer": | |
| return "not_an_int" | |
| elif t == "date": | |
| return "not_a_date" | |
| elif t == "md5": | |
| return "not_a_md5" | |
| elif t == "string": | |
| return random_string(colspec.get("max", 10) + 100) | |
| return "DIRTY" | |
| def generate_test_csv(spec, add_dirt=False, nrow=1000, dest="test_data.csv"): | |
| """ | |
| Generate a CSV of test data. | |
| This function generates a CSV file with test data containing | |
| the specified number of rows and columns as per specification. | |
| Args: | |
| spec (path): A path to a file with a frictionless data schema. | |
| add_dirt (bool): Whether to add values failing the schema. | |
| Default: False | |
| nrow (int): The number of rows to generate. Default: 1000 | |
| dest (path): A path to write the resulting CSV to. | |
| """ | |
| # Read the frictionless data YAML schema | |
| with Path.open(spec, encoding="utf-8") as yml_file: | |
| schema = yaml.safe_load(yml_file) | |
| # Extract fields from the schema | |
| fields = schema.get("fields", []) | |
| columns = [field["name"] for field in fields] | |
| colspecs = fields | |
| Path.mkdir(Path.parent(dest), parents=True, exist_ok=True) | |
| with Path.open(dest, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow(columns) | |
| for i in range(nrow): | |
| row = [] | |
| for colspec in colspecs: | |
| if add_dirt and random.random() < 0.05: | |
| val = dirty_value(colspec) | |
| else: | |
| t = colspec["type"] | |
| if t == "md5": | |
| val = random_md5() | |
| elif t == "string": | |
| val = random_string(colspec.get("max", 100)) | |
| elif t == "integer": | |
| val = random_integer( | |
| colspec.get("min", 0), | |
| colspec.get("max", 1000), | |
| ) | |
| elif t == "number": | |
| val = random_number( | |
| colspec.get("min", 0), | |
| colspec.get("max", 1000), | |
| ) | |
| elif t == "date": | |
| val = random_date( | |
| colspec.get("min", "2000-01-01"), | |
| colspec.get("max", "2025-01-01"), | |
| colspec.get("format", "Y-m-d"), | |
| ) | |
| elif t == "boolean": | |
| val = random_bool() | |
| else: | |
| val = "" | |
| row.append(val) | |
| writer.writerow(row) | |
| return dest | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser( | |
| description="Generate test CSV data based on a frictionless data schema.", | |
| ) | |
| parser.add_argument( | |
| "--spec", | |
| type=str, | |
| default="data/test_data.yml", | |
| help="Path to the frictionless data schema YAML file, annotated with ranges.", | |
| ) | |
| parser.add_argument( | |
| "--add-dirt", | |
| action="store_true", | |
| help="Add dirty (invalid) values to the data.", | |
| ) | |
| parser.add_argument( | |
| "--nrow", | |
| type=int, | |
| default=1000, | |
| help="Number of rows to generate.", | |
| ) | |
| parser.add_argument( | |
| "--dest", | |
| type=str, | |
| default="data", | |
| help="Destination file path for the generated CSV.", | |
| ) | |
| args = parser.parse_args() | |
| csv_path = generate_test_csv( | |
| spec=args.spec, | |
| add_dirt=args.add_dirt, | |
| nrow=args.nrow, | |
| dest=args.dest, | |
| ) | |
| print(f"Test data CSV generated at: {csv_path}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Frictionless Data Table Schema for all supported data types | |
| --- | |
| fields: | |
| - name: id | |
| type: md5 | |
| description: Unique identifier (md5 hash) | |
| - name: integer_col | |
| type: integer | |
| description: Example integer value | |
| min: 10 | |
| max: 1000 | |
| - name: number_col | |
| type: number | |
| description: Example number (float) value | |
| min: 10 | |
| max: 1000 | |
| - name: boolean_col | |
| type: boolean | |
| description: Example boolean value | |
| - name: date_col | |
| type: date | |
| description: Example date value (YYYY-MM-DD) | |
| min: 2000-01-01 | |
| max: 2026-01-01 | |
| format: Y-m-d | |
| - name: string_col | |
| type: string | |
| description: Example string value | |
| max: 100 | |
| # - name: time_col | |
| # type: time | |
| # description: Example time value (HH:MM:SS) | |
| # - name: datetime_col | |
| # type: datetime | |
| # description: Example datetime value (ISO 8601) | |
| # - name: year_col | |
| # type: year | |
| # description: Example year value (e.g., 2024) | |
| # - name: yearmonth_col | |
| # type: yearmonth | |
| # description: Example year and month value (YYYY-MM) | |
| # - name: duration_col | |
| # type: duration | |
| # description: Example duration value (PnYnMnDTnHnMnS) | |
| # - name: geopoint_col | |
| # type: geopoint | |
| # description: Example geopoint value (lat,lon) | |
| # - name: geojson_col | |
| # type: geojson | |
| # description: Example GeoJSON object | |
| # - name: object_col | |
| # type: object | |
| # description: Example object (JSON) | |
| # - name: array_col | |
| # type: array | |
| # description: Example array (list of values) | |
| primaryKey: id |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment