mubbashar/judge_csv_column_type.py

## judge_csv_column_type.py
import pandas as pd
import re
import os


def guess_column_types(file_path, delimiter=',', has_headers=True):
    try:
        # Read the CSV file using the specified delimiter and header settings
        df = pd.read_csv(file_path, sep=delimiter, header=0 if has_headers else None)

        # Initialize a dictionary to store column data types
        column_types = {}

        # Loop through columns and infer data types
        for column in df.columns:
            # sample_values = df[column].dropna().sample(min(5, len(df[column])), random_state=42)

            # Check for datetime format "YYYY-MM-DD HH:MM:SS"
            is_datetime = all(re.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', str(value)) for value in df[column])

            # Check for date format "YYYY-MM-DD"
            is_date = all(re.match(r'\d{4}-\d{2}-\d{2}', str(value)) for value in df[column])

            # Assign data type based on format detection
            if is_datetime:
                inferred_type = 'datetime64'
            elif is_date:
                inferred_type = 'date'
            else:
                inferred_type = pd.api.types.infer_dtype(df[column], skipna=True)

            column_types[column] = inferred_type

        return (True, column_types)  # Return success and column types
    except pd.errors.ParserError:
        return (False, str(e))  # Return error message


# Replace 'your_file.csv' with the actual path to your CSV file
csv_file_path = 'your_file.csv'

success, result = guess_column_types(csv_file_path)
if success:
    print("Guessed column data types:")
    for column, dtype in result.items():
        print(f"{column}: {dtype}")
else:
    print(f"Error: {result}")
	import pandas as pd
	import re
	import os


	def guess_column_types(file_path, delimiter=',', has_headers=True):
	try:
	# Read the CSV file using the specified delimiter and header settings
	df = pd.read_csv(file_path, sep=delimiter, header=0 if has_headers else None)

	# Initialize a dictionary to store column data types
	column_types = {}

	# Loop through columns and infer data types
	for column in df.columns:
	# sample_values = df[column].dropna().sample(min(5, len(df[column])), random_state=42)

	# Check for datetime format "YYYY-MM-DD HH:MM:SS"
	is_datetime = all(re.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', str(value)) for value in df[column])

	# Check for date format "YYYY-MM-DD"
	is_date = all(re.match(r'\d{4}-\d{2}-\d{2}', str(value)) for value in df[column])

	# Assign data type based on format detection
	if is_datetime:
	inferred_type = 'datetime64'
	elif is_date:
	inferred_type = 'date'
	else:
	inferred_type = pd.api.types.infer_dtype(df[column], skipna=True)

	column_types[column] = inferred_type

	return (True, column_types) # Return success and column types
	except pd.errors.ParserError:
	return (False, str(e)) # Return error message


	# Replace 'your_file.csv' with the actual path to your CSV file
	csv_file_path = 'your_file.csv'

	success, result = guess_column_types(csv_file_path)
	if success:
	print("Guessed column data types:")
	for column, dtype in result.items():
	print(f"{column}: {dtype}")
	else:
	print(f"Error: {result}")
No results found