Created
September 13, 2023 06:19
-
-
Save mubbashar/adf2d373d73bf191706778f03757a972 to your computer and use it in GitHub Desktop.
This Will determine the data type of each Column in CSV file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import re | |
| import os | |
| def guess_column_types(file_path, delimiter=',', has_headers=True): | |
| try: | |
| # Read the CSV file using the specified delimiter and header settings | |
| df = pd.read_csv(file_path, sep=delimiter, header=0 if has_headers else None) | |
| # Initialize a dictionary to store column data types | |
| column_types = {} | |
| # Loop through columns and infer data types | |
| for column in df.columns: | |
| # sample_values = df[column].dropna().sample(min(5, len(df[column])), random_state=42) | |
| # Check for datetime format "YYYY-MM-DD HH:MM:SS" | |
| is_datetime = all(re.match(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', str(value)) for value in df[column]) | |
| # Check for date format "YYYY-MM-DD" | |
| is_date = all(re.match(r'\d{4}-\d{2}-\d{2}', str(value)) for value in df[column]) | |
| # Assign data type based on format detection | |
| if is_datetime: | |
| inferred_type = 'datetime64' | |
| elif is_date: | |
| inferred_type = 'date' | |
| else: | |
| inferred_type = pd.api.types.infer_dtype(df[column], skipna=True) | |
| column_types[column] = inferred_type | |
| return (True, column_types) # Return success and column types | |
| except pd.errors.ParserError: | |
| return (False, str(e)) # Return error message | |
| # Replace 'your_file.csv' with the actual path to your CSV file | |
| csv_file_path = 'your_file.csv' | |
| success, result = guess_column_types(csv_file_path) | |
| if success: | |
| print("Guessed column data types:") | |
| for column, dtype in result.items(): | |
| print(f"{column}: {dtype}") | |
| else: | |
| print(f"Error: {result}") |
Author
This is just a starting point. You are welcome to share the improved version :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The
eon line 36 is not defined anywhere. Moreover, the script does not do a great job of detecting comma-separated numbers like 3,665 as integers.