star26bsd/unscrew.py

## unscrew.py
#!/usr/bin/env python3

# Before running this script
# - export the attendance XLS from the NDS system.
# - convert it to CSV format using LibreOffice or a similar tool, UTF-8 encoded, comma-separated.
#
# DO NOT remove or change data and DO NOT change its name!
#
# Hosted on https://gist.github.com/star26bsd/b4b07339238e63b5deb033989d67bf84

import csv
import argparse
import os

def transform_attendance_csv(input_file, output_file):
    """
    Transforms a human-readable attendance CSV with metadata to a machine-optimized format.

    Expected CSV format:
    - Row 7 contains "Datum" and the training dates starting from column 5
    - Student data rows have format: Number, Name, Firstname, Function, Age, attendance_data...
    - Attendance is marked with 'K', 'J', 'X', or combinations like 'K, J' (only empty cells represent non-attendance)

    Args:
        input_file (str): Path to the input CSV file
        output_file (str): Path to the output CSV file
    """
    # Read input from the CSV file
    print(f"📖 Reading input file: {input_file}")
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = list(csv.reader(f))
    print(f"   Total rows read: {len(reader)}")

    # Extract class number from filename (last number before file extension)
    import re
    filename = os.path.basename(input_file)
    filename_without_ext = os.path.splitext(filename)[0]
    # Find all numbers in the filename and take the last one
    numbers = re.findall(r'\d+', filename_without_ext)
    class_number = numbers[-1] if numbers else ""

    print(f"\n🔢 Extracting class number from filename...")
    print(f"   Filename: {filename}")
    print(f"   Found numbers: {numbers}")
    print(f"   Class number: {class_number or 'NOT FOUND'}")

    # Extract Angebot number from the first row, second column
    angebot_number = ""
    print(f"\n📋 Checking for Angebot number...")
    if len(reader) > 0 and len(reader[0]) > 1 and reader[0][0] == "Angebot":
        angebot_number = reader[0][1].strip()
        print(f"   Angebot: {angebot_number}")
    else:
        print(f"   Angebot: NOT FOUND (not in expected location)")

    # Find the dates row (should be row with "Datum" in first column)
    dates = []
    dates_row_index = -1

    print(f"\n📅 Searching for dates row...")
    for i, row in enumerate(reader):
        if len(row) > 0 and row[0] == "Datum":
            # Extract dates starting from column 5 (index 4)
            dates = [cell.strip() for cell in row[4:] if cell.strip() and "2025" in str(cell)]
            dates_row_index = i
            print(f"   Found 'Datum' at row {dates_row_index + 1}")
            print(f"   Extracted {len(dates)} dates: {dates[:3]}{'...' if len(dates) > 3 else ''}")
            break

    if not dates:
        raise ValueError("Could not find dates row. Expected row with 'Datum' in first column.")

    # Generate output filename based on class number
    if class_number and output_file == 'output.csv':  # Only modify if using default filename
        output_file = f'output_{class_number}.csv'

    output_records = []

    # Process student rows starting after the dates row
    in_participants_section = False
    students_processed = 0
    students_skipped = 0

    print(f"\n👥 Processing student rows...")
    for row_index, student_row in enumerate(reader):
        if row_index <= dates_row_index:
            continue

        if len(student_row) < 6:  # Need at least 6 columns for student data
            continue

        # Check for section headers to track which section we're in
        row_content = ''.join(student_row[:3]).strip()

        # Check if we're entering the participants section
        # Handle both "Teilnehmer/in" and "Teilnehmer/-in" formats
        if "Teilnehmer" in row_content and ("/-in" in row_content or "/in" in row_content):
            in_participants_section = True
            print(f"   ➡️  Entering PARTICIPANTS section (row {row_index + 1})")
            continue

        # Check if we're entering the instructors section or any other section
        # Handle both "Leiter/in" and "Leiter/-in" formats
        is_leiter = "Leiter" in row_content and ("/-in" in row_content or "/in" in row_content)
        is_teilnehmer = "Teilnehmer" in row_content and ("/-in" in row_content or "/in" in row_content)

        if is_leiter or (row_content.endswith(':') and not is_teilnehmer):
            if in_participants_section:
                print(f"   ⬅️  Leaving participants section - entering '{row_content}' (row {row_index + 1})")
            in_participants_section = False
            continue

        # Skip if we're not in the participants section
        if not in_participants_section:
            continue

        # Skip section headers, empty rows, and the column header row
        if (not row_content or
            row_content.endswith(':') or
            not any(c.isalpha() for c in row_content) or
            (student_row[1] == "Name" and student_row[2] == "Vorname")):  # Skip header row
            students_skipped += 1
            continue

        # Extract student information
        # Format: Number, LastName, FirstName, Birthday, Age, attendance_data...
        last_name = student_row[1].strip()
        first_name = student_row[2].strip()
        birthday_raw = student_row[3].strip() if len(student_row) > 3 else ""
        attendance_data = student_row[5:]  # Attendance starts from column 6 (index 5)

        # Convert birthday from DD.MM.YYYY to YYYY-MM-DD format
        birthday_formatted = ""
        if birthday_raw and "." in birthday_raw:
            try:
                # Split DD.MM.YYYY and rearrange to YYYY-MM-DD
                day, month, year = birthday_raw.split(".")
                birthday_formatted = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
            except ValueError:
                print(f"   ⚠️  Birthday parsing failed for '{birthday_raw}' on row {row_index + 1}")
                birthday_formatted = ""  # If parsing fails, leave empty

        # Skip if no name data
        if not last_name and not first_name:
            students_skipped += 1
            continue

        # Check attendance for each date
        attendance_count_for_student = 0
        for i, date in enumerate(dates):
            if i < len(attendance_data):
                attendance = attendance_data[i].strip()
                # Check if attendance contains 'K', 'J', or 'X' (only empty cells represent non-attendance)
                if attendance and ('K' in attendance or 'J' in attendance or 'X' in attendance):
                    output_records.append([class_number, date, f"{first_name} {last_name}", birthday_formatted])
                    attendance_count_for_student += 1

        students_processed += 1
        if attendance_count_for_student > 0:
            print(f"   ✓ {first_name} {last_name}: {attendance_count_for_student} attendance records")

        # Progress update every 10 students
        if students_processed % 10 == 0:
            print(f"   📊 Processed {students_processed} students so far...")

    # Write to the new CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['class_number', 'training_date', 'student_name', 'birthday'])
        writer.writerows(output_records)

    print(f"\n" + "="*60)
    print(f"📊 TRANSFORMATION SUMMARY")
    print(f"="*60)
    print(f"   Class number: {class_number}")
    print(f"   Training dates found: {len(dates)}")
    print(f"   Students processed: {students_processed}")
    print(f"   Students skipped: {students_skipped}")
    print(f"   Total attendance records: {len(output_records)}")
    print(f"   Output file: {output_file}")
    print(f"="*60)
    print(f"\nTransformation complete! {len(output_records)} attendance records written to {output_file}")

    # Return the actual output filename used
    return output_file

# Main execution
if __name__ == "__main__":
    # Set up command line argument parsing
    parser = argparse.ArgumentParser(description='Transform attendance CSV from human-readable to machine-optimized format.')
    parser.add_argument('-f', '--file',
                       required=True,
                       help='Input CSV file path (required)')
    parser.add_argument('-o', '--output',
                       default='output.csv',
                       help='Output CSV file path (default: output.csv, will be modified to include Angebot number)')

    args = parser.parse_args()

    input_filename = args.file
    output_filename = args.output

    try:
        actual_output_filename = transform_attendance_csv(input_filename, output_filename)
        print(f"Successfully transformed {input_filename} to {actual_output_filename}")
    except FileNotFoundError:
        print(f"Error: Could not find input file '{input_filename}'")
        print("Please ensure the CSV file exists and the path is correct.")
        print("Available CSV files in current directory:")
        # Try to find available CSV files
        csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
        for csv_file in csv_files:
            print(f"  - {csv_file}")
    except Exception as e:
        print(f"An error occurred: {e}")
	#!/usr/bin/env python3

	# Before running this script
	# - export the attendance XLS from the NDS system.
	# - convert it to CSV format using LibreOffice or a similar tool, UTF-8 encoded, comma-separated.
	#
	# DO NOT remove or change data and DO NOT change its name!
	#
	# Hosted on https://gist.github.com/star26bsd/b4b07339238e63b5deb033989d67bf84

	import csv
	import argparse
	import os

	def transform_attendance_csv(input_file, output_file):
	"""
	Transforms a human-readable attendance CSV with metadata to a machine-optimized format.

	Expected CSV format:
	- Row 7 contains "Datum" and the training dates starting from column 5
	- Student data rows have format: Number, Name, Firstname, Function, Age, attendance_data...
	- Attendance is marked with 'K', 'J', 'X', or combinations like 'K, J' (only empty cells represent non-attendance)

	Args:
	input_file (str): Path to the input CSV file
	output_file (str): Path to the output CSV file
	"""
	# Read input from the CSV file
	print(f"📖 Reading input file: {input_file}")
	with open(input_file, 'r', encoding='utf-8') as f:
	reader = list(csv.reader(f))
	print(f" Total rows read: {len(reader)}")

	# Extract class number from filename (last number before file extension)
	import re
	filename = os.path.basename(input_file)
	filename_without_ext = os.path.splitext(filename)[0]
	# Find all numbers in the filename and take the last one
	numbers = re.findall(r'\d+', filename_without_ext)
	class_number = numbers[-1] if numbers else ""

	print(f"\n🔢 Extracting class number from filename...")
	print(f" Filename: {filename}")
	print(f" Found numbers: {numbers}")
	print(f" Class number: {class_number or 'NOT FOUND'}")

	# Extract Angebot number from the first row, second column
	angebot_number = ""
	print(f"\n📋 Checking for Angebot number...")
	if len(reader) > 0 and len(reader[0]) > 1 and reader[0][0] == "Angebot":
	angebot_number = reader[0][1].strip()
	print(f" Angebot: {angebot_number}")
	else:
	print(f" Angebot: NOT FOUND (not in expected location)")

	# Find the dates row (should be row with "Datum" in first column)
	dates = []
	dates_row_index = -1

	print(f"\n📅 Searching for dates row...")
	for i, row in enumerate(reader):
	if len(row) > 0 and row[0] == "Datum":
	# Extract dates starting from column 5 (index 4)
	dates = [cell.strip() for cell in row[4:] if cell.strip() and "2025" in str(cell)]
	dates_row_index = i
	print(f" Found 'Datum' at row {dates_row_index + 1}")
	print(f" Extracted {len(dates)} dates: {dates[:3]}{'...' if len(dates) > 3 else ''}")
	break

	if not dates:
	raise ValueError("Could not find dates row. Expected row with 'Datum' in first column.")

	# Generate output filename based on class number
	if class_number and output_file == 'output.csv': # Only modify if using default filename
	output_file = f'output_{class_number}.csv'

	output_records = []

	# Process student rows starting after the dates row
	in_participants_section = False
	students_processed = 0
	students_skipped = 0

	print(f"\n👥 Processing student rows...")
	for row_index, student_row in enumerate(reader):
	if row_index <= dates_row_index:
	continue

	if len(student_row) < 6: # Need at least 6 columns for student data
	continue

	# Check for section headers to track which section we're in
	row_content = ''.join(student_row[:3]).strip()

	# Check if we're entering the participants section
	# Handle both "Teilnehmer/in" and "Teilnehmer/-in" formats
	if "Teilnehmer" in row_content and ("/-in" in row_content or "/in" in row_content):
	in_participants_section = True
	print(f" ➡️ Entering PARTICIPANTS section (row {row_index + 1})")
	continue

	# Check if we're entering the instructors section or any other section
	# Handle both "Leiter/in" and "Leiter/-in" formats
	is_leiter = "Leiter" in row_content and ("/-in" in row_content or "/in" in row_content)
	is_teilnehmer = "Teilnehmer" in row_content and ("/-in" in row_content or "/in" in row_content)

	if is_leiter or (row_content.endswith(':') and not is_teilnehmer):
	if in_participants_section:
	print(f" ⬅️ Leaving participants section - entering '{row_content}' (row {row_index + 1})")
	in_participants_section = False
	continue

	# Skip if we're not in the participants section
	if not in_participants_section:
	continue

	# Skip section headers, empty rows, and the column header row
	if (not row_content or
	row_content.endswith(':') or
	not any(c.isalpha() for c in row_content) or
	(student_row[1] == "Name" and student_row[2] == "Vorname")): # Skip header row
	students_skipped += 1
	continue

	# Extract student information
	# Format: Number, LastName, FirstName, Birthday, Age, attendance_data...
	last_name = student_row[1].strip()
	first_name = student_row[2].strip()
	birthday_raw = student_row[3].strip() if len(student_row) > 3 else ""
	attendance_data = student_row[5:] # Attendance starts from column 6 (index 5)

	# Convert birthday from DD.MM.YYYY to YYYY-MM-DD format
	birthday_formatted = ""
	if birthday_raw and "." in birthday_raw:
	try:
	# Split DD.MM.YYYY and rearrange to YYYY-MM-DD
	day, month, year = birthday_raw.split(".")
	birthday_formatted = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
	except ValueError:
	print(f" ⚠️ Birthday parsing failed for '{birthday_raw}' on row {row_index + 1}")
	birthday_formatted = "" # If parsing fails, leave empty

	# Skip if no name data
	if not last_name and not first_name:
	students_skipped += 1
	continue

	# Check attendance for each date
	attendance_count_for_student = 0
	for i, date in enumerate(dates):
	if i < len(attendance_data):
	attendance = attendance_data[i].strip()
	# Check if attendance contains 'K', 'J', or 'X' (only empty cells represent non-attendance)
	if attendance and ('K' in attendance or 'J' in attendance or 'X' in attendance):
	output_records.append([class_number, date, f"{first_name} {last_name}", birthday_formatted])
	attendance_count_for_student += 1

	students_processed += 1
	if attendance_count_for_student > 0:
	print(f" ✓ {first_name} {last_name}: {attendance_count_for_student} attendance records")

	# Progress update every 10 students
	if students_processed % 10 == 0:
	print(f" 📊 Processed {students_processed} students so far...")

	# Write to the new CSV file
	with open(output_file, 'w', newline='', encoding='utf-8') as f:
	writer = csv.writer(f)
	writer.writerow(['class_number', 'training_date', 'student_name', 'birthday'])
	writer.writerows(output_records)

	print(f"\n" + "="*60)
	print(f"📊 TRANSFORMATION SUMMARY")
	print(f"="*60)
	print(f" Class number: {class_number}")
	print(f" Training dates found: {len(dates)}")
	print(f" Students processed: {students_processed}")
	print(f" Students skipped: {students_skipped}")
	print(f" Total attendance records: {len(output_records)}")
	print(f" Output file: {output_file}")
	print(f"="*60)
	print(f"\nTransformation complete! {len(output_records)} attendance records written to {output_file}")

	# Return the actual output filename used
	return output_file

	# Main execution
	if __name__ == "__main__":
	# Set up command line argument parsing
	parser = argparse.ArgumentParser(description='Transform attendance CSV from human-readable to machine-optimized format.')
	parser.add_argument('-f', '--file',
	required=True,
	help='Input CSV file path (required)')
	parser.add_argument('-o', '--output',
	default='output.csv',
	help='Output CSV file path (default: output.csv, will be modified to include Angebot number)')

	args = parser.parse_args()

	input_filename = args.file
	output_filename = args.output

	try:
	actual_output_filename = transform_attendance_csv(input_filename, output_filename)
	print(f"Successfully transformed {input_filename} to {actual_output_filename}")
	except FileNotFoundError:
	print(f"Error: Could not find input file '{input_filename}'")
	print("Please ensure the CSV file exists and the path is correct.")
	print("Available CSV files in current directory:")
	# Try to find available CSV files
	csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
	for csv_file in csv_files:
	print(f" - {csv_file}")
	except Exception as e:
	print(f"An error occurred: {e}")
No results found