Skip to content

Instantly share code, notes, and snippets.

@ChuckJonas
Created December 29, 2025 22:49
Show Gist options
  • Select an option

  • Save ChuckJonas/4ffb0743de0c20a61394a6b956f6a8d0 to your computer and use it in GitHub Desktop.

Select an option

Save ChuckJonas/4ffb0743de0c20a61394a6b956f6a8d0 to your computer and use it in GitHub Desktop.
-- duckdb version of https://datapythonista.me/blog/pandas-with-hundreds-of-millions-of-rows
-- Top 5 origin airports by mean departure delay (hours), 1987–2008
-- Works with globbed year files even if other columns drift types across years.
SET threads = 8;
SET enable_progress_bar = true;
WITH raw AS (
SELECT
Origin,
CRSDepTime,
DepTime
FROM read_csv(
'./????.csv',
header = true,
union_by_name = true, -- helpful if some years have slightly different columns
all_varchar = true, -- prevents schema mismatch across files
delim = ',',
quote = '"',
escape = '"'
)
),
flights AS (
SELECT
Origin,
TRY_CAST(CRSDepTime AS INTEGER) AS CRSDepTime,
TRY_CAST(DepTime AS INTEGER) AS DepTime
FROM raw
WHERE
Origin IS NOT NULL
AND CRSDepTime IS NOT NULL
AND DepTime IS NOT NULL
AND TRY_CAST(CRSDepTime AS INTEGER) IS NOT NULL
AND TRY_CAST(DepTime AS INTEGER) IS NOT NULL
-- Validate HHMM
AND TRY_CAST(CRSDepTime AS INTEGER) BETWEEN 0 AND 2359
AND TRY_CAST(DepTime AS INTEGER) BETWEEN 0 AND 2359
AND (TRY_CAST(CRSDepTime AS INTEGER) % 100) < 60
AND (TRY_CAST(DepTime AS INTEGER) % 100) < 60
),
delays AS (
SELECT
Origin,
(
(
(CAST(FLOOR(DepTime / 100) AS INTEGER) * 60 + (DepTime % 100))
- (CAST(FLOOR(CRSDepTime / 100) AS INTEGER) * 60 + (CRSDepTime % 100))
) / 60.0
) AS delay_hours_raw
FROM flights
)
SELECT
Origin,
AVG(
CASE
-- keep the blog’s “overnight fix” behavior
WHEN delay_hours_raw > -2 THEN delay_hours_raw
ELSE 24 - delay_hours_raw
END
) AS mean_delay_hours,
COUNT(*) AS n_flights
FROM delays
GROUP BY Origin
ORDER BY mean_delay_hours DESC
LIMIT 5;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment