Created
December 29, 2025 22:49
-
-
Save ChuckJonas/4ffb0743de0c20a61394a6b956f6a8d0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -- duckdb version of https://datapythonista.me/blog/pandas-with-hundreds-of-millions-of-rows | |
| -- Top 5 origin airports by mean departure delay (hours), 1987–2008 | |
| -- Works with globbed year files even if other columns drift types across years. | |
| SET threads = 8; | |
| SET enable_progress_bar = true; | |
| WITH raw AS ( | |
| SELECT | |
| Origin, | |
| CRSDepTime, | |
| DepTime | |
| FROM read_csv( | |
| './????.csv', | |
| header = true, | |
| union_by_name = true, -- helpful if some years have slightly different columns | |
| all_varchar = true, -- prevents schema mismatch across files | |
| delim = ',', | |
| quote = '"', | |
| escape = '"' | |
| ) | |
| ), | |
| flights AS ( | |
| SELECT | |
| Origin, | |
| TRY_CAST(CRSDepTime AS INTEGER) AS CRSDepTime, | |
| TRY_CAST(DepTime AS INTEGER) AS DepTime | |
| FROM raw | |
| WHERE | |
| Origin IS NOT NULL | |
| AND CRSDepTime IS NOT NULL | |
| AND DepTime IS NOT NULL | |
| AND TRY_CAST(CRSDepTime AS INTEGER) IS NOT NULL | |
| AND TRY_CAST(DepTime AS INTEGER) IS NOT NULL | |
| -- Validate HHMM | |
| AND TRY_CAST(CRSDepTime AS INTEGER) BETWEEN 0 AND 2359 | |
| AND TRY_CAST(DepTime AS INTEGER) BETWEEN 0 AND 2359 | |
| AND (TRY_CAST(CRSDepTime AS INTEGER) % 100) < 60 | |
| AND (TRY_CAST(DepTime AS INTEGER) % 100) < 60 | |
| ), | |
| delays AS ( | |
| SELECT | |
| Origin, | |
| ( | |
| ( | |
| (CAST(FLOOR(DepTime / 100) AS INTEGER) * 60 + (DepTime % 100)) | |
| - (CAST(FLOOR(CRSDepTime / 100) AS INTEGER) * 60 + (CRSDepTime % 100)) | |
| ) / 60.0 | |
| ) AS delay_hours_raw | |
| FROM flights | |
| ) | |
| SELECT | |
| Origin, | |
| AVG( | |
| CASE | |
| -- keep the blog’s “overnight fix” behavior | |
| WHEN delay_hours_raw > -2 THEN delay_hours_raw | |
| ELSE 24 - delay_hours_raw | |
| END | |
| ) AS mean_delay_hours, | |
| COUNT(*) AS n_flights | |
| FROM delays | |
| GROUP BY Origin | |
| ORDER BY mean_delay_hours DESC | |
| LIMIT 5; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment