Skip to content

Instantly share code, notes, and snippets.

@cmpadden
Last active August 11, 2024 21:57
Show Gist options
  • Select an option

  • Save cmpadden/becfcdac9198cbd39b0918675edbd738 to your computer and use it in GitHub Desktop.

Select an option

Save cmpadden/becfcdac9198cbd39b0918675edbd738 to your computer and use it in GitHub Desktop.
DuckDB Yellow Taxi Benchmark
-- Computes the NYC zones that have the most (relative) yellow cab disputes
--
-- It is expected that the NYC yellow trip data set has been downloaded to `./data/nyc_yellow_tripdata`
--
-- https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
--
-- USAGE
-- $ du -sh data/nyc_yellow_tripdata/
-- 1.1G data/nyc_yellow_tripdata/ $ duckdb < nyc-yellow-taxi-duckdb.sql
--
create table trips as (
select
*
from './data/nyc_yellow_tripdata/*.parquet'
);
create table taxi_zone_lookup as (
select
*
from './data/nyc_yellow_tripdata/taxi_zone_lookup.csv'
);
-- why is this resulting in the counts being all the same?
with trips_with_zone as (
select
Borough as dropoff_borough,
Zone as dropoff_zone,
payment_type
from trips
join taxi_zone_lookup on DOLocationID = LocationID
where dropoff_borough not in ('Unknown', 'N/A')
)
select
dropoff_borough,
dropoff_zone,
count(*) as num_rides,
sum(case when payment_type = 4 then 1 else 0 end) as num_disputed_rides,
round(num_disputed_rides / num_rides, 6) as pct_disputed
from trips_with_zone
group by all
having num_disputed_rides > 0
order by pct_disputed desc
limit 25;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment