Last active
May 30, 2022 06:27
-
-
Save randypitcherii/5998238c80f401b63b1a9f426edfd08a to your computer and use it in GitHub Desktop.
This SQL file contains the SQL necessary for the March 2019 Hashmap+Snowflake Data for Breakfast workshop demonstrations.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // =========================================================================== | |
| // Welcome! | |
| // | |
| // Log in to your Snowflake instance, create a new worksheet, then | |
| // copy and paste the contents of this file into your worksheet. | |
| // =========================================================================== | |
| // =========================================================================== | |
| // PART 1 - GETTING COMFORTABLE IN SNOWFLAKE | |
| // | |
| // In this first part, we'll get to know some basic parts of Snowflake | |
| // and play with some demo data. | |
| // =========================================================================== | |
| // Start by changing context so that you're the ACCOUNTADMIN | |
| USE ROLE ACCOUNTADMIN; | |
| // Create a resource monitor so we don't spend too much | |
| CREATE RESOURCE MONITOR IF NOT EXISTS | |
| SNOWFLAKE_DEMO_MONITOR | |
| WITH | |
| CREDIT_QUOTA = 200 | |
| TRIGGERS | |
| ON 90 PERCENT DO SUSPEND | |
| ON 100 PERCENT DO SUSPEND_IMMEDIATE | |
| ON 50 PERCENT DO NOTIFY; | |
| ALTER ACCOUNT SET RESOURCE_MONITOR=SNOWFLAKE_DEMO_MONITOR; | |
| // Create a warehouse | |
| CREATE WAREHOUSE IF NOT EXISTS | |
| SNOWFLAKE_DEMO_WAREHOUSE | |
| WITH | |
| WAREHOUSE_SIZE = XSMALL | |
| WAREHOUSE_TYPE = STANDARD | |
| AUTO_SUSPEND = 60 | |
| AUTO_RESUME = TRUE | |
| INITIALLY_SUSPENDED = TRUE; | |
| // =========================================================================== | |
| // PART 1a - Setting the worksheet context | |
| // | |
| // Run each query and notice the context change in the upper right corner of | |
| // the workbook. | |
| // | |
| // Docs: https://docs.snowflake.net/manuals/user-guide/snowflake-manager.html#worksheet-page | |
| // =========================================================================== | |
| // set current session warehouse | |
| USE WAREHOUSE SNOWFLAKE_DEMO_WAREHOUSE; | |
| // Set the database for the current session | |
| USE SNOWFLAKE_SAMPLE_DATA.TPCH_SF100; | |
| // =========================================================================== | |
| // PART 1b - Evaluating a query | |
| // | |
| // Run the query below and take a look at the metrics, time taken and | |
| // query profile | |
| // | |
| // Docs: https://docs.snowflake.net/manuals/user-guide/ui-query-profile.html | |
| // =========================================================================== | |
| // Pricing Summary Report Query | |
| // This query reports the amount of business that was billed, shipped, and returned. | |
| SELECT | |
| l_returnflag, | |
| l_linestatus, | |
| SUM(l_quantity) AS sum_qty, | |
| SUM(l_extendedprice) AS sum_base_price, | |
| SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price, | |
| SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, | |
| AVG(l_quantity) AS avg_qty, | |
| AVG(l_extendedprice) AS avg_price, | |
| AVG(l_discount) AS avg_disc, | |
| COUNT(*) AS count_order | |
| FROM | |
| SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM | |
| WHERE | |
| l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01')) | |
| GROUP BY | |
| l_returnflag, | |
| l_linestatus | |
| ORDER BY | |
| l_returnflag, | |
| l_linestatus; | |
| // =========================================================================== | |
| // PART 1c - Demonstrating the Persisted Query Results feature | |
| // | |
| // Query results are persisted for 24 hours, this procedure will demonstrate | |
| // how this feature actually works, and how it can be used to optimize | |
| // | |
| // Docs: https://docs.snowflake.net/manuals/user-guide/querying-persisted-results.html | |
| // =========================================================================== | |
| // Stop the warehouse | |
| ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SUSPEND; | |
| // Run the query again and notice that the warehouse is still stopped | |
| // Also notice the query profile and the time it took to run | |
| SELECT | |
| l_returnflag, | |
| l_linestatus, | |
| SUM(l_quantity) AS sum_qty, | |
| SUM(l_extendedprice) AS sum_base_price, | |
| SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price, | |
| SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, | |
| AVG(l_quantity) AS avg_qty, | |
| AVG(l_extendedprice) AS avg_price, | |
| AVG(l_discount) AS avg_disc, | |
| COUNT(*) AS count_order | |
| FROM | |
| SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM | |
| WHERE | |
| l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01')) | |
| GROUP BY | |
| l_returnflag, | |
| l_linestatus | |
| ORDER BY | |
| l_returnflag, | |
| l_linestatus; | |
| // Go to the warehouses tab and notice that the warehouse did not resume and | |
| // go to the query profile to see that the results came from the "Query Result Reuse" | |
| // =========================================================================== | |
| // PART 1d- Vertically scaling a warehouse | |
| // | |
| // In this part we will vertically scale a warehouse (increase its t-shirt | |
| // size) to show the increase in performance. | |
| // | |
| // Docs: https://docs.snowflake.net/manuals/user-guide/warehouses-tasks.html#resizing-a-warehouse | |
| // =========================================================================== | |
| // Increase the size of the warehouse | |
| ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET WAREHOUSE_SIZE = 'LARGE'; | |
| // Make sure you dont get the cached result | |
| ALTER SESSION SET USE_CACHED_RESULT=FALSE; | |
| // Run the same query again and notice it go faster | |
| SELECT | |
| l_returnflag, | |
| l_linestatus, | |
| SUM(l_quantity) AS sum_qty, | |
| SUM(l_extendedprice) AS sum_base_price, | |
| SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price, | |
| SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, | |
| AVG(l_quantity) AS avg_qty, | |
| AVG(l_extendedprice) AS avg_price, | |
| AVG(l_discount) AS avg_disc, | |
| COUNT(*) AS count_order | |
| FROM | |
| SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM | |
| WHERE | |
| l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01')) | |
| GROUP BY | |
| l_returnflag, | |
| l_linestatus | |
| ORDER BY | |
| l_returnflag, | |
| l_linestatus; | |
| // Reset warehouse size and cached result behavior | |
| // Reduce the size of the warehouse | |
| ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET WAREHOUSE_SIZE = 'XSMALL'; | |
| // Re-enable cached result | |
| ALTER SESSION SET USE_CACHED_RESULT=TRUE; | |
| // =========================================================================== | |
| // PART 1e- Auto-Suspend Auto-Resume | |
| // | |
| // In this section we will show the effect of auto-suspend auto-resume. | |
| // Up until now our warehouse has been auto-suspend auto resume, however, let's | |
| // take a look what happens when auto-resume is not set to true. | |
| // | |
| // Docs: https://community.snowflake.com/s/article/faq-what-is-the-behavior-of-a-warehouse-in-auto-resume-mode | |
| // =========================================================================== | |
| // Go ahead and modify the warehouse to turn off autoresume | |
| ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_RESUME=FALSE; | |
| ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SUSPEND; | |
| // Run the same query again and notice the error message thrown | |
| SELECT | |
| l_returnflag, | |
| l_linestatus, | |
| SUM(l_quantity) AS sum_qty, | |
| SUM(l_extendedprice) AS sum_base_price, | |
| SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price, | |
| SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, | |
| AVG(l_quantity) AS avg_qty, | |
| AVG(l_extendedprice) AS avg_price, | |
| AVG(l_discount) AS avg_disc, | |
| COUNT(*) AS count_order | |
| FROM | |
| SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM | |
| WHERE | |
| l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01')) | |
| GROUP BY | |
| l_returnflag, | |
| l_linestatus | |
| ORDER BY | |
| l_returnflag, | |
| l_linestatus; | |
| // Go ahead and modify the warehouse to turn back on autoresume | |
| ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_RESUME=TRUE; | |
| // =========================================================================== | |
| // PART 1f- Local Disk Cache | |
| // | |
| // In this example we will look at Local Disk Cache. In this case, the result | |
| // of the remote disk IO is held at the warehouse, and lost if the warehouse is | |
| // suspended or shut down. | |
| // | |
| // Docs: https://community.snowflake.com/s/article/Caching-in-Snowflake-Data-Warehouse | |
| // =========================================================================== | |
| // Set the auto-suspend to 1 hr to avoid any unexpeted suspends | |
| ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_SUSPEND=3600; | |
| // Make sure you dont get the cached result | |
| ALTER SESSION SET USE_CACHED_RESULT=FALSE; | |
| // Run our big query. | |
| SELECT | |
| l_returnflag, | |
| l_linestatus, | |
| SUM(l_quantity) AS sum_qty, | |
| SUM(l_extendedprice) AS sum_base_price, | |
| SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price, | |
| SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, | |
| AVG(l_quantity) AS avg_qty, | |
| AVG(l_extendedprice) AS avg_price, | |
| AVG(l_discount) AS avg_disc, | |
| COUNT(*) AS count_order | |
| FROM | |
| SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM | |
| WHERE | |
| l_receiptdate <= DATEADD(DAY, -60, TO_DATE('1998-12-01')) | |
| GROUP BY | |
| l_returnflag, | |
| l_linestatus | |
| ORDER BY | |
| l_returnflag, | |
| l_linestatus; | |
| // Check out the query profile and notice the Remote Disk I/O | |
| // (SPECIFICALLY Percentage scanned from cache) Then modify | |
| // the query to run over the same data with a different predicate | |
| SELECT | |
| l_returnflag, | |
| l_linestatus, | |
| SUM(l_quantity) AS sum_qty, | |
| SUM(l_extendedprice) AS sum_base_price, | |
| SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price, | |
| SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, | |
| AVG(l_quantity) AS avg_qty, | |
| AVG(l_extendedprice) AS avg_price, | |
| AVG(l_discount) AS avg_disc, | |
| COUNT(*) AS count_order | |
| FROM | |
| SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM | |
| WHERE | |
| l_receiptdate <= DATEADD(DAY, -30, TO_DATE('1998-12-01')) | |
| GROUP BY | |
| l_returnflag, | |
| l_linestatus | |
| ORDER BY | |
| l_returnflag, | |
| l_linestatus; | |
| // Check out the query profile and specifically the remote disk IO | |
| // This is important to build out your warehouse architecture, and | |
| // is heavily dependant on your usecase. | |
| // Lets reset everything | |
| ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_SUSPEND = 60; | |
| ALTER SESSION SET USE_CACHED_RESULT = TRUE; | |
| // =========================================================================== | |
| // PART 1g- Semi-Structured data | |
| // | |
| // In this part we will look at querying semi-structured JSON data directly. | |
| // | |
| // Docs: https://docs.snowflake.net/manuals/user-guide/semistructured-concepts.html | |
| // =========================================================================== | |
| // Let's shift to some weather data. | |
| // How many weather records are there? | |
| SELECT COUNT(*) FROM weather.weather_14_total; | |
| // View the semi-strucutred data directly in the table | |
| SELECT | |
| * | |
| FROM | |
| weather.weather_14_total | |
| LIMIT 10; | |
| // Select json elements directly from the json document in column V | |
| SELECT | |
| V:time, | |
| V:city.name, | |
| V:city.country, | |
| V:main.temp, | |
| V:weather | |
| FROM | |
| weather.weather_14_total | |
| LIMIT 10; | |
| // =========================================================================== | |
| // PART 1h- Data Pruning | |
| // | |
| // In this example we want to look at data pruning, and how partitions play | |
| // into a query optimization | |
| // | |
| // Details on micro-partitions and clustering: | |
| // https://support.snowflake.net/s/article/understanding-micro-partitions-and-data-clustering | |
| // =========================================================================== | |
| // Speed up the next few queries | |
| ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET WAREHOUSE_SIZE = 'LARGE'; | |
| // View all weather records for the past 2 weeks | |
| SELECT | |
| * | |
| FROM | |
| weather.weather_14_total | |
| WHERE | |
| T >= DATEADD(DAY,-14,CURRENT_TIMESTAMP); | |
| // Go to the query profile and look at the total number of partitions | |
| // and the partitions that were actually scanned. Now execute the next | |
| // query. | |
| // View all weather records for the past week (pruning) | |
| SELECT | |
| * | |
| FROM | |
| weather.weather_14_total | |
| WHERE | |
| T >= DATEADD(DAY,-7,CURRENT_TIMESTAMP); | |
| // Notice how the number of partitions scanned decreased | |
| // Now we will create a UDF to convert the data in F rather than K | |
| CREATE OR REPLACE FUNCTION | |
| UTIL_DB.PUBLIC.convert_fahrenheit( t NUMBER) | |
| RETURNS NUMBER | |
| COMMENT='Convert from Kelvin from Fahrenheit' | |
| AS '(t - 273.15) * 1.8000 + 32.00'; | |
| // View a sample of the values in the weather data with casts and temp conversion to Farenheit | |
| SELECT | |
| V:time::timestamp time, | |
| V:city.name::string city, | |
| V:city.country country, | |
| UTIL_DB.PUBLIC.convert_fahrenheit(V:main.temp::number(38,4)) temp | |
| FROM | |
| weather.weather_14_total | |
| LIMIT 10; | |
| // View the high temps by city in descending order for the last 24 hours | |
| SELECT | |
| V:city.name::string city, | |
| V:city.country::string country, | |
| MAX(UTIL_DB.PUBLIC.convert_fahrenheit(V:main.temp::number(38,4))) AS high | |
| FROM | |
| weather.weather_14_total | |
| WHERE | |
| T >= DATEADD('DAY',-1,CURRENT_TIMESTAMP) | |
| GROUP BY | |
| 1,2 | |
| ORDER BY | |
| 3 DESC; | |
| // Cleanup the resources we created for this demo. | |
| DROP WAREHOUSE IF EXISTS SNOWFLAKE_DEMO_WAREHOUSE; | |
| DROP FUNCTION IF EXISTS UTIL_DB.PUBLIC.convert_fahrenheit(number); | |
| // =========================================================================== | |
| // =========================================================================== | |
| // PART 2 - SEATTLE LIBRARY DATA | |
| // | |
| // For this next section, let's load some external data and do some | |
| // data cleaning and exploration with Kaggle data for Seattle Public Library | |
| // checkouts. | |
| // =========================================================================== | |
| // Let's create a new warehouse for this work | |
| CREATE WAREHOUSE IF NOT EXISTS | |
| LIBRARY_DEMO_WAREHOUSE | |
| WITH | |
| WAREHOUSE_SIZE = 'MEDIUM' | |
| WAREHOUSE_TYPE = 'STANDARD' | |
| AUTO_SUSPEND = 600 | |
| AUTO_RESUME = TRUE; | |
| // use warehouse | |
| USE WAREHOUSE LIBRARY_DEMO_WAREHOUSE; | |
| // Create the workshop Database | |
| CREATE OR REPLACE DATABASE SNOWFLAKE_WORKSHOP; | |
| // Create the library Schema | |
| CREATE SCHEMA IF NOT EXISTS "SNOWFLAKE_WORKSHOP"."LIBRARY"; | |
| // Use the Schema | |
| USE SCHEMA "SNOWFLAKE_WORKSHOP"."LIBRARY"; | |
| // Create checkouts table | |
| CREATE OR REPLACE TABLE | |
| SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS | |
| ( | |
| BibNum NUMBER, | |
| ItemBarcode NUMBER, | |
| ItemType VARCHAR(30), | |
| Collection VARCHAR(30), | |
| CallNumber VARCHAR(70), | |
| CheckoutDateTime VARCHAR(30) | |
| ); | |
| // Create inventory table | |
| CREATE OR REPLACE TABLE | |
| SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY | |
| ( | |
| BibNum NUMBER, | |
| Title VARCHAR(1024), | |
| Author VARCHAR(1024), | |
| ISBN VARCHAR(1024), | |
| PublicationYear VARCHAR(1024), | |
| Publisher VARCHAR(1024), | |
| Subjects VARCHAR(1024), | |
| ItemType VARCHAR(10), | |
| ItemCollection VARCHAR(10), | |
| FloatingItem VARCHAR(10), | |
| ItemLocation VARCHAR(10), | |
| ReportDate VARCHAR(35), | |
| ItemCount NUMBER | |
| ); | |
| // Create File Format | |
| CREATE OR REPLACE FILE FORMAT | |
| SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT | |
| WITH | |
| TYPE = 'CSV' | |
| FIELD_DELIMITER = ',' | |
| FIELD_OPTIONALLY_ENCLOSED_BY = '"' | |
| SKIP_HEADER = 1; | |
| // Create Stage LIBRARY_LOAD_URL | |
| CREATE OR REPLACE STAGE | |
| SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD | |
| WITH | |
| URL = 's3://hm-snowflake/library' | |
| CREDENTIALS = ( | |
| AWS_KEY_ID= '<SEE GOOGLE DOC FOR SECRET KEY>' | |
| AWS_SECRET_KEY= '<SEE GOOGLE DOC FOR SECRET KEY>' | |
| ); | |
| // Show the created stages | |
| SHOW STAGES; | |
| // List the files from the stage | |
| LIST @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD; | |
| // Copy inventory dry run | |
| // the 1% indicates that the file will fail if greater than 1% of the | |
| // data contained in that file is bad | |
| COPY INTO | |
| SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY | |
| FROM | |
| @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD | |
| PATTERN='.*Library_Collection.*' | |
| FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT) | |
| ON_ERROR = 'skip_file_1%' | |
| VALIDATION_MODE = RETURN_100_ROWS; | |
| // Copy inventory data | |
| COPY INTO | |
| SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY | |
| FROM | |
| @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD | |
| PATTERN='.*Library_Collection.*' | |
| FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT) | |
| ON_ERROR = 'skip_file_1%' | |
| TRUNCATECOLUMNS = TRUE; | |
| // Get inventory count | |
| SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY; | |
| // Copy checkouts dry run | |
| COPY INTO | |
| SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS | |
| FROM | |
| @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD | |
| PATTERN = '.*Checkouts_By_Title_Data_Lens_20.*' | |
| FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT) | |
| ON_ERROR = 'skip_file_1%' | |
| VALIDATION_MODE = RETURN_100_ROWS; | |
| // Copy checkouts data | |
| COPY INTO | |
| SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS | |
| FROM | |
| @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD | |
| PATTERN = '.*Checkouts_By_Title_Data_Lens_20.*' | |
| FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT) | |
| ON_ERROR = 'skip_file_1%' | |
| TRUNCATECOLUMNS = TRUE; | |
| // Confirm checkouts data loaded by grabbing count | |
| SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS; | |
| // With data loaded, let's do some exploring | |
| // Frequency analysis for checkouts | |
| SELECT | |
| COUNT(*) AS CNT, | |
| DAYNAME(TO_DATE(SPLIT_PART(CHECKOUTDATETIME, ' ', 0))) AS DAY | |
| FROM | |
| SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS | |
| GROUP BY | |
| DAY; | |
| // Which checkout items are the most popular? | |
| WITH TOP_20_ITEMS AS ( | |
| SELECT | |
| COUNT(*) AS CNT, | |
| C.BIBNUM | |
| FROM | |
| SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS C INNER JOIN SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY I | |
| ON | |
| C.BIBNUM = I.BIBNUM | |
| GROUP BY | |
| C.BIBNUM | |
| ORDER BY | |
| CNT DESC | |
| LIMIT 20 | |
| ) | |
| SELECT DISTINCT | |
| A.CNT, | |
| A.BIBNUM, | |
| B.TITLE, | |
| B.AUTHOR | |
| FROM | |
| TOP_20_ITEMS A INNER JOIN SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY B | |
| ON | |
| A.BIBNUM = B.BIBNUM | |
| ORDER BY | |
| A.CNT DESC; | |
| // how does popularity change over time. Let's | |
| // find the top items for each year | |
| CREATE OR REPLACE TEMPORARY TABLE | |
| SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS_BY_YEAR | |
| AS ( | |
| SELECT DISTINCT | |
| COUNT(*) AS CHECKOUT_COUNT, | |
| YEAR(TO_DATE(SPLIT_PART(C.CHECKOUTDATETIME, ' ', 0))) AS CHECKOUT_YEAR, | |
| I.TITLE AS TITLE, | |
| I.AUTHOR AS AUTHOR | |
| FROM | |
| SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS C INNER JOIN SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY I | |
| ON | |
| C.BIBNUM = I.BIBNUM | |
| GROUP BY | |
| CHECKOUT_YEAR, AUTHOR, TITLE | |
| ); | |
| // using our handy temporary table, let's get the top items by year | |
| WITH RANKED_CHECKOUTS_BY_YEAR AS ( | |
| SELECT | |
| *, | |
| RANK() OVER ( | |
| PARTITION BY CHECKOUT_YEAR | |
| ORDER BY CHECKOUT_COUNT DESC | |
| ) AS RANK | |
| FROM | |
| SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS_BY_YEAR | |
| ) | |
| SELECT * FROM RANKED_CHECKOUTS_BY_YEAR WHERE RANK=1 ORDER BY CHECKOUT_YEAR DESC; | |
| // now, let's grab some smaller data and do some time traveling! | |
| // size our warehouse down | |
| ALTER WAREHOUSE IF EXISTS | |
| LIBRARY_DEMO_WAREHOUSE | |
| SET | |
| WAREHOUSE_SIZE='XSMALL'; | |
| // Create dictionary table | |
| CREATE OR REPLACE TABLE | |
| SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY | |
| ( | |
| Code VARCHAR(20), | |
| description VARCHAR(1024), | |
| Codetype VARCHAR(30), | |
| format_group VARCHAR(30), | |
| format_subgroup VARCHAR(50), | |
| cat_group VARCHAR(50), | |
| cat_subgroup VARCHAR(50) | |
| ); | |
| // Copy dictionary | |
| COPY INTO | |
| SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY | |
| FROM | |
| @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD | |
| PATTERN = '.*Integrated_Library_System.*' | |
| FILE_FORMAT = (FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT) | |
| ON_ERROR = 'skip_file_1%'; | |
| // Should be 555. MAKE SURE TO COPY THE QUERY ID FOR THIS QUERY | |
| SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY; | |
| // Store the query ID for time travel | |
| SET QUERY_ID = (select QUERY_ID from table(information_schema.query_history_by_session()) LIMIT 1 offset 2); | |
| // insert some bad data | |
| INSERT INTO | |
| SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY | |
| (CODE, CODETYPE, FORMAT_GROUP, FORMAT_SUBGROUP, CAT_GROUP, CAT_SUBGROUP) | |
| VALUES | |
| ('badcode', 'mistake', 'oops', 'wrong', 'badcat', 'badsubcat'), | |
| ('badcode2', 'mistake2', 'oops2', 'wrong2', 'badcat2', 'badsubcat2'); | |
| // should be 557 | |
| SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY; | |
| // Insert the query ID that you recorded above | |
| SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY AT(STATEMENT => '<INSERT YOUR QUERY ID HERE>'); 642 SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY AT(STATEMENT => $QUERY_ID); | |
| // create a clone of the database from 5 min ago you might have to play with the time depending on when you | |
| // inserted the bad data. | |
| CREATE OR REPLACE TABLE | |
| SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY_CLONE | |
| CLONE | |
| SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY | |
| AT(STATEMENT => $QUERY_ID); | |
| // destroy the table | |
| DROP TABLE SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY; | |
| // rename the clone | |
| ALTER TABLE SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY_CLONE RENAME TO SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY; | |
| // see the history of change for the table | |
| SHOW TABLES HISTORY LIKE '%DICTIONARY%'; | |
| // Confirm the bad data was removed | |
| SELECT COUNT(*) FROM DICTIONARY; | |
| // Cleanup | |
| DROP WAREHOUSE IF EXISTS LIBRARY_DEMO_WAREHOUSE; | |
| DROP DATABASE IF EXISTS SNOWFLAKE_WORKSHOP; | |
| // =========================================================================== | |
| // =========================================================================== | |
| // PART 3 - STITCH ETL | |
| // | |
| // For this next section, let's use the Talend Stitch + Snowflake integration | |
| // to ingest some Github data. | |
| // =========================================================================== | |
| // See the Google docs for instructions on Stitch integration. | |
| // | |
| // From here, feel free to use the skills you've learned today to explore | |
| // the github data (I wonder which users by month were top contributors). | |
| // =========================================================================== | |
| // FINAL CLEANUP | |
| DROP RESOURCE IF EXISTS MONITOR SNOWFLAKE_DEMO_MONITOR; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment