randypitcherii/snowflakeDataForBreakfastWorkshop.sql

## snowflakeDataForBreakfastWorkshop.sql
// ===========================================================================
// Welcome!
//
// Log in to your Snowflake instance, create a new worksheet, then
// copy and paste the contents of this file into your worksheet.
// ===========================================================================


// ===========================================================================
// PART 1 - GETTING COMFORTABLE IN SNOWFLAKE
//
// In this first part, we'll get to know some basic parts of Snowflake
// and play with some demo data.
// ===========================================================================
// Start by changing context so that you're the ACCOUNTADMIN
USE ROLE ACCOUNTADMIN;

// Create a resource monitor so we don't spend too much
CREATE RESOURCE MONITOR IF NOT EXISTS
  SNOWFLAKE_DEMO_MONITOR
WITH
  CREDIT_QUOTA = 200
TRIGGERS
  ON 90 PERCENT DO SUSPEND
  ON 100 PERCENT DO SUSPEND_IMMEDIATE
  ON 50 PERCENT DO NOTIFY;

ALTER ACCOUNT SET RESOURCE_MONITOR=SNOWFLAKE_DEMO_MONITOR;

// Create a warehouse
CREATE WAREHOUSE IF NOT EXISTS
  SNOWFLAKE_DEMO_WAREHOUSE
WITH
  WAREHOUSE_SIZE = XSMALL
  WAREHOUSE_TYPE = STANDARD
  AUTO_SUSPEND = 60
  AUTO_RESUME = TRUE
  INITIALLY_SUSPENDED = TRUE;

// ===========================================================================
// PART 1a - Setting the worksheet context
//
// Run each query and notice the context change in the upper right corner of
// the workbook.
//
// Docs: https://docs.snowflake.net/manuals/user-guide/snowflake-manager.html#worksheet-page
// ===========================================================================
// set current session warehouse
USE WAREHOUSE SNOWFLAKE_DEMO_WAREHOUSE;

//  Set the database for the current session
USE SNOWFLAKE_SAMPLE_DATA.TPCH_SF100;


// ===========================================================================
// PART 1b - Evaluating a query
//
// Run the query below and take a look at the metrics, time taken and
// query profile
//
// Docs: https://docs.snowflake.net/manuals/user-guide/ui-query-profile.html
// ===========================================================================
// Pricing Summary Report Query
// This query reports the amount of business that was billed, shipped, and returned.
SELECT
  l_returnflag,
  l_linestatus,
  SUM(l_quantity) AS sum_qty,
  SUM(l_extendedprice) AS sum_base_price,
  SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
  SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
  AVG(l_quantity) AS avg_qty,
  AVG(l_extendedprice) AS avg_price,
  AVG(l_discount) AS avg_disc,
  COUNT(*) AS count_order
FROM
    SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
WHERE
  l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01'))
GROUP BY
  l_returnflag,
  l_linestatus
ORDER BY
  l_returnflag,
  l_linestatus;


// ===========================================================================
// PART 1c - Demonstrating the Persisted Query Results feature
//
// Query results are persisted for 24 hours, this procedure will demonstrate
// how this feature actually works, and how it can be used to optimize
//
// Docs: https://docs.snowflake.net/manuals/user-guide/querying-persisted-results.html
// ===========================================================================
// Stop the warehouse
ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SUSPEND;

// Run the query again and notice that the warehouse is still stopped
// Also notice the query profile and the time it took to run
SELECT
  l_returnflag,
  l_linestatus,
  SUM(l_quantity) AS sum_qty,
  SUM(l_extendedprice) AS sum_base_price,
  SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
  SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
  AVG(l_quantity) AS avg_qty,
  AVG(l_extendedprice) AS avg_price,
  AVG(l_discount) AS avg_disc,
  COUNT(*) AS count_order
FROM
    SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
WHERE
  l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01'))
GROUP BY
  l_returnflag,
  l_linestatus
ORDER BY
  l_returnflag,
  l_linestatus;

// Go to the warehouses tab and notice that the warehouse did not resume and
// go to the query profile to see that the results came from the "Query Result Reuse"


// ===========================================================================
// PART 1d- Vertically scaling a warehouse
//
// In this part we will vertically scale a warehouse (increase its t-shirt
// size) to show the increase in performance.
//
// Docs: https://docs.snowflake.net/manuals/user-guide/warehouses-tasks.html#resizing-a-warehouse
// ===========================================================================
// Increase the size of the warehouse
ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET WAREHOUSE_SIZE = 'LARGE';

// Make sure you dont get the cached result
ALTER SESSION SET USE_CACHED_RESULT=FALSE;

// Run the same query again and notice it go faster
SELECT
  l_returnflag,
  l_linestatus,
  SUM(l_quantity) AS sum_qty,
  SUM(l_extendedprice) AS sum_base_price,
  SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
  SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
  AVG(l_quantity) AS avg_qty,
  AVG(l_extendedprice) AS avg_price,
  AVG(l_discount) AS avg_disc,
  COUNT(*) AS count_order
FROM
    SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
WHERE
  l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01'))
GROUP BY
  l_returnflag,
  l_linestatus
ORDER BY
  l_returnflag,
  l_linestatus;


// Reset warehouse size and cached result behavior
// Reduce the size of the warehouse
ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET WAREHOUSE_SIZE = 'XSMALL';

// Re-enable cached result
ALTER SESSION SET USE_CACHED_RESULT=TRUE;


// ===========================================================================
// PART 1e- Auto-Suspend Auto-Resume
//
// In this section we will show the effect of auto-suspend auto-resume.
// Up until now our warehouse has been auto-suspend auto resume, however, let's
// take a look what happens when auto-resume is not set to true.
//
// Docs: https://community.snowflake.com/s/article/faq-what-is-the-behavior-of-a-warehouse-in-auto-resume-mode
// ===========================================================================
// Go ahead and modify the warehouse to turn off autoresume
ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_RESUME=FALSE;

ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SUSPEND;

// Run the same query again and notice the error message thrown
SELECT
  l_returnflag,
  l_linestatus,
  SUM(l_quantity) AS sum_qty,
  SUM(l_extendedprice) AS sum_base_price,
  SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
  SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
  AVG(l_quantity) AS avg_qty,
  AVG(l_extendedprice) AS avg_price,
  AVG(l_discount) AS avg_disc,
  COUNT(*) AS count_order
FROM
    SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
WHERE
  l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01'))
GROUP BY
  l_returnflag,
  l_linestatus
ORDER BY
  l_returnflag,
  l_linestatus;

// Go ahead and modify the warehouse to turn back on autoresume
ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_RESUME=TRUE;


// ===========================================================================
// PART 1f- Local Disk Cache
//
// In this example we will look at Local Disk Cache. In this case, the result
// of the remote disk IO is held at the warehouse, and lost if the warehouse is
// suspended or shut down.
//
// Docs: https://community.snowflake.com/s/article/Caching-in-Snowflake-Data-Warehouse
// ===========================================================================
// Set the auto-suspend to 1 hr to avoid any unexpeted suspends
ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_SUSPEND=3600;

// Make sure you dont get the cached result
ALTER SESSION SET USE_CACHED_RESULT=FALSE;

// Run our big query.
SELECT
  l_returnflag,
  l_linestatus,
  SUM(l_quantity) AS sum_qty,
  SUM(l_extendedprice) AS sum_base_price,
  SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
  SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
  AVG(l_quantity) AS avg_qty,
  AVG(l_extendedprice) AS avg_price,
  AVG(l_discount) AS avg_disc,
  COUNT(*) AS count_order
FROM
    SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
WHERE
  l_receiptdate <= DATEADD(DAY, -60, TO_DATE('1998-12-01'))
GROUP BY
  l_returnflag,
  l_linestatus
ORDER BY
  l_returnflag,
  l_linestatus;

// Check out the query profile and notice the Remote Disk I/O
// (SPECIFICALLY Percentage scanned from cache) Then modify
// the query to run over the same data with a different predicate
SELECT
  l_returnflag,
  l_linestatus,
  SUM(l_quantity) AS sum_qty,
  SUM(l_extendedprice) AS sum_base_price,
  SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
  SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
  AVG(l_quantity) AS avg_qty,
  AVG(l_extendedprice) AS avg_price,
  AVG(l_discount) AS avg_disc,
  COUNT(*) AS count_order
FROM
    SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
WHERE
  l_receiptdate <= DATEADD(DAY, -30, TO_DATE('1998-12-01'))
GROUP BY
  l_returnflag,
  l_linestatus
ORDER BY
  l_returnflag,
  l_linestatus;

// Check out the query profile and specifically the remote disk IO
// This is important to build out your warehouse architecture, and
// is heavily dependant on your usecase.


// Lets reset everything
ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_SUSPEND = 60;
ALTER SESSION SET USE_CACHED_RESULT = TRUE;


// ===========================================================================
// PART 1g- Semi-Structured data
//
// In this part we will look at querying semi-structured JSON data directly.
//
// Docs: https://docs.snowflake.net/manuals/user-guide/semistructured-concepts.html
// ===========================================================================
// Let's shift to some weather data.
// How many weather records are there?
SELECT COUNT(*) FROM weather.weather_14_total;

// View the semi-strucutred data directly in the table
SELECT
  *
FROM
  weather.weather_14_total
LIMIT 10;

// Select json elements directly from the json document in column V
SELECT
  V:time,
  V:city.name,
  V:city.country,
  V:main.temp,
  V:weather
FROM
  weather.weather_14_total
LIMIT 10;


// ===========================================================================
// PART 1h- Data Pruning
//
// In this example we want to look at data pruning, and how partitions play
// into a query optimization
//
// Details on micro-partitions and clustering:
// https://support.snowflake.net/s/article/understanding-micro-partitions-and-data-clustering
// ===========================================================================
// Speed up the next few queries
ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET WAREHOUSE_SIZE = 'LARGE';

// View all weather records for the past 2 weeks
SELECT
  *
FROM
  weather.weather_14_total
WHERE
    T >= DATEADD(DAY,-14,CURRENT_TIMESTAMP);

// Go to the query profile and look at the total number of partitions
// and the partitions that were actually scanned. Now execute the next
// query.

// View all weather records for the past week (pruning)
SELECT
  *
FROM
  weather.weather_14_total
WHERE
    T >= DATEADD(DAY,-7,CURRENT_TIMESTAMP);

// Notice how the number of partitions scanned decreased

// Now we will create a UDF to convert the data in F rather than K
CREATE OR REPLACE FUNCTION
  UTIL_DB.PUBLIC.convert_fahrenheit( t NUMBER)
  RETURNS NUMBER
  COMMENT='Convert from Kelvin from Fahrenheit'
  AS '(t - 273.15) * 1.8000 + 32.00';

// View a sample of the values in the weather data with casts and temp conversion to Farenheit
SELECT
  V:time::timestamp time,
  V:city.name::string city,
  V:city.country country,
  UTIL_DB.PUBLIC.convert_fahrenheit(V:main.temp::number(38,4)) temp
FROM
  weather.weather_14_total
LIMIT 10;

// View the high temps by city in descending order for the last 24 hours
SELECT
  V:city.name::string city,
  V:city.country::string country,
  MAX(UTIL_DB.PUBLIC.convert_fahrenheit(V:main.temp::number(38,4))) AS high
FROM
  weather.weather_14_total
WHERE
  T >= DATEADD('DAY',-1,CURRENT_TIMESTAMP)
GROUP BY
  1,2
ORDER BY
  3 DESC;

// Cleanup the resources we created for this demo.
DROP WAREHOUSE IF EXISTS SNOWFLAKE_DEMO_WAREHOUSE;
DROP FUNCTION IF EXISTS UTIL_DB.PUBLIC.convert_fahrenheit(number);
// ===========================================================================


// ===========================================================================
// PART 2 - SEATTLE LIBRARY DATA
//
// For this next section, let's load some external data and do some
// data cleaning and exploration with Kaggle data for Seattle Public Library
// checkouts.
// ===========================================================================
// Let's create a new warehouse for this work
CREATE WAREHOUSE IF NOT EXISTS
  LIBRARY_DEMO_WAREHOUSE
WITH
  WAREHOUSE_SIZE = 'MEDIUM'
  WAREHOUSE_TYPE = 'STANDARD'
  AUTO_SUSPEND = 600
  AUTO_RESUME = TRUE;

// use warehouse
USE WAREHOUSE LIBRARY_DEMO_WAREHOUSE;

// Create the workshop Database
CREATE OR REPLACE DATABASE SNOWFLAKE_WORKSHOP;

// Create the library Schema
CREATE SCHEMA IF NOT EXISTS "SNOWFLAKE_WORKSHOP"."LIBRARY";

// Use the Schema
USE SCHEMA "SNOWFLAKE_WORKSHOP"."LIBRARY";

// Create checkouts table
CREATE OR REPLACE TABLE
  SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS
(
  BibNum NUMBER,
  ItemBarcode NUMBER,
  ItemType VARCHAR(30),
  Collection VARCHAR(30),
  CallNumber VARCHAR(70),
  CheckoutDateTime VARCHAR(30)
);

// Create inventory table
CREATE OR REPLACE TABLE
  SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY
(
  BibNum NUMBER,
  Title VARCHAR(1024),
  Author VARCHAR(1024),
  ISBN VARCHAR(1024),
  PublicationYear VARCHAR(1024),
  Publisher VARCHAR(1024),
  Subjects VARCHAR(1024),
  ItemType VARCHAR(10),
  ItemCollection VARCHAR(10),
  FloatingItem VARCHAR(10),
  ItemLocation VARCHAR(10),
  ReportDate VARCHAR(35),
  ItemCount NUMBER
);

// Create File Format
CREATE OR REPLACE FILE FORMAT
  SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT
WITH
  TYPE = 'CSV'
  FIELD_DELIMITER = ','
  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
  SKIP_HEADER = 1;

// Create Stage LIBRARY_LOAD_URL
CREATE OR REPLACE STAGE
  SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
WITH
  URL = 's3://hm-snowflake/library'
  CREDENTIALS = (
    AWS_KEY_ID= '<SEE GOOGLE DOC FOR SECRET KEY>'
    AWS_SECRET_KEY= '<SEE GOOGLE DOC FOR SECRET KEY>'
  );

// Show the created stages
SHOW STAGES;

// List the files from the stage
LIST @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD;

// Copy inventory dry run
// the 1% indicates that the file will fail if greater than 1% of the
// data contained in that file is bad
COPY INTO
  SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY
FROM
  @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
  PATTERN='.*Library_Collection.*'
  FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT)
  ON_ERROR = 'skip_file_1%'
  VALIDATION_MODE = RETURN_100_ROWS;

// Copy inventory data
COPY INTO
  SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY
FROM
  @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
  PATTERN='.*Library_Collection.*'
  FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT)
  ON_ERROR = 'skip_file_1%'
  TRUNCATECOLUMNS = TRUE;

// Get inventory count
SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY;


// Copy checkouts dry run
COPY INTO
  SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS
FROM
  @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
  PATTERN = '.*Checkouts_By_Title_Data_Lens_20.*'
  FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT)
  ON_ERROR = 'skip_file_1%'
  VALIDATION_MODE = RETURN_100_ROWS;

// Copy checkouts data
COPY INTO
  SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS
FROM
  @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
  PATTERN = '.*Checkouts_By_Title_Data_Lens_20.*'
  FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT)
  ON_ERROR = 'skip_file_1%'
  TRUNCATECOLUMNS = TRUE;

// Confirm checkouts data loaded by grabbing count
SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS;


// With data loaded, let's do some exploring
// Frequency analysis for checkouts
SELECT
  COUNT(*) AS CNT,
  DAYNAME(TO_DATE(SPLIT_PART(CHECKOUTDATETIME, ' ', 0))) AS DAY
FROM
  SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS
GROUP BY
  DAY;

// Which checkout items are the most popular?
WITH TOP_20_ITEMS AS (
  SELECT
    COUNT(*) AS CNT,
    C.BIBNUM
  FROM
    SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS C INNER JOIN SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY I
  ON
    C.BIBNUM = I.BIBNUM
  GROUP BY
    C.BIBNUM
  ORDER BY
    CNT DESC
  LIMIT 20
)
SELECT DISTINCT
  A.CNT,
  A.BIBNUM,
  B.TITLE,
  B.AUTHOR
FROM
  TOP_20_ITEMS A INNER JOIN SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY B
ON
  A.BIBNUM = B.BIBNUM
ORDER BY
  A.CNT DESC;

// how does popularity change over time. Let's
// find the top items for each year
CREATE OR REPLACE TEMPORARY TABLE
  SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS_BY_YEAR
AS (
  SELECT DISTINCT
    COUNT(*) AS CHECKOUT_COUNT,
    YEAR(TO_DATE(SPLIT_PART(C.CHECKOUTDATETIME, ' ', 0))) AS CHECKOUT_YEAR,
    I.TITLE AS TITLE,
    I.AUTHOR AS AUTHOR
  FROM
    SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS C INNER JOIN SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY I
  ON
    C.BIBNUM = I.BIBNUM
  GROUP BY
    CHECKOUT_YEAR, AUTHOR, TITLE
);

// using our handy temporary table, let's get the top items by year
WITH RANKED_CHECKOUTS_BY_YEAR AS (
SELECT
  *,
  RANK() OVER (
    PARTITION BY CHECKOUT_YEAR
    ORDER BY CHECKOUT_COUNT DESC
  ) AS RANK
FROM
  SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS_BY_YEAR
)
SELECT * FROM RANKED_CHECKOUTS_BY_YEAR WHERE RANK=1 ORDER BY CHECKOUT_YEAR DESC;


// now, let's grab some smaller data and do some time traveling!
// size our warehouse down
ALTER WAREHOUSE IF EXISTS
  LIBRARY_DEMO_WAREHOUSE
SET
  WAREHOUSE_SIZE='XSMALL';

// Create dictionary table
CREATE OR REPLACE TABLE
  SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY
(
  Code VARCHAR(20),
  description VARCHAR(1024),
  Codetype VARCHAR(30),
  format_group VARCHAR(30),
  format_subgroup VARCHAR(50),
  cat_group VARCHAR(50),
  cat_subgroup VARCHAR(50)
);

// Copy dictionary
COPY INTO
  SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY
FROM
  @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
  PATTERN = '.*Integrated_Library_System.*'
  FILE_FORMAT = (FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT)
  ON_ERROR = 'skip_file_1%';

// Should be 555. MAKE SURE TO COPY THE QUERY ID FOR THIS QUERY
SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY;

// Store the query ID for time travel
SET QUERY_ID = (select QUERY_ID from table(information_schema.query_history_by_session()) LIMIT 1 offset 2);

// insert some bad data
INSERT INTO
  SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY
  (CODE, CODETYPE, FORMAT_GROUP, FORMAT_SUBGROUP, CAT_GROUP, CAT_SUBGROUP)
VALUES
  ('badcode', 'mistake', 'oops', 'wrong', 'badcat', 'badsubcat'),
  ('badcode2', 'mistake2', 'oops2', 'wrong2', 'badcat2', 'badsubcat2');

// should be 557
SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY;

// Insert the query ID that you recorded above
SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY AT(STATEMENT => '<INSERT YOUR QUERY ID HERE>');	642	SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY AT(STATEMENT => $QUERY_ID);

// create a clone of the database from 5 min ago you might have to play with the time depending on when you
// inserted the bad data.
CREATE OR REPLACE TABLE
  SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY_CLONE
CLONE
  SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY
  AT(STATEMENT => $QUERY_ID);

// destroy the table
DROP TABLE SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY;

// rename the clone
ALTER TABLE SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY_CLONE RENAME TO SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY;

// see the history of change for the table
SHOW TABLES HISTORY LIKE '%DICTIONARY%';

// Confirm the bad data was removed
SELECT COUNT(*) FROM DICTIONARY;

// Cleanup
DROP WAREHOUSE IF EXISTS LIBRARY_DEMO_WAREHOUSE;
DROP DATABASE IF EXISTS SNOWFLAKE_WORKSHOP;
// ===========================================================================


// ===========================================================================
// PART 3 - STITCH ETL
//
// For this next section, let's use the Talend Stitch + Snowflake integration
// to ingest some Github data.
// ===========================================================================
// See the Google docs for instructions on Stitch integration.
//
// From here, feel free to use the skills you've learned today to explore
// the github data (I wonder which users by month were top contributors).
// ===========================================================================


// FINAL CLEANUP
DROP RESOURCE IF EXISTS MONITOR SNOWFLAKE_DEMO_MONITOR;
	// ===========================================================================
	// Welcome!
	//
	// Log in to your Snowflake instance, create a new worksheet, then
	// copy and paste the contents of this file into your worksheet.
	// ===========================================================================



	// ===========================================================================
	// PART 1 - GETTING COMFORTABLE IN SNOWFLAKE
	//
	// In this first part, we'll get to know some basic parts of Snowflake
	// and play with some demo data.
	// ===========================================================================
	// Start by changing context so that you're the ACCOUNTADMIN
	USE ROLE ACCOUNTADMIN;

	// Create a resource monitor so we don't spend too much
	CREATE RESOURCE MONITOR IF NOT EXISTS
	SNOWFLAKE_DEMO_MONITOR
	WITH
	CREDIT_QUOTA = 200
	TRIGGERS
	ON 90 PERCENT DO SUSPEND
	ON 100 PERCENT DO SUSPEND_IMMEDIATE
	ON 50 PERCENT DO NOTIFY;

	ALTER ACCOUNT SET RESOURCE_MONITOR=SNOWFLAKE_DEMO_MONITOR;

	// Create a warehouse
	CREATE WAREHOUSE IF NOT EXISTS
	SNOWFLAKE_DEMO_WAREHOUSE
	WITH
	WAREHOUSE_SIZE = XSMALL
	WAREHOUSE_TYPE = STANDARD
	AUTO_SUSPEND = 60
	AUTO_RESUME = TRUE
	INITIALLY_SUSPENDED = TRUE;

	// ===========================================================================
	// PART 1a - Setting the worksheet context
	//
	// Run each query and notice the context change in the upper right corner of
	// the workbook.
	//
	// Docs: https://docs.snowflake.net/manuals/user-guide/snowflake-manager.html#worksheet-page
	// ===========================================================================
	// set current session warehouse
	USE WAREHOUSE SNOWFLAKE_DEMO_WAREHOUSE;

	// Set the database for the current session
	USE SNOWFLAKE_SAMPLE_DATA.TPCH_SF100;



	// ===========================================================================
	// PART 1b - Evaluating a query
	//
	// Run the query below and take a look at the metrics, time taken and
	// query profile
	//
	// Docs: https://docs.snowflake.net/manuals/user-guide/ui-query-profile.html
	// ===========================================================================
	// Pricing Summary Report Query
	// This query reports the amount of business that was billed, shipped, and returned.
	SELECT
	l_returnflag,
	l_linestatus,
	SUM(l_quantity) AS sum_qty,
	SUM(l_extendedprice) AS sum_base_price,
	SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
	SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
	AVG(l_quantity) AS avg_qty,
	AVG(l_extendedprice) AS avg_price,
	AVG(l_discount) AS avg_disc,
	COUNT(*) AS count_order
	FROM
	SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
	WHERE
	l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01'))
	GROUP BY
	l_returnflag,
	l_linestatus
	ORDER BY
	l_returnflag,
	l_linestatus;


	// ===========================================================================
	// PART 1c - Demonstrating the Persisted Query Results feature
	//
	// Query results are persisted for 24 hours, this procedure will demonstrate
	// how this feature actually works, and how it can be used to optimize
	//
	// Docs: https://docs.snowflake.net/manuals/user-guide/querying-persisted-results.html
	// ===========================================================================
	// Stop the warehouse
	ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SUSPEND;

	// Run the query again and notice that the warehouse is still stopped
	// Also notice the query profile and the time it took to run
	SELECT
	l_returnflag,
	l_linestatus,
	SUM(l_quantity) AS sum_qty,
	SUM(l_extendedprice) AS sum_base_price,
	SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
	SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
	AVG(l_quantity) AS avg_qty,
	AVG(l_extendedprice) AS avg_price,
	AVG(l_discount) AS avg_disc,
	COUNT(*) AS count_order
	FROM
	SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
	WHERE
	l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01'))
	GROUP BY
	l_returnflag,
	l_linestatus
	ORDER BY
	l_returnflag,
	l_linestatus;

	// Go to the warehouses tab and notice that the warehouse did not resume and
	// go to the query profile to see that the results came from the "Query Result Reuse"


	// ===========================================================================
	// PART 1d- Vertically scaling a warehouse
	//
	// In this part we will vertically scale a warehouse (increase its t-shirt
	// size) to show the increase in performance.
	//
	// Docs: https://docs.snowflake.net/manuals/user-guide/warehouses-tasks.html#resizing-a-warehouse
	// ===========================================================================
	// Increase the size of the warehouse
	ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET WAREHOUSE_SIZE = 'LARGE';

	// Make sure you dont get the cached result
	ALTER SESSION SET USE_CACHED_RESULT=FALSE;

	// Run the same query again and notice it go faster
	SELECT
	l_returnflag,
	l_linestatus,
	SUM(l_quantity) AS sum_qty,
	SUM(l_extendedprice) AS sum_base_price,
	SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
	SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
	AVG(l_quantity) AS avg_qty,
	AVG(l_extendedprice) AS avg_price,
	AVG(l_discount) AS avg_disc,
	COUNT(*) AS count_order
	FROM
	SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
	WHERE
	l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01'))
	GROUP BY
	l_returnflag,
	l_linestatus
	ORDER BY
	l_returnflag,
	l_linestatus;


	// Reset warehouse size and cached result behavior
	// Reduce the size of the warehouse
	ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET WAREHOUSE_SIZE = 'XSMALL';

	// Re-enable cached result
	ALTER SESSION SET USE_CACHED_RESULT=TRUE;


	// ===========================================================================
	// PART 1e- Auto-Suspend Auto-Resume
	//
	// In this section we will show the effect of auto-suspend auto-resume.
	// Up until now our warehouse has been auto-suspend auto resume, however, let's
	// take a look what happens when auto-resume is not set to true.
	//
	// Docs: https://community.snowflake.com/s/article/faq-what-is-the-behavior-of-a-warehouse-in-auto-resume-mode
	// ===========================================================================
	// Go ahead and modify the warehouse to turn off autoresume
	ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_RESUME=FALSE;

	ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SUSPEND;

	// Run the same query again and notice the error message thrown
	SELECT
	l_returnflag,
	l_linestatus,
	SUM(l_quantity) AS sum_qty,
	SUM(l_extendedprice) AS sum_base_price,
	SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
	SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
	AVG(l_quantity) AS avg_qty,
	AVG(l_extendedprice) AS avg_price,
	AVG(l_discount) AS avg_disc,
	COUNT(*) AS count_order
	FROM
	SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
	WHERE
	l_receiptdate <= DATEADD(DAY, -90, TO_DATE('1998-12-01'))
	GROUP BY
	l_returnflag,
	l_linestatus
	ORDER BY
	l_returnflag,
	l_linestatus;

	// Go ahead and modify the warehouse to turn back on autoresume
	ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_RESUME=TRUE;


	// ===========================================================================
	// PART 1f- Local Disk Cache
	//
	// In this example we will look at Local Disk Cache. In this case, the result
	// of the remote disk IO is held at the warehouse, and lost if the warehouse is
	// suspended or shut down.
	//
	// Docs: https://community.snowflake.com/s/article/Caching-in-Snowflake-Data-Warehouse
	// ===========================================================================
	// Set the auto-suspend to 1 hr to avoid any unexpeted suspends
	ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_SUSPEND=3600;

	// Make sure you dont get the cached result
	ALTER SESSION SET USE_CACHED_RESULT=FALSE;

	// Run our big query.
	SELECT
	l_returnflag,
	l_linestatus,
	SUM(l_quantity) AS sum_qty,
	SUM(l_extendedprice) AS sum_base_price,
	SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
	SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
	AVG(l_quantity) AS avg_qty,
	AVG(l_extendedprice) AS avg_price,
	AVG(l_discount) AS avg_disc,
	COUNT(*) AS count_order
	FROM
	SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
	WHERE
	l_receiptdate <= DATEADD(DAY, -60, TO_DATE('1998-12-01'))
	GROUP BY
	l_returnflag,
	l_linestatus
	ORDER BY
	l_returnflag,
	l_linestatus;

	// Check out the query profile and notice the Remote Disk I/O
	// (SPECIFICALLY Percentage scanned from cache) Then modify
	// the query to run over the same data with a different predicate
	SELECT
	l_returnflag,
	l_linestatus,
	SUM(l_quantity) AS sum_qty,
	SUM(l_extendedprice) AS sum_base_price,
	SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
	SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
	AVG(l_quantity) AS avg_qty,
	AVG(l_extendedprice) AS avg_price,
	AVG(l_discount) AS avg_disc,
	COUNT(*) AS count_order
	FROM
	SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM
	WHERE
	l_receiptdate <= DATEADD(DAY, -30, TO_DATE('1998-12-01'))
	GROUP BY
	l_returnflag,
	l_linestatus
	ORDER BY
	l_returnflag,
	l_linestatus;

	// Check out the query profile and specifically the remote disk IO
	// This is important to build out your warehouse architecture, and
	// is heavily dependant on your usecase.


	// Lets reset everything
	ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET AUTO_SUSPEND = 60;
	ALTER SESSION SET USE_CACHED_RESULT = TRUE;


	// ===========================================================================
	// PART 1g- Semi-Structured data
	//
	// In this part we will look at querying semi-structured JSON data directly.
	//
	// Docs: https://docs.snowflake.net/manuals/user-guide/semistructured-concepts.html
	// ===========================================================================
	// Let's shift to some weather data.
	// How many weather records are there?
	SELECT COUNT(*) FROM weather.weather_14_total;

	// View the semi-strucutred data directly in the table
	SELECT
	*
	FROM
	weather.weather_14_total
	LIMIT 10;

	// Select json elements directly from the json document in column V
	SELECT
	V:time,
	V:city.name,
	V:city.country,
	V:main.temp,
	V:weather
	FROM
	weather.weather_14_total
	LIMIT 10;


	// ===========================================================================
	// PART 1h- Data Pruning
	//
	// In this example we want to look at data pruning, and how partitions play
	// into a query optimization
	//
	// Details on micro-partitions and clustering:
	// https://support.snowflake.net/s/article/understanding-micro-partitions-and-data-clustering
	// ===========================================================================
	// Speed up the next few queries
	ALTER WAREHOUSE "SNOWFLAKE_DEMO_WAREHOUSE" SET WAREHOUSE_SIZE = 'LARGE';

	// View all weather records for the past 2 weeks
	SELECT
	*
	FROM
	weather.weather_14_total
	WHERE
	T >= DATEADD(DAY,-14,CURRENT_TIMESTAMP);

	// Go to the query profile and look at the total number of partitions
	// and the partitions that were actually scanned. Now execute the next
	// query.

	// View all weather records for the past week (pruning)
	SELECT
	*
	FROM
	weather.weather_14_total
	WHERE
	T >= DATEADD(DAY,-7,CURRENT_TIMESTAMP);

	// Notice how the number of partitions scanned decreased

	// Now we will create a UDF to convert the data in F rather than K
	CREATE OR REPLACE FUNCTION
	UTIL_DB.PUBLIC.convert_fahrenheit( t NUMBER)
	RETURNS NUMBER
	COMMENT='Convert from Kelvin from Fahrenheit'
	AS '(t - 273.15) * 1.8000 + 32.00';

	// View a sample of the values in the weather data with casts and temp conversion to Farenheit
	SELECT
	V:time::timestamp time,
	V:city.name::string city,
	V:city.country country,
	UTIL_DB.PUBLIC.convert_fahrenheit(V:main.temp::number(38,4)) temp
	FROM
	weather.weather_14_total
	LIMIT 10;

	// View the high temps by city in descending order for the last 24 hours
	SELECT
	V:city.name::string city,
	V:city.country::string country,
	MAX(UTIL_DB.PUBLIC.convert_fahrenheit(V:main.temp::number(38,4))) AS high
	FROM
	weather.weather_14_total
	WHERE
	T >= DATEADD('DAY',-1,CURRENT_TIMESTAMP)
	GROUP BY
	1,2
	ORDER BY
	3 DESC;

	// Cleanup the resources we created for this demo.
	DROP WAREHOUSE IF EXISTS SNOWFLAKE_DEMO_WAREHOUSE;
	DROP FUNCTION IF EXISTS UTIL_DB.PUBLIC.convert_fahrenheit(number);
	// ===========================================================================



	// ===========================================================================
	// PART 2 - SEATTLE LIBRARY DATA
	//
	// For this next section, let's load some external data and do some
	// data cleaning and exploration with Kaggle data for Seattle Public Library
	// checkouts.
	// ===========================================================================
	// Let's create a new warehouse for this work
	CREATE WAREHOUSE IF NOT EXISTS
	LIBRARY_DEMO_WAREHOUSE
	WITH
	WAREHOUSE_SIZE = 'MEDIUM'
	WAREHOUSE_TYPE = 'STANDARD'
	AUTO_SUSPEND = 600
	AUTO_RESUME = TRUE;

	// use warehouse
	USE WAREHOUSE LIBRARY_DEMO_WAREHOUSE;

	// Create the workshop Database
	CREATE OR REPLACE DATABASE SNOWFLAKE_WORKSHOP;

	// Create the library Schema
	CREATE SCHEMA IF NOT EXISTS "SNOWFLAKE_WORKSHOP"."LIBRARY";

	// Use the Schema
	USE SCHEMA "SNOWFLAKE_WORKSHOP"."LIBRARY";

	// Create checkouts table
	CREATE OR REPLACE TABLE
	SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS
	(
	BibNum NUMBER,
	ItemBarcode NUMBER,
	ItemType VARCHAR(30),
	Collection VARCHAR(30),
	CallNumber VARCHAR(70),
	CheckoutDateTime VARCHAR(30)
	);

	// Create inventory table
	CREATE OR REPLACE TABLE
	SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY
	(
	BibNum NUMBER,
	Title VARCHAR(1024),
	Author VARCHAR(1024),
	ISBN VARCHAR(1024),
	PublicationYear VARCHAR(1024),
	Publisher VARCHAR(1024),
	Subjects VARCHAR(1024),
	ItemType VARCHAR(10),
	ItemCollection VARCHAR(10),
	FloatingItem VARCHAR(10),
	ItemLocation VARCHAR(10),
	ReportDate VARCHAR(35),
	ItemCount NUMBER
	);

	// Create File Format
	CREATE OR REPLACE FILE FORMAT
	SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT
	WITH
	TYPE = 'CSV'
	FIELD_DELIMITER = ','
	FIELD_OPTIONALLY_ENCLOSED_BY = '"'
	SKIP_HEADER = 1;

	// Create Stage LIBRARY_LOAD_URL
	CREATE OR REPLACE STAGE
	SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
	WITH
	URL = 's3://hm-snowflake/library'
	CREDENTIALS = (
	AWS_KEY_ID= '<SEE GOOGLE DOC FOR SECRET KEY>'
	AWS_SECRET_KEY= '<SEE GOOGLE DOC FOR SECRET KEY>'
	);

	// Show the created stages
	SHOW STAGES;

	// List the files from the stage
	LIST @SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD;

	// Copy inventory dry run
	// the 1% indicates that the file will fail if greater than 1% of the
	// data contained in that file is bad
	COPY INTO
	SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY
	FROM
	@SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
	PATTERN='.Library_Collection.'
	FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT)
	ON_ERROR = 'skip_file_1%'
	VALIDATION_MODE = RETURN_100_ROWS;

	// Copy inventory data
	COPY INTO
	SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY
	FROM
	@SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
	PATTERN='.Library_Collection.'
	FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT)
	ON_ERROR = 'skip_file_1%'
	TRUNCATECOLUMNS = TRUE;

	// Get inventory count
	SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY;


	// Copy checkouts dry run
	COPY INTO
	SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS
	FROM
	@SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
	PATTERN = '.Checkouts_By_Title_Data_Lens_20.'
	FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT)
	ON_ERROR = 'skip_file_1%'
	VALIDATION_MODE = RETURN_100_ROWS;

	// Copy checkouts data
	COPY INTO
	SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS
	FROM
	@SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
	PATTERN = '.Checkouts_By_Title_Data_Lens_20.'
	FILE_FORMAT=(FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT)
	ON_ERROR = 'skip_file_1%'
	TRUNCATECOLUMNS = TRUE;

	// Confirm checkouts data loaded by grabbing count
	SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS;


	// With data loaded, let's do some exploring
	// Frequency analysis for checkouts
	SELECT
	COUNT(*) AS CNT,
	DAYNAME(TO_DATE(SPLIT_PART(CHECKOUTDATETIME, ' ', 0))) AS DAY
	FROM
	SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS
	GROUP BY
	DAY;

	// Which checkout items are the most popular?
	WITH TOP_20_ITEMS AS (
	SELECT
	COUNT(*) AS CNT,
	C.BIBNUM
	FROM
	SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS C INNER JOIN SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY I
	ON
	C.BIBNUM = I.BIBNUM
	GROUP BY
	C.BIBNUM
	ORDER BY
	CNT DESC
	LIMIT 20
	)
	SELECT DISTINCT
	A.CNT,
	A.BIBNUM,
	B.TITLE,
	B.AUTHOR
	FROM
	TOP_20_ITEMS A INNER JOIN SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY B
	ON
	A.BIBNUM = B.BIBNUM
	ORDER BY
	A.CNT DESC;

	// how does popularity change over time. Let's
	// find the top items for each year
	CREATE OR REPLACE TEMPORARY TABLE
	SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS_BY_YEAR
	AS (
	SELECT DISTINCT
	COUNT(*) AS CHECKOUT_COUNT,
	YEAR(TO_DATE(SPLIT_PART(C.CHECKOUTDATETIME, ' ', 0))) AS CHECKOUT_YEAR,
	I.TITLE AS TITLE,
	I.AUTHOR AS AUTHOR
	FROM
	SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS C INNER JOIN SNOWFLAKE_WORKSHOP.LIBRARY.INVENTORY I
	ON
	C.BIBNUM = I.BIBNUM
	GROUP BY
	CHECKOUT_YEAR, AUTHOR, TITLE
	);

	// using our handy temporary table, let's get the top items by year
	WITH RANKED_CHECKOUTS_BY_YEAR AS (
	SELECT
	*,
	RANK() OVER (
	PARTITION BY CHECKOUT_YEAR
	ORDER BY CHECKOUT_COUNT DESC
	) AS RANK
	FROM
	SNOWFLAKE_WORKSHOP.LIBRARY.CHECKOUTS_BY_YEAR
	)
	SELECT * FROM RANKED_CHECKOUTS_BY_YEAR WHERE RANK=1 ORDER BY CHECKOUT_YEAR DESC;



	// now, let's grab some smaller data and do some time traveling!
	// size our warehouse down
	ALTER WAREHOUSE IF EXISTS
	LIBRARY_DEMO_WAREHOUSE
	SET
	WAREHOUSE_SIZE='XSMALL';

	// Create dictionary table
	CREATE OR REPLACE TABLE
	SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY
	(
	Code VARCHAR(20),
	description VARCHAR(1024),
	Codetype VARCHAR(30),
	format_group VARCHAR(30),
	format_subgroup VARCHAR(50),
	cat_group VARCHAR(50),
	cat_subgroup VARCHAR(50)
	);

	// Copy dictionary
	COPY INTO
	SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY
	FROM
	@SNOWFLAKE_WORKSHOP.LIBRARY.LIBRARY_LOAD
	PATTERN = '.Integrated_Library_System.'
	FILE_FORMAT = (FORMAT_NAME = SNOWFLAKE_WORKSHOP.LIBRARY.LOAD_CSV_FORMAT)
	ON_ERROR = 'skip_file_1%';

	// Should be 555. MAKE SURE TO COPY THE QUERY ID FOR THIS QUERY
	SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY;

	// Store the query ID for time travel
	SET QUERY_ID = (select QUERY_ID from table(information_schema.query_history_by_session()) LIMIT 1 offset 2);

	// insert some bad data
	INSERT INTO
	SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY
	(CODE, CODETYPE, FORMAT_GROUP, FORMAT_SUBGROUP, CAT_GROUP, CAT_SUBGROUP)
	VALUES
	('badcode', 'mistake', 'oops', 'wrong', 'badcat', 'badsubcat'),
	('badcode2', 'mistake2', 'oops2', 'wrong2', 'badcat2', 'badsubcat2');

	// should be 557
	SELECT COUNT(*) FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY;

	// Insert the query ID that you recorded above
	SELECT COUNT() FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY AT(STATEMENT => '<INSERT YOUR QUERY ID HERE>'); 642 SELECT COUNT() FROM SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY AT(STATEMENT => $QUERY_ID);

	// create a clone of the database from 5 min ago you might have to play with the time depending on when you
	// inserted the bad data.
	CREATE OR REPLACE TABLE
	SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY_CLONE
	CLONE
	SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY
	AT(STATEMENT => $QUERY_ID);

	// destroy the table
	DROP TABLE SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY;

	// rename the clone
	ALTER TABLE SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY_CLONE RENAME TO SNOWFLAKE_WORKSHOP.LIBRARY.DICTIONARY;

	// see the history of change for the table
	SHOW TABLES HISTORY LIKE '%DICTIONARY%';

	// Confirm the bad data was removed
	SELECT COUNT(*) FROM DICTIONARY;

	// Cleanup
	DROP WAREHOUSE IF EXISTS LIBRARY_DEMO_WAREHOUSE;
	DROP DATABASE IF EXISTS SNOWFLAKE_WORKSHOP;
	// ===========================================================================



	// ===========================================================================
	// PART 3 - STITCH ETL
	//
	// For this next section, let's use the Talend Stitch + Snowflake integration
	// to ingest some Github data.
	// ===========================================================================
	// See the Google docs for instructions on Stitch integration.
	//
	// From here, feel free to use the skills you've learned today to explore
	// the github data (I wonder which users by month were top contributors).
	// ===========================================================================


	// FINAL CLEANUP
	DROP RESOURCE IF EXISTS MONITOR SNOWFLAKE_DEMO_MONITOR;
No results found