Last active
March 26, 2019 21:44
-
-
Save randypitcherii/9ce93433b17f65aa9ed314d8bb842946 to your computer and use it in GitHub Desktop.
Full guided SQL for the Talend Stitch + Snowflake blog post
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // =========================================================================== | |
| // Welcome! | |
| // | |
| // Log in to your Snowflake instance, create a new worksheet, then | |
| // copy and paste the contents of this file into your worksheet. | |
| // | |
| // Make sure you're using the ACCOUNTADMIN role before we get started | |
| // (see the context in the upper right corner of your worksheet) | |
| // | |
| // Next, head over to the Partner Connect page (link in the top right of the snowflake toolbar). | |
| // | |
| // We're going to start by setting up a Talend Stitch account using the built in Snowflake connection. | |
| // This will give us everything we need to start securely ingesting data. | |
| // =========================================================================== | |
| // =========================================================================== | |
| // Next, let's manually setup PowerBI similarly to how Partner Connect set | |
| // Talend Stitch up for us before. | |
| // =========================================================================== | |
| // Create the DB, user, role, and warehouse that powerbi will use to securely connect | |
| CREATE DATABASE IF NOT EXISTS POWERBI; | |
| CREATE USER IF NOT EXISTS POWERBI_USER_ACCOUNT PASSWORD="my super cool password." MUST_CHANGE_PASSWORD=false; // use your own password, dummy | |
| CREATE ROLE IF NOT EXISTS POWERBI_ROLE; | |
| CREATE WAREHOUSE IF NOT EXISTS POWERBI_WH WAREHOUSE_SIZE=XSMALL; | |
| // Attach permissions | |
| GRANT USAGE ON DATABASE POWERBI TO ROLE POWERBI_ROLE; | |
| GRANT USAGE ON WAREHOUSE POWERBI_WH TO ROLE POWERBI_ROLE; | |
| ALTER USER POWERBI_USER_ACCOUNT SET DEFAULT_ROLE=POWERBI_ROLE; // you must do this because the PowerBI connector doesn't let you specify a role. | |
| // =========================================================================== | |
| // =========================================================================== | |
| // Now, setup ingestion from the Stitch side of things and wait for data to flow. | |
| // | |
| // Let's explore the raw data. | |
| // =========================================================================== | |
| // Initial look at data | |
| SELECT * FROM PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA LIMIT 10; | |
| // let's make the table easier to use in queries | |
| SET SPOTIFY = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA'; | |
| SELECT * FROM TABLE($SPOTIFY) LIMIT 10; | |
| // we can also set context to do this | |
| USE DATABASE PC_STITCH_DB; | |
| USE SCHEMA AWS_S3_CSV_BUCKET; | |
| SELECT * FROM SPOTIFY_DATA LIMIT 10; | |
| // NOTE: I prefer the variable approach for worksheets | |
| // where you'll be in multiple databases. It prevents mistakes | |
| // when running different parts of your worksheet | |
| // let's find out what the top songs are | |
| SELECT * FROM TABLE($SPOTIFY) ORDER BY STREAMS DESC LIMIT 10; | |
| // let's see what date ranges we're working with | |
| SELECT | |
| MIN(DATE) AS startDate, | |
| MAX(DATE) AS endDate | |
| FROM | |
| TABLE($SPOTIFY); | |
| // looks like we're essentially playing with top daily streams for all of 2017. | |
| // Let's see which artist had the most streams over the whole data set | |
| SELECT | |
| ARTIST, SUM(STREAMS) as streamCount | |
| FROM | |
| TABLE($SPOTIFY) | |
| GROUP BY | |
| ARTIST | |
| ORDER BY | |
| streamCount DESC | |
| LIMIT 10; | |
| // hmm, these numbers seem high. Maybe we need to explore those region codes | |
| SELECT DISTINCT REGION FROM TABLE($SPOTIFY); | |
| // maybe we should just use global? I think aggregating with regions AND global | |
| // is resulting in double counting. Lets find out using Drake's streams. | |
| SET globalDrakeSteams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION='global'); | |
| SET sumOfRegionalDrakeStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION!='global'); | |
| SELECT $globalDrakeSteams, $sumOfRegionalDrakeStreams; | |
| // what about Kendrick Lamar ("DAMN." came out in the spring of 2017 and it is a masterpiece) | |
| SET globalKendrickSteams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION='global'); | |
| SET sumOfRegionalKendrickStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION!='global'); | |
| SELECT $globalKendrickSteams, $sumOfRegionalKendrickStreams; | |
| // it's not exact, but close enough to convince me to filter by global from now on. (check with other artists if you like) | |
| // The easiest way to do that is with a view. Let's make one now | |
| CREATE VIEW IF NOT EXISTS | |
| PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL | |
| AS | |
| SELECT * FROM TABLE($SPOTIFY) WHERE REGION='global'; | |
| // Now grab the view name as a variable | |
| SET SPOTIFY_GLOBAL = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL'; | |
| // Cool, now we're working with some cleaner data. | |
| // Let's revisit our top artists. | |
| SELECT | |
| ARTIST, SUM(STREAMS) as streamCount | |
| FROM | |
| TABLE($SPOTIFY_GLOBAL) | |
| GROUP BY | |
| ARTIST | |
| ORDER BY | |
| streamCount DESC | |
| LIMIT 10; | |
| // now top songs | |
| SELECT | |
| ARTIST, TRACK_NAME, SUM(STREAMS) as streamCount | |
| FROM | |
| TABLE($SPOTIFY_GLOBAL) | |
| GROUP BY | |
| ARTIST, TRACK_NAME | |
| ORDER BY | |
| streamCount DESC | |
| LIMIT 10; | |
| // wow, Ed Sheeran was killing it in 2017. Good for him. | |
| // | |
| // I think we have enough context to maybe build a good BI table... | |
| // =========================================================================== | |
| // =========================================================================== | |
| // Now, let's send some clean, BI-ready data to our PowerBI DB. | |
| // | |
| // Our goal from here will be to transform the data in PC_STITCH_DB to a clean, | |
| // BI-ready form inside of the POWERBI database for analytics consumption | |
| // =========================================================================== | |
| // let's create our spotify schema in the powerbi db | |
| CREATE SCHEMA IF NOT EXISTS POWERBI.SPOTIFY; | |
| // I think it would be fun to see the top 5 songs every | |
| // month for US streaming. | |
| // start by creating a temp table with stream counts for each song by month. | |
| // We'll filter to only US data and we'll ignore the partial January 2018 data | |
| CREATE OR REPLACE TEMPORARY TABLE | |
| POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH | |
| AS ( | |
| SELECT | |
| ARTIST, | |
| TRACK_NAME, | |
| SUM(STREAMS) AS STREAM_COUNT, | |
| DATE_TRUNC(MONTH, DATE) AS MONTH | |
| FROM | |
| TABLE($SPOTIFY) | |
| WHERE | |
| REGION='us' AND YEAR(DATE) = 2017 | |
| GROUP BY | |
| MONTH, ARTIST, TRACK_NAME | |
| ); | |
| // now we'll get song ranks within each month using a nested | |
| // query with a window function | |
| CREATE OR REPLACE TABLE | |
| POWERBI.SPOTIFY.US_MONTHLY_TOP_5_SONGS | |
| AS ( | |
| SELECT * FROM | |
| ( | |
| SELECT | |
| *, | |
| RANK() OVER (PARTITION BY MONTH ORDER BY STREAM_COUNT DESC) AS MONTHLY_RANK | |
| FROM | |
| POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH | |
| ) | |
| WHERE | |
| MONTHLY_RANK <= 5 | |
| ); | |
| // now that we're finished with all that messiness, let's grant read access | |
| // on our new schema and table to our powerbi user | |
| GRANT USAGE ON SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE; | |
| GRANT SELECT ON ALL TABLES IN SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE; | |
| // Great job! You should be able to make some cool visuazliations in PBI now. | |
| // =========================================================================== |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment