Skip to content

Instantly share code, notes, and snippets.

@randypitcherii
Last active March 26, 2019 21:44
Show Gist options
  • Select an option

  • Save randypitcherii/9ce93433b17f65aa9ed314d8bb842946 to your computer and use it in GitHub Desktop.

Select an option

Save randypitcherii/9ce93433b17f65aa9ed314d8bb842946 to your computer and use it in GitHub Desktop.
Full guided SQL for the Talend Stitch + Snowflake blog post
// ===========================================================================
// Welcome!
//
// Log in to your Snowflake instance, create a new worksheet, then
// copy and paste the contents of this file into your worksheet.
//
// Make sure you're using the ACCOUNTADMIN role before we get started
// (see the context in the upper right corner of your worksheet)
//
// Next, head over to the Partner Connect page (link in the top right of the snowflake toolbar).
//
// We're going to start by setting up a Talend Stitch account using the built in Snowflake connection.
// This will give us everything we need to start securely ingesting data.
// ===========================================================================
// ===========================================================================
// Next, let's manually setup PowerBI similarly to how Partner Connect set
// Talend Stitch up for us before.
// ===========================================================================
// Create the DB, user, role, and warehouse that powerbi will use to securely connect
CREATE DATABASE IF NOT EXISTS POWERBI;
CREATE USER IF NOT EXISTS POWERBI_USER_ACCOUNT PASSWORD="my super cool password." MUST_CHANGE_PASSWORD=false; // use your own password, dummy
CREATE ROLE IF NOT EXISTS POWERBI_ROLE;
CREATE WAREHOUSE IF NOT EXISTS POWERBI_WH WAREHOUSE_SIZE=XSMALL;
// Attach permissions
GRANT USAGE ON DATABASE POWERBI TO ROLE POWERBI_ROLE;
GRANT USAGE ON WAREHOUSE POWERBI_WH TO ROLE POWERBI_ROLE;
ALTER USER POWERBI_USER_ACCOUNT SET DEFAULT_ROLE=POWERBI_ROLE; // you must do this because the PowerBI connector doesn't let you specify a role.
// ===========================================================================
// ===========================================================================
// Now, setup ingestion from the Stitch side of things and wait for data to flow.
//
// Let's explore the raw data.
// ===========================================================================
// Initial look at data
SELECT * FROM PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA LIMIT 10;
// let's make the table easier to use in queries
SET SPOTIFY = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_DATA';
SELECT * FROM TABLE($SPOTIFY) LIMIT 10;
// we can also set context to do this
USE DATABASE PC_STITCH_DB;
USE SCHEMA AWS_S3_CSV_BUCKET;
SELECT * FROM SPOTIFY_DATA LIMIT 10;
// NOTE: I prefer the variable approach for worksheets
// where you'll be in multiple databases. It prevents mistakes
// when running different parts of your worksheet
// let's find out what the top songs are
SELECT * FROM TABLE($SPOTIFY) ORDER BY STREAMS DESC LIMIT 10;
// let's see what date ranges we're working with
SELECT
MIN(DATE) AS startDate,
MAX(DATE) AS endDate
FROM
TABLE($SPOTIFY);
// looks like we're essentially playing with top daily streams for all of 2017.
// Let's see which artist had the most streams over the whole data set
SELECT
ARTIST, SUM(STREAMS) as streamCount
FROM
TABLE($SPOTIFY)
GROUP BY
ARTIST
ORDER BY
streamCount DESC
LIMIT 10;
// hmm, these numbers seem high. Maybe we need to explore those region codes
SELECT DISTINCT REGION FROM TABLE($SPOTIFY);
// maybe we should just use global? I think aggregating with regions AND global
// is resulting in double counting. Lets find out using Drake's streams.
SET globalDrakeSteams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION='global');
SET sumOfRegionalDrakeStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Drake' AND REGION!='global');
SELECT $globalDrakeSteams, $sumOfRegionalDrakeStreams;
// what about Kendrick Lamar ("DAMN." came out in the spring of 2017 and it is a masterpiece)
SET globalKendrickSteams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION='global');
SET sumOfRegionalKendrickStreams = (SELECT SUM(STREAMS) FROM TABLE($SPOTIFY) WHERE ARTIST='Kendrick Lamar' AND REGION!='global');
SELECT $globalKendrickSteams, $sumOfRegionalKendrickStreams;
// it's not exact, but close enough to convince me to filter by global from now on. (check with other artists if you like)
// The easiest way to do that is with a view. Let's make one now
CREATE VIEW IF NOT EXISTS
PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL
AS
SELECT * FROM TABLE($SPOTIFY) WHERE REGION='global';
// Now grab the view name as a variable
SET SPOTIFY_GLOBAL = 'PC_STITCH_DB.AWS_S3_CSV_BUCKET.SPOTIFY_GLOBAL';
// Cool, now we're working with some cleaner data.
// Let's revisit our top artists.
SELECT
ARTIST, SUM(STREAMS) as streamCount
FROM
TABLE($SPOTIFY_GLOBAL)
GROUP BY
ARTIST
ORDER BY
streamCount DESC
LIMIT 10;
// now top songs
SELECT
ARTIST, TRACK_NAME, SUM(STREAMS) as streamCount
FROM
TABLE($SPOTIFY_GLOBAL)
GROUP BY
ARTIST, TRACK_NAME
ORDER BY
streamCount DESC
LIMIT 10;
// wow, Ed Sheeran was killing it in 2017. Good for him.
//
// I think we have enough context to maybe build a good BI table...
// ===========================================================================
// ===========================================================================
// Now, let's send some clean, BI-ready data to our PowerBI DB.
//
// Our goal from here will be to transform the data in PC_STITCH_DB to a clean,
// BI-ready form inside of the POWERBI database for analytics consumption
// ===========================================================================
// let's create our spotify schema in the powerbi db
CREATE SCHEMA IF NOT EXISTS POWERBI.SPOTIFY;
// I think it would be fun to see the top 5 songs every
// month for US streaming.
// start by creating a temp table with stream counts for each song by month.
// We'll filter to only US data and we'll ignore the partial January 2018 data
CREATE OR REPLACE TEMPORARY TABLE
POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH
AS (
SELECT
ARTIST,
TRACK_NAME,
SUM(STREAMS) AS STREAM_COUNT,
DATE_TRUNC(MONTH, DATE) AS MONTH
FROM
TABLE($SPOTIFY)
WHERE
REGION='us' AND YEAR(DATE) = 2017
GROUP BY
MONTH, ARTIST, TRACK_NAME
);
// now we'll get song ranks within each month using a nested
// query with a window function
CREATE OR REPLACE TABLE
POWERBI.SPOTIFY.US_MONTHLY_TOP_5_SONGS
AS (
SELECT * FROM
(
SELECT
*,
RANK() OVER (PARTITION BY MONTH ORDER BY STREAM_COUNT DESC) AS MONTHLY_RANK
FROM
POWERBI.SPOTIFY.US_STREAMCOUNTS_BY_MONTH
)
WHERE
MONTHLY_RANK <= 5
);
// now that we're finished with all that messiness, let's grant read access
// on our new schema and table to our powerbi user
GRANT USAGE ON SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE;
GRANT SELECT ON ALL TABLES IN SCHEMA POWERBI.SPOTIFY TO ROLE POWERBI_ROLE;
// Great job! You should be able to make some cool visuazliations in PBI now.
// ===========================================================================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment