Created
August 21, 2024 21:15
-
-
Save pavlokurochka/0127af681839bcd4ad93e88633767488 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "i4bUhtWPH9nE" | |
| }, | |
| "source": [ | |
| "#Homework\n", | |
| "\n", | |
| "Write a python script that will load data from the SpaceX API into DuckDB using dlt.\n", | |
| "\n", | |
| "Use:\n", | |
| "- @dlt.source\n", | |
| "- @dlt.resource\n", | |
| "- @dlt.transformer\n", | |
| "\n", | |
| "SpaceX API URL: https://api.spacexdata.com\n", | |
| "\n", | |
| "Docs: https://github.com/r-spacex/SpaceX-API/blob/master/docs/README.md\n", | |
| "\n", | |
| "Endpoints for loading:\n", | |
| "- launches\n", | |
| "- rockets\n", | |
| "- crew" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "EoBQ35-j7oTB" | |
| }, | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "-cFnLAMpIBNF" | |
| }, | |
| "source": [ | |
| "# Install dlt with duckdb extention" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "id": "4uVKDZXdGkz3" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "%%capture\n", | |
| "!pip install dlt[duckdb]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "ZqBbuLiLIGwx" | |
| }, | |
| "source": [ | |
| "# Play with SpaceX API" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "fHVNIdclQZzT", | |
| "outputId": "037846a4-d6e3-48ad-c997-85fcdf47b0e4" | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'name': 'Robert Behnken',\n", | |
| " 'agency': 'NASA',\n", | |
| " 'image': 'https://imgur.com/0smMgMH.png',\n", | |
| " 'wikipedia': 'https://en.wikipedia.org/wiki/Robert_L._Behnken',\n", | |
| " 'launches': ['5eb87d46ffd86e000604b388'],\n", | |
| " 'status': 'active',\n", | |
| " 'id': '5ebf1a6e23a9a60006e03a7a'}" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import requests\n", | |
| "# endpoint = 'launches'\n", | |
| "# endpoint = 'rockets'\n", | |
| "endpoint = 'crew'\n", | |
| "response = requests.get(f\"https://api.spacexdata.com/v4/{endpoint}\")\n", | |
| "response.json()[0]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "cMZP5POeIKAW" | |
| }, | |
| "source": [ | |
| "# Helper\n", | |
| "Run the cell and ignore it." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "id": "nZIBJEKrWpTm" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from dlt.common.pipeline import LoadInfo\n", | |
| "\n", | |
| "def assert_load_info(info: LoadInfo, expected_load_packages: int = 1) -> None:\n", | |
| " \"\"\"Asserts that expected number of packages was loaded and there are no failed jobs\"\"\"\n", | |
| " assert len(info.loads_ids) == expected_load_packages\n", | |
| " # all packages loaded\n", | |
| " assert all(package.state == \"loaded\" for package in info.load_packages) is True\n", | |
| " # no failed jobs in any of the packages\n", | |
| " info.raise_on_failed_jobs()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "6kM1y10aXf2S" | |
| }, | |
| "source": [ | |
| "# Task 1\n", | |
| "\n", | |
| "\n", | |
| "Create a pipeline for SpaceX API, for the next endpoints: launches, rockets, crew.\n", | |
| "\n", | |
| "- Fill the empty lines in the functions below.\n", | |
| "- `get_rockets` resource should have `table_name=rockets`.\n", | |
| "- Create a [resource](https://dlthub.com/docs/general-usage/resource#declare-a-resource) for the `crew` endpoint from scratch.\n", | |
| "- [Run the pipeline](https://dlthub.com/docs/walkthroughs/run-a-pipeline) without errors." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "2He2O3XNGqnB", | |
| "outputId": "c770da58-6554-4854-c5b5-5df531e2b119" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Pipeline spacex_with_source load step completed in 1.41 seconds\n", | |
| "1 load package(s) were loaded to destination duckdb and into dataset spacex_data\n", | |
| "The duckdb destination used duckdb:///d:\\dlthub-education\\workshops\\workshop-august-2024\\part1\\spacex_with_source.duckdb location to store data\n", | |
| "Load package 1724268438.7316127 is LOADED and contains no failed jobs\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# import time\n", | |
| "\n", | |
| "import dlt\n", | |
| "import requests\n", | |
| "\n", | |
| "\n", | |
| "@dlt.resource(table_name=\"launches\", write_disposition='replace')\n", | |
| "def get_launches():\n", | |
| " response = requests.get(f\"https://api.spacexdata.com/v4/launches\")\n", | |
| " response.raise_for_status()\n", | |
| " yield response.json()\n", | |
| "\n", | |
| "\n", | |
| "@dlt.resource(table_name=\"rockets\")\n", | |
| "def get_rockets():\n", | |
| " response = requests.get(f\"https://api.spacexdata.com/v4/rockets\")\n", | |
| " response.raise_for_status()\n", | |
| " yield response.json()\n", | |
| "\n", | |
| "@dlt.resource(table_name=\"crew\")\n", | |
| "def get_crew():\n", | |
| " response = requests.get(f\"https://api.spacexdata.com/v4/crew\")\n", | |
| " response.raise_for_status()\n", | |
| " yield response.json()\n", | |
| "\n", | |
| "\n", | |
| "pipeline = dlt.pipeline(\n", | |
| " pipeline_name='spacex_with_source',\n", | |
| " destination='duckdb',\n", | |
| " dataset_name='spacex_data',\n", | |
| " # dev_mode=True,\n", | |
| ")\n", | |
| "\n", | |
| "load_info = pipeline.run([get_launches(), get_rockets(), get_crew()])\n", | |
| "# load_info = pipeline.run(spacex_source())\n", | |
| "print(load_info)\n", | |
| "assert_load_info(load_info)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "JcIaeCd3Yqk9" | |
| }, | |
| "source": [ | |
| "Run the code below and\n", | |
| "## Answer the Question:\n", | |
| "- What weight (kg) has the heighest (meters) rocket?\n", | |
| "\n", | |
| "\n", | |
| "118 meters" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "MZll2qr_O9wg", | |
| "outputId": "67915ea1-63cc-4063-d619-9baaab8ffbe3" | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>height__meters</th>\n", | |
| " <th>height__feet</th>\n", | |
| " <th>diameter__meters</th>\n", | |
| " <th>diameter__feet</th>\n", | |
| " <th>mass__kg</th>\n", | |
| " <th>mass__lb</th>\n", | |
| " <th>first_stage__thrust_sea_level__k_n</th>\n", | |
| " <th>first_stage__thrust_sea_level__lbf</th>\n", | |
| " <th>first_stage__thrust_vacuum__k_n</th>\n", | |
| " <th>first_stage__thrust_vacuum__lbf</th>\n", | |
| " <th>...</th>\n", | |
| " <th>country</th>\n", | |
| " <th>company</th>\n", | |
| " <th>wikipedia</th>\n", | |
| " <th>description</th>\n", | |
| " <th>id</th>\n", | |
| " <th>_dlt_load_id</th>\n", | |
| " <th>_dlt_id</th>\n", | |
| " <th>height__feet__v_double</th>\n", | |
| " <th>engines__thrust_to_weight__v_double</th>\n", | |
| " <th>landing_legs__material</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>118.0</td>\n", | |
| " <td>387</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>30.0</td>\n", | |
| " <td>1335000</td>\n", | |
| " <td>2943000</td>\n", | |
| " <td>128000</td>\n", | |
| " <td>28775544</td>\n", | |
| " <td>138000</td>\n", | |
| " <td>31023634</td>\n", | |
| " <td>...</td>\n", | |
| " <td>United States</td>\n", | |
| " <td>SpaceX</td>\n", | |
| " <td>https://en.wikipedia.org/wiki/SpaceX_Starship</td>\n", | |
| " <td>Starship and Super Heavy Rocket represent a fu...</td>\n", | |
| " <td>5e9d0d96eda699382d09d1ee</td>\n", | |
| " <td>1724268438.7316127</td>\n", | |
| " <td>nZWX/94P/ZsUtg</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>stainless steel</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>1 rows × 58 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " height__meters height__feet diameter__meters diameter__feet mass__kg \\\n", | |
| "0 118.0 387 9.0 30.0 1335000 \n", | |
| "\n", | |
| " mass__lb first_stage__thrust_sea_level__k_n \\\n", | |
| "0 2943000 128000 \n", | |
| "\n", | |
| " first_stage__thrust_sea_level__lbf first_stage__thrust_vacuum__k_n \\\n", | |
| "0 28775544 138000 \n", | |
| "\n", | |
| " first_stage__thrust_vacuum__lbf ... country company \\\n", | |
| "0 31023634 ... United States SpaceX \n", | |
| "\n", | |
| " wikipedia \\\n", | |
| "0 https://en.wikipedia.org/wiki/SpaceX_Starship \n", | |
| "\n", | |
| " description \\\n", | |
| "0 Starship and Super Heavy Rocket represent a fu... \n", | |
| "\n", | |
| " id _dlt_load_id _dlt_id \\\n", | |
| "0 5e9d0d96eda699382d09d1ee 1724268438.7316127 nZWX/94P/ZsUtg \n", | |
| "\n", | |
| " height__feet__v_double engines__thrust_to_weight__v_double \\\n", | |
| "0 NaN NaN \n", | |
| "\n", | |
| " landing_legs__material \n", | |
| "0 stainless steel \n", | |
| "\n", | |
| "[1 rows x 58 columns]" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "import duckdb\n", | |
| "# from google.colab import data_table\n", | |
| "# data_table.enable_dataframe_formatter()\n", | |
| "\n", | |
| "# a database '<pipeline_name>.duckdb' was created in working directory so just connect to it\n", | |
| "conn = duckdb.connect(f\"{pipeline.pipeline_name}.duckdb\")\n", | |
| "conn.sql(f\"SET search_path = '{pipeline.dataset_name}'\")\n", | |
| "stats_table = conn.sql(\"SELECT * FROM rockets order by height__meters desc limit 1\").df()\n", | |
| "display(stats_table)\n", | |
| "# stats_table.info()\n", | |
| "# conn.sql('show all tables')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "sql =\"\"\" select date_utc, date_unix, date_local, success, \t_dlt_load_id from launches order by date_utc \n", | |
| "\"\"\"\n", | |
| "stats_table = conn.sql(sql)\n", | |
| "display(stats_table)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "^C\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!dlt pipeline spacex_with_source show" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "TGvg6pBDBjLk", | |
| "outputId": "28237b77-fb88-46cc-b1ad-2209706fe1c5" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Found pipeline spacex_with_source in C:\\Users\\kuroc\\.dlt\\pipelines\n", | |
| "Run started at 2024-08-21 19:27:18.407188+00:00 and COMPLETED in 3.86 seconds with 4 steps.\n", | |
| "Step extract COMPLETED in 1.32 seconds.\n", | |
| "\n", | |
| "Load package 1724268438.7316127 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs\n", | |
| "\n", | |
| "Step normalize COMPLETED in 0.70 seconds.\n", | |
| "Normalized data for the following tables:\n", | |
| "- _dlt_pipeline_state: 1 row(s)\n", | |
| "- crew: 30 row(s)\n", | |
| "- crew__launches: 30 row(s)\n", | |
| "- launches: 205 row(s)\n", | |
| "- launches__failures: 5 row(s)\n", | |
| "- launches__payloads: 222 row(s)\n", | |
| "- launches__cores: 215 row(s)\n", | |
| "- launches__ships: 342 row(s)\n", | |
| "- launches__capsules: 35 row(s)\n", | |
| "- launches__links__flickr__original: 848 row(s)\n", | |
| "- launches__fairings__ships: 75 row(s)\n", | |
| "- launches__crew: 30 row(s)\n", | |
| "- rockets: 4 row(s)\n", | |
| "- rockets__payload_weights: 11 row(s)\n", | |
| "- rockets__flickr_images: 16 row(s)\n", | |
| "\n", | |
| "Load package 1724268438.7316127 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs\n", | |
| "\n", | |
| "Step load COMPLETED in 1.53 seconds.\n", | |
| "Pipeline spacex_with_source load step completed in 1.41 seconds\n", | |
| "1 load package(s) were loaded to destination duckdb and into dataset spacex_data\n", | |
| "The duckdb destination used duckdb:///d:\\dlthub-education\\workshops\\workshop-august-2024\\part1\\spacex_with_source.duckdb location to store data\n", | |
| "Load package 1724268438.7316127 is LOADED and contains no failed jobs\n", | |
| "\n", | |
| "Step run COMPLETED in 3.85 seconds.\n", | |
| "Pipeline spacex_with_source load step completed in 1.41 seconds\n", | |
| "1 load package(s) were loaded to destination duckdb and into dataset spacex_data\n", | |
| "The duckdb destination used duckdb:///d:\\dlthub-education\\workshops\\workshop-august-2024\\part1\\spacex_with_source.duckdb location to store data\n", | |
| "Load package 1724268438.7316127 is LOADED and contains no failed jobs\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!dlt pipeline spacex_with_source trace" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "Q9gObEHqZpB0" | |
| }, | |
| "source": [ | |
| "# Task 2\n", | |
| "- Add pagination, read [SpaceX API doc](https://github.com/r-spacex/SpaceX-API/blob/master/docs/queries.md).\n", | |
| "- Combine all resources in one [source](https://dlthub.com/docs/general-usage/source) and the pipeline with `@dlt.source`.\n", | |
| "- Add incremental loading for resource `get_launches` using `merge` write disposition, `id` as a primary key and `dlt.sources.incremental`.\n", | |
| "- Run the pipeline [only with](https://dlthub.com/docs/general-usage/source#access-and-select-resources-to-load) `get_launches` resource.\n", | |
| "\n", | |
| "Read more about [incremental loading](https://dlthub.com/docs/general-usage/incremental-loading)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "szRR1feH89yc" | |
| }, | |
| "source": [ | |
| "## Try post method to query SpaceX API\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "zpGww6Zx7gG_", | |
| "outputId": "94cf3fea-90f2-4fc3-b01a-8eb0eae66f0c" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import requests\n", | |
| "# rc = requests.delete(f\"https://api.spacexdata.com/admin/cache\")\n", | |
| "# rc.raise_for_status()\n", | |
| "# print(rc)\n", | |
| "response = requests.post(\n", | |
| " \"https://api.spacexdata.com/v5/launches/query\",\n", | |
| " json={\n", | |
| " \"query\": {\n", | |
| " \"date_utc\": {\n", | |
| " \"$gt\": \"2006-01-01T00:00:00.000Z\",\n", | |
| " }, },\n", | |
| " \"options\": {\n", | |
| " \"page\": 21,\n", | |
| " # \"offset\": 2,\n", | |
| " \"limit\": 10,\n", | |
| " \"pagination\": True\n", | |
| " } \n", | |
| " }\n", | |
| " )\n", | |
| "response_json = response.json()\n", | |
| "for k,v in response_json.items():\n", | |
| " if k != 'docs':\n", | |
| " print(f'{k}:{v}')\n", | |
| "print(response.json()[\"pagingCounter\"])\n", | |
| "for doc in response.json()[\"docs\"]:\n", | |
| " print(doc[\"id\"])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "5nzHZjbg-AO8" | |
| }, | |
| "source": [ | |
| "Use code above to make your launches resource incremental." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "h1MdG6V1X1rW", | |
| "outputId": "9e183e30-dbc8-43c0-f74b-ec0fda060e1b" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# !dlt pipeline spacex_with_source_inc drop --drop-all" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "zbHs4vAcP9en", | |
| "outputId": "c2951371-841a-491e-dfab-6de34d7a039b" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Pipeline spacex_with_source_inc load step completed in 2.62 seconds\n", | |
| "1 load package(s) were loaded to destination duckdb and into dataset spacex_data_inc\n", | |
| "The duckdb destination used duckdb:///d:\\dlthub-education\\workshops\\workshop-august-2024\\part1\\spacex_with_source_inc.duckdb location to store data\n", | |
| "Load package 1724273951.0527215 is LOADED and contains no failed jobs\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import dlt\n", | |
| "import requests\n", | |
| "\n", | |
| "\n", | |
| "def get_launches_page(page_no:int) ->dict:\n", | |
| " url = \"https://api.spacexdata.com/v4/launches/query\"\n", | |
| " payload = {\n", | |
| " \"query\": {\n", | |
| " \"date_utc\": {\n", | |
| " \"$gt\": \"2006-01-01T00:00:00.000Z\",\n", | |
| " }, },\n", | |
| " \"options\": {\n", | |
| " \"page\": page_no,\n", | |
| " \"limit\": 10,\n", | |
| " \"pagination\": True\n", | |
| " } \n", | |
| " }\n", | |
| "\n", | |
| " response = requests.post(url, json=payload)\n", | |
| " return response.json()\n", | |
| "\n", | |
| "\n", | |
| "@dlt.resource(table_name=\"launches\", primary_key=\"id\", write_disposition=\"merge\")\n", | |
| "def get_launches():\n", | |
| " page = 1\n", | |
| " while page:\n", | |
| " result = get_launches_page(page)\n", | |
| " yield result['docs']\n", | |
| " page = result['nextPage']\n", | |
| "\n", | |
| "\n", | |
| "@dlt.resource(table_name=\"rockets\", primary_key=\"id\", write_disposition=\"merge\")\n", | |
| "def get_rockets():\n", | |
| " response = requests.get(f\"https://api.spacexdata.com/v4/rockets\")\n", | |
| " response.raise_for_status()\n", | |
| " yield response.json()\n", | |
| "\n", | |
| "@dlt.resource(table_name=\"crew\", primary_key=\"id\", write_disposition=\"merge\")\n", | |
| "def get_crew():\n", | |
| " response = requests.get(f\"https://api.spacexdata.com/v4/crew\")\n", | |
| " response.raise_for_status()\n", | |
| " yield response.json()\n", | |
| "\n", | |
| "\n", | |
| "@dlt.source\n", | |
| "def spacex_source():\n", | |
| " return get_launches, get_rockets, get_crew\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "pipeline = dlt.pipeline(\n", | |
| " pipeline_name='spacex_with_source_inc',\n", | |
| " destination='duckdb',\n", | |
| " dataset_name='spacex_data_inc',\n", | |
| " # dev_mode=True,\n", | |
| ")\n", | |
| "\n", | |
| "# data = spacex_source().# put your code here\n", | |
| "data = spacex_source()\n", | |
| "\n", | |
| "load_info = pipeline.run(data)\n", | |
| "print(load_info)\n", | |
| "assert_load_info(load_info)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Pipeline spacex_with_source_inc load step completed in 2.37 seconds\n", | |
| "1 load package(s) were loaded to destination duckdb and into dataset spacex_data_inc\n", | |
| "The duckdb destination used duckdb:///d:\\dlthub-education\\workshops\\workshop-august-2024\\part1\\spacex_with_source_inc.duckdb location to store data\n", | |
| "Load package 1724273972.3240838 is LOADED and contains no failed jobs\n" | |
| ] | |
| }, | |
| { | |
| "ename": "AssertionError", | |
| "evalue": "", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[1;31mAssertionError\u001b[0m Traceback (most recent call last)", | |
| "Cell \u001b[1;32mIn[3], line 4\u001b[0m\n\u001b[0;32m 2\u001b[0m load_info \u001b[38;5;241m=\u001b[39m pipeline\u001b[38;5;241m.\u001b[39mrun(data)\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(load_info)\n\u001b[1;32m----> 4\u001b[0m assert_load_info(load_info, expected_load_packages\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n", | |
| "Cell \u001b[1;32mIn[1], line 5\u001b[0m, in \u001b[0;36massert_load_info\u001b[1;34m(info, expected_load_packages)\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21massert_load_info\u001b[39m(info: LoadInfo, expected_load_packages: \u001b[38;5;28mint\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 4\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Asserts that expected number of packages was loaded and there are no failed jobs\"\"\"\u001b[39;00m\n\u001b[1;32m----> 5\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(info\u001b[38;5;241m.\u001b[39mloads_ids) \u001b[38;5;241m==\u001b[39m expected_load_packages\n\u001b[0;32m 6\u001b[0m \u001b[38;5;66;03m# all packages loaded\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mall\u001b[39m(package\u001b[38;5;241m.\u001b[39mstate \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloaded\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m package \u001b[38;5;129;01min\u001b[39;00m info\u001b[38;5;241m.\u001b[39mload_packages) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n", | |
| "\u001b[1;31mAssertionError\u001b[0m: " | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Run the pipeline one more time, it should load no data\n", | |
| "load_info = pipeline.run(data)\n", | |
| "print(load_info)\n", | |
| "assert_load_info(load_info, expected_load_packages=0)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[LoadPackageInfo(load_id='1724273972.3240838', package_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838', state='loaded', schema=Schema spacex_source at 1666898050128, schema_update={}, completed_at=DateTime(2024, 8, 21, 20, 59, 39, 854187, tzinfo=Timezone('UTC')), jobs={'failed_jobs': [], 'new_jobs': [], 'completed_jobs': [LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\crew.5cce237413.0.sql', file_size=1604, created_at=DateTime(2024, 8, 21, 20, 59, 38, 731119, tzinfo=Timezone('UTC')), elapsed=1.123067855834961, job_file_info=ParsedLoadJobFileName(table_name='crew', file_id='5cce237413', retry_count=0, file_format='sql'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\crew.ba38ecf79e.0.insert_values', file_size=1823, created_at=DateTime(2024, 8, 21, 20, 59, 37, 36967, tzinfo=Timezone('UTC')), elapsed=2.8172202110290527, job_file_info=ParsedLoadJobFileName(table_name='crew', file_id='ba38ecf79e', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\crew__launches.e3a8308192.0.insert_values', file_size=1131, created_at=DateTime(2024, 8, 21, 20, 59, 37, 37967, tzinfo=Timezone('UTC')), elapsed=2.8162198066711426, job_file_info=ParsedLoadJobFileName(table_name='crew__launches', file_id='e3a8308192', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\launches.400f29ce85.0.sql', file_size=6727, created_at=DateTime(2024, 8, 21, 20, 59, 38, 745029, tzinfo=Timezone('UTC')), elapsed=1.1091575622558594, job_file_info=ParsedLoadJobFileName(table_name='launches', file_id='400f29ce85', retry_count=0, file_format='sql'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\launches.d0ef4b266f.0.insert_values', file_size=49294, created_at=DateTime(2024, 8, 21, 20, 59, 37, 54966, tzinfo=Timezone('UTC')), elapsed=2.7992208003997803, job_file_info=ParsedLoadJobFileName(table_name='launches', file_id='d0ef4b266f', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\launches__capsules.955bd50942.0.insert_values', file_size=1380, created_at=DateTime(2024, 8, 21, 20, 59, 37, 68967, tzinfo=Timezone('UTC')), elapsed=2.7852203845977783, job_file_info=ParsedLoadJobFileName(table_name='launches__capsules', file_id='955bd50942', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\launches__cores.cb595e2e6c.0.insert_values', file_size=7820, created_at=DateTime(2024, 8, 21, 20, 59, 37, 63966, tzinfo=Timezone('UTC')), elapsed=2.7902207374572754, job_file_info=ParsedLoadJobFileName(table_name='launches__cores', file_id='cb595e2e6c', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\launches__crew.7d4c3153ea.0.insert_values', file_size=1016, created_at=DateTime(2024, 8, 21, 20, 59, 37, 84966, tzinfo=Timezone('UTC')), elapsed=2.7692208290100098, job_file_info=ParsedLoadJobFileName(table_name='launches__crew', file_id='7d4c3153ea', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\launches__failures.9834748480.0.insert_values', file_size=598, created_at=DateTime(2024, 8, 21, 20, 59, 37, 54966, tzinfo=Timezone('UTC')), elapsed=2.7992208003997803, job_file_info=ParsedLoadJobFileName(table_name='launches__failures', file_id='9834748480', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\launches__fairings__ships.e3559c3337.0.insert_values', file_size=2187, created_at=DateTime(2024, 8, 21, 20, 59, 37, 84966, tzinfo=Timezone('UTC')), elapsed=2.7692208290100098, job_file_info=ParsedLoadJobFileName(table_name='launches__fairings__ships', file_id='e3559c3337', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\launches__links__flickr__original.8d735c2a5d.0.insert_values', file_size=28801, created_at=DateTime(2024, 8, 21, 20, 59, 37, 82966, tzinfo=Timezone('UTC')), elapsed=2.771221399307251, job_file_info=ParsedLoadJobFileName(table_name='launches__links__flickr__original', file_id='8d735c2a5d', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\launches__payloads.940efbbd0a.0.insert_values', file_size=7410, created_at=DateTime(2024, 8, 21, 20, 59, 37, 57967, tzinfo=Timezone('UTC')), elapsed=2.796220302581787, job_file_info=ParsedLoadJobFileName(table_name='launches__payloads', file_id='940efbbd0a', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\launches__ships.48f06a62e8.0.insert_values', file_size=7812, created_at=DateTime(2024, 8, 21, 20, 59, 37, 67966, tzinfo=Timezone('UTC')), elapsed=2.7862212657928467, job_file_info=ParsedLoadJobFileName(table_name='launches__ships', file_id='48f06a62e8', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\rockets.2de251ba47.0.insert_values', file_size=1768, created_at=DateTime(2024, 8, 21, 20, 59, 37, 85969, tzinfo=Timezone('UTC')), elapsed=2.7682178020477295, job_file_info=ParsedLoadJobFileName(table_name='rockets', file_id='2de251ba47', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\rockets.55ada50fa2.0.sql', file_size=5027, created_at=DateTime(2024, 8, 21, 20, 59, 38, 752026, tzinfo=Timezone('UTC')), elapsed=1.1021606922149658, job_file_info=ParsedLoadJobFileName(table_name='rockets', file_id='55ada50fa2', retry_count=0, file_format='sql'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\rockets__flickr_images.d0d60568c4.0.insert_values', file_size=808, created_at=DateTime(2024, 8, 21, 20, 59, 37, 87968, tzinfo=Timezone('UTC')), elapsed=2.766218900680542, job_file_info=ParsedLoadJobFileName(table_name='rockets__flickr_images', file_id='d0d60568c4', retry_count=0, file_format='insert_values'), failed_message=None), LoadJobInfo(state='completed_jobs', file_path='C:\\\\Users\\\\kuroc\\\\.dlt\\\\pipelines\\\\spacex_with_source_inc\\\\load\\\\loaded\\\\1724273972.3240838\\\\completed_jobs\\\\rockets__payload_weights.6be142fa53.0.insert_values', file_size=561, created_at=DateTime(2024, 8, 21, 20, 59, 37, 86970, tzinfo=Timezone('UTC')), elapsed=2.7672173976898193, job_file_info=ParsedLoadJobFileName(table_name='rockets__payload_weights', file_id='6be142fa53', retry_count=0, file_format='insert_values'), failed_message=None)], 'started_jobs': []})]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(load_info.load_packages)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "RjKdycjuWL4F", | |
| "outputId": "7fc9d393-b67d-4580-93b6-c2533dd88ce9" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Found pipeline spacex_with_source_inc in C:\\Users\\kuroc\\.dlt\\pipelines\n", | |
| "Run started at 2024-08-21 20:59:31.991808+00:00 and COMPLETED in 7.93 seconds with 4 steps.\n", | |
| "Step extract COMPLETED in 4.59 seconds.\n", | |
| "\n", | |
| "Load package 1724273972.3240838 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs\n", | |
| "\n", | |
| "Step normalize COMPLETED in 0.78 seconds.\n", | |
| "Normalized data for the following tables:\n", | |
| "- crew: 30 row(s)\n", | |
| "- crew__launches: 30 row(s)\n", | |
| "- launches: 205 row(s)\n", | |
| "- launches__failures: 5 row(s)\n", | |
| "- launches__payloads: 222 row(s)\n", | |
| "- launches__cores: 215 row(s)\n", | |
| "- launches__ships: 342 row(s)\n", | |
| "- launches__capsules: 35 row(s)\n", | |
| "- launches__links__flickr__original: 848 row(s)\n", | |
| "- launches__fairings__ships: 75 row(s)\n", | |
| "- launches__crew: 30 row(s)\n", | |
| "- rockets: 4 row(s)\n", | |
| "- rockets__payload_weights: 11 row(s)\n", | |
| "- rockets__flickr_images: 16 row(s)\n", | |
| "\n", | |
| "Load package 1724273972.3240838 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs\n", | |
| "\n", | |
| "Step load COMPLETED in 2.47 seconds.\n", | |
| "Pipeline spacex_with_source_inc load step completed in 2.37 seconds\n", | |
| "1 load package(s) were loaded to destination duckdb and into dataset spacex_data_inc\n", | |
| "The duckdb destination used duckdb:///d:\\dlthub-education\\workshops\\workshop-august-2024\\part1\\spacex_with_source_inc.duckdb location to store data\n", | |
| "Load package 1724273972.3240838 is LOADED and contains no failed jobs\n", | |
| "\n", | |
| "Step run COMPLETED in 7.93 seconds.\n", | |
| "Pipeline spacex_with_source_inc load step completed in 2.37 seconds\n", | |
| "1 load package(s) were loaded to destination duckdb and into dataset spacex_data_inc\n", | |
| "The duckdb destination used duckdb:///d:\\dlthub-education\\workshops\\workshop-august-2024\\part1\\spacex_with_source_inc.duckdb location to store data\n", | |
| "Load package 1724273972.3240838 is LOADED and contains no failed jobs\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!dlt pipeline spacex_with_source_inc trace" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "!dlt pipeline spacex_with_source_inc show" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "j2yWGz58jTQX" | |
| }, | |
| "source": [ | |
| "## Answer the Question:\n", | |
| "What rocket was launched between 2022-11-01 and 2022-11-02?\n", | |
| "\n", | |
| " Falcon Heavy" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 1000 | |
| }, | |
| "id": "It2RISJWxVQu", | |
| "outputId": "91c9fdd3-0dae-4339-98c9-65bd1d9ece1b" | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "┌──────────────┬──────────────────────────┬──────────────────────────┐\n", | |
| "│ rocket_name │ date_utc │ rocket │\n", | |
| "│ varchar │ timestamp with time zone │ varchar │\n", | |
| "├──────────────┼──────────────────────────┼──────────────────────────┤\n", | |
| "│ Falcon Heavy │ 2022-11-01 06:41:00-07 │ 5e9d0d95eda69974db09d1ed │\n", | |
| "└──────────────┴──────────────────────────┴──────────────────────────┘" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "import duckdb\n", | |
| "# from google.colab import data_table\n", | |
| "# data_table.enable_dataframe_formatter()\n", | |
| "\n", | |
| "# a database '<pipeline_name>.duckdb' was created in working directory so just connect to it\n", | |
| "conn = duckdb.connect(f\"{pipeline.pipeline_name}.duckdb\")\n", | |
| "conn.sql(f\"SET search_path = '{pipeline.dataset_name}'\")\n", | |
| "# sql = \"\"\"\n", | |
| "# SELECT r.name rocket_name , l.* EXCLUDE (fairings__reused\n", | |
| "# ,\tfairings__recovery_attempt\n", | |
| "# ,\tfairings__recovered\n", | |
| "# ,\tlinks__patch__small\n", | |
| "# ,\tlinks__patch__large\n", | |
| "# ,\tlinks__webcast\n", | |
| "# ,\tlinks__youtube_id\n", | |
| "# ,\tlinks__article\n", | |
| "# ,\tlinks__wikipedia\n", | |
| "# ,\t_dlt_load_id\n", | |
| "# ,\t_dlt_id\n", | |
| "# ,\tlinks__presskit\n", | |
| "# ,\tlinks__reddit__launch\n", | |
| "# ,\tlinks__reddit__media\n", | |
| "# ,\tlinks__reddit__recovery\n", | |
| "# ,\tlinks__reddit__campaign\n", | |
| "\n", | |
| "# ) FROM launches l\n", | |
| "# join rockets r on r.id = l.rocket\n", | |
| "# WHERE\n", | |
| "# l.date_utc BETWEEN '2022-11-01 00:00:00+00:00' AND '2022-11-02 00:00:00+00:00'\n", | |
| "# \"\"\"\n", | |
| "\n", | |
| "sql = \"\"\"\n", | |
| "SELECT\n", | |
| "r.name rocket_name , \n", | |
| "l.date_utc , l.rocket \n", | |
| "/* , l.* \n", | |
| " EXCLUDE (fairings__reused\n", | |
| ",\tfairings__recovery_attempt\n", | |
| ",\tfairings__recovered\n", | |
| ",\tlinks__patch__small\n", | |
| ",\tlinks__patch__large\n", | |
| ",\tlinks__webcast\n", | |
| ",\tlinks__youtube_id\n", | |
| ",\tlinks__article\n", | |
| ",\tlinks__wikipedia\n", | |
| ",\t_dlt_load_id\n", | |
| ",\t_dlt_id \n", | |
| ",\tlinks__presskit\n", | |
| ",\tlinks__reddit__launch\n", | |
| ",\tlinks__reddit__media\n", | |
| ",\tlinks__reddit__recovery\n", | |
| ",\tlinks__reddit__campaign) */\n", | |
| "FROM launches l\n", | |
| " join rockets r on r.id = l.rocket \n", | |
| "WHERE l.date_utc BETWEEN '2022-11-01 00:00:00+00:00'::timestamp AND '2022-11-02 00:00:00+00:00'::timestamp \n", | |
| "\"\"\"\n", | |
| "stats_table = conn.sql(sql)\n", | |
| "# stats_table.df().astype('str').to_excel('launches1.xlsx', index=False)\n", | |
| "display(stats_table)\n", | |
| "# stats_table.info()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 162 | |
| }, | |
| "id": "Vwqah6PcMik_", | |
| "outputId": "43efd314-c52e-4220-ee86-5c206e3784f8" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# stats_table = conn.sql(\"SELECT name FROM rockets where id in ('5e9d0d95eda69974db09d1ed', '5e9d0d95eda69973a809d1ec') \").df()\n", | |
| "# display(stats_table)\n", | |
| "# stats_table.info()\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "diK6iTnA-gKz" | |
| }, | |
| "source": [ | |
| "# Task 3: Get payloads of launches\n", | |
| "\n", | |
| "Use `@dlt.transformer` to get additional info for your data.\n", | |
| "\n", | |
| "Read more about dlt [transformers](https://dlthub.com/docs/general-usage/resource#process-resources-with-dlttransformer)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "metadata": { | |
| "id": "2pyeJXUB7MbR" | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Pipeline spacex_with_source_tr load step completed in 1.34 seconds\n", | |
| "1 load package(s) were loaded to destination duckdb and into dataset spacex_data_tr_20240821025609\n", | |
| "The duckdb destination used duckdb:///d:\\dlthub-education\\workshops\\workshop-august-2024\\part1\\spacex_with_source_tr.duckdb location to store data\n", | |
| "Load package 1724208969.262194 is LOADED and contains no failed jobs\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import dlt\n", | |
| "import requests\n", | |
| "\n", | |
| "\n", | |
| "@dlt.resource(table_name=\"launches\")\n", | |
| "def get_launches():\n", | |
| " response = requests.get(f\"https://api.spacexdata.com/v4/launches\")\n", | |
| " response.raise_for_status()\n", | |
| " yield response.json()\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "@dlt.transformer(data_from=get_launches, table_name=\"payloads\")\n", | |
| "def get_payloads(launches):\n", | |
| " for launch in launches: # data limit for demo\n", | |
| " # url to request dlt-hub/dlt issues\n", | |
| " response = requests.post(\n", | |
| " \"https://api.spacexdata.com/v4/payloads/query\",\n", | |
| " json= { \"query\": {\n", | |
| " \"launch\": launch['id'] },\n", | |
| " \"options\": {\n", | |
| " # \"page\": 1,\n", | |
| " # \"offset\": 2,\n", | |
| " # \"limit\": 10,\n", | |
| " \"pagination\": False\n", | |
| " }\n", | |
| " }\n", | |
| " )\n", | |
| "\n", | |
| " # make the request and check if succeeded\n", | |
| " response.raise_for_status()\n", | |
| " yield response.json()\n", | |
| "\n", | |
| "\n", | |
| "pipeline = dlt.pipeline(\n", | |
| " pipeline_name='spacex_with_source_tr',\n", | |
| " destination='duckdb',\n", | |
| " dataset_name='spacex_data_tr',\n", | |
| " dev_mode=True,\n", | |
| ")\n", | |
| "\n", | |
| "data = get_launches | get_payloads\n", | |
| "\n", | |
| "load_info = pipeline.run(data())\n", | |
| "print(load_info)\n", | |
| "assert_load_info(load_info)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Found pipeline spacex_with_source_tr in C:\\Users\\kuroc\\.dlt\\pipelines\n", | |
| "Run started at 2024-08-21 02:56:09.224991+00:00 and COMPLETED in 38.64 seconds with 4 steps.\n", | |
| "Step extract COMPLETED in 36.71 seconds.\n", | |
| "\n", | |
| "Load package 1724208969.262194 is EXTRACTED and NOT YET LOADED to the destination and contains no failed jobs\n", | |
| "\n", | |
| "Step normalize COMPLETED in 0.52 seconds.\n", | |
| "Normalized data for the following tables:\n", | |
| "- _dlt_pipeline_state: 1 row(s)\n", | |
| "- payloads: 205 row(s)\n", | |
| "- payloads__docs: 221 row(s)\n", | |
| "- payloads__docs__customers: 231 row(s)\n", | |
| "- payloads__docs__nationalities: 217 row(s)\n", | |
| "- payloads__docs__manufacturers: 205 row(s)\n", | |
| "- payloads__docs__norad_ids: 906 row(s)\n", | |
| "\n", | |
| "Load package 1724208969.262194 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs\n", | |
| "\n", | |
| "Step load COMPLETED in 1.40 seconds.\n", | |
| "Pipeline spacex_with_source_tr load step completed in 1.34 seconds\n", | |
| "1 load package(s) were loaded to destination duckdb and into dataset spacex_data_tr_20240821025609\n", | |
| "The duckdb destination used duckdb:///d:\\dlthub-education\\workshops\\workshop-august-2024\\part1\\spacex_with_source_tr.duckdb location to store data\n", | |
| "Load package 1724208969.262194 is LOADED and contains no failed jobs\n", | |
| "\n", | |
| "Step run COMPLETED in 38.64 seconds.\n", | |
| "Pipeline spacex_with_source_tr load step completed in 1.34 seconds\n", | |
| "1 load package(s) were loaded to destination duckdb and into dataset spacex_data_tr_20240821025609\n", | |
| "The duckdb destination used duckdb:///d:\\dlthub-education\\workshops\\workshop-august-2024\\part1\\spacex_with_source_tr.duckdb location to store data\n", | |
| "Load package 1724208969.262194 is LOADED and contains no failed jobs\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!dlt pipeline spacex_with_source_tr trace" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "^C\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!dlt pipeline spacex_with_source_tr show" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "id": "QeCYG3VKg--W" | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "┌──────────────────────┬──────────────────────┬──────────────────────┬───┬──────────────────────┬───────────┐\n", | |
| "│ database │ schema │ name │ … │ column_types │ temporary │\n", | |
| "│ varchar │ varchar │ varchar │ │ varchar[] │ boolean │\n", | |
| "├──────────────────────┼──────────────────────┼──────────────────────┼───┼──────────────────────┼───────────┤\n", | |
| "│ spacex_with_source… │ spacex_data_inc │ _dlt_loads │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc │ _dlt_pipeline_state │ … │ [BIGINT, BIGINT, V… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc │ _dlt_version │ … │ [BIGINT, BIGINT, T… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc │ crew │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc │ crew__launches │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc │ launches │ … │ [BOOLEAN, BOOLEAN,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc │ launches__capsules │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc │ launches__cores │ … │ [VARCHAR, BIGINT, … │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc │ launches__crew │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc │ launches__failures │ … │ [BIGINT, VARCHAR, … │ false │\n", | |
| "│ · │ · │ · │ · │ · │ · │\n", | |
| "│ · │ · │ · │ · │ · │ · │\n", | |
| "│ · │ · │ · │ · │ · │ · │\n", | |
| "│ spacex_with_source… │ spacex_data_inc_st… │ launches__cores │ … │ [VARCHAR, BIGINT, … │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc_st… │ launches__crew │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc_st… │ launches__failures │ … │ [BIGINT, VARCHAR, … │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc_st… │ launches__fairings… │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc_st… │ launches__links__f… │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc_st… │ launches__payloads │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc_st… │ launches__ships │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc_st… │ rockets │ … │ [DOUBLE, BIGINT, D… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc_st… │ rockets__flickr_im… │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "│ spacex_with_source… │ spacex_data_inc_st… │ rockets__payload_w… │ … │ [VARCHAR, VARCHAR,… │ false │\n", | |
| "├──────────────────────┴──────────────────────┴──────────────────────┴───┴──────────────────────┴───────────┤\n", | |
| "│ 32 rows (20 shown) 6 columns (5 shown) │\n", | |
| "└───────────────────────────────────────────────────────────────────────────────────────────────────────────┘" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "import duckdb\n", | |
| "# from google.colab import data_table\n", | |
| "# data_table.enable_dataframe_formatter()\n", | |
| "\n", | |
| "# a database '<pipeline_name>.duckdb' was created in working directory so just connect to it\n", | |
| "conn = duckdb.connect(f\"{pipeline.pipeline_name}.duckdb\")\n", | |
| "conn.sql(f\"SET search_path = '{pipeline.dataset_name}'\")\n", | |
| "# list all tables\n", | |
| "display(conn.sql(\"DESCRIBE\"))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "qUll84tQi1Ij" | |
| }, | |
| "source": [ | |
| "Run the code below and\n", | |
| "\n", | |
| "## Answer the Question:\n", | |
| "\n", | |
| "What regime has Satellite \"FalconSAT-2\" with launch id: 5eb87cd9ffd86e000604b32a? \n", | |
| "\n", | |
| "low-earth" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "metadata": { | |
| "id": "lkJz8HsrOYtj" | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>regime</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>low-earth</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " regime\n", | |
| "0 low-earth" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "stats_table = conn.sql(\"SELECT regime FROM payloads__docs where name ='FalconSAT-2' and launch = '5eb87cd9ffd86e000604b32a' \").df()\n", | |
| "display(stats_table)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "provenance": [] | |
| }, | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.7" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment