Skip to content

Instantly share code, notes, and snippets.

@sa-
Created October 5, 2021 08:52
Show Gist options
  • Select an option

  • Save sa-/09c8a347d0dbe864d29d1f8f66eb8e88 to your computer and use it in GitHub Desktop.

Select an option

Save sa-/09c8a347d0dbe864d29d1f8f66eb8e88 to your computer and use it in GitHub Desktop.
Spylon + jupyter kernel gateway container
FROM python:3.8-slim
WORKDIR /tmp
ENV SPARK_HOME=/opt/spark-3.1.2-bin-hadoop3.2
# Spark, julia
RUN apt update \
&& apt-get install -y \
wget htop zsh git gnupg curl zip unzip vim cmake tmux sudo openjdk-11-jre \
&& wget https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz \
&& tar -xf spark-3.1.2-bin-hadoop3.2.tgz \
&& rm spark-3.1.2-bin-hadoop3.2.tgz \
&& mv spark-3.1.2-bin-hadoop3.2 /opt \
&& ln -sf /opt/spark-3.1.2-bin-hadoop3.2/bin/* /usr/local/bin/ \
&& wget https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar \
&& mv gcs-connector-hadoop3-latest.jar $SPARK_HOME/jars \
&& useradd -ms /bin/bash service_user \
&& usermod -aG sudo service_user \
&& echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
USER service_user
WORKDIR /home/service_user
ENV PATH=/home/service_user/.local/bin:/home/service_user/google-cloud-sdk/bin:$PATH
RUN pip install jupyterlab jupyter_kernel_gateway psutil spylon-kernel gcsfs fsspec papermill[all] \
&& python -m spylon_kernel install --user \
&& echo "export JULIA_NUM_THREADS=\$(nproc --all)" >> $HOME/.zshrc \
&& curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-353.0.0-linux-x86_64.tar.gz \
&& tar -xf google-cloud-sdk-353.0.0-linux-x86_64.tar.gz \
&& rm google-cloud-sdk-353.0.0-linux-x86_64.tar.gz \
&& ./google-cloud-sdk/install.sh \
&& echo "set -g mouse on\nset -g default-shell /bin/bash" >> ~/.tmux.conf
COPY --chown=service_user entrypoint.bash /home/service_user/
COPY --chown=service_user jupyter_kernel_gateway_config.py /home/service_user/.jupyter/
COPY --chown=service_user spark-defaults.conf $SPARK_HOME/conf/
EXPOSE 8080 8888 8890
SHELL [ "/bin/zsh" ]
ENTRYPOINT [ "/bin/bash", "entrypoint.bash" ]
#! /bin/bash
if [ ! -z "$GOOGLE_APPLICATION_CREDENTIALS" ];
then gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS
fi
python -c 'import psutil; print(f"\nspark.driver.memory {int(psutil.virtual_memory().total / 1024**3 * 0.9 - 2)}g")' \
>> $SPARK_HOME/conf/spark-defaults.conf
jupyter kernelgateway
# Configuration file for jupyter-kernel-gateway.
#------------------------------------------------------------------------------
# Application(SingletonConfigurable) configuration
#------------------------------------------------------------------------------
## This is an application.
## The date format used by logging formatters for %(asctime)s
# Default: '%Y-%m-%d %H:%M:%S'
# c.Application.log_datefmt = '%Y-%m-%d %H:%M:%S'
## The Logging format template
# Default: '[%(name)s]%(highlevel)s %(message)s'
# c.Application.log_format = '[%(name)s]%(highlevel)s %(message)s'
## Set the log level by value or name.
# Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
# Default: 30
# c.Application.log_level = 30
## Instead of starting the Application, dump configuration to stdout
# Default: False
# c.Application.show_config = False
## Instead of starting the Application, dump configuration to stdout (as JSON)
# Default: False
# c.Application.show_config_json = False
#------------------------------------------------------------------------------
# JupyterApp(Application) configuration
#------------------------------------------------------------------------------
## Base class for Jupyter applications
## Answer yes to any prompts.
# Default: False
# c.JupyterApp.answer_yes = False
## Full path of a config file.
# Default: ''
# c.JupyterApp.config_file = ''
## Specify a config file to load.
# Default: ''
# c.JupyterApp.config_file_name = ''
## Generate default config file.
# Default: False
# c.JupyterApp.generate_config = False
## The date format used by logging formatters for %(asctime)s
# See also: Application.log_datefmt
# c.JupyterApp.log_datefmt = '%Y-%m-%d %H:%M:%S'
## The Logging format template
# See also: Application.log_format
# c.JupyterApp.log_format = '[%(name)s]%(highlevel)s %(message)s'
## Set the log level by value or name.
# See also: Application.log_level
# c.JupyterApp.log_level = 30
## Instead of starting the Application, dump configuration to stdout
# See also: Application.show_config
# c.JupyterApp.show_config = False
## Instead of starting the Application, dump configuration to stdout (as JSON)
# See also: Application.show_config_json
# c.JupyterApp.show_config_json = False
#------------------------------------------------------------------------------
# KernelGatewayApp(JupyterApp) configuration
#------------------------------------------------------------------------------
## Application that provisions Jupyter kernels and proxies HTTP/Websocket traffic
# to the kernels.
#
# - reads command line and environment variable settings - initializes managers
# and routes - creates a Tornado HTTP server - starts the Tornado event loop
## Sets the Access-Control-Allow-Credentials header. (KG_ALLOW_CREDENTIALS env
# var)
# Default: ''
# c.KernelGatewayApp.allow_credentials = ''
## Sets the Access-Control-Allow-Headers header. (KG_ALLOW_HEADERS env var)
# Default: ''
# c.KernelGatewayApp.allow_headers = ''
## Sets the Access-Control-Allow-Methods header. (KG_ALLOW_METHODS env var)
# Default: ''
# c.KernelGatewayApp.allow_methods = ''
## Sets the Access-Control-Allow-Origin header. (KG_ALLOW_ORIGIN env var)
# Default: ''
# c.KernelGatewayApp.allow_origin = ''
## Answer yes to any prompts.
# See also: JupyterApp.answer_yes
# c.KernelGatewayApp.answer_yes = False
## Controls which API to expose, that of a Jupyter notebook server, the seed
# notebook's, or one provided by another module, respectively using values
# 'kernel_gateway.jupyter_websocket', 'kernel_gateway.notebook_http', or another
# fully qualified module name (KG_API env var)
# Default: 'kernel_gateway.jupyter_websocket'
# c.KernelGatewayApp.api = 'kernel_gateway.jupyter_websocket'
## Authorization token required for all requests (KG_AUTH_TOKEN env var)
# Default: ''
# c.KernelGatewayApp.auth_token = ''
## The base path for mounting all API resources (KG_BASE_URL env var)
# Default: '/'
# c.KernelGatewayApp.base_url = '/'
## The full path to an SSL/TLS certificate file. (KG_CERTFILE env var)
# Default: None
# c.KernelGatewayApp.certfile = None
## The full path to a certificate authority certificate for SSL/TLS client
# authentication. (KG_CLIENT_CA env var)
# Default: None
# c.KernelGatewayApp.client_ca = None
## Full path of a config file.
# See also: JupyterApp.config_file
# c.KernelGatewayApp.config_file = ''
## Specify a config file to load.
# See also: JupyterApp.config_file_name
# c.KernelGatewayApp.config_file_name = ''
## Default kernel name when spawning a kernel (KG_DEFAULT_KERNEL_NAME env var)
# Default: ''
# c.KernelGatewayApp.default_kernel_name = ''
## Environment variables allowed to be inherited from the spawning process by the
# kernel
# Default: []
c.KernelGatewayApp.env_process_whitelist = ["HOME", "SPARK_HOME", "JULIA_NUM_THREADS", "GOOGLE_APPLICATION_CREDENTIALS"]
## Sets the Access-Control-Expose-Headers header. (KG_EXPOSE_HEADERS env var)
# Default: ''
# c.KernelGatewayApp.expose_headers = ''
## Override any kernel name specified in a notebook or request
# (KG_FORCE_KERNEL_NAME env var)
# Default: ''
# c.KernelGatewayApp.force_kernel_name = ''
## Generate default config file.
# See also: JupyterApp.generate_config
# c.KernelGatewayApp.generate_config = False
## IP address on which to listen (KG_IP env var)
# Default: '127.0.0.1'
c.KernelGatewayApp.ip = '0.0.0.0'
## The kernel manager class to use.
# Default: 'kernel_gateway.services.kernels.manager.SeedingMappingKernelManager'
# c.KernelGatewayApp.kernel_manager_class = 'kernel_gateway.services.kernels.manager.SeedingMappingKernelManager'
## The kernel spec manager class to use. Should be a subclass of
# `jupyter_client.kernelspec.KernelSpecManager`.
# Default: 'jupyter_client.kernelspec.KernelSpecManager'
# c.KernelGatewayApp.kernel_spec_manager_class = 'jupyter_client.kernelspec.KernelSpecManager'
## The full path to a private key file for usage with SSL/TLS. (KG_KEYFILE env
# var)
# Default: None
# c.KernelGatewayApp.keyfile = None
## The date format used by logging formatters for %(asctime)s
# See also: Application.log_datefmt
# c.KernelGatewayApp.log_datefmt = '%Y-%m-%d %H:%M:%S'
## The Logging format template
# See also: Application.log_format
# c.KernelGatewayApp.log_format = '[%(name)s]%(highlevel)s %(message)s'
## Set the log level by value or name.
# See also: Application.log_level
# c.KernelGatewayApp.log_level = 30
## Sets the Access-Control-Max-Age header. (KG_MAX_AGE env var)
# Default: ''
# c.KernelGatewayApp.max_age = ''
## Limits the number of kernel instances allowed to run by this gateway.
# Unbounded by default. (KG_MAX_KERNELS env var)
# Default: None
# c.KernelGatewayApp.max_kernels = None
## Port on which to listen (KG_PORT env var)
# Default: 8888
c.KernelGatewayApp.port = 8890
## Number of ports to try if the specified port is not available (KG_PORT_RETRIES
# env var)
# Default: 50
# c.KernelGatewayApp.port_retries = 50
## Number of kernels to prespawn using the default language. No prespawn by
# default. (KG_PRESPAWN_COUNT env var)
# Default: None
# c.KernelGatewayApp.prespawn_count = None
## Runs the notebook (.ipynb) at the given URI on every kernel launched. No seed
# by default. (KG_SEED_URI env var)
# Default: None
# c.KernelGatewayApp.seed_uri = None
## Instead of starting the Application, dump configuration to stdout
# See also: Application.show_config
# c.KernelGatewayApp.show_config = False
## Instead of starting the Application, dump configuration to stdout (as JSON)
# See also: Application.show_config_json
# c.KernelGatewayApp.show_config_json = False
## Sets the SSL version to use for the web socket connection. (KG_SSL_VERSION env
# var)
# Default: None
# c.KernelGatewayApp.ssl_version = None
## Use x-* header values for overriding the remote-ip, useful when application is
# behing a proxy. (KG_TRUST_XHEADERS env var)
# Default: False
# c.KernelGatewayApp.trust_xheaders = False
#------------------------------------------------------------------------------
# NotebookHTTPPersonality(LoggingConfigurable) configuration
#------------------------------------------------------------------------------
## Personality for notebook-http support, creating REST endpoints based on the
# notebook's annotated cells
## Optional API to download the notebook source code in notebook-http mode,
# defaults to not allow
# Default: False
# c.NotebookHTTPPersonality.allow_notebook_download = False
## Determines which module is used to parse the notebook for endpoints and
# documentation. Valid module names include
# 'kernel_gateway.notebook_http.cell.parser' and
# 'kernel_gateway.notebook_http.swagger.parser'. (KG_CELL_PARSER env var)
# Default: 'kernel_gateway.notebook_http.cell.parser'
# c.NotebookHTTPPersonality.cell_parser = 'kernel_gateway.notebook_http.cell.parser'
## Maps kernel language to code comment syntax
# Default: {'scala': '//', None: '#'}
# c.NotebookHTTPPersonality.comment_prefix = {'scala': '//', None: '#'}
## Serve static files on disk in the given path as /public, defaults to not serve
# Default: None
# c.NotebookHTTPPersonality.static_path = None
#------------------------------------------------------------------------------
# JupyterWebsocketPersonality(LoggingConfigurable) configuration
#------------------------------------------------------------------------------
## Personality for standard websocket functionality, registering endpoints that
# are part of the Jupyter Kernel Gateway API
## Environment variables allowed to be set when a client requests a new kernel
# Default: []
# c.JupyterWebsocketPersonality.env_whitelist = []
## Permits listing of the running kernels using API endpoints /api/kernels and
# /api/sessions (KG_LIST_KERNELS env var). Note: Jupyter Notebook allows this by
# default but kernel gateway does not.
# Default: False
# c.JupyterWebsocketPersonality.list_kernels = False
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.
# Example:
# spark.master spark://master:7077
# spark.eventLog.enabled true
# spark.eventLog.dir hdfs://namenode:8021/directory
# spark.serializer org.apache.spark.serializer.KryoSerializer
# spark.driver.memory 5g
spark.driver.defaultJavaOptions -Dio.netty.tryReflectionSetAccessible=true
spark.driver.maxResultSize 50g
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment