Goals: Add links that are reasonable and good explanations of how stuff works. No hype and no vendor content if possible. Practical first-hand accounts of models in prod eagerly sought.
| #!/usr/bin/env -S uv run --script | |
| # /// script | |
| # dependencies = [ | |
| # "pyarrow", | |
| # "fsspec", | |
| # "click", | |
| # "s3fs", | |
| # "gcsfs", | |
| # ] | |
| # /// |
| import torch | |
| import torch._inductor.config | |
| import time | |
| torch._inductor.config.triton.cudagraphs = False | |
| torch.set_float32_matmul_precision('high') | |
| def bench(f, name=None, iters=100, warmup=5, display=True, profile=False): | |
| for _ in range(warmup): | |
| f() |
| from matplotlib import pyplot | |
| import random | |
| import time | |
| pyplot.style.use("ggplot") | |
| now = time.time() | |
| def generate_user(censor=now): | |
| # Pick some point in time the user was created | |
| t_created = t = now - random.random() * 1e7 |
| library("tidyverse") | |
| library("sparklyr") | |
| library("sparklyr.nested") | |
| library("cowplot") | |
| library("ggsci") | |
| #Spark config | |
| config <- spark_config() | |
| # Allowing to GCP datasets access |
If you were to give recommendations to your "little brother/sister" on things that they need to do to become a data scientist, what would those things be?
I think the "Data Science Venn Diagram" (http://drewconway.com/zia/2013/3/26/the-data-science-venn-diagram) is a great place to start. You need three things to be a good data scientist:
| import luigi | |
| import time | |
| class TimeTaskMixin(object): | |
| ''' | |
| A mixin that when added to a luigi task, will print out | |
| the tasks execution time to standard out, when the task is | |
| finished | |
| ''' | |
| @luigi.Task.event_handler(luigi.Event.PROCESSING_TIME) |
| _ssh_auth_save() { | |
| ln -sf "$SSH_AUTH_SOCK" "$HOME/.ssh/ssh-auth-sock.$HOSTNAME" | |
| } | |
| alias screen='_ssh_auth_save ; export HOSTNAME=$(hostname) ; screen' | |
| alias tmux='_ssh_auth_save ; export HOSTNAME=$(hostname) ; tmux' |
| # Intro | |
| extremely simple and unsophisticated cross process data sharing | |
| supports one read-write master process and an arbitrary number of read-only processes | |
| please consider using pickle/cPickle/ctype to store complex data | |
| # References |