These functions are exactly equivalent
| Reference | ||
|---|---|---|
| filter | where | pyspark.sql.DataFrame.filter |
| drop_duplicates | dropDuplicates | pyspark.sql.DataFrame.drop_duplicates |
| avg | mean | pyspark.sql.GroupedData.avg |
| """ | |
| Python script for batch geocoding of addresses using the Google Geocoding API. | |
| This script allows for massive lists of addresses to be geocoded for free by pausing when the | |
| geocoder hits the free rate limit set by Google (2500 per day). If you have an API key for paid | |
| geocoding from Google, set it in the API key section. | |
| Addresses for geocoding can be specified in a list of strings "addresses". In this script, addresses | |
| come from a csv file with a column "Address". Adjust the code to your own requirements as needed. | |
| After every 500 successul geocode operations, a temporary file with results is recorded in case of | |
| script failure / loss of connection later. | |
| Addresses and data are held in memory, so this script may need to be adjusted to process files line |
| import numpy as np | |
| EPSILON = 1e-10 | |
| def _error(actual: np.ndarray, predicted: np.ndarray): | |
| """ Simple error """ | |
| return actual - predicted |
| import boto3 | |
| lamba_client = boto3.client('lambda', region_name='REGION_NAME') | |
| lamba_client.add_permission( | |
| FunctionName='create_lab', | |
| StatementId='AWSEventsRule', | |
| Action='lambda:InvokeFunction', | |
| Principal='events.amazonaws.com', | |
| SourceArn='arn:aws:events:REGION_NAME:ACCOUNT_NUMBER:rule/*', |
| import numpy as np | |
| EPSILON = 1e-10 | |
| def _error(actual: np.ndarray, predicted: np.ndarray): | |
| """ Simple error """ | |
| return actual - predicted |
| #Import All Functions | |
| from pyspark.sql import SQLContext | |
| from pyspark.sql import functions as F | |
| from pyspark.sql import SparkSession | |
| from pyspark.sql.functions import unix_timestamp, to_date, date_format, month, year, dayofyear, dayofweek, col | |
| from pyspark.sql.types import TimestampType | |
| from pyspark.sql import functions as F | |
| from pyspark.sql import SparkSession | |
| from pyspark.sql.functions import unix_timestamp, to_date, date_format, month, year, dayofyear, dayofweek, col | |
| from pyspark.sql.types import TimestampType |
These functions are exactly equivalent
| Reference | ||
|---|---|---|
| filter | where | pyspark.sql.DataFrame.filter |
| drop_duplicates | dropDuplicates | pyspark.sql.DataFrame.drop_duplicates |
| avg | mean | pyspark.sql.GroupedData.avg |
| #!/bin/bash | |
| ######################################################################################## | |
| # START, STOP or STATUS # | |
| # ---------------------------- # | |
| # This scrip is intended to help you start, stop or get the IP address of # | |
| # Current running EC2. # | |
| # This will require you to 1st configure your AWC-CLI, namualy to ensure safety # | |
| # # | |
| # Please read the code to ensure that It does not cause any security issues # | |
| # # |
| # A simple cheat sheet of Spark Dataframe syntax | |
| # Current for Spark 1.6.1 | |
| # import statements | |
| from pyspark.sql import SQLContext | |
| from pyspark.sql.types import * | |
| from pyspark.sql.functions import * | |
| #creating dataframes | |
| df = sqlContext.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"]) # from manual data |
| #!/bin/bash | |
| ################################################################################## | |
| # ---------------------------------------------------------------- | |
| # THIS SCRIPT WILL HELP YOUR AUTOMATE THE DOCKER INSTALATION STEPS | |
| # ---------------------------------------------------------------- | |
| # Test was ran on aws ec2 instance. | |
| # | |
| # AUTHOR: |