Lalit Singh lalitsingh24x7

## ecr_image_update.py
#!/usr/bin/env python3
"""
ECR Image Updater Script
Pulls Docker images from AWS ECR, adds pip install requests, and pushes with new tag
"""

import subprocess
import sys
import re
import tempfile

## databricks.yml
bundle:
  name: portfolio-analyser

targets:
  dev:
    default: true
    workspace:
      host: https://<your workspace URL>.cloud.databricks.com  # your workspace URL
    mode: development

## dbcer
https://credentials.databricks.com/9698dfe8-8cf0-4c50-a905-22f8f7c1f3a4#acc.BtECliDT

## sample_data,py
# Create the customers DataFrame
customers_data = [(101, "Lalit"), (102, "Neha")]
customers = spark.createDataFrame(customers_data, ["customer_id", "customer_name"])

# Create the orders DataFrame
orders_data = [(1, 101, 250), (2, 102, 150), (3, 0, 100), (4, None, 200)]
orders = spark.createDataFrame(orders_data, ["order_id", "customer_id", "order_amount"])

## lambda.py
def handler(event, context):
    print("Received event: " + str(event))

    try:
        # Extract the file name from the event data
        file_name = event['Records'][0]['s3']['object']['key']
        print("File name: " + file_name)
    except (KeyError, IndexError) as e:
        print(f"Error extracting file name: {e}")
        return {

## note.txt
1: Test and Quality

Unit Test: Individual transformations - Pytest
Schema Test: Ensures correct column names & types - Great Expectations
Data Quality: Checks for missing values & duplicates - Pandas or PySpark
Integration Test: Validates data flow from source to destination - Pytest, Temp Files


2: Failure, Visibilty and retry

## union.sql
CREATE VIEW Combined_Portfolio AS
SELECT
    portfolio_id,
    amount,
    'Portfolio_two' AS source_table
FROM
    Portfolio_two
UNION ALL
SELECT
    t1.portfolio_id,

## partitionBy.py
1. Partitioned Writes:

# Write the DataFrame partitioned by Product and Date directly to S3
df.write.mode("overwrite").partitionBy("Product", "Date").csv(output_s3_base_path, header=True)

<<<
# Select relevant columns, keeping Product as a column but partitioning by Date only
df = df.select("Product", "Date", "Amount")

# Write the DataFrame partitioned only by Date

## job.py
import sys
from awsglue.context import GlueContext
from pyspark.context import SparkContext
from awsglue.dynamicframe import DynamicFrame

# Initialize Glue context
sc = SparkContext()
glueContext = GlueContext(sc)

# Read the CSV file from S3

## asyncio_demo.py
import asyncio

# 1: Coroutines:
"""
Coroutines are functions that can pause and resume their execution.
They are defined using the async def syntax. To call a coroutine, you use await.
"""

async def my_coroutine():
    print("Hello")
	#!/usr/bin/env python3
	"""
	ECR Image Updater Script
	Pulls Docker images from AWS ECR, adds pip install requests, and pushes with new tag
	"""

	import subprocess
	import sys
	import re
	import tempfile
	bundle:
	name: portfolio-analyser

	targets:
	dev:
	default: true
	workspace:
	host: https://<your workspace URL>.cloud.databricks.com # your workspace URL
	mode: development
	# Create the customers DataFrame
	customers_data = [(101, "Lalit"), (102, "Neha")]
	customers = spark.createDataFrame(customers_data, ["customer_id", "customer_name"])

	# Create the orders DataFrame
	orders_data = [(1, 101, 250), (2, 102, 150), (3, 0, 100), (4, None, 200)]
	orders = spark.createDataFrame(orders_data, ["order_id", "customer_id", "order_amount"])
	def handler(event, context):
	print("Received event: " + str(event))

	try:
	# Extract the file name from the event data
	file_name = event['Records'][0]['s3']['object']['key']
	print("File name: " + file_name)
	except (KeyError, IndexError) as e:
	print(f"Error extracting file name: {e}")
	return {
	1: Test and Quality

	Unit Test: Individual transformations - Pytest
	Schema Test: Ensures correct column names & types - Great Expectations
	Data Quality: Checks for missing values & duplicates - Pandas or PySpark
	Integration Test: Validates data flow from source to destination - Pytest, Temp Files


	2: Failure, Visibilty and retry
	CREATE VIEW Combined_Portfolio AS
	SELECT
	portfolio_id,
	amount,
	'Portfolio_two' AS source_table
	FROM
	Portfolio_two
	UNION ALL
	SELECT
	t1.portfolio_id,
	1. Partitioned Writes:

	# Write the DataFrame partitioned by Product and Date directly to S3
	df.write.mode("overwrite").partitionBy("Product", "Date").csv(output_s3_base_path, header=True)

	<<<
	# Select relevant columns, keeping Product as a column but partitioning by Date only
	df = df.select("Product", "Date", "Amount")

	# Write the DataFrame partitioned only by Date
	import sys
	from awsglue.context import GlueContext
	from pyspark.context import SparkContext
	from awsglue.dynamicframe import DynamicFrame

	# Initialize Glue context
	sc = SparkContext()
	glueContext = GlueContext(sc)

	# Read the CSV file from S3
	import asyncio

	# 1: Coroutines:
	"""
	Coroutines are functions that can pause and resume their execution.
	They are defined using the async def syntax. To call a coroutine, you use await.
	"""

	async def my_coroutine():
	print("Hello")