Skip to content

Instantly share code, notes, and snippets.

View mdrakiburrahman's full-sized avatar
💤
living the data dream

Raki mdrakiburrahman

💤
living the data dream
View GitHub Profile
"""Fabric Lakehouse ODBC query runner.
Loads query definitions from a YAML file, builds a WHERE clause from caller-
supplied scope (months, services, teams, severities), and executes queries
in parallel via ThreadPoolExecutor — all sharing a single ODBC connection
(``ReuseSession=true``) so only one Livy session is created.
A warm-up ``SELECT 1`` runs first to ensure the Livy session is alive.
Then all real queries fire in parallel using cursors from the same connection.
"""
"""Fuzzy title bucketing: TF-IDF clustering → Soundex rebalancing.
Groups incident titles that are semantically similar (e.g. same alert
with a different region suffix, or "errors" vs "failures" variants) into a
single bucket label.
Pipeline
--------
1. **Normalise** — strip bracketed prefixes (``[topic=…]``), quoted strings,
``Region: …`` labels, UUIDs, IPs, timestamps, and uppercase region codes.
package me.rakirahman.quality.table.deltalake
import me.rakirahman.config.DeltaLakeConfiguration
import me.rakirahman.logging.level.LoggingConstants
import me.rakirahman.metastore.MetastoreOperations
import me.rakirahman.quality.deequ.repository.metric.spark.table.DataQualityMetadata
import me.rakirahman.quality.table.TableAnalyzer
import io.delta.tables._
package me.rakirahman.connection.fabric.sql
import me.rakirahman.connection.fabric.MetadataManager
import me.rakirahman.feeds.authentication.jwt.JwtScopeExtensions._
import me.rakirahman.feeds.authentication.jwt.JwtScopes
import com.azure.core.credential.{TokenCredential, TokenRequestContext}
import org.apache.http.client.methods.{HttpGet, HttpPost}
import org.apache.http.entity.{ContentType, StringEntity}
{
"eventTime":"2026-01-09T23:18:22.634Z",
"producer":"https://github.com/OpenLineage/OpenLineage/tree/1.23.0/integration/spark",
"schemaURL":"https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/RunEvent",
"eventType":"START",
"run":{
"runId":"019ba50d-c8ab-798b-86ae-e437366a5a3f",
"facets":{
"parent":{
"_producer":"https://github.com/OpenLineage/OpenLineage/tree/1.23.0/integration/spark",
@mdrakiburrahman
mdrakiburrahman / otel-delta.dbml
Created May 19, 2025 21:40
OpenTelemetry schema in Delta Lake representation
Table "columnar"."log" {
"id" varchar [pk]
"event_year_date" varchar
"region" varchar
"service_name" varchar
"time_utc" datetime2
"observed_time_utc" datetime2
"scope_name" varchar
"scope_version" varchar
"trace_id" varchar
package microsoft.opentelemetrycollector.otlp.json.v1
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType
// @formatter:off
/** Represents a schema for a Logs payload.
*
* @param spark
* The SparkSession for schema inference.
IFS='|' read -ra JAR_ARRAY <<<"$SYNAPSE_WORKSPACE_PACKAGE_UPLOAD_PIPE_DELIMITED"
for JAR in "${JAR_ARRAY[@]}"; do
export PACKAGE_NAME=$(echo "${JAR}" | awk -F'/' '{print $NF}')
echo "Checking if package ${PACKAGE_NAME} exists in ${SYNAPSE_WORKSPACE_NAME}"
export PACKAGE_ID_EXISTING=$(az synapse workspace-package show --workspace-name "$SYNAPSE_WORKSPACE_NAME" --name "$PACKAGE_NAME" --query "id" -o tsv)
export PACKAGE_EXISTS=false
if [ -z "$PACKAGE_ID_EXISTING" ]; then
echo "Package ${PACKAGE_NAME} does not exist in ${SYNAPSE_WORKSPACE_NAME}."
else

Pictures and stuff.

graph TD
subgraph SqlServerInstancePrimary1
createEndpoint1
grantConnectOnEndpoint1
createAvailabilityGroup1
end
subgraph SqlServerInstanceSecondary2
createEndpoint2