Skip to content

Instantly share code, notes, and snippets.

@dzogrim
Created December 2, 2025 15:17
Show Gist options
  • Select an option

  • Save dzogrim/5382a0afa0b08d0d2ce7f60162fcbd23 to your computer and use it in GitHub Desktop.

Select an option

Save dzogrim/5382a0afa0b08d0d2ce7f60162fcbd23 to your computer and use it in GitHub Desktop.
This custom script compares and filters two related inventory CSV files.
#!/usr/bin/env bash
# -----------------------------------------------------------------------------
# cross-filter-inventory.sh
# -----------------------------------------------------------------------------
# Description:
# This custom script compares and filters two related inventory CSV files:
# 1) An anonymized "updates" CSV (e.g.: inventory-anon-updates.csv)
# 2) A confidential detailed inventory CSV (e.g.: inventory-detailed.csv)
#
# It filters out:
# - Lines that are not marked "OK" (case-insensitive) in the first column
# of the detailed inventory.
# - Entries whose H/W "TAG" already exists in the updates CSV
# (with TAG normalization removes leading zeros: 01218 → 1218).
# - Entries whose last name already appears in the updates CSV.
# - Entries whose last name contains specific keywords listed in the
# FILTER_NAMES array (case-insensitive).
#
# Remaining valid rows are printed to stdout.
# Before printing, the CSV is "compacted":
# - Any column that is empty for all remaining rows is removed.
#
# With the `--truncate` flag, the first and last columns of the compacted
# CSV output are removed (e.g. drop column 1 "OK" and the last field).
# -----------------------------------------------------------------------------
# Usage:
# ./cross-filter-inventory.sh [--truncate] <updates.csv> <inventory.csv>
#
# Examples that generates a new cleaned-up CSV file:
# ./cross-filter-inventory.sh inventory-anon-updates.csv \
# inventory-detailed.csv > filtered.csv
#
# ./cross-filter-inventory.sh --truncate inventory-anon-updates.csv \
# inventory-detailed.csv > filtered_clean.csv
# -----------------------------------------------------------------------------
# Output:
# - Filtered CSV rows on stdout.
# - Final line count on stderr.
# -----------------------------------------------------------------------------
# Requirements:
# - Bash 4+
# - AWK (POSIX awk or GNU awk)
# -----------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2024-2025 Sébastien L.
# -----------------------------------------------------------------------------
set -euo pipefail
# -----------------------------------------------------------------------------
# Configuration
# -----------------------------------------------------------------------------
# Keywords that should cause a row to be dropped if found in owner or last name
FILTER_NAMES=(
stock
RECOVERY
majeur
)
# -----------------------------------------------------------------------------
# Functions
# -----------------------------------------------------------------------------
usage() {
cat >&2 <<EOF
Usage: $0 [--truncate] <updates.csv> <detailed-inventory.csv>
Compare an anonymized "updates" CSV with a detailed inventory CSV and print
only the 'missing' entries from the detailed inventory that are not already
present in the updates (based on TAG and last name).
Options:
--truncate Remove first and last columns from the compacted CSV output.
-h, --help Show this help message and exit.
EOF
}
# -----------------------------------------------------------------------------
# Argument parsing
# -----------------------------------------------------------------------------
TRUNCATE="false"
while (( $# > 0 )); do
case "$1" in
--truncate)
TRUNCATE="true"
shift
;;
-h|--help)
usage
exit 0
;;
--)
shift
break
;;
-*)
echo "Error: unknown option: $1" >&2
usage
exit 1
;;
*)
# first non-option: break to treat remaining as positional args
break
;;
esac
done
if (( $# != 2 )); then
echo "Error: expected 2 arguments, got $#." >&2
usage
exit 1
fi
upd=$1 # updates CSV (anonymous)
inv=$2 # detailed inventory CSV
# Flatten FILTER_NAMES into a single space-separated string for AWK.
# Using "${FILTER_NAMES[*]}" preserves words, separated by spaces.
FILTERS_STR=${FILTER_NAMES[*]:-}
# -----------------------------------------------------------------------------
# Core AWK logic
# -----------------------------------------------------------------------------
awk -v filters="$FILTERS_STR" -v truncate="$TRUNCATE" -F',' '
BEGIN {
FS = ","
OFS = ","
printed = 0 # number of accepted lines
line_count = 0 # number of stored lines
max_nf = 0 # max number of fields seen
# build a table of forbidden words from FILTER_NAMES
n = split(filters, f, " ")
for (i = 1; i <= n; i++) {
if (f[i] == "") {
continue
}
bad[toupper(f[i])] = 1
}
}
# -------------------------------------------------------------------
# First file: updates CSV (anonymous "reference" entries)
# NR == FNR means: still reading the first input file
# -------------------------------------------------------------------
NR == FNR {
# Skip header line
if (FNR == 1) {
next
}
# Column 1: TAG
tag = $1
sub(/^0+/, "", tag) # normalize: remove leading zeros
if (tag != "") {
done_tag[tag] = 1
}
# Column 6: last name (in your input format)
lname = toupper($6)
gsub(/^ +| +$/, "", lname)
if (lname != "") {
done_name[lname] = 1
}
next
}
# -------------------------------------------------------------------
# Second file: detailed inventory CSV (candidate entries)
# -------------------------------------------------------------------
# Skip header line of the second file
FNR == 1 {
next
}
# Keep only lines with OK (case-insensitive) in the first column
toupper($1) != "OK" {
next
}
{
# Column 4: TAG in the detailed inventory
itag = $4
sub(/^0+/, "", itag) # normalize TAG here as well
# Column 2: owner string, we infer last name from it
owner = $2
up_owner = toupper(owner)
split(up_owner, a, /[[:space:]]+/)
own_lname = a[length(a)] # last token considered as last name
# -----------------------------------------------------------------
# 1) Filter using FILTER_NAMES (keywords in owner or last name)
# -----------------------------------------------------------------
for (k in bad) {
if (k == "") {
continue
}
if (index(up_owner, k) > 0 || index(own_lname, k) > 0) {
next
}
}
# -----------------------------------------------------------------
# 2) Filter using updates data (TAG or last name already present)
# -----------------------------------------------------------------
if (itag in done_tag) {
next
}
if (own_lname in done_name) {
next
}
# -----------------------------------------------------------------
# 3) Line accepted: store it and track non-empty columns
# -----------------------------------------------------------------
printed++
line_count++
lines[line_count] = $0
if (NF > max_nf) {
max_nf = NF
}
for (i = 1; i <= NF; i++) {
tmp = $i
gsub(/^ +| +$/, "", tmp)
if (tmp != "") {
non_empty[i] = 1 # mark this column as having data
}
}
}
END {
# -----------------------------------------------------------------
# Build the list of columns that are not globally empty
# -----------------------------------------------------------------
keep_count = 0
for (i = 1; i <= max_nf; i++) {
if (i in non_empty) {
keep_count++
keep[keep_count] = i
}
}
# -----------------------------------------------------------------
# Optional truncation of first and last *output* columns
# -----------------------------------------------------------------
start = 1
end = keep_count
if (truncate == "true" && keep_count > 1) {
start = 2
end = keep_count - 1
}
# -----------------------------------------------------------------
# Reprint all stored lines with only the selected columns
# -----------------------------------------------------------------
for (idx = 1; idx <= line_count; idx++) {
n = split(lines[idx], f, FS)
out = ""
for (k = start; k <= end; k++) {
col = keep[k]
val = (col <= n ? f[col] : "")
if (k > start) {
out = out OFS
}
out = out val
}
print out
}
# -----------------------------------------------------------------
# Print the number of lines to stderr (does NOT pollute a new CSV)
# -----------------------------------------------------------------
# blue = 34, yellow = 33, reset = 0
print "\033[1;34mNumber of printed lines:\033[0m \033[1;33m" printed "\033[0m" > "/dev/stderr"
}
' "$upd" "$inv"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment