dzogrim/cross-filter-inventory.sh

## cross-filter-inventory.sh
#!/usr/bin/env bash
# -----------------------------------------------------------------------------
#  cross-filter-inventory.sh
# -----------------------------------------------------------------------------
#  Description:
#    This custom script compares and filters two related inventory CSV files:
#      1) An anonymized "updates" CSV (e.g.: inventory-anon-updates.csv)
#      2) A confidential detailed inventory CSV (e.g.: inventory-detailed.csv)
#
#    It filters out:
#      - Lines that are not marked "OK" (case-insensitive) in the first column
#        of the detailed inventory.
#      - Entries whose H/W "TAG" already exists in the updates CSV
#        (with TAG normalization removes leading zeros: 01218 → 1218).
#      - Entries whose last name already appears in the updates CSV.
#      - Entries whose last name contains specific keywords listed in the
#        FILTER_NAMES array (case-insensitive).
#
#    Remaining valid rows are printed to stdout.
#    Before printing, the CSV is "compacted":
#      - Any column that is empty for all remaining rows is removed.
#
#    With the `--truncate` flag, the first and last columns of the compacted
#    CSV output are removed (e.g. drop column 1 "OK" and the last field).
# -----------------------------------------------------------------------------
#  Usage:
#      ./cross-filter-inventory.sh [--truncate] <updates.csv> <inventory.csv>
#
#    Examples that generates a new cleaned-up CSV file:
#      ./cross-filter-inventory.sh inventory-anon-updates.csv \
#                            inventory-detailed.csv > filtered.csv
#
#      ./cross-filter-inventory.sh --truncate inventory-anon-updates.csv \
#                            inventory-detailed.csv > filtered_clean.csv
# -----------------------------------------------------------------------------
#  Output:
#      - Filtered CSV rows on stdout.
#      - Final line count on stderr.
# -----------------------------------------------------------------------------
#  Requirements:
#      - Bash 4+
#      - AWK (POSIX awk or GNU awk)
# -----------------------------------------------------------------------------
#  MIT License
#
#  Copyright (c) 2024-2025 Sébastien L.
# -----------------------------------------------------------------------------
set -euo pipefail

# -----------------------------------------------------------------------------
# Configuration
# -----------------------------------------------------------------------------
# Keywords that should cause a row to be dropped if found in owner or last name
FILTER_NAMES=(
  stock
  RECOVERY
  majeur
)

# -----------------------------------------------------------------------------
# Functions
# -----------------------------------------------------------------------------
usage() {
  cat >&2 <<EOF
Usage: $0 [--truncate] <updates.csv> <detailed-inventory.csv>

Compare an anonymized "updates" CSV with a detailed inventory CSV and print
only the 'missing' entries from the detailed inventory that are not already
present in the updates (based on TAG and last name).

Options:
  --truncate    Remove first and last columns from the compacted CSV output.
  -h, --help    Show this help message and exit.
EOF
}

# -----------------------------------------------------------------------------
# Argument parsing
# -----------------------------------------------------------------------------
TRUNCATE="false"
while (( $# > 0 )); do
  case "$1" in
    --truncate)
      TRUNCATE="true"
      shift
      ;;
    -h|--help)
      usage
      exit 0
      ;;
    --)
      shift
      break
      ;;
    -*)
      echo "Error: unknown option: $1" >&2
      usage
      exit 1
      ;;
    *)
      # first non-option: break to treat remaining as positional args
      break
      ;;
  esac
done

if (( $# != 2 )); then
  echo "Error: expected 2 arguments, got $#." >&2
  usage
  exit 1
fi

upd=$1   # updates CSV (anonymous)
inv=$2   # detailed inventory CSV

# Flatten FILTER_NAMES into a single space-separated string for AWK.
# Using "${FILTER_NAMES[*]}" preserves words, separated by spaces.
FILTERS_STR=${FILTER_NAMES[*]:-}

# -----------------------------------------------------------------------------
# Core AWK logic
# -----------------------------------------------------------------------------
awk -v filters="$FILTERS_STR" -v truncate="$TRUNCATE" -F',' '
BEGIN {
  FS  = ","
  OFS = ","

  printed   = 0     # number of accepted lines
  line_count = 0    # number of stored lines
  max_nf     = 0    # max number of fields seen

  # build a table of forbidden words from FILTER_NAMES
  n = split(filters, f, " ")
  for (i = 1; i <= n; i++) {
    if (f[i] == "") {
      continue
    }
    bad[toupper(f[i])] = 1
  }
}

# -------------------------------------------------------------------
# First file: updates CSV (anonymous "reference" entries)
# NR == FNR means: still reading the first input file
# -------------------------------------------------------------------
NR == FNR {
  # Skip header line
  if (FNR == 1) {
    next
  }

  # Column 1: TAG
  tag = $1
  sub(/^0+/, "", tag)       # normalize: remove leading zeros

  if (tag != "") {
    done_tag[tag] = 1
  }

  # Column 6: last name (in your input format)
  lname = toupper($6)
  gsub(/^ +| +$/, "", lname)

  if (lname != "") {
    done_name[lname] = 1
  }

  next
}

# -------------------------------------------------------------------
# Second file: detailed inventory CSV (candidate entries)
# -------------------------------------------------------------------
# Skip header line of the second file
FNR == 1 {
  next
}

# Keep only lines with OK (case-insensitive) in the first column
toupper($1) != "OK" {
  next
}

{
  # Column 4: TAG in the detailed inventory
  itag = $4
  sub(/^0+/, "", itag)      # normalize TAG here as well

  # Column 2: owner string, we infer last name from it
  owner    = $2
  up_owner = toupper(owner)
  split(up_owner, a, /[[:space:]]+/)
  own_lname = a[length(a)] # last token considered as last name

  # -----------------------------------------------------------------
  # 1) Filter using FILTER_NAMES (keywords in owner or last name)
  # -----------------------------------------------------------------
  for (k in bad) {
    if (k == "") {
      continue
    }
    if (index(up_owner, k) > 0 || index(own_lname, k) > 0) {
      next
    }
  }

  # -----------------------------------------------------------------
  # 2) Filter using updates data (TAG or last name already present)
  # -----------------------------------------------------------------
  if (itag in done_tag) {
    next
  }
  if (own_lname in done_name) {
    next
  }

  # -----------------------------------------------------------------
  # 3) Line accepted: store it and track non-empty columns
  # -----------------------------------------------------------------
  printed++
  line_count++
  lines[line_count] = $0

  if (NF > max_nf) {
    max_nf = NF
  }

  for (i = 1; i <= NF; i++) {
    tmp = $i
    gsub(/^ +| +$/, "", tmp)
    if (tmp != "") {
      non_empty[i] = 1   # mark this column as having data
    }
  }
}

END {
  # -----------------------------------------------------------------
  # Build the list of columns that are not globally empty
  # -----------------------------------------------------------------
  keep_count = 0
  for (i = 1; i <= max_nf; i++) {
    if (i in non_empty) {
      keep_count++
      keep[keep_count] = i
    }
  }

  # -----------------------------------------------------------------
  # Optional truncation of first and last *output* columns
  # -----------------------------------------------------------------
  start = 1
  end   = keep_count

  if (truncate == "true" && keep_count > 1) {
    start = 2
    end   = keep_count - 1
  }

  # -----------------------------------------------------------------
  # Reprint all stored lines with only the selected columns
  # -----------------------------------------------------------------
  for (idx = 1; idx <= line_count; idx++) {
    n = split(lines[idx], f, FS)
    out = ""

    for (k = start; k <= end; k++) {
      col = keep[k]
      val = (col <= n ? f[col] : "")

      if (k > start) {
        out = out OFS
      }
      out = out val
    }

    print out
  }

  # -----------------------------------------------------------------
  # Print the number of lines to stderr (does NOT pollute a new CSV)
  # -----------------------------------------------------------------
  # blue = 34, yellow = 33, reset = 0
  print "\033[1;34mNumber of printed lines:\033[0m \033[1;33m" printed "\033[0m" > "/dev/stderr"
}
' "$upd" "$inv"
	#!/usr/bin/env bash
	# -----------------------------------------------------------------------------
	# cross-filter-inventory.sh
	# -----------------------------------------------------------------------------
	# Description:
	# This custom script compares and filters two related inventory CSV files:
	# 1) An anonymized "updates" CSV (e.g.: inventory-anon-updates.csv)
	# 2) A confidential detailed inventory CSV (e.g.: inventory-detailed.csv)
	#
	# It filters out:
	# - Lines that are not marked "OK" (case-insensitive) in the first column
	# of the detailed inventory.
	# - Entries whose H/W "TAG" already exists in the updates CSV
	# (with TAG normalization removes leading zeros: 01218 → 1218).
	# - Entries whose last name already appears in the updates CSV.
	# - Entries whose last name contains specific keywords listed in the
	# FILTER_NAMES array (case-insensitive).
	#
	# Remaining valid rows are printed to stdout.
	# Before printing, the CSV is "compacted":
	# - Any column that is empty for all remaining rows is removed.
	#
	# With the `--truncate` flag, the first and last columns of the compacted
	# CSV output are removed (e.g. drop column 1 "OK" and the last field).
	# -----------------------------------------------------------------------------
	# Usage:
	# ./cross-filter-inventory.sh [--truncate] <updates.csv> <inventory.csv>
	#
	# Examples that generates a new cleaned-up CSV file:
	# ./cross-filter-inventory.sh inventory-anon-updates.csv \
	# inventory-detailed.csv > filtered.csv
	#
	# ./cross-filter-inventory.sh --truncate inventory-anon-updates.csv \
	# inventory-detailed.csv > filtered_clean.csv
	# -----------------------------------------------------------------------------
	# Output:
	# - Filtered CSV rows on stdout.
	# - Final line count on stderr.
	# -----------------------------------------------------------------------------
	# Requirements:
	# - Bash 4+
	# - AWK (POSIX awk or GNU awk)
	# -----------------------------------------------------------------------------
	# MIT License
	#
	# Copyright (c) 2024-2025 Sébastien L.
	# -----------------------------------------------------------------------------
	set -euo pipefail

	# -----------------------------------------------------------------------------
	# Configuration
	# -----------------------------------------------------------------------------
	# Keywords that should cause a row to be dropped if found in owner or last name
	FILTER_NAMES=(
	stock
	RECOVERY
	majeur
	)

	# -----------------------------------------------------------------------------
	# Functions
	# -----------------------------------------------------------------------------
	usage() {
	cat >&2 <<EOF
	Usage: $0 [--truncate] <updates.csv> <detailed-inventory.csv>

	Compare an anonymized "updates" CSV with a detailed inventory CSV and print
	only the 'missing' entries from the detailed inventory that are not already
	present in the updates (based on TAG and last name).

	Options:
	--truncate Remove first and last columns from the compacted CSV output.
	-h, --help Show this help message and exit.
	EOF
	}

	# -----------------------------------------------------------------------------
	# Argument parsing
	# -----------------------------------------------------------------------------
	TRUNCATE="false"
	while (( $# > 0 )); do
	case "$1" in
	--truncate)
	TRUNCATE="true"
	shift
	;;
	-h\|--help)
	usage
	exit 0
	;;
	--)
	shift
	break
	;;
	-*)
	echo "Error: unknown option: $1" >&2
	usage
	exit 1
	;;
	*)
	# first non-option: break to treat remaining as positional args
	break
	;;
	esac
	done

	if (( $# != 2 )); then
	echo "Error: expected 2 arguments, got $#." >&2
	usage
	exit 1
	fi

	upd=$1 # updates CSV (anonymous)
	inv=$2 # detailed inventory CSV

	# Flatten FILTER_NAMES into a single space-separated string for AWK.
	# Using "${FILTER_NAMES[*]}" preserves words, separated by spaces.
	FILTERS_STR=${FILTER_NAMES[*]:-}

	# -----------------------------------------------------------------------------
	# Core AWK logic
	# -----------------------------------------------------------------------------
	awk -v filters="$FILTERS_STR" -v truncate="$TRUNCATE" -F',' '
	BEGIN {
	FS = ","
	OFS = ","

	printed = 0 # number of accepted lines
	line_count = 0 # number of stored lines
	max_nf = 0 # max number of fields seen

	# build a table of forbidden words from FILTER_NAMES
	n = split(filters, f, " ")
	for (i = 1; i <= n; i++) {
	if (f[i] == "") {
	continue
	}
	bad[toupper(f[i])] = 1
	}
	}

	# -------------------------------------------------------------------
	# First file: updates CSV (anonymous "reference" entries)
	# NR == FNR means: still reading the first input file
	# -------------------------------------------------------------------
	NR == FNR {
	# Skip header line
	if (FNR == 1) {
	next
	}

	# Column 1: TAG
	tag = $1
	sub(/^0+/, "", tag) # normalize: remove leading zeros

	if (tag != "") {
	done_tag[tag] = 1
	}

	# Column 6: last name (in your input format)
	lname = toupper($6)
	gsub(/^ +\| +$/, "", lname)

	if (lname != "") {
	done_name[lname] = 1
	}

	next
	}

	# -------------------------------------------------------------------
	# Second file: detailed inventory CSV (candidate entries)
	# -------------------------------------------------------------------
	# Skip header line of the second file
	FNR == 1 {
	next
	}

	# Keep only lines with OK (case-insensitive) in the first column
	toupper($1) != "OK" {
	next
	}

	{
	# Column 4: TAG in the detailed inventory
	itag = $4
	sub(/^0+/, "", itag) # normalize TAG here as well

	# Column 2: owner string, we infer last name from it
	owner = $2
	up_owner = toupper(owner)
	split(up_owner, a, /[[:space:]]+/)
	own_lname = a[length(a)] # last token considered as last name

	# -----------------------------------------------------------------
	# 1) Filter using FILTER_NAMES (keywords in owner or last name)
	# -----------------------------------------------------------------
	for (k in bad) {
	if (k == "") {
	continue
	}
	if (index(up_owner, k) > 0 \|\| index(own_lname, k) > 0) {
	next
	}
	}

	# -----------------------------------------------------------------
	# 2) Filter using updates data (TAG or last name already present)
	# -----------------------------------------------------------------
	if (itag in done_tag) {
	next
	}
	if (own_lname in done_name) {
	next
	}

	# -----------------------------------------------------------------
	# 3) Line accepted: store it and track non-empty columns
	# -----------------------------------------------------------------
	printed++
	line_count++
	lines[line_count] = $0

	if (NF > max_nf) {
	max_nf = NF
	}

	for (i = 1; i <= NF; i++) {
	tmp = $i
	gsub(/^ +\| +$/, "", tmp)
	if (tmp != "") {
	non_empty[i] = 1 # mark this column as having data
	}
	}
	}

	END {
	# -----------------------------------------------------------------
	# Build the list of columns that are not globally empty
	# -----------------------------------------------------------------
	keep_count = 0
	for (i = 1; i <= max_nf; i++) {
	if (i in non_empty) {
	keep_count++
	keep[keep_count] = i
	}
	}

	# -----------------------------------------------------------------
	# Optional truncation of first and last output columns
	# -----------------------------------------------------------------
	start = 1
	end = keep_count

	if (truncate == "true" && keep_count > 1) {
	start = 2
	end = keep_count - 1
	}

	# -----------------------------------------------------------------
	# Reprint all stored lines with only the selected columns
	# -----------------------------------------------------------------
	for (idx = 1; idx <= line_count; idx++) {
	n = split(lines[idx], f, FS)
	out = ""

	for (k = start; k <= end; k++) {
	col = keep[k]
	val = (col <= n ? f[col] : "")

	if (k > start) {
	out = out OFS
	}
	out = out val
	}

	print out
	}

	# -----------------------------------------------------------------
	# Print the number of lines to stderr (does NOT pollute a new CSV)
	# -----------------------------------------------------------------
	# blue = 34, yellow = 33, reset = 0
	print "\033[1;34mNumber of printed lines:\033[0m \033[1;33m" printed "\033[0m" > "/dev/stderr"
	}
	' "$upd" "$inv"
No results found