Created
December 2, 2025 15:17
-
-
Save dzogrim/5382a0afa0b08d0d2ce7f60162fcbd23 to your computer and use it in GitHub Desktop.
This custom script compares and filters two related inventory CSV files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # ----------------------------------------------------------------------------- | |
| # cross-filter-inventory.sh | |
| # ----------------------------------------------------------------------------- | |
| # Description: | |
| # This custom script compares and filters two related inventory CSV files: | |
| # 1) An anonymized "updates" CSV (e.g.: inventory-anon-updates.csv) | |
| # 2) A confidential detailed inventory CSV (e.g.: inventory-detailed.csv) | |
| # | |
| # It filters out: | |
| # - Lines that are not marked "OK" (case-insensitive) in the first column | |
| # of the detailed inventory. | |
| # - Entries whose H/W "TAG" already exists in the updates CSV | |
| # (with TAG normalization removes leading zeros: 01218 → 1218). | |
| # - Entries whose last name already appears in the updates CSV. | |
| # - Entries whose last name contains specific keywords listed in the | |
| # FILTER_NAMES array (case-insensitive). | |
| # | |
| # Remaining valid rows are printed to stdout. | |
| # Before printing, the CSV is "compacted": | |
| # - Any column that is empty for all remaining rows is removed. | |
| # | |
| # With the `--truncate` flag, the first and last columns of the compacted | |
| # CSV output are removed (e.g. drop column 1 "OK" and the last field). | |
| # ----------------------------------------------------------------------------- | |
| # Usage: | |
| # ./cross-filter-inventory.sh [--truncate] <updates.csv> <inventory.csv> | |
| # | |
| # Examples that generates a new cleaned-up CSV file: | |
| # ./cross-filter-inventory.sh inventory-anon-updates.csv \ | |
| # inventory-detailed.csv > filtered.csv | |
| # | |
| # ./cross-filter-inventory.sh --truncate inventory-anon-updates.csv \ | |
| # inventory-detailed.csv > filtered_clean.csv | |
| # ----------------------------------------------------------------------------- | |
| # Output: | |
| # - Filtered CSV rows on stdout. | |
| # - Final line count on stderr. | |
| # ----------------------------------------------------------------------------- | |
| # Requirements: | |
| # - Bash 4+ | |
| # - AWK (POSIX awk or GNU awk) | |
| # ----------------------------------------------------------------------------- | |
| # MIT License | |
| # | |
| # Copyright (c) 2024-2025 Sébastien L. | |
| # ----------------------------------------------------------------------------- | |
| set -euo pipefail | |
| # ----------------------------------------------------------------------------- | |
| # Configuration | |
| # ----------------------------------------------------------------------------- | |
| # Keywords that should cause a row to be dropped if found in owner or last name | |
| FILTER_NAMES=( | |
| stock | |
| RECOVERY | |
| majeur | |
| ) | |
| # ----------------------------------------------------------------------------- | |
| # Functions | |
| # ----------------------------------------------------------------------------- | |
| usage() { | |
| cat >&2 <<EOF | |
| Usage: $0 [--truncate] <updates.csv> <detailed-inventory.csv> | |
| Compare an anonymized "updates" CSV with a detailed inventory CSV and print | |
| only the 'missing' entries from the detailed inventory that are not already | |
| present in the updates (based on TAG and last name). | |
| Options: | |
| --truncate Remove first and last columns from the compacted CSV output. | |
| -h, --help Show this help message and exit. | |
| EOF | |
| } | |
| # ----------------------------------------------------------------------------- | |
| # Argument parsing | |
| # ----------------------------------------------------------------------------- | |
| TRUNCATE="false" | |
| while (( $# > 0 )); do | |
| case "$1" in | |
| --truncate) | |
| TRUNCATE="true" | |
| shift | |
| ;; | |
| -h|--help) | |
| usage | |
| exit 0 | |
| ;; | |
| --) | |
| shift | |
| break | |
| ;; | |
| -*) | |
| echo "Error: unknown option: $1" >&2 | |
| usage | |
| exit 1 | |
| ;; | |
| *) | |
| # first non-option: break to treat remaining as positional args | |
| break | |
| ;; | |
| esac | |
| done | |
| if (( $# != 2 )); then | |
| echo "Error: expected 2 arguments, got $#." >&2 | |
| usage | |
| exit 1 | |
| fi | |
| upd=$1 # updates CSV (anonymous) | |
| inv=$2 # detailed inventory CSV | |
| # Flatten FILTER_NAMES into a single space-separated string for AWK. | |
| # Using "${FILTER_NAMES[*]}" preserves words, separated by spaces. | |
| FILTERS_STR=${FILTER_NAMES[*]:-} | |
| # ----------------------------------------------------------------------------- | |
| # Core AWK logic | |
| # ----------------------------------------------------------------------------- | |
| awk -v filters="$FILTERS_STR" -v truncate="$TRUNCATE" -F',' ' | |
| BEGIN { | |
| FS = "," | |
| OFS = "," | |
| printed = 0 # number of accepted lines | |
| line_count = 0 # number of stored lines | |
| max_nf = 0 # max number of fields seen | |
| # build a table of forbidden words from FILTER_NAMES | |
| n = split(filters, f, " ") | |
| for (i = 1; i <= n; i++) { | |
| if (f[i] == "") { | |
| continue | |
| } | |
| bad[toupper(f[i])] = 1 | |
| } | |
| } | |
| # ------------------------------------------------------------------- | |
| # First file: updates CSV (anonymous "reference" entries) | |
| # NR == FNR means: still reading the first input file | |
| # ------------------------------------------------------------------- | |
| NR == FNR { | |
| # Skip header line | |
| if (FNR == 1) { | |
| next | |
| } | |
| # Column 1: TAG | |
| tag = $1 | |
| sub(/^0+/, "", tag) # normalize: remove leading zeros | |
| if (tag != "") { | |
| done_tag[tag] = 1 | |
| } | |
| # Column 6: last name (in your input format) | |
| lname = toupper($6) | |
| gsub(/^ +| +$/, "", lname) | |
| if (lname != "") { | |
| done_name[lname] = 1 | |
| } | |
| next | |
| } | |
| # ------------------------------------------------------------------- | |
| # Second file: detailed inventory CSV (candidate entries) | |
| # ------------------------------------------------------------------- | |
| # Skip header line of the second file | |
| FNR == 1 { | |
| next | |
| } | |
| # Keep only lines with OK (case-insensitive) in the first column | |
| toupper($1) != "OK" { | |
| next | |
| } | |
| { | |
| # Column 4: TAG in the detailed inventory | |
| itag = $4 | |
| sub(/^0+/, "", itag) # normalize TAG here as well | |
| # Column 2: owner string, we infer last name from it | |
| owner = $2 | |
| up_owner = toupper(owner) | |
| split(up_owner, a, /[[:space:]]+/) | |
| own_lname = a[length(a)] # last token considered as last name | |
| # ----------------------------------------------------------------- | |
| # 1) Filter using FILTER_NAMES (keywords in owner or last name) | |
| # ----------------------------------------------------------------- | |
| for (k in bad) { | |
| if (k == "") { | |
| continue | |
| } | |
| if (index(up_owner, k) > 0 || index(own_lname, k) > 0) { | |
| next | |
| } | |
| } | |
| # ----------------------------------------------------------------- | |
| # 2) Filter using updates data (TAG or last name already present) | |
| # ----------------------------------------------------------------- | |
| if (itag in done_tag) { | |
| next | |
| } | |
| if (own_lname in done_name) { | |
| next | |
| } | |
| # ----------------------------------------------------------------- | |
| # 3) Line accepted: store it and track non-empty columns | |
| # ----------------------------------------------------------------- | |
| printed++ | |
| line_count++ | |
| lines[line_count] = $0 | |
| if (NF > max_nf) { | |
| max_nf = NF | |
| } | |
| for (i = 1; i <= NF; i++) { | |
| tmp = $i | |
| gsub(/^ +| +$/, "", tmp) | |
| if (tmp != "") { | |
| non_empty[i] = 1 # mark this column as having data | |
| } | |
| } | |
| } | |
| END { | |
| # ----------------------------------------------------------------- | |
| # Build the list of columns that are not globally empty | |
| # ----------------------------------------------------------------- | |
| keep_count = 0 | |
| for (i = 1; i <= max_nf; i++) { | |
| if (i in non_empty) { | |
| keep_count++ | |
| keep[keep_count] = i | |
| } | |
| } | |
| # ----------------------------------------------------------------- | |
| # Optional truncation of first and last *output* columns | |
| # ----------------------------------------------------------------- | |
| start = 1 | |
| end = keep_count | |
| if (truncate == "true" && keep_count > 1) { | |
| start = 2 | |
| end = keep_count - 1 | |
| } | |
| # ----------------------------------------------------------------- | |
| # Reprint all stored lines with only the selected columns | |
| # ----------------------------------------------------------------- | |
| for (idx = 1; idx <= line_count; idx++) { | |
| n = split(lines[idx], f, FS) | |
| out = "" | |
| for (k = start; k <= end; k++) { | |
| col = keep[k] | |
| val = (col <= n ? f[col] : "") | |
| if (k > start) { | |
| out = out OFS | |
| } | |
| out = out val | |
| } | |
| print out | |
| } | |
| # ----------------------------------------------------------------- | |
| # Print the number of lines to stderr (does NOT pollute a new CSV) | |
| # ----------------------------------------------------------------- | |
| # blue = 34, yellow = 33, reset = 0 | |
| print "\033[1;34mNumber of printed lines:\033[0m \033[1;33m" printed "\033[0m" > "/dev/stderr" | |
| } | |
| ' "$upd" "$inv" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment