Skip to content

Instantly share code, notes, and snippets.

@waylan
Last active September 24, 2025 20:02
Show Gist options
  • Select an option

  • Save waylan/6ce435230c2e9749f60e1b3cff4d675b to your computer and use it in GitHub Desktop.

Select an option

Save waylan/6ce435230c2e9749f60e1b3cff4d675b to your computer and use it in GitHub Desktop.
Compare two excel files and report differences in the records.
#!/usr/bin/env python3
# /// script
# dependencies = [
# "pandas",
# "openpyxl",
# "click",
# ]
# ///
'''
Compare two excel files and report differences in the records.
Updated: 2025-09-24
Author: Waylan Limberg
'''
import click
import datetime
import pandas as pd
class Mutex(click.Option):
def __init__(self, *args, **kwargs):
self.not_required_if:list = kwargs.pop("not_required_if")
assert self.not_required_if, "'not_required_if' parameter required"
super(Mutex, self).__init__(*args, **kwargs)
def handle_parse_result(self, ctx, opts, args):
current_opt:bool = self.name in opts
for mutex_opt in self.not_required_if:
if mutex_opt in opts:
if current_opt:
raise click.UsageError(f"Illegal usage: '{self.name}' is mutually exclusive with '{mutex_opt}'.")
else:
self.required = False
return super(Mutex, self).handle_parse_result(ctx, opts, args)
@click.command()
@click.option('-o', '--old', type=click.File('rb'), required=True, help='Old file to compare against.')
@click.option('-n', '--new', type=click.File('rb'), required=True, help='New file to compare against.')
@click.option(
'-k', '--key-column', type=click.STRING, required=True, cls=Mutex, not_required_if=['list_columns'],
help='Name of column to use as unique identifier.'
)
@click.option(
'-c', '--compare-column', type=click.STRING, cls=Mutex, not_required_if=['list_columns'],
help='Name of column to compare for changes.'
)
@click.option('-l', '--list-columns', is_flag=True, help='Show list of columns and exit.')
@click.help_option('-h', '--help')
@click.argument('filename', type=click.STRING, default=None, required=False)
def main(old, new, key_column, compare_column, list_columns, filename):
'''
Compare two excel files and report the differences.
Differences are reported in FILENAME, which should be different from
either the old or new file. If FILENAME is not provided, then a filename
is generated based in the current date.
The -k/--key-column option defines the column which should be unique for
each record. This would usually be a column which contains an
identification number or similar data.
If the -c/--compare-column option is provided, changed records only
compares the provided column. Otherwise, all columns are compared.
It is generally expected that both files will contain the same columns of
data with the same column names. If a column is in only one of the files,
it will be ignored. To see a list of columns for each file, use the
-l/--list-columns option. No output file is generated when the
-l/--list-columns option is used.
The output file contains 3 sheets:
New Records:
This sheet contains all rows which are only found in the new file.
Removed Records:
This sheet contains all rows which are only found in the old file.
Changed Records:
This sheet contains rows which were found in both files, but have
different data.
'''
try:
click.echo('Loading old file... ', nl=False)
old = pd.read_excel(old)
except Exception as e:
click.echo(f'\nFailed to open old file: {e}', err=True)
else:
click.echo('Success!')
try:
click.echo('Loading new file... ', nl=False)
new = pd.read_excel(new)
except Exception as e:
click.echo(f'\nFailed to open new file: {e}', err=True)
else:
click.echo('Success!')
if list_columns:
# Display list of columns and exit
click.echo(f'\nOld columns: {list(old.columns)}')
click.echo(f'New columns: {list(new.columns)}')
return
filename = filename or f'diff{ datetime.datetime.now().strftime("%Y%m%d") }.xlsx'
with pd.ExcelWriter(f'{filename}') as writer:
# Compare with old results and export additions, subtractions, and transfers.
m = old.merge(new, on=[key_column], how='outer', suffixes=['_old', '_new'], indicator=True)
# Additions
add = m.loc[m['_merge'] == 'right_only']
add = add.drop(columns=[x for x in add.columns if x.endswith('_old')] + ['_merge'])
add = add.rename(columns={x: x[:-4] for x in add.columns if x.endswith('_new')})
click.echo(f' {len(add)} new record{"s" if not len(add) == 1 else ""} found.')
add.to_excel(writer, sheet_name='New Records', index=False)
# Subtractions
sub = m.loc[m['_merge'] == 'left_only']
sub = sub.drop(columns=[x for x in sub.columns if x.endswith('_new')] + ['_merge'])
sub = sub.rename(columns={x: x[:-4] for x in sub.columns if x.endswith('_old')})
click.echo(f' {len(sub)} removed record{"s" if not len(sub) == 1 else ""} found.')
sub.to_excel(writer, sheet_name='Removed Records', index=False)
# Changes
if compare_column:
exp = f'`{compare_column}_old` != `{compare_column}_new`'
else:
exp = ' | '.join(
f'(`{col}_old` != `{col}_new`)' for col in old.columns
if f'{col}_old' in m.columns and f'{col}_new' in m.columns
)
chng = m.query(f'(_merge == "both") & ({exp})')
chng = chng.drop(columns=['_merge'])
if compare_column:
# Drop 'old' columns except compare_column
chng = chng.drop(
columns=[x for x in chng.columns if x.endswith('_old') and not x.startswith(compare_column)]
)
# Remove `_new` from column names except compare_column
chng = chng.rename(
columns={x: x[:-4] for x in chng.columns if x.endswith('_new') and not x.startswith(compare_column)}
)
# Reorder columns so that compare_column_old is directly to the left of compare_column_new
columns = list(chng.columns)
old = columns.pop(columns.index(f'{compare_column}_old'))
columns.insert(columns.index(f'{compare_column}_new'), old)
chng = chng[columns]
click.echo(f' {len(chng)} changed record{"s" if not len(chng) == 1 else ""} found.')
chng.to_excel(writer, sheet_name='Changed Records', index=False)
click.echo(f'Results saved to "{filename}"')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment