Last active
September 24, 2025 20:02
-
-
Save waylan/6ce435230c2e9749f60e1b3cff4d675b to your computer and use it in GitHub Desktop.
Compare two excel files and report differences in the records.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # /// script | |
| # dependencies = [ | |
| # "pandas", | |
| # "openpyxl", | |
| # "click", | |
| # ] | |
| # /// | |
| ''' | |
| Compare two excel files and report differences in the records. | |
| Updated: 2025-09-24 | |
| Author: Waylan Limberg | |
| ''' | |
| import click | |
| import datetime | |
| import pandas as pd | |
| class Mutex(click.Option): | |
| def __init__(self, *args, **kwargs): | |
| self.not_required_if:list = kwargs.pop("not_required_if") | |
| assert self.not_required_if, "'not_required_if' parameter required" | |
| super(Mutex, self).__init__(*args, **kwargs) | |
| def handle_parse_result(self, ctx, opts, args): | |
| current_opt:bool = self.name in opts | |
| for mutex_opt in self.not_required_if: | |
| if mutex_opt in opts: | |
| if current_opt: | |
| raise click.UsageError(f"Illegal usage: '{self.name}' is mutually exclusive with '{mutex_opt}'.") | |
| else: | |
| self.required = False | |
| return super(Mutex, self).handle_parse_result(ctx, opts, args) | |
| @click.command() | |
| @click.option('-o', '--old', type=click.File('rb'), required=True, help='Old file to compare against.') | |
| @click.option('-n', '--new', type=click.File('rb'), required=True, help='New file to compare against.') | |
| @click.option( | |
| '-k', '--key-column', type=click.STRING, required=True, cls=Mutex, not_required_if=['list_columns'], | |
| help='Name of column to use as unique identifier.' | |
| ) | |
| @click.option( | |
| '-c', '--compare-column', type=click.STRING, cls=Mutex, not_required_if=['list_columns'], | |
| help='Name of column to compare for changes.' | |
| ) | |
| @click.option('-l', '--list-columns', is_flag=True, help='Show list of columns and exit.') | |
| @click.help_option('-h', '--help') | |
| @click.argument('filename', type=click.STRING, default=None, required=False) | |
| def main(old, new, key_column, compare_column, list_columns, filename): | |
| ''' | |
| Compare two excel files and report the differences. | |
| Differences are reported in FILENAME, which should be different from | |
| either the old or new file. If FILENAME is not provided, then a filename | |
| is generated based in the current date. | |
| The -k/--key-column option defines the column which should be unique for | |
| each record. This would usually be a column which contains an | |
| identification number or similar data. | |
| If the -c/--compare-column option is provided, changed records only | |
| compares the provided column. Otherwise, all columns are compared. | |
| It is generally expected that both files will contain the same columns of | |
| data with the same column names. If a column is in only one of the files, | |
| it will be ignored. To see a list of columns for each file, use the | |
| -l/--list-columns option. No output file is generated when the | |
| -l/--list-columns option is used. | |
| The output file contains 3 sheets: | |
| New Records: | |
| This sheet contains all rows which are only found in the new file. | |
| Removed Records: | |
| This sheet contains all rows which are only found in the old file. | |
| Changed Records: | |
| This sheet contains rows which were found in both files, but have | |
| different data. | |
| ''' | |
| try: | |
| click.echo('Loading old file... ', nl=False) | |
| old = pd.read_excel(old) | |
| except Exception as e: | |
| click.echo(f'\nFailed to open old file: {e}', err=True) | |
| else: | |
| click.echo('Success!') | |
| try: | |
| click.echo('Loading new file... ', nl=False) | |
| new = pd.read_excel(new) | |
| except Exception as e: | |
| click.echo(f'\nFailed to open new file: {e}', err=True) | |
| else: | |
| click.echo('Success!') | |
| if list_columns: | |
| # Display list of columns and exit | |
| click.echo(f'\nOld columns: {list(old.columns)}') | |
| click.echo(f'New columns: {list(new.columns)}') | |
| return | |
| filename = filename or f'diff{ datetime.datetime.now().strftime("%Y%m%d") }.xlsx' | |
| with pd.ExcelWriter(f'{filename}') as writer: | |
| # Compare with old results and export additions, subtractions, and transfers. | |
| m = old.merge(new, on=[key_column], how='outer', suffixes=['_old', '_new'], indicator=True) | |
| # Additions | |
| add = m.loc[m['_merge'] == 'right_only'] | |
| add = add.drop(columns=[x for x in add.columns if x.endswith('_old')] + ['_merge']) | |
| add = add.rename(columns={x: x[:-4] for x in add.columns if x.endswith('_new')}) | |
| click.echo(f' {len(add)} new record{"s" if not len(add) == 1 else ""} found.') | |
| add.to_excel(writer, sheet_name='New Records', index=False) | |
| # Subtractions | |
| sub = m.loc[m['_merge'] == 'left_only'] | |
| sub = sub.drop(columns=[x for x in sub.columns if x.endswith('_new')] + ['_merge']) | |
| sub = sub.rename(columns={x: x[:-4] for x in sub.columns if x.endswith('_old')}) | |
| click.echo(f' {len(sub)} removed record{"s" if not len(sub) == 1 else ""} found.') | |
| sub.to_excel(writer, sheet_name='Removed Records', index=False) | |
| # Changes | |
| if compare_column: | |
| exp = f'`{compare_column}_old` != `{compare_column}_new`' | |
| else: | |
| exp = ' | '.join( | |
| f'(`{col}_old` != `{col}_new`)' for col in old.columns | |
| if f'{col}_old' in m.columns and f'{col}_new' in m.columns | |
| ) | |
| chng = m.query(f'(_merge == "both") & ({exp})') | |
| chng = chng.drop(columns=['_merge']) | |
| if compare_column: | |
| # Drop 'old' columns except compare_column | |
| chng = chng.drop( | |
| columns=[x for x in chng.columns if x.endswith('_old') and not x.startswith(compare_column)] | |
| ) | |
| # Remove `_new` from column names except compare_column | |
| chng = chng.rename( | |
| columns={x: x[:-4] for x in chng.columns if x.endswith('_new') and not x.startswith(compare_column)} | |
| ) | |
| # Reorder columns so that compare_column_old is directly to the left of compare_column_new | |
| columns = list(chng.columns) | |
| old = columns.pop(columns.index(f'{compare_column}_old')) | |
| columns.insert(columns.index(f'{compare_column}_new'), old) | |
| chng = chng[columns] | |
| click.echo(f' {len(chng)} changed record{"s" if not len(chng) == 1 else ""} found.') | |
| chng.to_excel(writer, sheet_name='Changed Records', index=False) | |
| click.echo(f'Results saved to "{filename}"') | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment