waylan/comparexl.py

## comparexl.py
#!/usr/bin/env python3
# /// script
# dependencies = [
#   "pandas",
#   "openpyxl",
#   "click",
# ]
# ///

'''
Compare two excel files and report differences in the records.

Updated: 2025-09-24

Author: Waylan Limberg

'''

import click
import datetime
import pandas as pd


class Mutex(click.Option):
    def __init__(self, *args, **kwargs):
        self.not_required_if:list = kwargs.pop("not_required_if")
        assert self.not_required_if, "'not_required_if' parameter required"
        super(Mutex, self).__init__(*args, **kwargs)

    def handle_parse_result(self, ctx, opts, args):
        current_opt:bool = self.name in opts
        for mutex_opt in self.not_required_if:
            if mutex_opt in opts:
                if current_opt:
                    raise click.UsageError(f"Illegal usage: '{self.name}' is mutually exclusive with '{mutex_opt}'.")
                else:
                    self.required = False
        return super(Mutex, self).handle_parse_result(ctx, opts, args)


@click.command()
@click.option('-o', '--old', type=click.File('rb'), required=True, help='Old file to compare against.')
@click.option('-n', '--new', type=click.File('rb'), required=True, help='New file to compare against.')
@click.option(
    '-k', '--key-column', type=click.STRING, required=True, cls=Mutex, not_required_if=['list_columns'],
    help='Name of column to use as unique identifier.'
)
@click.option(
    '-c', '--compare-column', type=click.STRING, cls=Mutex, not_required_if=['list_columns'],
    help='Name of column to compare for changes.'
)
@click.option('-l', '--list-columns', is_flag=True, help='Show list of columns and exit.')
@click.help_option('-h', '--help')
@click.argument('filename', type=click.STRING, default=None, required=False)
def main(old, new, key_column, compare_column, list_columns, filename):
    '''
    Compare two excel files and report the differences.

    Differences are reported in FILENAME, which should be different from
    either the old or new file. If FILENAME is not provided, then a filename
    is generated based in the current date.

    The -k/--key-column option defines the column which should be unique for
    each record. This would usually be a column which contains an
    identification number or similar data.

    If the -c/--compare-column option is provided, changed records only
    compares the provided column. Otherwise, all columns are compared.

    It is generally expected that both files will contain the same columns of
    data with the same column names. If a column is in only one of the files,
    it will be ignored. To see a list of columns for each file, use the
    -l/--list-columns option. No output file is generated when the
    -l/--list-columns option is used.

    The output file contains 3 sheets:

    New Records:

        This sheet contains all rows which are only found in the new file.

    Removed Records:

        This sheet contains all rows which are only found in the old file.

    Changed Records:

        This sheet contains rows which were found in both files, but have
        different data.

    '''

    try:
        click.echo('Loading old file... ', nl=False)
        old = pd.read_excel(old)
    except Exception as e:
        click.echo(f'\nFailed to open old file: {e}', err=True)
    else:
        click.echo('Success!')


    try:
        click.echo('Loading new file... ', nl=False)
        new = pd.read_excel(new)
    except Exception as e:
        click.echo(f'\nFailed to open new file: {e}', err=True)
    else:
        click.echo('Success!')

    if list_columns:
        # Display list of columns and exit
        click.echo(f'\nOld columns: {list(old.columns)}')
        click.echo(f'New columns: {list(new.columns)}')
        return

    filename = filename or f'diff{ datetime.datetime.now().strftime("%Y%m%d") }.xlsx'
    with pd.ExcelWriter(f'{filename}') as writer:
        # Compare with old results and export additions, subtractions, and transfers.
        m = old.merge(new, on=[key_column], how='outer', suffixes=['_old', '_new'], indicator=True)

        # Additions
        add = m.loc[m['_merge'] == 'right_only']
        add = add.drop(columns=[x for x in add.columns if x.endswith('_old')] + ['_merge'])
        add = add.rename(columns={x: x[:-4] for x in add.columns if x.endswith('_new')})
        click.echo(f'    {len(add)} new record{"s" if not len(add) == 1 else ""} found.')
        add.to_excel(writer, sheet_name='New Records', index=False)

        # Subtractions
        sub = m.loc[m['_merge'] == 'left_only']
        sub = sub.drop(columns=[x for x in sub.columns if x.endswith('_new')] + ['_merge'])
        sub = sub.rename(columns={x: x[:-4] for x in sub.columns if x.endswith('_old')})
        click.echo(f'    {len(sub)} removed record{"s" if not len(sub) == 1 else ""} found.')
        sub.to_excel(writer, sheet_name='Removed Records', index=False)

        # Changes
        if compare_column:
            exp = f'`{compare_column}_old` != `{compare_column}_new`'
        else:
            exp = ' | '.join(
                f'(`{col}_old` != `{col}_new`)' for col in old.columns
                    if f'{col}_old' in m.columns and f'{col}_new' in m.columns
            )
        chng = m.query(f'(_merge == "both") & ({exp})')
        chng = chng.drop(columns=['_merge'])
        if compare_column:
            # Drop 'old' columns except compare_column
            chng = chng.drop(
                columns=[x for x in chng.columns if x.endswith('_old') and not x.startswith(compare_column)]
            )
            # Remove `_new` from column names except compare_column
            chng = chng.rename(
                columns={x: x[:-4] for x in chng.columns if x.endswith('_new') and not x.startswith(compare_column)}
            )
            # Reorder columns so that compare_column_old is directly to the left of compare_column_new
            columns = list(chng.columns)
            old = columns.pop(columns.index(f'{compare_column}_old'))
            columns.insert(columns.index(f'{compare_column}_new'), old)
            chng = chng[columns]
        click.echo(f'    {len(chng)} changed record{"s" if not len(chng) == 1 else ""} found.')
        chng.to_excel(writer, sheet_name='Changed Records', index=False)

    click.echo(f'Results saved to "{filename}"')

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	# /// script
	# dependencies = [
	# "pandas",
	# "openpyxl",
	# "click",
	# ]
	# ///

	'''
	Compare two excel files and report differences in the records.

	Updated: 2025-09-24

	Author: Waylan Limberg

	'''

	import click
	import datetime
	import pandas as pd


	class Mutex(click.Option):
	def __init__(self, args, *kwargs):
	self.not_required_if:list = kwargs.pop("not_required_if")
	assert self.not_required_if, "'not_required_if' parameter required"
	super(Mutex, self).__init__(args, *kwargs)

	def handle_parse_result(self, ctx, opts, args):
	current_opt:bool = self.name in opts
	for mutex_opt in self.not_required_if:
	if mutex_opt in opts:
	if current_opt:
	raise click.UsageError(f"Illegal usage: '{self.name}' is mutually exclusive with '{mutex_opt}'.")
	else:
	self.required = False
	return super(Mutex, self).handle_parse_result(ctx, opts, args)


	@click.command()
	@click.option('-o', '--old', type=click.File('rb'), required=True, help='Old file to compare against.')
	@click.option('-n', '--new', type=click.File('rb'), required=True, help='New file to compare against.')
	@click.option(
	'-k', '--key-column', type=click.STRING, required=True, cls=Mutex, not_required_if=['list_columns'],
	help='Name of column to use as unique identifier.'
	)
	@click.option(
	'-c', '--compare-column', type=click.STRING, cls=Mutex, not_required_if=['list_columns'],
	help='Name of column to compare for changes.'
	)
	@click.option('-l', '--list-columns', is_flag=True, help='Show list of columns and exit.')
	@click.help_option('-h', '--help')
	@click.argument('filename', type=click.STRING, default=None, required=False)
	def main(old, new, key_column, compare_column, list_columns, filename):
	'''
	Compare two excel files and report the differences.

	Differences are reported in FILENAME, which should be different from
	either the old or new file. If FILENAME is not provided, then a filename
	is generated based in the current date.

	The -k/--key-column option defines the column which should be unique for
	each record. This would usually be a column which contains an
	identification number or similar data.

	If the -c/--compare-column option is provided, changed records only
	compares the provided column. Otherwise, all columns are compared.

	It is generally expected that both files will contain the same columns of
	data with the same column names. If a column is in only one of the files,
	it will be ignored. To see a list of columns for each file, use the
	-l/--list-columns option. No output file is generated when the
	-l/--list-columns option is used.

	The output file contains 3 sheets:

	New Records:

	This sheet contains all rows which are only found in the new file.

	Removed Records:

	This sheet contains all rows which are only found in the old file.

	Changed Records:

	This sheet contains rows which were found in both files, but have
	different data.

	'''

	try:
	click.echo('Loading old file... ', nl=False)
	old = pd.read_excel(old)
	except Exception as e:
	click.echo(f'\nFailed to open old file: {e}', err=True)
	else:
	click.echo('Success!')


	try:
	click.echo('Loading new file... ', nl=False)
	new = pd.read_excel(new)
	except Exception as e:
	click.echo(f'\nFailed to open new file: {e}', err=True)
	else:
	click.echo('Success!')

	if list_columns:
	# Display list of columns and exit
	click.echo(f'\nOld columns: {list(old.columns)}')
	click.echo(f'New columns: {list(new.columns)}')
	return

	filename = filename or f'diff{ datetime.datetime.now().strftime("%Y%m%d") }.xlsx'
	with pd.ExcelWriter(f'{filename}') as writer:
	# Compare with old results and export additions, subtractions, and transfers.
	m = old.merge(new, on=[key_column], how='outer', suffixes=['_old', '_new'], indicator=True)

	# Additions
	add = m.loc[m['_merge'] == 'right_only']
	add = add.drop(columns=[x for x in add.columns if x.endswith('_old')] + ['_merge'])
	add = add.rename(columns={x: x[:-4] for x in add.columns if x.endswith('_new')})
	click.echo(f' {len(add)} new record{"s" if not len(add) == 1 else ""} found.')
	add.to_excel(writer, sheet_name='New Records', index=False)

	# Subtractions
	sub = m.loc[m['_merge'] == 'left_only']
	sub = sub.drop(columns=[x for x in sub.columns if x.endswith('_new')] + ['_merge'])
	sub = sub.rename(columns={x: x[:-4] for x in sub.columns if x.endswith('_old')})
	click.echo(f' {len(sub)} removed record{"s" if not len(sub) == 1 else ""} found.')
	sub.to_excel(writer, sheet_name='Removed Records', index=False)

	# Changes
	if compare_column:
	exp = f'`{compare_column}_old` != `{compare_column}_new`'
	else:
	exp = ' \| '.join(
	f'(`{col}_old` != `{col}_new`)' for col in old.columns
	if f'{col}_old' in m.columns and f'{col}_new' in m.columns
	)
	chng = m.query(f'(_merge == "both") & ({exp})')
	chng = chng.drop(columns=['_merge'])
	if compare_column:
	# Drop 'old' columns except compare_column
	chng = chng.drop(
	columns=[x for x in chng.columns if x.endswith('_old') and not x.startswith(compare_column)]
	)
	# Remove `_new` from column names except compare_column
	chng = chng.rename(
	columns={x: x[:-4] for x in chng.columns if x.endswith('_new') and not x.startswith(compare_column)}
	)
	# Reorder columns so that compare_column_old is directly to the left of compare_column_new
	columns = list(chng.columns)
	old = columns.pop(columns.index(f'{compare_column}_old'))
	columns.insert(columns.index(f'{compare_column}_new'), old)
	chng = chng[columns]
	click.echo(f' {len(chng)} changed record{"s" if not len(chng) == 1 else ""} found.')
	chng.to_excel(writer, sheet_name='Changed Records', index=False)

	click.echo(f'Results saved to "{filename}"')

	if __name__ == '__main__':
	main()
No results found