Below is a python script that uses the ydata-profiling library to create a report that compares two csv files. This is proving useful in verifying the similarity between original and tables migrated to a new platform.
Inspired by this tool which creates a summary report using sweetviz of a single csv file.
Usage:
uv run compare_csv.py file_path01.csv file_path02.csv

# /// script
# requires-python = "==3.13"
# dependencies = [
# "pandas",
# "ydata_profiling",
# "click",
# "setuptools",
# "standard-imghdr",
# "legacy-cgi",
# ]
# ///
# Compare two CSV files
# a command line application to compare two CSV files
# uses the ydata_profiling library to visualize the differences
# uses the click library to create the command line interface
# usage: python compare_2_csv_files.py file1.csv file2.csv
import pandas as pd
from ydata_profiling import ProfileReport
import click
import uuid
from pathlib import Path
import webbrowser
@click.command()
@click.argument('original_file', type=click.Path(exists=True))
@click.argument('new_file', type=click.Path(exists=True))
# save folder as an option, with downloads folder as default
@click.option('--save_location', default='/Users/msmith9/Downloads', help='Output folder for the report')
# the name for the report as an option, default is comparison
@click.option('--report_name', default='comparison', help='Name for the report. The table name is a sensible name if comparing for migration verification purposes')
def compare_csv(original_file, new_file, save_location, report_name):
'''Compare two CSV files and generate a ydata-profiling report.'''
def process_csv(file_path):
'''Process a CSV and return a ydata-profiling report.'''
# get file name from path
path = Path(file_path)
file_name = path.stem
# ensure the file path refers to a CSV file
if not file_path.endswith('.csv'):
raise ValueError(f'The file path must refer to a CSV file. Path provided: {file_path}')
# Read in the CSV file as dataframe
df = pd.read_csv(file_path)
# convert column names to lowercase
df.columns = [col.lower() for col in df.columns]
# Create the ydata_profiling report
report = ProfileReport(df, title=file_name)
return report
# Process the original and new CSV files
original_report = process_csv(original_file)
new_report = process_csv(new_file)
# Compare the two reports
comparison_report = original_report.compare(new_report)
# create the output folder if it does not exist
output_folder = Path(save_location)
output_folder.mkdir(parents=True, exist_ok=True)
# Save the comparison report to an HTML file
id = uuid.uuid4()
output_file_path = Path(f"{save_location}/{report_name}-{id}.html")
comparison_report.to_file(output_file_path)
print(f"Comparison report saved to {output_file_path}")
# Open the report in the default web browser
webbrowser.open(output_file_path.absolute().as_uri())
if __name__ == '__main__':
compare_csv()