A command line tool to compare 2 different csv files

Below is a python script that uses the ydata-profiling library to create a report that compares two csv files. This is proving useful in verifying the similarity between original and tables migrated to a new platform.

Inspired by this tool which creates a summary report using sweetviz of a single csv file.

Usage:

uv run compare_csv.py file_path01.csv file_path02.csv
# /// script
# requires-python = "==3.13"
# dependencies = [
#   "pandas",
#   "ydata_profiling",
#   "click",
#   "setuptools",
#   "standard-imghdr",
#   "legacy-cgi",
# ]
# ///

# Compare two CSV files
# a command line application to compare two CSV files 
# uses the ydata_profiling library to visualize the differences
# uses the click library to create the command line interface
# usage: python compare_2_csv_files.py file1.csv file2.csv


import pandas as pd
from ydata_profiling import ProfileReport
import click
import uuid
from pathlib import Path
import webbrowser

@click.command()
@click.argument('original_file', type=click.Path(exists=True))
@click.argument('new_file', type=click.Path(exists=True))
# save folder as an option, with downloads folder as default
@click.option('--save_location', default='/Users/msmith9/Downloads', help='Output folder for the report')
# the name for the report as an option, default is comparison
@click.option('--report_name', default='comparison', help='Name for the report. The table name is a sensible name if comparing for migration verification purposes')

def compare_csv(original_file, new_file, save_location, report_name):
    '''Compare two CSV files and generate a ydata-profiling report.'''

    def process_csv(file_path):
        '''Process a CSV and return a ydata-profiling report.'''

        # get file name from path
        path = Path(file_path)
        file_name = path.stem

        # ensure the file path refers to a CSV file
        if not file_path.endswith('.csv'):
            raise ValueError(f'The file path must refer to a CSV file. Path provided: {file_path}')
        
        # Read in the CSV file as dataframe
        df = pd.read_csv(file_path)
        # convert column names to lowercase
        df.columns = [col.lower() for col in df.columns]
        # Create the ydata_profiling report
        report = ProfileReport(df, title=file_name)
        return report

    # Process the original and new CSV files
    original_report = process_csv(original_file)
    new_report = process_csv(new_file)

    # Compare the two reports
    comparison_report = original_report.compare(new_report)

    # create the output folder if it does not exist
    output_folder = Path(save_location)
    output_folder.mkdir(parents=True, exist_ok=True)
    
    # Save the comparison report to an HTML file
    id = uuid.uuid4()
    output_file_path = Path(f"{save_location}/{report_name}-{id}.html")
    comparison_report.to_file(output_file_path)
    print(f"Comparison report saved to {output_file_path}")

    # Open the report in the default web browser
    webbrowser.open(output_file_path.absolute().as_uri())



if __name__ == '__main__':
    compare_csv()

Send a Comment

Your email address will not be published.