From d21d2621e31229955c83ae282e68bfc8e7c714f9 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Mon, 12 Jun 2023 10:38:05 +0300 Subject: [PATCH] csv_metadata_quality/app.py: read fields as strings I suspect this undermines the PyArrow backend performance gains in recent Pandas 2.0.0, but we are dealing with messy data sometimes and we must rely on data being strings. --- csv_metadata_quality/app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 2adf450..df78571 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -73,7 +73,8 @@ def run(argv): # set the signal handler for SIGINT (^C) signal.signal(signal.SIGINT, signal_handler) - df = pd.read_csv(args.input_file, dtype_backend="pyarrow") + # Read all fields as strings so dates don't get converted from 1998 to 1998.0 + df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str") # Check if the user requested to skip any fields if args.exclude_fields: