1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-01-27 04:29:10 +01:00

csv_metadata_quality/app.py: read fields as strings

I suspect this undermines the PyArrow backend performance gains in
recent Pandas 2.0.0, but we are dealing with messy data sometimes
and we must rely on data being strings.
This commit is contained in:
Alan Orth 2023-06-12 10:38:05 +03:00
parent f3fb1ff7fb
commit d21d2621e3

View File

@ -73,7 +73,8 @@ def run(argv):
# set the signal handler for SIGINT (^C)
signal.signal(signal.SIGINT, signal_handler)
df = pd.read_csv(args.input_file, dtype_backend="pyarrow")
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")
# Check if the user requested to skip any fields
if args.exclude_fields: