mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-24 14:50:17 +01:00
csv_metadata_quality/app.py: read fields as strings
I suspect this undermines the PyArrow backend performance gains in recent Pandas 2.0.0, but we are dealing with messy data sometimes and we must rely on data being strings.
This commit is contained in:
parent
f3fb1ff7fb
commit
d21d2621e3
@ -73,7 +73,8 @@ def run(argv):
|
|||||||
# set the signal handler for SIGINT (^C)
|
# set the signal handler for SIGINT (^C)
|
||||||
signal.signal(signal.SIGINT, signal_handler)
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
|
|
||||||
df = pd.read_csv(args.input_file, dtype_backend="pyarrow")
|
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
||||||
|
df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")
|
||||||
|
|
||||||
# Check if the user requested to skip any fields
|
# Check if the user requested to skip any fields
|
||||||
if args.exclude_fields:
|
if args.exclude_fields:
|
||||||
|
Loading…
Reference in New Issue
Block a user