1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2024-11-27 16:18:19 +01:00

Compare commits

...

3 Commits

Author SHA1 Message Date
1878002391 poetry.lock: run poetry update
All checks were successful
continuous-integration/drone/push Build is passing
2023-06-12 10:42:50 +03:00
d21d2621e3 csv_metadata_quality/app.py: read fields as strings
I suspect this undermines the PyArrow backend performance gains in
recent Pandas 2.0.0, but we are dealing with messy data sometimes
and we must rely on data being strings.
2023-06-12 10:42:50 +03:00
f3fb1ff7fb Don't crash when title is missing
We shouldn't crash the country/region checker/fixer when the title
field is missing, since we only use it to show status to the user.
2023-06-12 10:42:50 +03:00
5 changed files with 54 additions and 29 deletions

View File

@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
`dcterms.bibliographicCitation` fields `dcterms.bibliographicCitation` fields
- Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation` - Fixed regex so we run the comma space fix on `dcterms.bibliographicCitation`
fields fields
- Don't crash the country/region checker/fixer when a title field is missing
### Changed ### Changed
- Don't run newline fix on description fields - Don't run newline fix on description fields

View File

@ -73,7 +73,8 @@ def run(argv):
# set the signal handler for SIGINT (^C) # set the signal handler for SIGINT (^C)
signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGINT, signal_handler)
df = pd.read_csv(args.input_file, dtype_backend="pyarrow") # Read all fields as strings so dates don't get converted from 1998 to 1998.0
df = pd.read_csv(args.input_file, dtype_backend="pyarrow", dtype="str")
# Check if the user requested to skip any fields # Check if the user requested to skip any fields
if args.exclude_fields: if args.exclude_fields:

View File

@ -563,8 +563,13 @@ def countries_match_regions(row, exclude):
un_region = cc.convert(names=country, to="UNRegion") un_region = cc.convert(names=country, to="UNRegion")
if un_region != "not found" and un_region not in regions: if un_region != "not found" and un_region not in regions:
print( try:
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}" print(
) f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}{row[title_column_name]}"
)
except KeyError:
print(
f"{Fore.YELLOW}Missing region ({country} → {un_region}): {Fore.RESET}<title field not present>"
)
return return

View File

@ -370,9 +370,17 @@ def countries_match_regions(row, exclude):
# it doesn't already exist in regions. # it doesn't already exist in regions.
if un_region != "not found" and un_region not in regions: if un_region != "not found" and un_region not in regions:
if un_region not in missing_regions: if un_region not in missing_regions:
print( try:
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}" print(
) f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}{row[title_column_name]}"
)
except KeyError:
# If there is no title column in the CSV we will print
# the fix without the title instead of crashing.
print(
f"{Fore.YELLOW}Adding missing region ({un_region}): {Fore.RESET}<title field not present>"
)
missing_regions.append(un_region) missing_regions.append(un_region)
if len(missing_regions) > 0: if len(missing_regions) > 0:

54
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.5.0 and should not be changed by hand. # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
[[package]] [[package]]
name = "agate" name = "agate"
@ -256,18 +256,28 @@ doc = ["gitpython", "numpydoc", "sphinx"]
[[package]] [[package]]
name = "cattrs" name = "cattrs"
version = "22.2.0" version = "23.1.2"
description = "Composable complex class support for attrs and dataclasses." description = "Composable complex class support for attrs and dataclasses."
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "cattrs-22.2.0-py3-none-any.whl", hash = "sha256:bc12b1f0d000b9f9bee83335887d532a1d3e99a833d1bf0882151c97d3e68c21"}, {file = "cattrs-23.1.2-py3-none-any.whl", hash = "sha256:b2bb14311ac17bed0d58785e5a60f022e5431aca3932e3fc5cc8ed8639de50a4"},
{file = "cattrs-22.2.0.tar.gz", hash = "sha256:f0eed5642399423cf656e7b66ce92cdc5b963ecafd041d1b24d136fdde7acf6d"}, {file = "cattrs-23.1.2.tar.gz", hash = "sha256:db1c821b8c537382b2c7c66678c3790091ca0275ac486c76f3c8f3920e83c657"},
] ]
[package.dependencies] [package.dependencies]
attrs = ">=20" attrs = ">=20"
exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
typing_extensions = {version = ">=4.1.0", markers = "python_version < \"3.11\""}
[package.extras]
bson = ["pymongo (>=4.2.0,<5.0.0)"]
cbor2 = ["cbor2 (>=5.4.6,<6.0.0)"]
msgpack = ["msgpack (>=1.0.2,<2.0.0)"]
orjson = ["orjson (>=3.5.2,<4.0.0)"]
pyyaml = ["PyYAML (>=6.0,<7.0)"]
tomlkit = ["tomlkit (>=0.11.4,<0.12.0)"]
ujson = ["ujson (>=5.4.0,<6.0.0)"]
[[package]] [[package]]
name = "certifi" name = "certifi"
@ -1121,18 +1131,18 @@ files = [
[[package]] [[package]]
name = "platformdirs" name = "platformdirs"
version = "3.5.1" version = "3.5.3"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "platformdirs-3.5.1-py3-none-any.whl", hash = "sha256:e2378146f1964972c03c085bb5662ae80b2b8c06226c54b2ff4aa9483e8a13a5"}, {file = "platformdirs-3.5.3-py3-none-any.whl", hash = "sha256:0ade98a4895e87dc51d47151f7d2ec290365a585151d97b4d8d6312ed6132fed"},
{file = "platformdirs-3.5.1.tar.gz", hash = "sha256:412dae91f52a6f84830f39a8078cecd0e866cb72294a5c66808e74d5e88d251f"}, {file = "platformdirs-3.5.3.tar.gz", hash = "sha256:e48fabd87db8f3a7df7150a4a5ea22c546ee8bc39bc2473244730d4b56d2cc4e"},
] ]
[package.extras] [package.extras]
docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.2.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"]
test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"]
[[package]] [[package]]
name = "pluggy" name = "pluggy"
@ -1289,13 +1299,13 @@ plugins = ["importlib-metadata"]
[[package]] [[package]]
name = "pytest" name = "pytest"
version = "7.3.1" version = "7.3.2"
description = "pytest: simple powerful testing with Python" description = "pytest: simple powerful testing with Python"
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"},
{file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"},
] ]
[package.dependencies] [package.dependencies]
@ -1307,7 +1317,7 @@ pluggy = ">=0.12,<2.0"
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras] [package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]] [[package]]
name = "pytest-clarity" name = "pytest-clarity"
@ -1446,13 +1456,13 @@ yaml = ["pyyaml (>=5.4)"]
[[package]] [[package]]
name = "rich" name = "rich"
version = "13.3.5" version = "13.4.1"
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
optional = false optional = false
python-versions = ">=3.7.0" python-versions = ">=3.7.0"
files = [ files = [
{file = "rich-13.3.5-py3-none-any.whl", hash = "sha256:69cdf53799e63f38b95b9bf9c875f8c90e78dd62b2f00c13a911c7a3b9fa4704"}, {file = "rich-13.4.1-py3-none-any.whl", hash = "sha256:d204aadb50b936bf6b1a695385429d192bc1fdaf3e8b907e8e26f4c4e4b5bf75"},
{file = "rich-13.3.5.tar.gz", hash = "sha256:2d11b9b8dd03868f09b4fffadc84a6a8cda574e40dc90821bd845720ebb8e89c"}, {file = "rich-13.4.1.tar.gz", hash = "sha256:76f6b65ea7e5c5d924ba80e322231d7cb5b5981aa60bfc1e694f1bc097fe6fe1"},
] ]
[package.dependencies] [package.dependencies]
@ -1605,13 +1615,13 @@ test = ["argcomplete (>=2.0)", "pre-commit", "pytest", "pytest-mock"]
[[package]] [[package]]
name = "typing-extensions" name = "typing-extensions"
version = "4.6.2" version = "4.6.3"
description = "Backported and Experimental Type Hints for Python 3.7+" description = "Backported and Experimental Type Hints for Python 3.7+"
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "typing_extensions-4.6.2-py3-none-any.whl", hash = "sha256:3a8b36f13dd5fdc5d1b16fe317f5668545de77fa0b8e02006381fd49d731ab98"}, {file = "typing_extensions-4.6.3-py3-none-any.whl", hash = "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26"},
{file = "typing_extensions-4.6.2.tar.gz", hash = "sha256:06006244c70ac8ee83fa8282cb188f697b8db25bc8b4df07be1873c43897060c"}, {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"},
] ]
[[package]] [[package]]
@ -1641,13 +1651,13 @@ six = "*"
[[package]] [[package]]
name = "urllib3" name = "urllib3"
version = "2.0.2" version = "2.0.3"
description = "HTTP library with thread-safe connection pooling, file post, and more." description = "HTTP library with thread-safe connection pooling, file post, and more."
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "urllib3-2.0.2-py3-none-any.whl", hash = "sha256:d055c2f9d38dc53c808f6fdc8eab7360b6fdbbde02340ed25cfbcd817c62469e"}, {file = "urllib3-2.0.3-py3-none-any.whl", hash = "sha256:48e7fafa40319d358848e1bc6809b208340fafe2096f1725d05d67443d0483d1"},
{file = "urllib3-2.0.2.tar.gz", hash = "sha256:61717a1095d7e155cdb737ac7bb2f4324a858a1e2e6466f6d03ff630ca68d3cc"}, {file = "urllib3-2.0.3.tar.gz", hash = "sha256:bee28b5e56addb8226c96f7f13ac28cb4c301dd5ea8a6ca179c0b9835e032825"},
] ]
[package.extras] [package.extras]