mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-11 23:56:00 +02:00
Compare commits
6 Commits
e92ec5d371
...
8eddb76aab
Author | SHA1 | Date | |
---|---|---|---|
8eddb76aab
|
|||
a04dbc50db
|
|||
28335ed159
|
|||
773a0a2695
|
|||
39a4b1a487
|
|||
898bb412c3
|
@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## Unreleased
|
||||
### Added
|
||||
- Ability to check for, and fix, "mojibake" characters using [ftfy](https://github.com/LuminosoInsight/python-ftfy)
|
||||
|
||||
## [0.4.7] - 2021-03-17
|
||||
### Changed
|
||||
- Fixing invalid multi-value separators like `|` and `|||` is no longer class-
|
||||
|
@ -20,6 +20,7 @@ If you use the DSpace CSV metadata quality checker please cite:
|
||||
- Perform [Unicode normalization](https://withblue.ink/2019/03/11/why-you-need-to-normalize-unicode-strings.html) on strings using `--unsafe-fixes`
|
||||
- Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc
|
||||
- Check for "suspicious" characters that indicate encoding or copy/paste issues, for example "foreˆt" should be "forêt"
|
||||
- Check for "mojibake" characters (and attempt to fix with `--unsafe-fixes`)
|
||||
- Remove duplicate metadata values
|
||||
- Check for duplicate items, using the title, type, and date issued as an indicator
|
||||
|
||||
@ -75,6 +76,14 @@ This is considered "unsafe" because some systems give special importance to vert
|
||||
|
||||
Read more about [Unicode normalization](https://withblue.ink/2019/03/11/why-you-need-to-normalize-unicode-strings.html).
|
||||
|
||||
### Encoding Issues aka "Mojibake"
|
||||
[Mojibake](https://en.wikipedia.org/wiki/Mojibake) is a phenomenon that occurs when text is decoded using an unintended character encoding. This usually presents itself in the form of strange, garbled characters in the text. Enabling "unsafe" fixes will attempt to correct these, for example:
|
||||
|
||||
- CIAT Publicaçao → CIAT Publicaçao
|
||||
- CIAT Publicación → CIAT Publicación
|
||||
|
||||
Pay special attention to the output of the script as well as the resulting file to make sure no new issues have been introduced. The ideal way to solve these issues is to avoid it in the first place. See [this guide about opening CSVs in UTF-8 format in Excel](https://www.itg.ias.edu/content/how-import-csv-file-uses-utf-8-character-encoding-0).
|
||||
|
||||
## AGROVOC Validation
|
||||
You can enable validation of metadata values in certain fields against the AGROVOC REST API with the `--agrovoc-fields` option. For example, in addition to agricultural subjects, many countries and regions are also present AGROVOC. Enable this validation by specifying a comma-separated list of fields:
|
||||
|
||||
|
@ -107,6 +107,13 @@ def run(argv):
|
||||
# Check: suspicious characters
|
||||
df[column].apply(check.suspicious_characters, field_name=column)
|
||||
|
||||
# Check: mojibake
|
||||
df[column].apply(check.mojibake, field_name=column)
|
||||
|
||||
# Fix: mojibake
|
||||
if args.unsafe_fixes:
|
||||
df[column] = df[column].apply(fix.mojibake, field_name=column)
|
||||
|
||||
# Fix: invalid and unnecessary multi-value separators
|
||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||
# Run whitespace fix again after fixing invalid separators
|
||||
|
@ -11,6 +11,8 @@ from pycountry import languages
|
||||
from stdnum import isbn as stdnum_isbn
|
||||
from stdnum import issn as stdnum_issn
|
||||
|
||||
from csv_metadata_quality.util import is_mojibake
|
||||
|
||||
|
||||
def issn(field):
|
||||
"""Check if an ISSN is valid.
|
||||
@ -345,3 +347,22 @@ def duplicate_items(df):
|
||||
)
|
||||
else:
|
||||
items.append(item_title_type_date)
|
||||
|
||||
|
||||
def mojibake(field, field_name):
|
||||
"""Check for mojibake (text that was encoded in one encoding and decoded in
|
||||
in another, perhaps multiple times). See util.py.
|
||||
|
||||
Prints the string if it contains suspected mojibake.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
if is_mojibake(field):
|
||||
print(
|
||||
f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}"
|
||||
)
|
||||
|
||||
return
|
||||
|
@ -3,8 +3,9 @@ from unicodedata import normalize
|
||||
|
||||
import pandas as pd
|
||||
from colorama import Fore
|
||||
from ftfy import fix_text
|
||||
|
||||
from csv_metadata_quality.util import is_nfc
|
||||
from csv_metadata_quality.util import is_mojibake, is_nfc
|
||||
|
||||
|
||||
def whitespace(field, field_name):
|
||||
@ -253,3 +254,22 @@ def normalize_unicode(field, field_name):
|
||||
field = normalize("NFC", field)
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def mojibake(field, field_name):
|
||||
"""Attempts to fix mojibake (text that was encoded in one encoding and deco-
|
||||
ded in another, perhaps multiple times). See util.py.
|
||||
|
||||
Return fixed string.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return field
|
||||
|
||||
if is_mojibake(field):
|
||||
print(f"{Fore.GREEN}Fixing encoding issue ({field_name}): {Fore.RESET}{field}")
|
||||
|
||||
return fix_text(field)
|
||||
else:
|
||||
return field
|
||||
|
@ -1,3 +1,6 @@
|
||||
from ftfy.badness import sequence_weirdness
|
||||
|
||||
|
||||
def is_nfc(field):
|
||||
"""Utility function to check whether a string is using normalized Unicode.
|
||||
Python's built-in unicodedata library has the is_normalized() function, but
|
||||
@ -12,3 +15,35 @@ def is_nfc(field):
|
||||
from unicodedata import normalize
|
||||
|
||||
return field == normalize("NFC", field)
|
||||
|
||||
|
||||
def is_mojibake(field):
|
||||
"""Determines whether a string contains mojibake.
|
||||
|
||||
We commonly deal with CSV files that were *encoded* in UTF-8, but decoded
|
||||
as something else like CP-1252 (Windows Latin). This manifests in the form
|
||||
of "mojibake", for example:
|
||||
|
||||
- CIAT Publicaçao
|
||||
- CIAT Publicación
|
||||
|
||||
This uses the excellent "fixes text for you" (ftfy) library to determine
|
||||
whether a string contains characters that have been encoded in one encoding
|
||||
and decoded in another.
|
||||
|
||||
Inspired by this code snippet from Martijn Pieters on StackOverflow:
|
||||
https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python
|
||||
|
||||
Return boolean.
|
||||
"""
|
||||
if not sequence_weirdness(field):
|
||||
# Nothing weird, should be okay
|
||||
return False
|
||||
try:
|
||||
field.encode("sloppy-windows-1252")
|
||||
except UnicodeEncodeError:
|
||||
# Not CP-1252 encodable, probably fine
|
||||
return False
|
||||
else:
|
||||
# Encodable as CP-1252, Mojibake alert level high
|
||||
return True
|
||||
|
@ -1 +1 @@
|
||||
VERSION = "0.4.7"
|
||||
VERSION = "0.4.8-dev"
|
||||
|
@ -32,3 +32,4 @@ Unnecessary multi-value separator,2021-01-03,0378-5955||,,,,,,,
|
||||
Invalid SPDX license identifier,2021-03-11,,,,,,,CC-BY,
|
||||
Duplicate Title,2021-03-17,,,,,,,,Report
|
||||
Duplicate Title,2021-03-17,,,,,,,,Report
|
||||
Mojibake,2021-03-18,,,,CIAT Publicaçao,,,,Report
|
||||
|
|
18
poetry.lock
generated
18
poetry.lock
generated
@ -221,6 +221,17 @@ mccabe = ">=0.6.0,<0.7.0"
|
||||
pycodestyle = ">=2.7.0,<2.8.0"
|
||||
pyflakes = ">=2.3.0,<2.4.0"
|
||||
|
||||
[[package]]
|
||||
name = "ftfy"
|
||||
version = "5.9"
|
||||
description = "Fixes some problems with Unicode text after the fact"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
|
||||
[package.dependencies]
|
||||
wcwidth = "*"
|
||||
|
||||
[[package]]
|
||||
name = "greenlet"
|
||||
version = "1.0.0"
|
||||
@ -804,7 +815,7 @@ brotli = ["brotlipy (>=0.6.0)"]
|
||||
name = "wcwidth"
|
||||
version = "0.2.5"
|
||||
description = "Measures the displayed width of unicode strings in a terminal"
|
||||
category = "dev"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
@ -831,7 +842,7 @@ testing = ["pytest (>=4.6)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pyt
|
||||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = "^3.7.1"
|
||||
content-hash = "e60e882e091af667b968c00521fd378e1220c1836d394d90bbc783920e38bb62"
|
||||
content-hash = "75d841101b863c35e74aeccb5ae2bf24cf7733dd406c390ec118e43a6eaa49c6"
|
||||
|
||||
[metadata.files]
|
||||
agate = [
|
||||
@ -907,6 +918,9 @@ flake8 = [
|
||||
{file = "flake8-3.9.0-py2.py3-none-any.whl", hash = "sha256:12d05ab02614b6aee8df7c36b97d1a3b2372761222b19b58621355e82acddcff"},
|
||||
{file = "flake8-3.9.0.tar.gz", hash = "sha256:78873e372b12b093da7b5e5ed302e8ad9e988b38b063b61ad937f26ca58fc5f0"},
|
||||
]
|
||||
ftfy = [
|
||||
{file = "ftfy-5.9.tar.gz", hash = "sha256:8c4fb2863c0b82eae2ab3cf353d9ade268dfbde863d322f78d6a9fd5cefb31e9"},
|
||||
]
|
||||
greenlet = [
|
||||
{file = "greenlet-1.0.0-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:1d1d4473ecb1c1d31ce8fd8d91e4da1b1f64d425c1dc965edc4ed2a63cfa67b2"},
|
||||
{file = "greenlet-1.0.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:cfd06e0f0cc8db2a854137bd79154b61ecd940dce96fad0cba23fe31de0b793c"},
|
||||
|
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "csv-metadata-quality"
|
||||
version = "0.4.7"
|
||||
version = "0.4.8-dev"
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
|
||||
authors = ["Alan Orth <alan.orth@gmail.com>"]
|
||||
license="GPL-3.0-only"
|
||||
@ -21,6 +21,7 @@ pycountry = "^19.8.18"
|
||||
langid = "^1.1.6"
|
||||
colorama = "^0.4.4"
|
||||
spdx-license-list = "^0.5.2"
|
||||
ftfy = "^5.9"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^6.1.1"
|
||||
|
@ -18,6 +18,7 @@ dbfread==2.0.7
|
||||
decorator==4.4.2; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.2.0"
|
||||
et-xmlfile==1.0.1; python_version >= "3.6"
|
||||
flake8==3.9.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
|
||||
ftfy==5.9; python_version >= "3.5"
|
||||
greenlet==1.0.0; python_version >= "3" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3"
|
||||
idna==2.10; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
importlib-metadata==3.7.3; python_version < "3.8" and python_version >= "3.6" and (python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "3.8" or python_full_version >= "3.5.0" and python_version < "3.8" and python_version >= "3.6") and (python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "3.8" or python_full_version >= "3.4.0" and python_version >= "3.6" and python_version < "3.8") and (python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6") and (python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "3.8" or python_full_version >= "3.6.0" and python_version < "3.8" and python_version >= "3.6")
|
||||
|
@ -1,6 +1,7 @@
|
||||
certifi==2020.12.5; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
chardet==4.0.0; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
colorama==0.4.4; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.5.0")
|
||||
ftfy==5.9; python_version >= "3.5"
|
||||
idna==2.10; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0"
|
||||
langid==1.1.6
|
||||
numpy==1.20.1; python_version >= "3.7" and python_full_version >= "3.7.1"
|
||||
@ -14,4 +15,5 @@ requests==2.25.1; (python_version >= "2.7" and python_full_version < "3.0.0") or
|
||||
six==1.15.0; python_full_version >= "3.7.1"
|
||||
spdx-license-list==0.5.2
|
||||
urllib3==1.26.4; python_version >= "2.7" and python_full_version < "3.0.0" or python_full_version >= "3.5.0" and python_version < "4"
|
||||
wcwidth==0.2.5; python_version >= "3.5"
|
||||
xlrd==1.2.0; (python_version >= "2.7" and python_full_version < "3.0.0") or (python_full_version >= "3.4.0")
|
||||
|
2
setup.py
2
setup.py
@ -14,7 +14,7 @@ install_requires = [
|
||||
|
||||
setuptools.setup(
|
||||
name="csv-metadata-quality",
|
||||
version="0.4.7",
|
||||
version="0.4.8-dev",
|
||||
author="Alan Orth",
|
||||
author_email="aorth@mjanja.ch",
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||
|
@ -339,3 +339,29 @@ def test_check_duplicate_item(capsys):
|
||||
captured.out
|
||||
== f"{Fore.YELLOW}Possible duplicate (dc.title): {Fore.RESET}{item_title}\n"
|
||||
)
|
||||
|
||||
|
||||
def test_check_no_mojibake():
|
||||
"""Test string with no mojibake."""
|
||||
|
||||
field = "CIAT Publicaçao"
|
||||
field_name = "dcterms.isPartOf"
|
||||
|
||||
result = check.mojibake(field, field_name)
|
||||
|
||||
assert result == None
|
||||
|
||||
|
||||
def test_check_mojibake(capsys):
|
||||
"""Test string with mojibake."""
|
||||
|
||||
field = "CIAT Publicaçao"
|
||||
field_name = "dcterms.isPartOf"
|
||||
|
||||
result = check.mojibake(field, field_name)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert (
|
||||
captured.out
|
||||
== f"{Fore.YELLOW}Possible encoding issue ({field_name}): {Fore.RESET}{field}\n"
|
||||
)
|
||||
|
@ -108,3 +108,12 @@ def test_fix_decomposed_unicode():
|
||||
field_name = "dc.contributor.author"
|
||||
|
||||
assert fix.normalize_unicode(value, field_name) == "Ouédraogo, Mathieu"
|
||||
|
||||
|
||||
def test_fix_mojibake():
|
||||
"""Test string with no mojibake."""
|
||||
|
||||
field = "CIAT Publicaçao"
|
||||
field_name = "dcterms.isPartOf"
|
||||
|
||||
assert fix.mojibake(field, field_name) == "CIAT Publicaçao"
|
||||
|
Reference in New Issue
Block a user