1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-09 14:46:00 +02:00

4 Commits

Author SHA1 Message Date
0b2d211455 Version 0.4.1 2020-01-15 12:19:42 +02:00
7f1df0b47c Support Python 3.6 and 3.7 again 2020-01-15 12:19:17 +02:00
365ecda324 Add utility function to check normalization
Python's built-in unicodedata library includes the is_normalized()
function starting with Python 3.8. This utility function allows us
to do the same thing with earlier Python versions.

See: https://docs.python.org/3/library/unicodedata.html
2020-01-15 12:17:52 +02:00
550ce7fb7e .travis.yml: Only test Python 3.8
The Unicode normalization feature requires Python 3.8 because the
unicodedata.is_normalized() function only appears there. If I find
another way to check if a string is normalized without normalizing
it first I will drop the requirements back down to Python 3.6.

See: https://docs.python.org/3/library/unicodedata.html
2020-01-15 11:57:21 +02:00
5 changed files with 25 additions and 4 deletions

View File

@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.4.1] - 2020-01-15
### Changed
- Reduce minimum Python version to 3.6 by working around the `is_normalized()`
that only works in Python >= 3.8
## [0.4.0] - 2020-01-15
### Added
- Unicode normalization (enable with `--unsafe-fixes`, see README.md)

View File

@ -212,7 +212,7 @@ def normalize_unicode(field, field_name):
Return normalized string.
"""
from unicodedata import is_normalized
from csv_metadata_quality.util import is_nfc
from unicodedata import normalize
# Skip fields with missing values
@ -220,7 +220,7 @@ def normalize_unicode(field, field_name):
return
# Check if the current string is using normalized Unicode (NFC)
if not is_normalized("NFC", field):
if not is_nfc(field):
print(f"Normalizing Unicode ({field_name}): {field}")
field = normalize("NFC", field)

View File

@ -0,0 +1,14 @@
def is_nfc(field):
"""Utility function to check whether a string is using normalized Unicode.
Python's built-in unicodedata library has the is_normalized() function, but
it was only introduced in Python 3.8. By using a simple utility function we
are able to run on Python >= 3.6 again.
See: https://docs.python.org/3/library/unicodedata.html
Return boolean.
"""
from unicodedata import normalize
return field == normalize("NFC", field)

View File

@ -1 +1 @@
VERSION = "0.4.0"
VERSION = "0.4.1"

View File

@ -14,7 +14,7 @@ install_requires = [
setuptools.setup(
name="csv-metadata-quality",
version="0.4.0",
version="0.4.1",
author="Alan Orth",
author_email="aorth@mjanja.ch",
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
@ -23,6 +23,8 @@ setuptools.setup(
long_description_content_type="text/markdown",
url="https://github.com/alanorth/csv-metadata-quality",
classifiers=[
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",