mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-10 15:16:01 +02:00
Compare commits
26 Commits
Author | SHA1 | Date | |
---|---|---|---|
7255bf4707
|
|||
3aaf18c290
|
|||
745306edd7
|
|||
e324e321a2
|
|||
232ff99898
|
|||
13d5221378
|
|||
3c7a9eb75b
|
|||
a99fbd8a51
|
|||
e801042340
|
|||
62ef2a4489
|
|||
9ce7dc6716
|
|||
5ff584a8d7
|
|||
4cf7bc182b
|
|||
7d3f5aae66
|
|||
c77c065e25
|
|||
8fb40d96b1
|
|||
5f2e3ff4bd
|
|||
d93c2aae13
|
|||
62fea95087
|
|||
8772bdec51
|
|||
6d4ecd75aa
|
|||
264ce1d1df
|
|||
f4e7fd73f5
|
|||
a00d3d7ea5
|
|||
f772a3be41
|
|||
d1b3e9e375
|
11
CHANGELOG.md
11
CHANGELOG.md
@ -4,7 +4,16 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
## [0.2.1] - 2019-08-11
|
||||
### Added
|
||||
- Check for uncommon filename extensions
|
||||
- Replacement of unneccessary Unicode characters like soft hyphens (U+00AD)
|
||||
|
||||
## [0.2.0] - 2019-08-09
|
||||
### Added
|
||||
- Handle Ctrl-C interrupt gracefully
|
||||
- Make output in suspicious character check more user friendly
|
||||
- Add pytest-clarity to dev packages for more user friendly pytest output
|
||||
|
||||
## [0.1.0] - 2019-08-01
|
||||
### Changed
|
||||
|
5
Pipfile
5
Pipfile
@ -7,6 +7,7 @@ verify_ssl = true
|
||||
pytest = "*"
|
||||
ipython = "*"
|
||||
flake8 = "*"
|
||||
pytest-clarity = "*"
|
||||
|
||||
[packages]
|
||||
pandas = "*"
|
||||
@ -15,6 +16,10 @@ xlrd = "*"
|
||||
requests = "*"
|
||||
requests-cache = "*"
|
||||
pycountry = "*"
|
||||
csv-metadata-quality = {editable = true,path = "."}
|
||||
|
||||
[requires]
|
||||
python_version = "3.7"
|
||||
|
||||
[pipenv]
|
||||
allow_prereleases = true
|
||||
|
25
Pipfile.lock
generated
25
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "1c4130ed98fb55545244ba2926f2b4246dc86af7545cb892a45311426f934cae"
|
||||
"sha256": "f8f0a9f208ec41f4d8183ecfc68356b40674b083b2f126c37468b3c9533ba5df"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@ -30,6 +30,10 @@
|
||||
],
|
||||
"version": "==3.0.4"
|
||||
},
|
||||
"csv-metadata-quality": {
|
||||
"editable": true,
|
||||
"path": "."
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
|
||||
@ -102,10 +106,10 @@
|
||||
},
|
||||
"pytz": {
|
||||
"hashes": [
|
||||
"sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",
|
||||
"sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141"
|
||||
"sha256:26c0b32e437e54a18161324a2fca3c4b9846b74a8dccddd843113109e1116b32",
|
||||
"sha256:c894d57500a4cd2d5c71114aaab77dbab5eabd9022308ce5ac9bb93a60a6f0c7"
|
||||
],
|
||||
"version": "==2019.1"
|
||||
"version": "==2019.2"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
@ -327,6 +331,13 @@
|
||||
"index": "pypi",
|
||||
"version": "==5.0.1"
|
||||
},
|
||||
"pytest-clarity": {
|
||||
"hashes": [
|
||||
"sha256:3f40d5ae7cb21cc95e622fc4f50d9466f80ae0f91460225b8c95c07afbf93e20"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.2.0a1"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
|
||||
@ -334,6 +345,12 @@
|
||||
],
|
||||
"version": "==1.12.0"
|
||||
},
|
||||
"termcolor": {
|
||||
"hashes": [
|
||||
"sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"
|
||||
],
|
||||
"version": "==1.1.0"
|
||||
},
|
||||
"traitlets": {
|
||||
"hashes": [
|
||||
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
|
||||
|
@ -23,7 +23,6 @@ $ git clone https://git.sr.ht/~alanorth/csv-metadata-quality
|
||||
$ cd csv-metadata-quality
|
||||
$ pipenv install
|
||||
$ pipenv shell
|
||||
$ pip install .
|
||||
```
|
||||
|
||||
Otherwise, if you don't have pipenv, you can use a vanilla Python virtual environment:
|
||||
@ -34,7 +33,6 @@ $ cd csv-metadata-quality
|
||||
$ python3 -m venv venv
|
||||
$ source venv/bin/activate
|
||||
$ pip install -r requirements.txt
|
||||
$ pip install .
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
@ -4,6 +4,8 @@ import csv_metadata_quality.check as check
|
||||
import csv_metadata_quality.fix as fix
|
||||
import pandas as pd
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
|
||||
|
||||
def parse_args(argv):
|
||||
@ -18,9 +20,16 @@ def parse_args(argv):
|
||||
return args
|
||||
|
||||
|
||||
def signal_handler(signal, frame):
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def run(argv):
|
||||
args = parse_args(argv)
|
||||
|
||||
# set the signal handler for SIGINT (^C)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
||||
df = pd.read_csv(args.input_file, dtype=str)
|
||||
|
||||
@ -39,7 +48,7 @@ def run(argv):
|
||||
df[column] = df[column].apply(check.separators)
|
||||
|
||||
# Check: suspicious characters
|
||||
df[column] = df[column].apply(check.suspicious_characters)
|
||||
df[column] = df[column].apply(check.suspicious_characters, field_name=column)
|
||||
|
||||
# Fix: invalid multi-value separator
|
||||
if args.unsafe_fixes:
|
||||
@ -77,5 +86,15 @@ def run(argv):
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.date)
|
||||
|
||||
# Check: filename extension
|
||||
if column == 'filename':
|
||||
df[column] = df[column].apply(check.filename_extension)
|
||||
|
||||
# Write
|
||||
df.to_csv(args.output_file, index=False)
|
||||
|
||||
# Close the input and output files before exiting
|
||||
args.input_file.close()
|
||||
args.output_file.close()
|
||||
|
||||
sys.exit(0)
|
||||
|
@ -128,7 +128,7 @@ def date(field):
|
||||
return field
|
||||
|
||||
|
||||
def suspicious_characters(field):
|
||||
def suspicious_characters(field, field_name):
|
||||
"""Warn about suspicious characters.
|
||||
|
||||
Look for standalone characters that could indicate encoding or copy/paste
|
||||
@ -143,10 +143,21 @@ def suspicious_characters(field):
|
||||
suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']
|
||||
|
||||
for character in suspicious_characters:
|
||||
character_set = set(character)
|
||||
# Find the position of the suspicious character in the string
|
||||
suspicious_character_position = field.find(character)
|
||||
|
||||
if character_set.issubset(field):
|
||||
print(f'Suspicious character: {field}')
|
||||
# Python returns -1 if there is no match
|
||||
if suspicious_character_position != -1:
|
||||
# Create a temporary new string starting from the position of the
|
||||
# suspicious character
|
||||
field_subset = field[suspicious_character_position:]
|
||||
|
||||
# Print part of the metadata value starting from the suspicious
|
||||
# character and spanning enough of the rest to give a preview,
|
||||
# but not too much to cause the line to break in terminals with
|
||||
# a default of 80 characters width.
|
||||
suspicious_character_msg = f'Suspicious character ({field_name}): {field_subset}'
|
||||
print(f'{suspicious_character_msg:1.80}')
|
||||
|
||||
return field
|
||||
|
||||
@ -242,3 +253,48 @@ def agrovoc(field, field_name):
|
||||
print(f'Invalid AGROVOC ({field_name}): {value}')
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def filename_extension(field):
|
||||
"""Check filename extension.
|
||||
|
||||
CSVs with a 'filename' column are likely meant as input for the SAFBuilder
|
||||
tool, which creates a Simple Archive Format bundle for importing metadata
|
||||
with accompanying PDFs or other files into DSpace.
|
||||
|
||||
This check warns if a filename has an uncommon extension (that is, other
|
||||
than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
values = field.split('||')
|
||||
|
||||
# List of common filename extentions
|
||||
common_filename_extensions = ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']
|
||||
|
||||
# Iterate over all values
|
||||
for value in values:
|
||||
# Assume filename extension does not match
|
||||
filename_extension_match = False
|
||||
|
||||
for filename_extension in common_filename_extensions:
|
||||
# Check for extension at the end of the filename
|
||||
pattern = re.escape(filename_extension) + r'$'
|
||||
match = re.search(pattern, value, re.IGNORECASE)
|
||||
|
||||
if match is not None:
|
||||
# Register the match and stop checking for this filename
|
||||
filename_extension_match = True
|
||||
|
||||
break
|
||||
|
||||
if filename_extension_match is False:
|
||||
print(f'Filename with uncommon extension: {value}')
|
||||
|
||||
return field
|
||||
|
@ -68,14 +68,17 @@ def separators(field):
|
||||
|
||||
|
||||
def unnecessary_unicode(field):
|
||||
"""Remove unnecessary Unicode characters.
|
||||
"""Remove and replace unnecessary Unicode characters.
|
||||
|
||||
Removes unnecessary Unicode characters like:
|
||||
- Zero-width space (U+200B)
|
||||
- Replacement character (U+FFFD)
|
||||
- No-break space (U+00A0)
|
||||
|
||||
Return string with characters removed.
|
||||
Replaces unnecessary Unicode characters like:
|
||||
- Soft hyphen (U+00AD) → hyphen
|
||||
|
||||
Return string with characters removed or replaced.
|
||||
"""
|
||||
|
||||
# Skip fields with missing values
|
||||
@ -106,6 +109,14 @@ def unnecessary_unicode(field):
|
||||
print(f'Removing unnecessary Unicode (U+00A0): {field}')
|
||||
field = re.sub(pattern, '', field)
|
||||
|
||||
# Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
|
||||
pattern = re.compile(r'\u002D*?\u00AD')
|
||||
match = re.findall(pattern, field)
|
||||
|
||||
if match:
|
||||
print(f'Replacing unnecessary Unicode (U+00AD): {field}')
|
||||
field = re.sub(pattern, '-', field)
|
||||
|
||||
return field
|
||||
|
||||
|
||||
|
@ -1 +1 @@
|
||||
VERSION = '0.1.0'
|
||||
VERSION = '0.2.1'
|
||||
|
@ -1,23 +1,25 @@
|
||||
dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country
|
||||
Leading space,2019-07-29,,,,,
|
||||
Trailing space ,2019-07-29,,,,,
|
||||
Excessive space,2019-07-29,,,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,
|
||||
Invalid date,2019-07-260,,,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,,,
|
||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,
|
||||
Unnecessary Unicode,2019-07-29,,,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,,,
|
||||
Invalid ISO 639-2 language,2019-07-29,,,jp,,
|
||||
Invalid ISO 639-3 language,2019-07-29,,,chi,,
|
||||
Invalid language,2019-07-29,,,Span,,
|
||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,
|
||||
dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country,filename
|
||||
Leading space,2019-07-29,,,,,,
|
||||
Trailing space ,2019-07-29,,,,,,
|
||||
Excessive space,2019-07-29,,,,,,
|
||||
Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,
|
||||
Duplicate||Duplicate,2019-07-29,,,,,,
|
||||
Invalid ISSN,2019-07-29,2321-2302,,,,,
|
||||
Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,
|
||||
Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,
|
||||
Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,
|
||||
Invalid date,2019-07-260,,,,,,
|
||||
Multiple dates,2019-07-26||2019-01-10,,,,,,
|
||||
Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,
|
||||
Unnecessary Unicode,2019-07-29,,,,,,
|
||||
Suspicious character||foreˆt,2019-07-29,,,,,,
|
||||
Invalid ISO 639-2 language,2019-07-29,,,jp,,,
|
||||
Invalid ISO 639-3 language,2019-07-29,,,chi,,,
|
||||
Invalid language,2019-07-29,,,Span,,,
|
||||
Invalid AGROVOC subject,2019-07-29,,,,FOREST,,
|
||||
Newline (LF),2019-07-30,,,,"TANZA
|
||||
NIA",
|
||||
Missing date,,,,,,
|
||||
Invalid country,2019-08-01,,,,,KENYAA
|
||||
NIA",,
|
||||
Missing date,,,,,,,
|
||||
Invalid country,2019-08-01,,,,,KENYAA,
|
||||
Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck
|
||||
Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,
|
||||
|
|
@ -23,8 +23,10 @@ pycodestyle==2.5.0
|
||||
pyflakes==2.1.1
|
||||
pygments==2.4.2
|
||||
pyparsing==2.4.2
|
||||
pytest-clarity==0.2.0a1
|
||||
pytest==5.0.1
|
||||
six==1.12.0
|
||||
termcolor==1.1.0
|
||||
traitlets==4.3.2
|
||||
wcwidth==0.1.7
|
||||
zipp==0.5.2
|
||||
|
@ -1,4 +1,5 @@
|
||||
-i https://pypi.org/simple
|
||||
-e .
|
||||
certifi==2019.6.16
|
||||
chardet==3.0.4
|
||||
idna==2.8
|
||||
@ -7,7 +8,7 @@ pandas==0.25.0
|
||||
pycountry==19.7.15
|
||||
python-dateutil==2.8.0
|
||||
python-stdnum==1.11
|
||||
pytz==2019.1
|
||||
pytz==2019.2
|
||||
requests-cache==0.5.0
|
||||
requests==2.22.0
|
||||
six==1.12.0
|
||||
|
2
setup.py
2
setup.py
@ -13,7 +13,7 @@ install_requires = [
|
||||
|
||||
setuptools.setup(
|
||||
name="csv-metadata-quality",
|
||||
version="0.1.0",
|
||||
version="0.2.1",
|
||||
author="Alan Orth",
|
||||
author_email="aorth@mjanja.ch",
|
||||
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
|
||||
|
@ -112,10 +112,12 @@ def test_check_suspicious_characters(capsys):
|
||||
|
||||
value = 'foreˆt'
|
||||
|
||||
check.suspicious_characters(value)
|
||||
field_name = 'dc.contributor.author'
|
||||
|
||||
check.suspicious_characters(value, field_name)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Suspicious character: {value}\n'
|
||||
assert captured.out == f'Suspicious character ({field_name}): ˆt\n'
|
||||
|
||||
|
||||
def test_check_valid_iso639_2_language():
|
||||
@ -192,3 +194,24 @@ def test_check_valid_agrovoc():
|
||||
result = check.agrovoc(value, field_name)
|
||||
|
||||
assert result == value
|
||||
|
||||
|
||||
def test_check_uncommon_filename_extension(capsys):
|
||||
'''Test uncommon filename extension.'''
|
||||
|
||||
value = 'file.pdf.lck'
|
||||
|
||||
check.filename_extension(value)
|
||||
|
||||
captured = capsys.readouterr()
|
||||
assert captured.out == f'Filename with uncommon extension: {value}\n'
|
||||
|
||||
|
||||
def test_check_common_filename_extension():
|
||||
'''Test common filename extension.'''
|
||||
|
||||
value = 'file.pdf'
|
||||
|
||||
result = check.filename_extension(value)
|
||||
|
||||
assert result == value
|
||||
|
Reference in New Issue
Block a user