Version 0.2.1

CHANGELOG.md: Move unreleased changes to 0.2.1
CHANGELOG.md: Add note about replacement of unnccesary Unicode
2025-05-10 15:16:01 +02:00 · 2019-08-11 10:39:39 +03:00 · 2019-08-11 10:39:18 +03:00 · 2019-08-11 00:09:35 +03:00 · 2019-08-11 00:08:44 +03:00 · 2019-08-11 00:07:21 +03:00
13 changed files with 184 additions and 41 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,7 +4,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

-## [Unreleased]
+## [0.2.1] - 2019-08-11
+### Added
+- Check for uncommon filename extensions 
+- Replacement of unneccessary Unicode characters like soft hyphens (U+00AD)
+
+## [0.2.0] - 2019-08-09
+### Added
+- Handle Ctrl-C interrupt gracefully
+- Make output in suspicious character check more user friendly
+- Add pytest-clarity to dev packages for more user friendly pytest output

 ## [0.1.0] - 2019-08-01
 ### Changed
--- a/5
+++ b/5
@ -7,6 +7,7 @@ verify_ssl = true
 pytest = "*"
 ipython = "*"
 flake8 = "*"
+pytest-clarity = "*"

 [packages]
 pandas = "*"
@ -15,6 +16,10 @@ xlrd = "*"
 requests = "*"
 requests-cache = "*"
 pycountry = "*"
+csv-metadata-quality = {editable = true,path = "."}

 [requires]
 python_version = "3.7"
+
+[pipenv]
+allow_prereleases = true
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "1c4130ed98fb55545244ba2926f2b4246dc86af7545cb892a45311426f934cae"
+            "sha256": "f8f0a9f208ec41f4d8183ecfc68356b40674b083b2f126c37468b3c9533ba5df"
        },
        "pipfile-spec": 6,
        "requires": {
@ -30,6 +30,10 @@
            ],
            "version": "==3.0.4"
        },
+        "csv-metadata-quality": {
+            "editable": true,
+            "path": "."
+        },
        "idna": {
            "hashes": [
                "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
@ -102,10 +106,10 @@
        },
        "pytz": {
            "hashes": [
-                "sha256:303879e36b721603cc54604edcac9d20401bdbe31e1e4fdee5b9f98d5d31dfda",
-                "sha256:d747dd3d23d77ef44c6a3526e274af6efeb0a6f1afd5a69ba4d5be4098c8e141"
+                "sha256:26c0b32e437e54a18161324a2fca3c4b9846b74a8dccddd843113109e1116b32",
+                "sha256:c894d57500a4cd2d5c71114aaab77dbab5eabd9022308ce5ac9bb93a60a6f0c7"
            ],
-            "version": "==2019.1"
+            "version": "==2019.2"
        },
        "requests": {
            "hashes": [
@ -327,6 +331,13 @@
            "index": "pypi",
            "version": "==5.0.1"
        },
+        "pytest-clarity": {
+            "hashes": [
+                "sha256:3f40d5ae7cb21cc95e622fc4f50d9466f80ae0f91460225b8c95c07afbf93e20"
+            ],
+            "index": "pypi",
+            "version": "==0.2.0a1"
+        },
        "six": {
            "hashes": [
                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
@ -334,6 +345,12 @@
            ],
            "version": "==1.12.0"
        },
+        "termcolor": {
+            "hashes": [
+                "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"
+            ],
+            "version": "==1.1.0"
+        },
        "traitlets": {
            "hashes": [
                "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
--- a/README.md
+++ b/README.md
@ -23,7 +23,6 @@ $ git clone https://git.sr.ht/~alanorth/csv-metadata-quality
 $ cd csv-metadata-quality
 $ pipenv install
 $ pipenv shell
-$ pip install .
 ```

 Otherwise, if you don't have pipenv, you can use a vanilla Python virtual environment:
@ -34,7 +33,6 @@ $ cd csv-metadata-quality
 $ python3 -m venv venv
 $ source venv/bin/activate
 $ pip install -r requirements.txt
-$ pip install .
 ```

 ## Usage
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -4,6 +4,8 @@ import csv_metadata_quality.check as check
 import csv_metadata_quality.fix as fix
 import pandas as pd
 import re
+import signal
+import sys


 def parse_args(argv):
@ -18,9 +20,16 @@ def parse_args(argv):
    return args


+def signal_handler(signal, frame):
+    sys.exit(1)
+
+
 def run(argv):
    args = parse_args(argv)

+    # set the signal handler for SIGINT (^C)
+    signal.signal(signal.SIGINT, signal_handler)
+
    # Read all fields as strings so dates don't get converted from 1998 to 1998.0
    df = pd.read_csv(args.input_file, dtype=str)

@ -39,7 +48,7 @@ def run(argv):
        df[column] = df[column].apply(check.separators)

        # Check: suspicious characters
-        df[column] = df[column].apply(check.suspicious_characters)
+        df[column] = df[column].apply(check.suspicious_characters, field_name=column)

        # Fix: invalid multi-value separator
        if args.unsafe_fixes:
@ -77,5 +86,15 @@ def run(argv):
        if match is not None:
            df[column] = df[column].apply(check.date)

+        # Check: filename extension
+        if column == 'filename':
+            df[column] = df[column].apply(check.filename_extension)
+
    # Write
    df.to_csv(args.output_file, index=False)
+
+    # Close the input and output files before exiting
+    args.input_file.close()
+    args.output_file.close()
+
+    sys.exit(0)
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -128,7 +128,7 @@ def date(field):
        return field


-def suspicious_characters(field):
+def suspicious_characters(field, field_name):
    """Warn about suspicious characters.

    Look for standalone characters that could indicate encoding or copy/paste
@ -143,10 +143,21 @@ def suspicious_characters(field):
    suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060']

    for character in suspicious_characters:
-        character_set = set(character)
+        # Find the position of the suspicious character in the string
+        suspicious_character_position = field.find(character)

-        if character_set.issubset(field):
-            print(f'Suspicious character: {field}')
+        # Python returns -1 if there is no match
+        if suspicious_character_position != -1:
+            # Create a temporary new string starting from the position of the
+            # suspicious character
+            field_subset = field[suspicious_character_position:]
+
+            # Print part of the metadata value starting from the suspicious
+            # character and spanning enough of the rest to give a preview,
+            # but not too much to cause the line to break in terminals with
+            # a default of 80 characters width.
+            suspicious_character_msg = f'Suspicious character ({field_name}): {field_subset}'
+            print(f'{suspicious_character_msg:1.80}')

    return field

@ -242,3 +253,48 @@ def agrovoc(field, field_name):
                    print(f'Invalid AGROVOC ({field_name}): {value}')

    return field
+
+
+def filename_extension(field):
+    """Check filename extension.
+
+    CSVs with a 'filename' column are likely meant as input for the SAFBuilder
+    tool, which creates a Simple Archive Format bundle for importing metadata
+    with accompanying PDFs or other files into DSpace.
+
+    This check warns if a filename has an uncommon extension (that is, other
+    than .pdf, .xls(x), .doc(x), ppt(x), case insensitive).
+    """
+
+    import re
+
+    # Skip fields with missing values
+    if pd.isna(field):
+        return
+
+    # Try to split multi-value field on "||" separator
+    values = field.split('||')
+
+    # List of common filename extentions
+    common_filename_extensions = ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx']
+
+    # Iterate over all values
+    for value in values:
+        # Assume filename extension does not match
+        filename_extension_match = False
+
+        for filename_extension in common_filename_extensions:
+            # Check for extension at the end of the filename
+            pattern = re.escape(filename_extension) + r'$'
+            match = re.search(pattern, value, re.IGNORECASE)
+
+            if match is not None:
+                # Register the match and stop checking for this filename
+                filename_extension_match = True
+
+                break
+
+        if filename_extension_match is False:
+            print(f'Filename with uncommon extension: {value}')
+
+    return field
--- a/csv_metadata_quality/fix.py
+++ b/csv_metadata_quality/fix.py
@ -68,14 +68,17 @@ def separators(field):


 def unnecessary_unicode(field):
-    """Remove unnecessary Unicode characters.
+    """Remove and replace unnecessary Unicode characters.

    Removes unnecessary Unicode characters like:
        - Zero-width space (U+200B)
        - Replacement character (U+FFFD)
        - No-break space (U+00A0)

-    Return string with characters removed.
+    Replaces unnecessary Unicode characters like:
+        - Soft hyphen (U+00AD) → hyphen
+
+    Return string with characters removed or replaced.
    """

    # Skip fields with missing values
@ -106,6 +109,14 @@ def unnecessary_unicode(field):
        print(f'Removing unnecessary Unicode (U+00A0): {field}')
        field = re.sub(pattern, '', field)

+    # Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen
+    pattern = re.compile(r'\u002D*?\u00AD')
+    match = re.findall(pattern, field)
+
+    if match:
+        print(f'Replacing unnecessary Unicode (U+00AD): {field}')
+        field = re.sub(pattern, '-', field)
+
    return field


--- a/csv_metadata_quality/version.py
+++ b/csv_metadata_quality/version.py
@ -1 +1 @@
-VERSION = '0.1.0'
+VERSION = '0.2.1'
--- a/data/test.csv
+++ b/data/test.csv
@ -1,23 +1,25 @@
-dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country
- Leading space,2019-07-29,,,,,
-Trailing space ,2019-07-29,,,,,
-Excessive  space,2019-07-29,,,,,
-Miscellaenous ||whitespace | issues ,2019-07-29,,,,,
-Duplicate||Duplicate,2019-07-29,,,,,
-Invalid ISSN,2019-07-29,2321-2302,,,,
-Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,
-Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,
-Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,
-Invalid date,2019-07-260,,,,,
-Multiple dates,2019-07-26||2019-01-10,,,,,
-Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,
-Unnecessary Unicode,2019-07-29,,,,,
-Suspicious character||foreˆt,2019-07-29,,,,,
-Invalid ISO 639-2 language,2019-07-29,,,jp,,
-Invalid ISO 639-3 language,2019-07-29,,,chi,,
-Invalid language,2019-07-29,,,Span,,
-Invalid AGROVOC subject,2019-07-29,,,,FOREST,
+dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso,dc.subject,cg.coverage.country,filename
+ Leading space,2019-07-29,,,,,,
+Trailing space ,2019-07-29,,,,,,
+Excessive  space,2019-07-29,,,,,,
+Miscellaenous ||whitespace | issues ,2019-07-29,,,,,,
+Duplicate||Duplicate,2019-07-29,,,,,,
+Invalid ISSN,2019-07-29,2321-2302,,,,,
+Invalid ISBN,2019-07-29,,978-0-306-40615-6,,,,
+Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,,,,,
+Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7,,,,
+Invalid date,2019-07-260,,,,,,
+Multiple dates,2019-07-26||2019-01-10,,,,,,
+Invalid multi-value separator,2019-07-29,0378-5955|0024-9319,,,,,
+Unnecessary Unicode,2019-07-29,,,,,,
+Suspicious character||foreˆt,2019-07-29,,,,,,
+Invalid ISO 639-2 language,2019-07-29,,,jp,,,
+Invalid ISO 639-3 language,2019-07-29,,,chi,,,
+Invalid language,2019-07-29,,,Span,,,
+Invalid AGROVOC subject,2019-07-29,,,,FOREST,,
 Newline (LF),2019-07-30,,,,"TANZA
-NIA",
-Missing date,,,,,,
-Invalid country,2019-08-01,,,,,KENYAA
+NIA",,
+Missing date,,,,,,,
+Invalid country,2019-08-01,,,,,KENYAA,
+Uncommon filename extension,2019-08-10,,,,,,file.pdf.lck
+Unneccesary unicode (U+002D + U+00AD),2019-08-10,,978-92-9043-823-6,,,,
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -23,8 +23,10 @@ pycodestyle==2.5.0
 pyflakes==2.1.1
 pygments==2.4.2
 pyparsing==2.4.2
+pytest-clarity==0.2.0a1
 pytest==5.0.1
 six==1.12.0
+termcolor==1.1.0
 traitlets==4.3.2
 wcwidth==0.1.7
 zipp==0.5.2
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 -i https://pypi.org/simple
+-e .
 certifi==2019.6.16
 chardet==3.0.4
 idna==2.8
@ -7,7 +8,7 @@ pandas==0.25.0
 pycountry==19.7.15
 python-dateutil==2.8.0
 python-stdnum==1.11
-pytz==2019.1
+pytz==2019.2
 requests-cache==0.5.0
 requests==2.22.0
 six==1.12.0
--- a/setup.py
+++ b/setup.py
@ -13,7 +13,7 @@ install_requires = [

 setuptools.setup(
    name="csv-metadata-quality",
-    version="0.1.0",
+    version="0.2.1",
    author="Alan Orth",
    author_email="aorth@mjanja.ch",
    description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem.",
--- a/tests/test_check.py
+++ b/tests/test_check.py
@ -112,10 +112,12 @@ def test_check_suspicious_characters(capsys):

    value = 'foreˆt'

-    check.suspicious_characters(value)
+    field_name = 'dc.contributor.author'
+
+    check.suspicious_characters(value, field_name)

    captured = capsys.readouterr()
-    assert captured.out == f'Suspicious character: {value}\n'
+    assert captured.out == f'Suspicious character ({field_name}): ˆt\n'


 def test_check_valid_iso639_2_language():
@ -192,3 +194,24 @@ def test_check_valid_agrovoc():
    result = check.agrovoc(value, field_name)

    assert result == value
+
+
+def test_check_uncommon_filename_extension(capsys):
+    '''Test uncommon filename extension.'''
+
+    value = 'file.pdf.lck'
+
+    check.filename_extension(value)
+
+    captured = capsys.readouterr()
+    assert captured.out == f'Filename with uncommon extension: {value}\n'
+
+
+def test_check_common_filename_extension():
+    '''Test common filename extension.'''
+
+    value = 'file.pdf'
+
+    result = check.filename_extension(value)
+
+    assert result == value
Author	SHA1	Message	Date
Alan Orth	7255bf4707	Version 0.2.1	2019-08-11 10:39:39 +03:00
Alan Orth	3aaf18c290	CHANGELOG.md: Move unreleased changes to 0.2.1	2019-08-11 10:39:18 +03:00
Alan Orth	745306edd7	CHANGELOG.md: Add note about replacement of unnccesary Unicode	2019-08-11 00:09:35 +03:00
Alan Orth	e324e321a2	data/test.csv: Add test for replacement of unneccessary Unicode	2019-08-11 00:08:44 +03:00
Alan Orth	232ff99898	csv_metadata_quality/fix.py: Add more unneccessary Unicode fixes Add a check for soft hyphens (U+00AD). In one sample CSV I have a normal hyphen followed by a soft hyphen in an ISBN. This causes the ISBN validation to fail.	2019-08-11 00:07:21 +03:00
Alan Orth	13d5221378	csv_metadata_quality/check.py: Fix test for False	2019-08-10 23:52:53 +03:00
Alan Orth	3c7a9eb75b	CHANGELOG.md: Add check for uncommon filename extensions	2019-08-10 23:47:46 +03:00
Alan Orth	a99fbd8a51	data/test.csv: Add test case for uncommon filename extension	2019-08-10 23:46:56 +03:00
Alan Orth	e801042340	tests/test_check.py: Fix unused result We don't need to capture the function's return value here because pytest will capture stdout from the function.	2019-08-10 23:45:41 +03:00
Alan Orth	62ef2a4489	tests/test_check.py: Add tests for file extensions	2019-08-10 23:44:13 +03:00
Alan Orth	9ce7dc6716	Add check for uncommon filenames Generally we want people to upload documents in accessible formats like PDF, Word, Excel, and PowerPoint. This check warns if a file is using an uncommon extension.	2019-08-10 23:41:16 +03:00
Alan Orth	5ff584a8d7	Version 0.2.0	2019-08-09 01:39:51 +03:00
Alan Orth	4cf7bc182b	Update requirements-dev.txt Generated with: $ pipenv lock -r -d > requirements-dev.txt	2019-08-09 01:34:54 +03:00
Alan Orth	7d3f5aae66	CHANGELOG.md: Add pytest-clarity	2019-08-09 01:33:34 +03:00
Alan Orth	c77c065e25	Update Pipfile.lock	2019-08-09 01:32:53 +03:00
Alan Orth	8fb40d96b1	Pipfile: Add pytest-clarity to dev packages This helps you understand the cryptic assertion error output from pytest. For some reason pytest-clarity is a pre-release package so we need to install it in pipenv with --pre.	2019-08-09 01:30:37 +03:00
Alan Orth	5f2e3ff4bd	CHANGELOG.md: Add improved suspicious character check	2019-08-09 01:28:07 +03:00
Alan Orth	d93c2aae13	tests/test_check.py: Update suspicious character check The suspicious character check was updated to include the name of the field where the metadata value with the suspicious character exists.	2019-08-09 01:26:38 +03:00
Alan Orth	62fea95087	Improve suspicious character detection Now it will print just the part of the metadata value that contains the suspicious character (up to 80 characters, so we don't make the line break on terminals that use 80 character width by default). Also, print the name of the field in which the metadata value is so that it is easier for the user to locate.	2019-08-09 01:25:40 +03:00
Alan Orth	8772bdec51	csv_metadata_quality/app.py: Explicitly exit with success	2019-08-04 09:10:37 +03:00
Alan Orth	6d4ecd75aa	csv_metadata_quality/app.py: Close files before exit	2019-08-04 09:10:19 +03:00
Alan Orth	264ce1d1df	CHANGELOG.md: Add new item for Ctrl-C handling	2019-08-03 22:18:44 +03:00
Alan Orth	f4e7fd73f5	csv_metadata_quality/app.py: Handle Ctrl-C Instead of printing an ugly two-page stack trace.	2019-08-03 21:11:57 +03:00
Alan Orth	a00d3d7ea5	README.md: Simplify installation instructions Pipenv has captured the local dependency with `-e .` so now it gets installed by the Pipfile or requirements.txt.	2019-08-02 11:02:50 +03:00
Alan Orth	f772a3be41	Update python requirements Generated using pipenv: $ pipenv lock -r > requirements.txt	2019-08-02 11:02:25 +03:00
Alan Orth	d1b3e9e375	pipenv install -e .	2019-08-02 10:58:21 +03:00