diff --git a/README.md b/README.md index 262fd90..c57b865 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ Requires Python 3.6 or greater. CSV and Excel support comes from the [Pandas](ht - Read/write CSV files - Read Excel files - Validate dates, ISSNs, ISBNs, and multi-value separators ("||") +- Validate languages against ISO 639-2 and ISO 639-3 - Fix leading, trailing, and excessive whitespace - Fix invalid multi-value separators (`|`) using `--unsafe-fixes` - Remove unnecessary Unicode like [non-breaking spaces](https://en.wikipedia.org/wiki/Non-breaking_space), [replacement characters](https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character), etc diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index c80940a..49e6640 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -43,6 +43,11 @@ def main(argv): # Fix: duplicate metadata values df[column] = df[column].apply(fix.duplicates) + # Check: invalid language + match = re.match(r'^.*?language.*$', column) + if match is not None: + df[column] = df[column].apply(check.language) + # Check: invalid ISSN match = re.match(r'^.*?issn.*$', column) if match is not None: diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index a70e6f9..7502a53 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -149,3 +149,43 @@ def suspicious_characters(field): print(f'Suspicious character: {field}') return field + + +def language(field): + """Check if a language is valid ISO 639-2 or ISO 639-3. + + Prints the value if it is invalid. + """ + + from iso639 import languages + + # Skip fields with missing values + if pd.isna(field): + return + + # need to handle "Other" values here... + + # Try to split multi-value field on "||" separator + for value in field.split('||'): + + # After splitting, check if language value is 2 or 3 characters so we + # can check it against ISO 639-2 or ISO 639-3 accordingly. In iso-639 + # library ISO 639-2 is "part1" and ISO 639-3 is "part3". + if len(value) == 2: + try: + languages.get(part1=value) + except KeyError: + print(f'Invalid ISO 639-2 language: {value}') + + pass + elif len(value) == 3: + try: + languages.get(part3=value) + except KeyError: + print(f'Invalid ISO 639-3 language: {value}') + + pass + else: + print(f'Invalid language: {value}') + + return field diff --git a/data/test.csv b/data/test.csv index d0d2485..27adcce 100644 --- a/data/test.csv +++ b/data/test.csv @@ -1,15 +1,18 @@ -dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn - Leading space,2019-07-29,, -Trailing space ,2019-07-29,, -Excessive space,2019-07-29,, -Miscellaenous ||whitespace | issues ,2019-07-29,, -Duplicate||Duplicate,2019-07-29,, -Invalid ISSN,2019-07-29,2321-2302, -Invalid ISBN,2019-07-29,,978-0-306-40615-6 -Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319, -Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7 -Invalid date,2019-07-260,, -Multiple dates,2019-07-26||2019-01-10,, -Invalid multi-value separator,,0378-5955|0024-9319, -Unnecessary Unicode​,2019-07-29,, -Suspicious character||foreˆt,2019-07-29,, +dc.contributor.author,birthdate,dc.identifier.issn,dc.identifier.isbn,dc.language.iso + Leading space,2019-07-29,,, +Trailing space ,2019-07-29,,, +Excessive space,2019-07-29,,, +Miscellaenous ||whitespace | issues ,2019-07-29,,, +Duplicate||Duplicate,2019-07-29,,, +Invalid ISSN,2019-07-29,2321-2302,, +Invalid ISBN,2019-07-29,,978-0-306-40615-6, +Multiple valid ISSNs,2019-07-29,0378-5955||0024-9319,, +Multiple valid ISBNs,2019-07-29,,99921-58-10-7||978-0-306-40615-7, +Invalid date,2019-07-260,,, +Multiple dates,2019-07-26||2019-01-10,,, +Invalid multi-value separator,,0378-5955|0024-9319,, +Unnecessary Unicode​,2019-07-29,,, +Suspicious character||foreˆt,2019-07-29,,, +Invalid ISO 639-2 language,2019-07-29,,,jp +Invalid ISO 639-3 language,2019-07-29,,,chi +Invalid language,2019-07-29,,,Span diff --git a/tests/test_check.py b/tests/test_check.py index dd45075..527e50b 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -116,3 +116,56 @@ def test_check_suspicious_characters(capsys): captured = capsys.readouterr() assert captured.out == f'Suspicious character: {value}\n' + + +def test_check_valid_iso639_2_language(): + '''Test valid ISO 639-2 language.''' + + value = 'ja' + + result = check.language(value) + + assert result == value + + +def test_check_valid_iso639_3_language(): + '''Test invalid ISO 639-3 language.''' + + value = 'eng' + + result = check.language(value) + + assert result == value + + +def test_check_invalid_iso639_2_language(capsys): + '''Test invalid ISO 639-2 language.''' + + value = 'jp' + + check.language(value) + + captured = capsys.readouterr() + assert captured.out == f'Invalid ISO 639-2 language: {value}\n' + + +def test_check_invalid_iso639_3_language(capsys): + '''Test invalid ISO 639-3 language.''' + + value = 'chi' + + check.language(value) + + captured = capsys.readouterr() + assert captured.out == f'Invalid ISO 639-3 language: {value}\n' + + +def test_check_invalid_language(capsys): + '''Test invalid language.''' + + value = 'Span' + + check.language(value) + + captured = capsys.readouterr() + assert captured.out == f'Invalid language: {value}\n'