1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-05-08 14:16:00 +02:00

Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long
the language field is. Otherwise will return that the language is
invalid.

Does not currently have any support for generic values like "Other".
This commit is contained in:
2019-07-29 18:59:42 +03:00
parent 1978fa7b48
commit a36454a3ac
5 changed files with 117 additions and 15 deletions

View File

@ -43,6 +43,11 @@ def main(argv):
# Fix: duplicate metadata values
df[column] = df[column].apply(fix.duplicates)
# Check: invalid language
match = re.match(r'^.*?language.*$', column)
if match is not None:
df[column] = df[column].apply(check.language)
# Check: invalid ISSN
match = re.match(r'^.*?issn.*$', column)
if match is not None:

View File

@ -149,3 +149,43 @@ def suspicious_characters(field):
print(f'Suspicious character: {field}')
return field
def language(field):
"""Check if a language is valid ISO 639-2 or ISO 639-3.
Prints the value if it is invalid.
"""
from iso639 import languages
# Skip fields with missing values
if pd.isna(field):
return
# need to handle "Other" values here...
# Try to split multi-value field on "||" separator
for value in field.split('||'):
# After splitting, check if language value is 2 or 3 characters so we
# can check it against ISO 639-2 or ISO 639-3 accordingly. In iso-639
# library ISO 639-2 is "part1" and ISO 639-3 is "part3".
if len(value) == 2:
try:
languages.get(part1=value)
except KeyError:
print(f'Invalid ISO 639-2 language: {value}')
pass
elif len(value) == 3:
try:
languages.get(part3=value)
except KeyError:
print(f'Invalid ISO 639-3 language: {value}')
pass
else:
print(f'Invalid language: {value}')
return field