mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-05-08 14:16:00 +02:00
Add support for validating languages
Will validate against ISO 639-2 or ISO 639-3 depending on how long the language field is. Otherwise will return that the language is invalid. Does not currently have any support for generic values like "Other".
This commit is contained in:
@ -43,6 +43,11 @@ def main(argv):
|
||||
# Fix: duplicate metadata values
|
||||
df[column] = df[column].apply(fix.duplicates)
|
||||
|
||||
# Check: invalid language
|
||||
match = re.match(r'^.*?language.*$', column)
|
||||
if match is not None:
|
||||
df[column] = df[column].apply(check.language)
|
||||
|
||||
# Check: invalid ISSN
|
||||
match = re.match(r'^.*?issn.*$', column)
|
||||
if match is not None:
|
||||
|
@ -149,3 +149,43 @@ def suspicious_characters(field):
|
||||
print(f'Suspicious character: {field}')
|
||||
|
||||
return field
|
||||
|
||||
|
||||
def language(field):
|
||||
"""Check if a language is valid ISO 639-2 or ISO 639-3.
|
||||
|
||||
Prints the value if it is invalid.
|
||||
"""
|
||||
|
||||
from iso639 import languages
|
||||
|
||||
# Skip fields with missing values
|
||||
if pd.isna(field):
|
||||
return
|
||||
|
||||
# need to handle "Other" values here...
|
||||
|
||||
# Try to split multi-value field on "||" separator
|
||||
for value in field.split('||'):
|
||||
|
||||
# After splitting, check if language value is 2 or 3 characters so we
|
||||
# can check it against ISO 639-2 or ISO 639-3 accordingly. In iso-639
|
||||
# library ISO 639-2 is "part1" and ISO 639-3 is "part3".
|
||||
if len(value) == 2:
|
||||
try:
|
||||
languages.get(part1=value)
|
||||
except KeyError:
|
||||
print(f'Invalid ISO 639-2 language: {value}')
|
||||
|
||||
pass
|
||||
elif len(value) == 3:
|
||||
try:
|
||||
languages.get(part3=value)
|
||||
except KeyError:
|
||||
print(f'Invalid ISO 639-3 language: {value}')
|
||||
|
||||
pass
|
||||
else:
|
||||
print(f'Invalid language: {value}')
|
||||
|
||||
return field
|
||||
|
Reference in New Issue
Block a user