Add support for validating languages

Will validate against ISO 639-2 or ISO 639-3 depending on how long the language field is. Otherwise will return that the language is invalid. Does not currently have any support for generic values like "Other".
2025-07-03 21:13:28 +02:00 · 2019-07-29 18:59:42 +03:00
parent 1978fa7b48
commit a36454a3ac
5 changed files with 117 additions and 15 deletions
--- a/csv_metadata_quality/app.py
+++ b/csv_metadata_quality/app.py
@ -43,6 +43,11 @@ def main(argv):
        # Fix: duplicate metadata values
        df[column] = df[column].apply(fix.duplicates)

+        # Check: invalid language
+        match = re.match(r'^.*?language.*$', column)
+        if match is not None:
+            df[column] = df[column].apply(check.language)
+
        # Check: invalid ISSN
        match = re.match(r'^.*?issn.*$', column)
        if match is not None:
--- a/csv_metadata_quality/check.py
+++ b/csv_metadata_quality/check.py
@ -149,3 +149,43 @@ def suspicious_characters(field):
            print(f'Suspicious character: {field}')

    return field
+
+
+def language(field):
+    """Check if a language is valid ISO 639-2 or ISO 639-3.
+
+    Prints the value if it is invalid.
+    """
+
+    from iso639 import languages
+
+    # Skip fields with missing values
+    if pd.isna(field):
+        return
+
+    # need to handle "Other" values here...
+
+    # Try to split multi-value field on "||" separator
+    for value in field.split('||'):
+
+        # After splitting, check if language value is 2 or 3 characters so we
+        # can check it against ISO 639-2 or ISO 639-3 accordingly. In iso-639
+        # library ISO 639-2 is "part1" and ISO 639-3 is "part3".
+        if len(value) == 2:
+            try:
+                languages.get(part1=value)
+            except KeyError:
+                print(f'Invalid ISO 639-2 language: {value}')
+
+                pass
+        elif len(value) == 3:
+            try:
+                languages.get(part3=value)
+            except KeyError:
+                print(f'Invalid ISO 639-3 language: {value}')
+
+                pass
+        else:
+            print(f'Invalid language: {value}')
+
+    return field