mirror of
				https://github.com/ilri/csv-metadata-quality.git
				synced 2025-10-31 04:41:17 +01:00 
			
		
		
		
	Compare commits
	
		
			5 Commits
		
	
	
		
			ed084da08c
			...
			14010896a5
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 14010896a5 | |||
| ab3af2ec62 | |||
| 1aa2084230 | |||
| 330a7b7b9c | |||
| 9a5e3fd6ef | 
| @@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | |||||||
| ### Changed | ### Changed | ||||||
| - Fixing invalid multi-value separators like `|` and `|||` is no longer class- | - Fixing invalid multi-value separators like `|` and `|||` is no longer class- | ||||||
| ified as "unsafe" as I have yet to see a case where this was intentional | ified as "unsafe" as I have yet to see a case where this was intentional | ||||||
|  | - Not user visible, but now checks only print a warning to the screen instead | ||||||
|  | of returning a value and re-writing the DataFrame, which should be faster and | ||||||
|  | use less memory | ||||||
|  |  | ||||||
| ### Added | ### Added | ||||||
| - Configurable directory for AGROVOC requests cache (to allow running the web | - Configurable directory for AGROVOC requests cache (to allow running the web | ||||||
|   | |||||||
| @@ -116,6 +116,10 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib | |||||||
|   - Warn if item is Open Access, but missing a license |   - Warn if item is Open Access, but missing a license | ||||||
|   - Warn if item has an ISSN but no journal title |   - Warn if item has an ISSN but no journal title | ||||||
|   - Update journal titles from ISSN |   - Update journal titles from ISSN | ||||||
|  | - Check for duplicates | ||||||
|  |   - If I check titles only, then I might miss if one is a Report and another is a Presentation | ||||||
|  |   - I could just check each item against each other item, but that sounds slow... | ||||||
|  |   - Perhaps I could check for the number of unique values in a few rows, like title and doi, and see if it is the same as the total number of items | ||||||
|  |  | ||||||
| ## License | ## License | ||||||
| This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html). | This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html). | ||||||
|   | |||||||
| @@ -105,7 +105,7 @@ def run(argv): | |||||||
|         df[column] = df[column].apply(fix.unnecessary_unicode) |         df[column] = df[column].apply(fix.unnecessary_unicode) | ||||||
|  |  | ||||||
|         # Check: suspicious characters |         # Check: suspicious characters | ||||||
|         df[column] = df[column].apply(check.suspicious_characters, field_name=column) |         df[column].apply(check.suspicious_characters, field_name=column) | ||||||
|  |  | ||||||
|         # Fix: invalid and unnecessary multi-value separators |         # Fix: invalid and unnecessary multi-value separators | ||||||
|         df[column] = df[column].apply(fix.separators, field_name=column) |         df[column] = df[column].apply(fix.separators, field_name=column) | ||||||
| @@ -120,36 +120,36 @@ def run(argv): | |||||||
|             # Identify fields the user wants to validate against AGROVOC |             # Identify fields the user wants to validate against AGROVOC | ||||||
|             for field in args.agrovoc_fields.split(","): |             for field in args.agrovoc_fields.split(","): | ||||||
|                 if column == field: |                 if column == field: | ||||||
|                     df[column] = df[column].apply(check.agrovoc, field_name=column) |                     df[column].apply(check.agrovoc, field_name=column) | ||||||
|  |  | ||||||
|         # Check: invalid language |         # Check: invalid language | ||||||
|         match = re.match(r"^.*?language.*$", column) |         match = re.match(r"^.*?language.*$", column) | ||||||
|         if match is not None: |         if match is not None: | ||||||
|             df[column] = df[column].apply(check.language) |             df[column].apply(check.language) | ||||||
|  |  | ||||||
|         # Check: invalid ISSN |         # Check: invalid ISSN | ||||||
|         match = re.match(r"^.*?issn.*$", column) |         match = re.match(r"^.*?issn.*$", column) | ||||||
|         if match is not None: |         if match is not None: | ||||||
|             df[column] = df[column].apply(check.issn) |             df[column].apply(check.issn) | ||||||
|  |  | ||||||
|         # Check: invalid ISBN |         # Check: invalid ISBN | ||||||
|         match = re.match(r"^.*?isbn.*$", column) |         match = re.match(r"^.*?isbn.*$", column) | ||||||
|         if match is not None: |         if match is not None: | ||||||
|             df[column] = df[column].apply(check.isbn) |             df[column].apply(check.isbn) | ||||||
|  |  | ||||||
|         # Check: invalid date |         # Check: invalid date | ||||||
|         match = re.match(r"^.*?(date|dcterms\.issued).*$", column) |         match = re.match(r"^.*?(date|dcterms\.issued).*$", column) | ||||||
|         if match is not None: |         if match is not None: | ||||||
|             df[column] = df[column].apply(check.date, field_name=column) |             df[column].apply(check.date, field_name=column) | ||||||
|  |  | ||||||
|         # Check: filename extension |         # Check: filename extension | ||||||
|         if column == "filename": |         if column == "filename": | ||||||
|             df[column] = df[column].apply(check.filename_extension) |             df[column].apply(check.filename_extension) | ||||||
|  |  | ||||||
|         # Check: SPDX license identifier |         # Check: SPDX license identifier | ||||||
|         match = re.match(r"dcterms\.license.*$", column) |         match = re.match(r"dcterms\.license.*$", column) | ||||||
|         if match is not None: |         if match is not None: | ||||||
|             df[column] = df[column].apply(check.spdx_license_identifier) |             df[column].apply(check.spdx_license_identifier) | ||||||
|  |  | ||||||
|     ## |     ## | ||||||
|     # Perform some checks on rows so we can consider items as a whole rather |     # Perform some checks on rows so we can consider items as a whole rather | ||||||
|   | |||||||
| @@ -32,7 +32,7 @@ def issn(field): | |||||||
|         if not stdnum_issn.is_valid(value): |         if not stdnum_issn.is_valid(value): | ||||||
|             print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}") |             print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}") | ||||||
|  |  | ||||||
|     return field |     return | ||||||
|  |  | ||||||
|  |  | ||||||
| def isbn(field): | def isbn(field): | ||||||
| @@ -55,7 +55,7 @@ def isbn(field): | |||||||
|         if not stdnum_isbn.is_valid(value): |         if not stdnum_isbn.is_valid(value): | ||||||
|             print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}") |             print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}") | ||||||
|  |  | ||||||
|     return field |     return | ||||||
|  |  | ||||||
|  |  | ||||||
| def date(field, field_name): | def date(field, field_name): | ||||||
| @@ -83,13 +83,13 @@ def date(field, field_name): | |||||||
|             f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}" |             f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}" | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|         return field |         return | ||||||
|  |  | ||||||
|     try: |     try: | ||||||
|         # Check if date is valid YYYY format |         # Check if date is valid YYYY format | ||||||
|         datetime.strptime(field, "%Y") |         datetime.strptime(field, "%Y") | ||||||
|  |  | ||||||
|         return field |         return | ||||||
|     except ValueError: |     except ValueError: | ||||||
|         pass |         pass | ||||||
|  |  | ||||||
| @@ -97,7 +97,7 @@ def date(field, field_name): | |||||||
|         # Check if date is valid YYYY-MM format |         # Check if date is valid YYYY-MM format | ||||||
|         datetime.strptime(field, "%Y-%m") |         datetime.strptime(field, "%Y-%m") | ||||||
|  |  | ||||||
|         return field |         return | ||||||
|     except ValueError: |     except ValueError: | ||||||
|         pass |         pass | ||||||
|  |  | ||||||
| @@ -105,7 +105,7 @@ def date(field, field_name): | |||||||
|         # Check if date is valid YYYY-MM-DD format |         # Check if date is valid YYYY-MM-DD format | ||||||
|         datetime.strptime(field, "%Y-%m-%d") |         datetime.strptime(field, "%Y-%m-%d") | ||||||
|  |  | ||||||
|         return field |         return | ||||||
|     except ValueError: |     except ValueError: | ||||||
|         pass |         pass | ||||||
|  |  | ||||||
| @@ -113,11 +113,11 @@ def date(field, field_name): | |||||||
|         # Check if date is valid YYYY-MM-DDTHH:MM:SSZ format |         # Check if date is valid YYYY-MM-DDTHH:MM:SSZ format | ||||||
|         datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ") |         datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ") | ||||||
|  |  | ||||||
|         return field |         return | ||||||
|     except ValueError: |     except ValueError: | ||||||
|         print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}") |         print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}") | ||||||
|  |  | ||||||
|         return field |         return | ||||||
|  |  | ||||||
|  |  | ||||||
| def suspicious_characters(field, field_name): | def suspicious_characters(field, field_name): | ||||||
| @@ -151,7 +151,7 @@ def suspicious_characters(field, field_name): | |||||||
|             suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}" |             suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}" | ||||||
|             print(f"{suspicious_character_msg:1.80}") |             print(f"{suspicious_character_msg:1.80}") | ||||||
|  |  | ||||||
|     return field |     return | ||||||
|  |  | ||||||
|  |  | ||||||
| def language(field): | def language(field): | ||||||
| @@ -184,7 +184,7 @@ def language(field): | |||||||
|         else: |         else: | ||||||
|             print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}") |             print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}") | ||||||
|  |  | ||||||
|     return field |     return | ||||||
|  |  | ||||||
|  |  | ||||||
| def agrovoc(field, field_name): | def agrovoc(field, field_name): | ||||||
| @@ -211,7 +211,9 @@ def agrovoc(field, field_name): | |||||||
|     # running in an environment where we can't write to the current working di- |     # running in an environment where we can't write to the current working di- | ||||||
|     # rectory (for example from csv-metadata-quality-web). |     # rectory (for example from csv-metadata-quality-web). | ||||||
|     REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".") |     REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".") | ||||||
|     requests_cache.install_cache(f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after) |     requests_cache.install_cache( | ||||||
|  |         f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after | ||||||
|  |     ) | ||||||
|  |  | ||||||
|     # prune old cache entries |     # prune old cache entries | ||||||
|     requests_cache.core.remove_expired_responses() |     requests_cache.core.remove_expired_responses() | ||||||
| @@ -230,7 +232,7 @@ def agrovoc(field, field_name): | |||||||
|             if len(data["results"]) == 0: |             if len(data["results"]) == 0: | ||||||
|                 print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}") |                 print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}") | ||||||
|  |  | ||||||
|     return field |     return | ||||||
|  |  | ||||||
|  |  | ||||||
| def filename_extension(field): | def filename_extension(field): | ||||||
| @@ -281,7 +283,7 @@ def filename_extension(field): | |||||||
|         if filename_extension_match is False: |         if filename_extension_match is False: | ||||||
|             print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}") |             print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}") | ||||||
|  |  | ||||||
|     return field |     return | ||||||
|  |  | ||||||
|  |  | ||||||
| def spdx_license_identifier(field): | def spdx_license_identifier(field): | ||||||
| @@ -301,4 +303,4 @@ def spdx_license_identifier(field): | |||||||
|  |  | ||||||
|             pass |             pass | ||||||
|  |  | ||||||
|     return field |     return | ||||||
|   | |||||||
| @@ -1,5 +1,9 @@ | |||||||
|  | import re | ||||||
|  |  | ||||||
|  | import langid | ||||||
| import pandas as pd | import pandas as pd | ||||||
| from colorama import Fore | from colorama import Fore | ||||||
|  | from pycountry import languages | ||||||
|  |  | ||||||
|  |  | ||||||
| def correct_language(row): | def correct_language(row): | ||||||
| @@ -11,11 +15,6 @@ def correct_language(row): | |||||||
|     language and returns the value in the language field if it does match. |     language and returns the value in the language field if it does match. | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     import re |  | ||||||
|  |  | ||||||
|     import langid |  | ||||||
|     from pycountry import languages |  | ||||||
|  |  | ||||||
|     # Initialize some variables at global scope so that we can set them in the |     # Initialize some variables at global scope so that we can set them in the | ||||||
|     # loop scope below and still be able to access them afterwards. |     # loop scope below and still be able to access them afterwards. | ||||||
|     language = "" |     language = "" | ||||||
| @@ -94,4 +93,4 @@ def correct_language(row): | |||||||
|         ) |         ) | ||||||
|  |  | ||||||
|     else: |     else: | ||||||
|         return language |         return | ||||||
|   | |||||||
| @@ -23,7 +23,7 @@ def test_check_valid_issn(): | |||||||
|  |  | ||||||
|     result = check.issn(value) |     result = check.issn(value) | ||||||
|  |  | ||||||
|     assert result == value |     assert result == None | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_invalid_isbn(capsys): | def test_check_invalid_isbn(capsys): | ||||||
| @@ -44,7 +44,7 @@ def test_check_valid_isbn(): | |||||||
|  |  | ||||||
|     result = check.isbn(value) |     result = check.isbn(value) | ||||||
|  |  | ||||||
|     assert result == value |     assert result == None | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_missing_date(capsys): | def test_check_missing_date(capsys): | ||||||
| @@ -100,7 +100,7 @@ def test_check_valid_date(): | |||||||
|  |  | ||||||
|     result = check.date(value, field_name) |     result = check.date(value, field_name) | ||||||
|  |  | ||||||
|     assert result == value |     assert result == None | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_suspicious_characters(capsys): | def test_check_suspicious_characters(capsys): | ||||||
| @@ -126,7 +126,7 @@ def test_check_valid_iso639_1_language(): | |||||||
|  |  | ||||||
|     result = check.language(value) |     result = check.language(value) | ||||||
|  |  | ||||||
|     assert result == value |     assert result == None | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_valid_iso639_3_language(): | def test_check_valid_iso639_3_language(): | ||||||
| @@ -136,7 +136,7 @@ def test_check_valid_iso639_3_language(): | |||||||
|  |  | ||||||
|     result = check.language(value) |     result = check.language(value) | ||||||
|  |  | ||||||
|     assert result == value |     assert result == None | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_invalid_iso639_1_language(capsys): | def test_check_invalid_iso639_1_language(capsys): | ||||||
| @@ -199,7 +199,7 @@ def test_check_valid_agrovoc(): | |||||||
|  |  | ||||||
|     result = check.agrovoc(value, field_name) |     result = check.agrovoc(value, field_name) | ||||||
|  |  | ||||||
|     assert result == value |     assert result == None | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_uncommon_filename_extension(capsys): | def test_check_uncommon_filename_extension(capsys): | ||||||
| @@ -223,7 +223,7 @@ def test_check_common_filename_extension(): | |||||||
|  |  | ||||||
|     result = check.filename_extension(value) |     result = check.filename_extension(value) | ||||||
|  |  | ||||||
|     assert result == value |     assert result == None | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_incorrect_iso_639_1_language(capsys): | def test_check_incorrect_iso_639_1_language(capsys): | ||||||
| @@ -276,7 +276,7 @@ def test_check_correct_iso_639_1_language(): | |||||||
|  |  | ||||||
|     result = experimental.correct_language(series) |     result = experimental.correct_language(series) | ||||||
|  |  | ||||||
|     assert result == language |     assert result == None | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_correct_iso_639_3_language(): | def test_check_correct_iso_639_3_language(): | ||||||
| @@ -291,7 +291,7 @@ def test_check_correct_iso_639_3_language(): | |||||||
|  |  | ||||||
|     result = experimental.correct_language(series) |     result = experimental.correct_language(series) | ||||||
|  |  | ||||||
|     assert result == language |     assert result == None | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_valid_spdx_license_identifier(): | def test_check_valid_spdx_license_identifier(): | ||||||
| @@ -301,7 +301,7 @@ def test_check_valid_spdx_license_identifier(): | |||||||
|  |  | ||||||
|     result = check.spdx_license_identifier(license) |     result = check.spdx_license_identifier(license) | ||||||
|  |  | ||||||
|     assert result == license |     assert result == None | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_invalid_spdx_license_identifier(capsys): | def test_check_invalid_spdx_license_identifier(capsys): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user