mirror of
				https://github.com/ilri/csv-metadata-quality.git
				synced 2025-10-30 20:31:14 +01:00 
			
		
		
		
	Compare commits
	
		
			5 Commits
		
	
	
		
			ed084da08c
			...
			14010896a5
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 14010896a5 | |||
| ab3af2ec62 | |||
| 1aa2084230 | |||
| 330a7b7b9c | |||
| 9a5e3fd6ef | 
| @@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | ||||
| ### Changed | ||||
| - Fixing invalid multi-value separators like `|` and `|||` is no longer class- | ||||
| ified as "unsafe" as I have yet to see a case where this was intentional | ||||
| - Not user visible, but now checks only print a warning to the screen instead | ||||
| of returning a value and re-writing the DataFrame, which should be faster and | ||||
| use less memory | ||||
|  | ||||
| ### Added | ||||
| - Configurable directory for AGROVOC requests cache (to allow running the web | ||||
|   | ||||
							
								
								
									
										12
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								README.md
									
									
									
									
									
								
							| @@ -112,10 +112,14 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib | ||||
| - Add configurable field validation, like specify a field name and a validation file? | ||||
|   - Perhaps like --validate=field.name,filename | ||||
| - Add some row-based item sanity checks and fixes: | ||||
|     - Warn if item is Open Access, but missing a filename or URL | ||||
|     - Warn if item is Open Access, but missing a license | ||||
|     - Warn if item has an ISSN but no journal title | ||||
|     - Update journal titles from ISSN | ||||
|   - Warn if item is Open Access, but missing a filename or URL | ||||
|   - Warn if item is Open Access, but missing a license | ||||
|   - Warn if item has an ISSN but no journal title | ||||
|   - Update journal titles from ISSN | ||||
| - Check for duplicates | ||||
|   - If I check titles only, then I might miss if one is a Report and another is a Presentation | ||||
|   - I could just check each item against each other item, but that sounds slow... | ||||
|   - Perhaps I could check for the number of unique values in a few rows, like title and doi, and see if it is the same as the total number of items | ||||
|  | ||||
| ## License | ||||
| This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html). | ||||
|   | ||||
| @@ -105,7 +105,7 @@ def run(argv): | ||||
|         df[column] = df[column].apply(fix.unnecessary_unicode) | ||||
|  | ||||
|         # Check: suspicious characters | ||||
|         df[column] = df[column].apply(check.suspicious_characters, field_name=column) | ||||
|         df[column].apply(check.suspicious_characters, field_name=column) | ||||
|  | ||||
|         # Fix: invalid and unnecessary multi-value separators | ||||
|         df[column] = df[column].apply(fix.separators, field_name=column) | ||||
| @@ -120,36 +120,36 @@ def run(argv): | ||||
|             # Identify fields the user wants to validate against AGROVOC | ||||
|             for field in args.agrovoc_fields.split(","): | ||||
|                 if column == field: | ||||
|                     df[column] = df[column].apply(check.agrovoc, field_name=column) | ||||
|                     df[column].apply(check.agrovoc, field_name=column) | ||||
|  | ||||
|         # Check: invalid language | ||||
|         match = re.match(r"^.*?language.*$", column) | ||||
|         if match is not None: | ||||
|             df[column] = df[column].apply(check.language) | ||||
|             df[column].apply(check.language) | ||||
|  | ||||
|         # Check: invalid ISSN | ||||
|         match = re.match(r"^.*?issn.*$", column) | ||||
|         if match is not None: | ||||
|             df[column] = df[column].apply(check.issn) | ||||
|             df[column].apply(check.issn) | ||||
|  | ||||
|         # Check: invalid ISBN | ||||
|         match = re.match(r"^.*?isbn.*$", column) | ||||
|         if match is not None: | ||||
|             df[column] = df[column].apply(check.isbn) | ||||
|             df[column].apply(check.isbn) | ||||
|  | ||||
|         # Check: invalid date | ||||
|         match = re.match(r"^.*?(date|dcterms\.issued).*$", column) | ||||
|         if match is not None: | ||||
|             df[column] = df[column].apply(check.date, field_name=column) | ||||
|             df[column].apply(check.date, field_name=column) | ||||
|  | ||||
|         # Check: filename extension | ||||
|         if column == "filename": | ||||
|             df[column] = df[column].apply(check.filename_extension) | ||||
|             df[column].apply(check.filename_extension) | ||||
|  | ||||
|         # Check: SPDX license identifier | ||||
|         match = re.match(r"dcterms\.license.*$", column) | ||||
|         if match is not None: | ||||
|             df[column] = df[column].apply(check.spdx_license_identifier) | ||||
|             df[column].apply(check.spdx_license_identifier) | ||||
|  | ||||
|     ## | ||||
|     # Perform some checks on rows so we can consider items as a whole rather | ||||
|   | ||||
| @@ -32,7 +32,7 @@ def issn(field): | ||||
|         if not stdnum_issn.is_valid(value): | ||||
|             print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}") | ||||
|  | ||||
|     return field | ||||
|     return | ||||
|  | ||||
|  | ||||
| def isbn(field): | ||||
| @@ -55,7 +55,7 @@ def isbn(field): | ||||
|         if not stdnum_isbn.is_valid(value): | ||||
|             print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}") | ||||
|  | ||||
|     return field | ||||
|     return | ||||
|  | ||||
|  | ||||
| def date(field, field_name): | ||||
| @@ -83,13 +83,13 @@ def date(field, field_name): | ||||
|             f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}" | ||||
|         ) | ||||
|  | ||||
|         return field | ||||
|         return | ||||
|  | ||||
|     try: | ||||
|         # Check if date is valid YYYY format | ||||
|         datetime.strptime(field, "%Y") | ||||
|  | ||||
|         return field | ||||
|         return | ||||
|     except ValueError: | ||||
|         pass | ||||
|  | ||||
| @@ -97,7 +97,7 @@ def date(field, field_name): | ||||
|         # Check if date is valid YYYY-MM format | ||||
|         datetime.strptime(field, "%Y-%m") | ||||
|  | ||||
|         return field | ||||
|         return | ||||
|     except ValueError: | ||||
|         pass | ||||
|  | ||||
| @@ -105,7 +105,7 @@ def date(field, field_name): | ||||
|         # Check if date is valid YYYY-MM-DD format | ||||
|         datetime.strptime(field, "%Y-%m-%d") | ||||
|  | ||||
|         return field | ||||
|         return | ||||
|     except ValueError: | ||||
|         pass | ||||
|  | ||||
| @@ -113,11 +113,11 @@ def date(field, field_name): | ||||
|         # Check if date is valid YYYY-MM-DDTHH:MM:SSZ format | ||||
|         datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ") | ||||
|  | ||||
|         return field | ||||
|         return | ||||
|     except ValueError: | ||||
|         print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}") | ||||
|  | ||||
|         return field | ||||
|         return | ||||
|  | ||||
|  | ||||
| def suspicious_characters(field, field_name): | ||||
| @@ -151,7 +151,7 @@ def suspicious_characters(field, field_name): | ||||
|             suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}" | ||||
|             print(f"{suspicious_character_msg:1.80}") | ||||
|  | ||||
|     return field | ||||
|     return | ||||
|  | ||||
|  | ||||
| def language(field): | ||||
| @@ -184,7 +184,7 @@ def language(field): | ||||
|         else: | ||||
|             print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}") | ||||
|  | ||||
|     return field | ||||
|     return | ||||
|  | ||||
|  | ||||
| def agrovoc(field, field_name): | ||||
| @@ -211,7 +211,9 @@ def agrovoc(field, field_name): | ||||
|     # running in an environment where we can't write to the current working di- | ||||
|     # rectory (for example from csv-metadata-quality-web). | ||||
|     REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".") | ||||
|     requests_cache.install_cache(f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after) | ||||
|     requests_cache.install_cache( | ||||
|         f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after | ||||
|     ) | ||||
|  | ||||
|     # prune old cache entries | ||||
|     requests_cache.core.remove_expired_responses() | ||||
| @@ -230,7 +232,7 @@ def agrovoc(field, field_name): | ||||
|             if len(data["results"]) == 0: | ||||
|                 print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}") | ||||
|  | ||||
|     return field | ||||
|     return | ||||
|  | ||||
|  | ||||
| def filename_extension(field): | ||||
| @@ -281,7 +283,7 @@ def filename_extension(field): | ||||
|         if filename_extension_match is False: | ||||
|             print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}") | ||||
|  | ||||
|     return field | ||||
|     return | ||||
|  | ||||
|  | ||||
| def spdx_license_identifier(field): | ||||
| @@ -301,4 +303,4 @@ def spdx_license_identifier(field): | ||||
|  | ||||
|             pass | ||||
|  | ||||
|     return field | ||||
|     return | ||||
|   | ||||
| @@ -1,5 +1,9 @@ | ||||
| import re | ||||
|  | ||||
| import langid | ||||
| import pandas as pd | ||||
| from colorama import Fore | ||||
| from pycountry import languages | ||||
|  | ||||
|  | ||||
| def correct_language(row): | ||||
| @@ -11,11 +15,6 @@ def correct_language(row): | ||||
|     language and returns the value in the language field if it does match. | ||||
|     """ | ||||
|  | ||||
|     import re | ||||
|  | ||||
|     import langid | ||||
|     from pycountry import languages | ||||
|  | ||||
|     # Initialize some variables at global scope so that we can set them in the | ||||
|     # loop scope below and still be able to access them afterwards. | ||||
|     language = "" | ||||
| @@ -94,4 +93,4 @@ def correct_language(row): | ||||
|         ) | ||||
|  | ||||
|     else: | ||||
|         return language | ||||
|         return | ||||
|   | ||||
| @@ -23,7 +23,7 @@ def test_check_valid_issn(): | ||||
|  | ||||
|     result = check.issn(value) | ||||
|  | ||||
|     assert result == value | ||||
|     assert result == None | ||||
|  | ||||
|  | ||||
| def test_check_invalid_isbn(capsys): | ||||
| @@ -44,7 +44,7 @@ def test_check_valid_isbn(): | ||||
|  | ||||
|     result = check.isbn(value) | ||||
|  | ||||
|     assert result == value | ||||
|     assert result == None | ||||
|  | ||||
|  | ||||
| def test_check_missing_date(capsys): | ||||
| @@ -100,7 +100,7 @@ def test_check_valid_date(): | ||||
|  | ||||
|     result = check.date(value, field_name) | ||||
|  | ||||
|     assert result == value | ||||
|     assert result == None | ||||
|  | ||||
|  | ||||
| def test_check_suspicious_characters(capsys): | ||||
| @@ -126,7 +126,7 @@ def test_check_valid_iso639_1_language(): | ||||
|  | ||||
|     result = check.language(value) | ||||
|  | ||||
|     assert result == value | ||||
|     assert result == None | ||||
|  | ||||
|  | ||||
| def test_check_valid_iso639_3_language(): | ||||
| @@ -136,7 +136,7 @@ def test_check_valid_iso639_3_language(): | ||||
|  | ||||
|     result = check.language(value) | ||||
|  | ||||
|     assert result == value | ||||
|     assert result == None | ||||
|  | ||||
|  | ||||
| def test_check_invalid_iso639_1_language(capsys): | ||||
| @@ -199,7 +199,7 @@ def test_check_valid_agrovoc(): | ||||
|  | ||||
|     result = check.agrovoc(value, field_name) | ||||
|  | ||||
|     assert result == value | ||||
|     assert result == None | ||||
|  | ||||
|  | ||||
| def test_check_uncommon_filename_extension(capsys): | ||||
| @@ -223,7 +223,7 @@ def test_check_common_filename_extension(): | ||||
|  | ||||
|     result = check.filename_extension(value) | ||||
|  | ||||
|     assert result == value | ||||
|     assert result == None | ||||
|  | ||||
|  | ||||
| def test_check_incorrect_iso_639_1_language(capsys): | ||||
| @@ -276,7 +276,7 @@ def test_check_correct_iso_639_1_language(): | ||||
|  | ||||
|     result = experimental.correct_language(series) | ||||
|  | ||||
|     assert result == language | ||||
|     assert result == None | ||||
|  | ||||
|  | ||||
| def test_check_correct_iso_639_3_language(): | ||||
| @@ -291,7 +291,7 @@ def test_check_correct_iso_639_3_language(): | ||||
|  | ||||
|     result = experimental.correct_language(series) | ||||
|  | ||||
|     assert result == language | ||||
|     assert result == None | ||||
|  | ||||
|  | ||||
| def test_check_valid_spdx_license_identifier(): | ||||
| @@ -301,7 +301,7 @@ def test_check_valid_spdx_license_identifier(): | ||||
|  | ||||
|     result = check.spdx_license_identifier(license) | ||||
|  | ||||
|     assert result == license | ||||
|     assert result == None | ||||
|  | ||||
|  | ||||
| def test_check_invalid_spdx_license_identifier(capsys): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user