mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2025-07-04 05:23:25 +02:00
Compare commits
5 Commits
ed084da08c
...
14010896a5
Author | SHA1 | Date | |
---|---|---|---|
14010896a5
|
|||
ab3af2ec62
|
|||
1aa2084230
|
|||
330a7b7b9c
|
|||
9a5e3fd6ef
|
@ -8,6 +8,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
### Changed
|
### Changed
|
||||||
- Fixing invalid multi-value separators like `|` and `|||` is no longer class-
|
- Fixing invalid multi-value separators like `|` and `|||` is no longer class-
|
||||||
ified as "unsafe" as I have yet to see a case where this was intentional
|
ified as "unsafe" as I have yet to see a case where this was intentional
|
||||||
|
- Not user visible, but now checks only print a warning to the screen instead
|
||||||
|
of returning a value and re-writing the DataFrame, which should be faster and
|
||||||
|
use less memory
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- Configurable directory for AGROVOC requests cache (to allow running the web
|
- Configurable directory for AGROVOC requests cache (to allow running the web
|
||||||
|
@ -116,6 +116,10 @@ This currently uses the [Python langid](https://github.com/saffsd/langid.py) lib
|
|||||||
- Warn if item is Open Access, but missing a license
|
- Warn if item is Open Access, but missing a license
|
||||||
- Warn if item has an ISSN but no journal title
|
- Warn if item has an ISSN but no journal title
|
||||||
- Update journal titles from ISSN
|
- Update journal titles from ISSN
|
||||||
|
- Check for duplicates
|
||||||
|
- If I check titles only, then I might miss if one is a Report and another is a Presentation
|
||||||
|
- I could just check each item against each other item, but that sounds slow...
|
||||||
|
- Perhaps I could check for the number of unique values in a few rows, like title and doi, and see if it is the same as the total number of items
|
||||||
|
|
||||||
## License
|
## License
|
||||||
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
This work is licensed under the [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html).
|
||||||
|
@ -105,7 +105,7 @@ def run(argv):
|
|||||||
df[column] = df[column].apply(fix.unnecessary_unicode)
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
||||||
|
|
||||||
# Check: suspicious characters
|
# Check: suspicious characters
|
||||||
df[column] = df[column].apply(check.suspicious_characters, field_name=column)
|
df[column].apply(check.suspicious_characters, field_name=column)
|
||||||
|
|
||||||
# Fix: invalid and unnecessary multi-value separators
|
# Fix: invalid and unnecessary multi-value separators
|
||||||
df[column] = df[column].apply(fix.separators, field_name=column)
|
df[column] = df[column].apply(fix.separators, field_name=column)
|
||||||
@ -120,36 +120,36 @@ def run(argv):
|
|||||||
# Identify fields the user wants to validate against AGROVOC
|
# Identify fields the user wants to validate against AGROVOC
|
||||||
for field in args.agrovoc_fields.split(","):
|
for field in args.agrovoc_fields.split(","):
|
||||||
if column == field:
|
if column == field:
|
||||||
df[column] = df[column].apply(check.agrovoc, field_name=column)
|
df[column].apply(check.agrovoc, field_name=column)
|
||||||
|
|
||||||
# Check: invalid language
|
# Check: invalid language
|
||||||
match = re.match(r"^.*?language.*$", column)
|
match = re.match(r"^.*?language.*$", column)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
df[column] = df[column].apply(check.language)
|
df[column].apply(check.language)
|
||||||
|
|
||||||
# Check: invalid ISSN
|
# Check: invalid ISSN
|
||||||
match = re.match(r"^.*?issn.*$", column)
|
match = re.match(r"^.*?issn.*$", column)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
df[column] = df[column].apply(check.issn)
|
df[column].apply(check.issn)
|
||||||
|
|
||||||
# Check: invalid ISBN
|
# Check: invalid ISBN
|
||||||
match = re.match(r"^.*?isbn.*$", column)
|
match = re.match(r"^.*?isbn.*$", column)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
df[column] = df[column].apply(check.isbn)
|
df[column].apply(check.isbn)
|
||||||
|
|
||||||
# Check: invalid date
|
# Check: invalid date
|
||||||
match = re.match(r"^.*?(date|dcterms\.issued).*$", column)
|
match = re.match(r"^.*?(date|dcterms\.issued).*$", column)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
df[column] = df[column].apply(check.date, field_name=column)
|
df[column].apply(check.date, field_name=column)
|
||||||
|
|
||||||
# Check: filename extension
|
# Check: filename extension
|
||||||
if column == "filename":
|
if column == "filename":
|
||||||
df[column] = df[column].apply(check.filename_extension)
|
df[column].apply(check.filename_extension)
|
||||||
|
|
||||||
# Check: SPDX license identifier
|
# Check: SPDX license identifier
|
||||||
match = re.match(r"dcterms\.license.*$", column)
|
match = re.match(r"dcterms\.license.*$", column)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
df[column] = df[column].apply(check.spdx_license_identifier)
|
df[column].apply(check.spdx_license_identifier)
|
||||||
|
|
||||||
##
|
##
|
||||||
# Perform some checks on rows so we can consider items as a whole rather
|
# Perform some checks on rows so we can consider items as a whole rather
|
||||||
|
@ -32,7 +32,7 @@ def issn(field):
|
|||||||
if not stdnum_issn.is_valid(value):
|
if not stdnum_issn.is_valid(value):
|
||||||
print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")
|
print(f"{Fore.RED}Invalid ISSN: {Fore.RESET}{value}")
|
||||||
|
|
||||||
return field
|
return
|
||||||
|
|
||||||
|
|
||||||
def isbn(field):
|
def isbn(field):
|
||||||
@ -55,7 +55,7 @@ def isbn(field):
|
|||||||
if not stdnum_isbn.is_valid(value):
|
if not stdnum_isbn.is_valid(value):
|
||||||
print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")
|
print(f"{Fore.RED}Invalid ISBN: {Fore.RESET}{value}")
|
||||||
|
|
||||||
return field
|
return
|
||||||
|
|
||||||
|
|
||||||
def date(field, field_name):
|
def date(field, field_name):
|
||||||
@ -83,13 +83,13 @@ def date(field, field_name):
|
|||||||
f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}"
|
f"{Fore.RED}Multiple dates not allowed ({field_name}): {Fore.RESET}{field}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return field
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Check if date is valid YYYY format
|
# Check if date is valid YYYY format
|
||||||
datetime.strptime(field, "%Y")
|
datetime.strptime(field, "%Y")
|
||||||
|
|
||||||
return field
|
return
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -97,7 +97,7 @@ def date(field, field_name):
|
|||||||
# Check if date is valid YYYY-MM format
|
# Check if date is valid YYYY-MM format
|
||||||
datetime.strptime(field, "%Y-%m")
|
datetime.strptime(field, "%Y-%m")
|
||||||
|
|
||||||
return field
|
return
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -105,7 +105,7 @@ def date(field, field_name):
|
|||||||
# Check if date is valid YYYY-MM-DD format
|
# Check if date is valid YYYY-MM-DD format
|
||||||
datetime.strptime(field, "%Y-%m-%d")
|
datetime.strptime(field, "%Y-%m-%d")
|
||||||
|
|
||||||
return field
|
return
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -113,11 +113,11 @@ def date(field, field_name):
|
|||||||
# Check if date is valid YYYY-MM-DDTHH:MM:SSZ format
|
# Check if date is valid YYYY-MM-DDTHH:MM:SSZ format
|
||||||
datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ")
|
datetime.strptime(field, "%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
|
||||||
return field
|
return
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}")
|
print(f"{Fore.RED}Invalid date ({field_name}): {Fore.RESET}{field}")
|
||||||
|
|
||||||
return field
|
return
|
||||||
|
|
||||||
|
|
||||||
def suspicious_characters(field, field_name):
|
def suspicious_characters(field, field_name):
|
||||||
@ -151,7 +151,7 @@ def suspicious_characters(field, field_name):
|
|||||||
suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}"
|
suspicious_character_msg = f"{Fore.YELLOW}Suspicious character ({field_name}): {Fore.RESET}{field_subset}"
|
||||||
print(f"{suspicious_character_msg:1.80}")
|
print(f"{suspicious_character_msg:1.80}")
|
||||||
|
|
||||||
return field
|
return
|
||||||
|
|
||||||
|
|
||||||
def language(field):
|
def language(field):
|
||||||
@ -184,7 +184,7 @@ def language(field):
|
|||||||
else:
|
else:
|
||||||
print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}")
|
print(f"{Fore.RED}Invalid language: {Fore.RESET}{value}")
|
||||||
|
|
||||||
return field
|
return
|
||||||
|
|
||||||
|
|
||||||
def agrovoc(field, field_name):
|
def agrovoc(field, field_name):
|
||||||
@ -211,7 +211,9 @@ def agrovoc(field, field_name):
|
|||||||
# running in an environment where we can't write to the current working di-
|
# running in an environment where we can't write to the current working di-
|
||||||
# rectory (for example from csv-metadata-quality-web).
|
# rectory (for example from csv-metadata-quality-web).
|
||||||
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
REQUESTS_CACHE_DIR = os.environ.get("REQUESTS_CACHE_DIR", ".")
|
||||||
requests_cache.install_cache(f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after)
|
requests_cache.install_cache(
|
||||||
|
f"{REQUESTS_CACHE_DIR}/agrovoc-response-cache", expire_after=expire_after
|
||||||
|
)
|
||||||
|
|
||||||
# prune old cache entries
|
# prune old cache entries
|
||||||
requests_cache.core.remove_expired_responses()
|
requests_cache.core.remove_expired_responses()
|
||||||
@ -230,7 +232,7 @@ def agrovoc(field, field_name):
|
|||||||
if len(data["results"]) == 0:
|
if len(data["results"]) == 0:
|
||||||
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
|
print(f"{Fore.RED}Invalid AGROVOC ({field_name}): {Fore.RESET}{value}")
|
||||||
|
|
||||||
return field
|
return
|
||||||
|
|
||||||
|
|
||||||
def filename_extension(field):
|
def filename_extension(field):
|
||||||
@ -281,7 +283,7 @@ def filename_extension(field):
|
|||||||
if filename_extension_match is False:
|
if filename_extension_match is False:
|
||||||
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
|
print(f"{Fore.YELLOW}Filename with uncommon extension: {Fore.RESET}{value}")
|
||||||
|
|
||||||
return field
|
return
|
||||||
|
|
||||||
|
|
||||||
def spdx_license_identifier(field):
|
def spdx_license_identifier(field):
|
||||||
@ -301,4 +303,4 @@ def spdx_license_identifier(field):
|
|||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return field
|
return
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
import langid
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
|
from pycountry import languages
|
||||||
|
|
||||||
|
|
||||||
def correct_language(row):
|
def correct_language(row):
|
||||||
@ -11,11 +15,6 @@ def correct_language(row):
|
|||||||
language and returns the value in the language field if it does match.
|
language and returns the value in the language field if it does match.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
import langid
|
|
||||||
from pycountry import languages
|
|
||||||
|
|
||||||
# Initialize some variables at global scope so that we can set them in the
|
# Initialize some variables at global scope so that we can set them in the
|
||||||
# loop scope below and still be able to access them afterwards.
|
# loop scope below and still be able to access them afterwards.
|
||||||
language = ""
|
language = ""
|
||||||
@ -94,4 +93,4 @@ def correct_language(row):
|
|||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return language
|
return
|
||||||
|
@ -23,7 +23,7 @@ def test_check_valid_issn():
|
|||||||
|
|
||||||
result = check.issn(value)
|
result = check.issn(value)
|
||||||
|
|
||||||
assert result == value
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
def test_check_invalid_isbn(capsys):
|
def test_check_invalid_isbn(capsys):
|
||||||
@ -44,7 +44,7 @@ def test_check_valid_isbn():
|
|||||||
|
|
||||||
result = check.isbn(value)
|
result = check.isbn(value)
|
||||||
|
|
||||||
assert result == value
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
def test_check_missing_date(capsys):
|
def test_check_missing_date(capsys):
|
||||||
@ -100,7 +100,7 @@ def test_check_valid_date():
|
|||||||
|
|
||||||
result = check.date(value, field_name)
|
result = check.date(value, field_name)
|
||||||
|
|
||||||
assert result == value
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
def test_check_suspicious_characters(capsys):
|
def test_check_suspicious_characters(capsys):
|
||||||
@ -126,7 +126,7 @@ def test_check_valid_iso639_1_language():
|
|||||||
|
|
||||||
result = check.language(value)
|
result = check.language(value)
|
||||||
|
|
||||||
assert result == value
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
def test_check_valid_iso639_3_language():
|
def test_check_valid_iso639_3_language():
|
||||||
@ -136,7 +136,7 @@ def test_check_valid_iso639_3_language():
|
|||||||
|
|
||||||
result = check.language(value)
|
result = check.language(value)
|
||||||
|
|
||||||
assert result == value
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
def test_check_invalid_iso639_1_language(capsys):
|
def test_check_invalid_iso639_1_language(capsys):
|
||||||
@ -199,7 +199,7 @@ def test_check_valid_agrovoc():
|
|||||||
|
|
||||||
result = check.agrovoc(value, field_name)
|
result = check.agrovoc(value, field_name)
|
||||||
|
|
||||||
assert result == value
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
def test_check_uncommon_filename_extension(capsys):
|
def test_check_uncommon_filename_extension(capsys):
|
||||||
@ -223,7 +223,7 @@ def test_check_common_filename_extension():
|
|||||||
|
|
||||||
result = check.filename_extension(value)
|
result = check.filename_extension(value)
|
||||||
|
|
||||||
assert result == value
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
def test_check_incorrect_iso_639_1_language(capsys):
|
def test_check_incorrect_iso_639_1_language(capsys):
|
||||||
@ -276,7 +276,7 @@ def test_check_correct_iso_639_1_language():
|
|||||||
|
|
||||||
result = experimental.correct_language(series)
|
result = experimental.correct_language(series)
|
||||||
|
|
||||||
assert result == language
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
def test_check_correct_iso_639_3_language():
|
def test_check_correct_iso_639_3_language():
|
||||||
@ -291,7 +291,7 @@ def test_check_correct_iso_639_3_language():
|
|||||||
|
|
||||||
result = experimental.correct_language(series)
|
result = experimental.correct_language(series)
|
||||||
|
|
||||||
assert result == language
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
def test_check_valid_spdx_license_identifier():
|
def test_check_valid_spdx_license_identifier():
|
||||||
@ -301,7 +301,7 @@ def test_check_valid_spdx_license_identifier():
|
|||||||
|
|
||||||
result = check.spdx_license_identifier(license)
|
result = check.spdx_license_identifier(license)
|
||||||
|
|
||||||
assert result == license
|
assert result == None
|
||||||
|
|
||||||
|
|
||||||
def test_check_invalid_spdx_license_identifier(capsys):
|
def test_check_invalid_spdx_license_identifier(capsys):
|
||||||
|
Reference in New Issue
Block a user