1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-01-24 19:23:22 +01:00

Add check for title in citation

This checks if the item title exists in the citation. If it is not
present it could just be missing, or could have minor differences
in the whitespace, accents, etc.
This commit is contained in:
Alan Orth 2021-12-05 15:52:42 +02:00
parent 999cc65097
commit 3b40a68279
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9
2 changed files with 37 additions and 0 deletions

View File

@ -194,6 +194,9 @@ def run(argv):
# Check: citation DOI
check.citation_doi(df_transposed[column])
# Check: title in citation
check.title_in_citation(df_transposed[column])
if args.experimental_checks:
experimental.correct_language(df_transposed[column])

View File

@ -410,3 +410,37 @@ def citation_doi(row):
)
return
def title_in_citation(row):
"""Check for the scenario where an item's title is missing from its cita-
tion. This could mean that it is missing entirely, or perhaps just exists
in a different format (whitespace, accents, etc).
Function prints a warning if the title does not appear in the citation.
"""
# Iterate over the labels of the current row's values to get the names of
# the title and citation columns. Then we check if the title is present in
# the citation.
for label in row.axes[0]:
# Skip fields with missing values
if pd.isna(row[label]):
continue
# Find the name of the title column
match = re.match(r"^(dc|dcterms)\.title.*$", label)
if match is not None:
title_column_name = label
# Find the name of the citation column
match = re.match(r"^.*?[cC]itation.*$", label)
if match is not None:
citation_column_name = label
if row[citation_column_name] != "":
if row[title_column_name] not in row[citation_column_name]:
print(
f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{row[title_column_name]}"
)
return