mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-12-22 12:12:18 +01:00
Add check for title in citation
This checks if the item title exists in the citation. If it is not present it could just be missing, or could have minor differences in the whitespace, accents, etc.
This commit is contained in:
parent
999cc65097
commit
3b40a68279
@ -194,6 +194,9 @@ def run(argv):
|
|||||||
# Check: citation DOI
|
# Check: citation DOI
|
||||||
check.citation_doi(df_transposed[column])
|
check.citation_doi(df_transposed[column])
|
||||||
|
|
||||||
|
# Check: title in citation
|
||||||
|
check.title_in_citation(df_transposed[column])
|
||||||
|
|
||||||
if args.experimental_checks:
|
if args.experimental_checks:
|
||||||
experimental.correct_language(df_transposed[column])
|
experimental.correct_language(df_transposed[column])
|
||||||
|
|
||||||
|
@ -410,3 +410,37 @@ def citation_doi(row):
|
|||||||
)
|
)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def title_in_citation(row):
|
||||||
|
"""Check for the scenario where an item's title is missing from its cita-
|
||||||
|
tion. This could mean that it is missing entirely, or perhaps just exists
|
||||||
|
in a different format (whitespace, accents, etc).
|
||||||
|
|
||||||
|
Function prints a warning if the title does not appear in the citation.
|
||||||
|
"""
|
||||||
|
# Iterate over the labels of the current row's values to get the names of
|
||||||
|
# the title and citation columns. Then we check if the title is present in
|
||||||
|
# the citation.
|
||||||
|
for label in row.axes[0]:
|
||||||
|
# Skip fields with missing values
|
||||||
|
if pd.isna(row[label]):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find the name of the title column
|
||||||
|
match = re.match(r"^(dc|dcterms)\.title.*$", label)
|
||||||
|
if match is not None:
|
||||||
|
title_column_name = label
|
||||||
|
|
||||||
|
# Find the name of the citation column
|
||||||
|
match = re.match(r"^.*?[cC]itation.*$", label)
|
||||||
|
if match is not None:
|
||||||
|
citation_column_name = label
|
||||||
|
|
||||||
|
if row[citation_column_name] != "":
|
||||||
|
if row[title_column_name] not in row[citation_column_name]:
|
||||||
|
print(
|
||||||
|
f"{Fore.YELLOW}Title is not present in citation: {Fore.RESET}{row[title_column_name]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return
|
||||||
|
Loading…
Reference in New Issue
Block a user