mirror of
https://github.com/ilri/csv-metadata-quality.git
synced 2024-11-18 03:57:03 +01:00
Alan Orth
898bb412c3
This detects whether text has likely been encoded in one encoding and decoded in another, perhaps multiple times. This often results in display of "mojibake" characters. For example, a file encoded in UTF-8 is opened as CP-1252 (Windows Latin codepage) in Microsoft Excel, and saved again as UTF-8. You will see strings like this in the resulting file: - CIAT Publicaçao - CIAT Publicación The correct version of these in UTF-8 would be: - CIAT Publicaçao - CIAT Publicación I use a code snippet from Martijn Pieters on StackOverflow to de- tect whether a string is "weird" as determined by the excellent "fixes text for you" (ftfy) Python library, then check if a weird string encodes as CP-1252 or not. If so, I can try to fix it. See: https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python
199 lines
6.6 KiB
Python
199 lines
6.6 KiB
Python
import argparse
|
|
import re
|
|
import signal
|
|
import sys
|
|
|
|
import pandas as pd
|
|
from colorama import Fore
|
|
|
|
import csv_metadata_quality.check as check
|
|
import csv_metadata_quality.experimental as experimental
|
|
import csv_metadata_quality.fix as fix
|
|
from csv_metadata_quality.version import VERSION
|
|
|
|
|
|
def parse_args(argv):
|
|
parser = argparse.ArgumentParser(description="Metadata quality checker and fixer.")
|
|
parser.add_argument(
|
|
"--agrovoc-fields",
|
|
"-a",
|
|
help="Comma-separated list of fields to validate against AGROVOC, for example: dcterms.subject,cg.coverage.country",
|
|
)
|
|
parser.add_argument(
|
|
"--experimental-checks",
|
|
"-e",
|
|
help="Enable experimental checks like language detection",
|
|
action="store_true",
|
|
)
|
|
parser.add_argument(
|
|
"--input-file",
|
|
"-i",
|
|
help="Path to input file. Can be UTF-8 CSV or Excel XLSX.",
|
|
required=True,
|
|
type=argparse.FileType("r", encoding="UTF-8"),
|
|
)
|
|
parser.add_argument(
|
|
"--output-file",
|
|
"-o",
|
|
help="Path to output file (always CSV).",
|
|
required=True,
|
|
type=argparse.FileType("w", encoding="UTF-8"),
|
|
)
|
|
parser.add_argument(
|
|
"--unsafe-fixes", "-u", help="Perform unsafe fixes.", action="store_true"
|
|
)
|
|
parser.add_argument(
|
|
"--version", "-V", action="version", version=f"CSV Metadata Quality v{VERSION}"
|
|
)
|
|
parser.add_argument(
|
|
"--exclude-fields",
|
|
"-x",
|
|
help="Comma-separated list of fields to skip, for example: dc.contributor.author,dcterms.bibliographicCitation",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
return args
|
|
|
|
|
|
def signal_handler(signal, frame):
|
|
sys.exit(1)
|
|
|
|
|
|
def run(argv):
|
|
args = parse_args(argv)
|
|
|
|
# set the signal handler for SIGINT (^C)
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
|
|
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
|
|
df = pd.read_csv(args.input_file, dtype=str)
|
|
|
|
for column in df.columns:
|
|
# Check if the user requested to skip any fields
|
|
if args.exclude_fields:
|
|
skip = False
|
|
# Split the list of excludes on ',' so we can test exact matches
|
|
# rather than fuzzy matches with regexes or "if word in string"
|
|
for exclude in args.exclude_fields.split(","):
|
|
if column == exclude and skip is False:
|
|
skip = True
|
|
if skip:
|
|
print(f"{Fore.YELLOW}Skipping {Fore.RESET}{column}")
|
|
|
|
continue
|
|
|
|
# Fix: whitespace
|
|
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
|
|
|
# Fix: newlines
|
|
if args.unsafe_fixes:
|
|
df[column] = df[column].apply(fix.newlines)
|
|
|
|
# Fix: missing space after comma. Only run on author and citation
|
|
# fields for now, as this problem is mostly an issue in names.
|
|
if args.unsafe_fixes:
|
|
match = re.match(r"^.*?(author|citation).*$", column)
|
|
if match is not None:
|
|
df[column] = df[column].apply(fix.comma_space, field_name=column)
|
|
|
|
# Fix: perform Unicode normalization (NFC) to convert decomposed
|
|
# characters into their canonical forms.
|
|
if args.unsafe_fixes:
|
|
df[column] = df[column].apply(fix.normalize_unicode, field_name=column)
|
|
|
|
# Fix: unnecessary Unicode
|
|
df[column] = df[column].apply(fix.unnecessary_unicode)
|
|
|
|
# Check: suspicious characters
|
|
df[column].apply(check.suspicious_characters, field_name=column)
|
|
|
|
# Check: mojibake
|
|
df[column].apply(check.mojibake, field_name=column)
|
|
|
|
# Fix: mojibake
|
|
if args.unsafe_fixes:
|
|
df[column] = df[column].apply(fix.mojibake, field_name=column)
|
|
|
|
# Fix: invalid and unnecessary multi-value separators
|
|
df[column] = df[column].apply(fix.separators, field_name=column)
|
|
# Run whitespace fix again after fixing invalid separators
|
|
df[column] = df[column].apply(fix.whitespace, field_name=column)
|
|
|
|
# Fix: duplicate metadata values
|
|
df[column] = df[column].apply(fix.duplicates, field_name=column)
|
|
|
|
# Check: invalid AGROVOC subject
|
|
if args.agrovoc_fields:
|
|
# Identify fields the user wants to validate against AGROVOC
|
|
for field in args.agrovoc_fields.split(","):
|
|
if column == field:
|
|
df[column].apply(check.agrovoc, field_name=column)
|
|
|
|
# Check: invalid language
|
|
match = re.match(r"^.*?language.*$", column)
|
|
if match is not None:
|
|
df[column].apply(check.language)
|
|
|
|
# Check: invalid ISSN
|
|
match = re.match(r"^.*?issn.*$", column)
|
|
if match is not None:
|
|
df[column].apply(check.issn)
|
|
|
|
# Check: invalid ISBN
|
|
match = re.match(r"^.*?isbn.*$", column)
|
|
if match is not None:
|
|
df[column].apply(check.isbn)
|
|
|
|
# Check: invalid date
|
|
match = re.match(r"^.*?(date|dcterms\.issued).*$", column)
|
|
if match is not None:
|
|
df[column].apply(check.date, field_name=column)
|
|
|
|
# Check: filename extension
|
|
if column == "filename":
|
|
df[column].apply(check.filename_extension)
|
|
|
|
# Check: SPDX license identifier
|
|
match = re.match(r"dcterms\.license.*$", column)
|
|
if match is not None:
|
|
df[column].apply(check.spdx_license_identifier)
|
|
|
|
### End individual column checks ###
|
|
|
|
# Check: duplicate items
|
|
# We extract just the title, type, and date issued columns to analyze
|
|
duplicates_df = df.filter(
|
|
regex=r"dcterms\.title|dc\.title|dcterms\.type|dc\.type|dcterms\.issued|dc\.date\.issued"
|
|
)
|
|
check.duplicate_items(duplicates_df)
|
|
|
|
# Delete the temporary duplicates DataFrame
|
|
del duplicates_df
|
|
|
|
##
|
|
# Perform some checks on rows so we can consider items as a whole rather
|
|
# than simple on a field-by-field basis. This allows us to check whether
|
|
# the language used in the title and abstract matches the language indi-
|
|
# cated in the language field, for example.
|
|
#
|
|
# This is slower and apparently frowned upon in the Pandas community be-
|
|
# cause it requires iterating over rows rather than using apply over a
|
|
# column. For now it will have to do.
|
|
##
|
|
|
|
if args.experimental_checks:
|
|
# Transpose the DataFrame so we can consider each row as a column
|
|
df_transposed = df.T
|
|
|
|
for column in df_transposed.columns:
|
|
experimental.correct_language(df_transposed[column])
|
|
|
|
# Write
|
|
df.to_csv(args.output_file, index=False)
|
|
|
|
# Close the input and output files before exiting
|
|
args.input_file.close()
|
|
args.output_file.close()
|
|
|
|
sys.exit(0)
|