diff --git a/csv_metadata_quality/__main__.py b/csv_metadata_quality/__main__.py index 46baa1d..e6f2704 100644 --- a/csv_metadata_quality/__main__.py +++ b/csv_metadata_quality/__main__.py @@ -6,5 +6,5 @@ def main(): app.run(argv) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/csv_metadata_quality/app.py b/csv_metadata_quality/app.py index 3799f33..6063ac8 100644 --- a/csv_metadata_quality/app.py +++ b/csv_metadata_quality/app.py @@ -9,13 +9,37 @@ import sys def parse_args(argv): - parser = argparse.ArgumentParser(description='Metadata quality checker and fixer.') - parser.add_argument('--agrovoc-fields', '-a', help='Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country') - parser.add_argument('--input-file', '-i', help='Path to input file. Can be UTF-8 CSV or Excel XLSX.', required=True, type=argparse.FileType('r', encoding='UTF-8')) - parser.add_argument('--output-file', '-o', help='Path to output file (always CSV).', required=True, type=argparse.FileType('w', encoding='UTF-8')) - parser.add_argument('--unsafe-fixes', '-u', help='Perform unsafe fixes.', action='store_true') - parser.add_argument('--version', '-V', action='version', version=f'CSV Metadata Quality v{VERSION}') - parser.add_argument('--exclude-fields', '-x', help='Comma-separated list of fields to skip, for example: dc.contributor.author,dc.identifier.citation') + parser = argparse.ArgumentParser(description="Metadata quality checker and fixer.") + parser.add_argument( + "--agrovoc-fields", + "-a", + help="Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country", + ) + parser.add_argument( + "--input-file", + "-i", + help="Path to input file. Can be UTF-8 CSV or Excel XLSX.", + required=True, + type=argparse.FileType("r", encoding="UTF-8"), + ) + parser.add_argument( + "--output-file", + "-o", + help="Path to output file (always CSV).", + required=True, + type=argparse.FileType("w", encoding="UTF-8"), + ) + parser.add_argument( + "--unsafe-fixes", "-u", help="Perform unsafe fixes.", action="store_true" + ) + parser.add_argument( + "--version", "-V", action="version", version=f"CSV Metadata Quality v{VERSION}" + ) + parser.add_argument( + "--exclude-fields", + "-x", + help="Comma-separated list of fields to skip, for example: dc.contributor.author,dc.identifier.citation", + ) args = parser.parse_args() return args @@ -40,11 +64,11 @@ def run(argv): skip = False # Split the list of excludes on ',' so we can test exact matches # rather than fuzzy matches with regexes or "if word in string" - for exclude in args.exclude_fields.split(','): + for exclude in args.exclude_fields.split(","): if column == exclude and skip is False: skip = True if skip: - print(f'Skipping {column}') + print(f"Skipping {column}") continue @@ -58,7 +82,7 @@ def run(argv): # Fix: missing space after comma. Only run on author and citation # fields for now, as this problem is mostly an issue in names. if args.unsafe_fixes: - match = re.match(r'^.*?(author|citation).*$', column) + match = re.match(r"^.*?(author|citation).*$", column) if match is not None: df[column] = df[column].apply(fix.comma_space, field_name=column) @@ -83,32 +107,32 @@ def run(argv): # Check: invalid AGROVOC subject if args.agrovoc_fields: # Identify fields the user wants to validate against AGROVOC - for field in args.agrovoc_fields.split(','): + for field in args.agrovoc_fields.split(","): if column == field: df[column] = df[column].apply(check.agrovoc, field_name=column) # Check: invalid language - match = re.match(r'^.*?language.*$', column) + match = re.match(r"^.*?language.*$", column) if match is not None: df[column] = df[column].apply(check.language) # Check: invalid ISSN - match = re.match(r'^.*?issn.*$', column) + match = re.match(r"^.*?issn.*$", column) if match is not None: df[column] = df[column].apply(check.issn) # Check: invalid ISBN - match = re.match(r'^.*?isbn.*$', column) + match = re.match(r"^.*?isbn.*$", column) if match is not None: df[column] = df[column].apply(check.isbn) # Check: invalid date - match = re.match(r'^.*?date.*$', column) + match = re.match(r"^.*?date.*$", column) if match is not None: df[column] = df[column].apply(check.date, field_name=column) # Check: filename extension - if column == 'filename': + if column == "filename": df[column] = df[column].apply(check.filename_extension) # Write diff --git a/csv_metadata_quality/check.py b/csv_metadata_quality/check.py index d9dcbb5..2c8a336 100755 --- a/csv_metadata_quality/check.py +++ b/csv_metadata_quality/check.py @@ -18,10 +18,10 @@ def issn(field): return # Try to split multi-value field on "||" separator - for value in field.split('||'): + for value in field.split("||"): if not issn.is_valid(value): - print(f'Invalid ISSN: {value}') + print(f"Invalid ISSN: {value}") return field @@ -43,10 +43,10 @@ def isbn(field): return # Try to split multi-value field on "||" separator - for value in field.split('||'): + for value in field.split("||"): if not isbn.is_valid(value): - print(f'Invalid ISBN: {value}') + print(f"Invalid ISBN: {value}") return field @@ -64,13 +64,13 @@ def separators(field): return # Try to split multi-value field on "||" separator - for value in field.split('||'): + for value in field.split("||"): # After splitting, see if there are any remaining "|" characters - match = re.findall(r'^.*?\|.*$', value) + match = re.findall(r"^.*?\|.*$", value) if match: - print(f'Invalid multi-value separator: {field}') + print(f"Invalid multi-value separator: {field}") return field @@ -88,22 +88,22 @@ def date(field, field_name): from datetime import datetime if pd.isna(field): - print(f'Missing date ({field_name}).') + print(f"Missing date ({field_name}).") return # Try to split multi-value field on "||" separator - multiple_dates = field.split('||') + multiple_dates = field.split("||") # We don't allow multi-value date fields if len(multiple_dates) > 1: - print(f'Multiple dates not allowed ({field_name}): {field}') + print(f"Multiple dates not allowed ({field_name}): {field}") return field try: # Check if date is valid YYYY format - datetime.strptime(field, '%Y') + datetime.strptime(field, "%Y") return field except ValueError: @@ -111,7 +111,7 @@ def date(field, field_name): try: # Check if date is valid YYYY-MM format - datetime.strptime(field, '%Y-%m') + datetime.strptime(field, "%Y-%m") return field except ValueError: @@ -119,11 +119,11 @@ def date(field, field_name): try: # Check if date is valid YYYY-MM-DD format - datetime.strptime(field, '%Y-%m-%d') + datetime.strptime(field, "%Y-%m-%d") return field except ValueError: - print(f'Invalid date ({field_name}): {field}') + print(f"Invalid date ({field_name}): {field}") return field @@ -140,7 +140,7 @@ def suspicious_characters(field, field_name): return # List of suspicious characters, for example: ́ˆ~` - suspicious_characters = ['\u00B4', '\u02C6', '\u007E', '\u0060'] + suspicious_characters = ["\u00B4", "\u02C6", "\u007E", "\u0060"] for character in suspicious_characters: # Find the position of the suspicious character in the string @@ -156,8 +156,10 @@ def suspicious_characters(field, field_name): # character and spanning enough of the rest to give a preview, # but not too much to cause the line to break in terminals with # a default of 80 characters width. - suspicious_character_msg = f'Suspicious character ({field_name}): {field_subset}' - print(f'{suspicious_character_msg:1.80}') + suspicious_character_msg = ( + f"Suspicious character ({field_name}): {field_subset}" + ) + print(f"{suspicious_character_msg:1.80}") return field @@ -177,22 +179,22 @@ def language(field): # need to handle "Other" values here... # Try to split multi-value field on "||" separator - for value in field.split('||'): + for value in field.split("||"): # After splitting, check if language value is 2 or 3 characters so we # can check it against ISO 639-2 or ISO 639-3 accordingly. if len(value) == 2: if not languages.get(alpha_2=value): - print(f'Invalid ISO 639-2 language: {value}') + print(f"Invalid ISO 639-2 language: {value}") pass elif len(value) == 3: if not languages.get(alpha_3=value): - print(f'Invalid ISO 639-3 language: {value}') + print(f"Invalid ISO 639-3 language: {value}") pass else: - print(f'Invalid language: {value}') + print(f"Invalid language: {value}") return field @@ -220,12 +222,16 @@ def agrovoc(field, field_name): return # Try to split multi-value field on "||" separator - for value in field.split('||'): - request_url = f'http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search?query={value}' + for value in field.split("||"): + request_url = ( + f"http://agrovoc.uniroma2.it/agrovoc/rest/v1/agrovoc/search?query={value}" + ) # enable transparent request cache with thirty days expiry expire_after = timedelta(days=30) - requests_cache.install_cache('agrovoc-response-cache', expire_after=expire_after) + requests_cache.install_cache( + "agrovoc-response-cache", expire_after=expire_after + ) request = requests.get(request_url) @@ -236,8 +242,8 @@ def agrovoc(field, field_name): data = request.json() # check if there are any results - if len(data['results']) == 0: - print(f'Invalid AGROVOC ({field_name}): {value}') + if len(data["results"]) == 0: + print(f"Invalid AGROVOC ({field_name}): {value}") return field @@ -260,10 +266,18 @@ def filename_extension(field): return # Try to split multi-value field on "||" separator - values = field.split('||') + values = field.split("||") # List of common filename extentions - common_filename_extensions = ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.xls', '.xlsx'] + common_filename_extensions = [ + ".pdf", + ".doc", + ".docx", + ".ppt", + ".pptx", + ".xls", + ".xlsx", + ] # Iterate over all values for value in values: @@ -272,7 +286,7 @@ def filename_extension(field): for filename_extension in common_filename_extensions: # Check for extension at the end of the filename - pattern = re.escape(filename_extension) + r'$' + pattern = re.escape(filename_extension) + r"$" match = re.search(pattern, value, re.IGNORECASE) if match is not None: @@ -282,6 +296,6 @@ def filename_extension(field): break if filename_extension_match is False: - print(f'Filename with uncommon extension: {value}') + print(f"Filename with uncommon extension: {value}") return field diff --git a/csv_metadata_quality/fix.py b/csv_metadata_quality/fix.py index f664a78..db56b05 100755 --- a/csv_metadata_quality/fix.py +++ b/csv_metadata_quality/fix.py @@ -16,23 +16,23 @@ def whitespace(field): values = list() # Try to split multi-value field on "||" separator - for value in field.split('||'): + for value in field.split("||"): # Strip leading and trailing whitespace value = value.strip() # Replace excessive whitespace (>2) with one space - pattern = re.compile(r'\s{2,}') + pattern = re.compile(r"\s{2,}") match = re.findall(pattern, value) if match: - print(f'Excessive whitespace: {value}') - value = re.sub(pattern, ' ', value) + print(f"Excessive whitespace: {value}") + value = re.sub(pattern, " ", value) # Save cleaned value values.append(value) # Create a new field consisting of all values joined with "||" - new_field = '||'.join(values) + new_field = "||".join(values) return new_field @@ -48,21 +48,21 @@ def separators(field): values = list() # Try to split multi-value field on "||" separator - for value in field.split('||'): + for value in field.split("||"): # After splitting, see if there are any remaining "|" characters - pattern = re.compile(r'\|') + pattern = re.compile(r"\|") match = re.findall(pattern, value) if match: - print(f'Fixing invalid multi-value separator: {value}') + print(f"Fixing invalid multi-value separator: {value}") - value = re.sub(pattern, '||', value) + value = re.sub(pattern, "||", value) # Save cleaned value values.append(value) # Create a new field consisting of all values joined with "||" - new_field = '||'.join(values) + new_field = "||".join(values) return new_field @@ -86,36 +86,36 @@ def unnecessary_unicode(field): return # Check for zero-width space characters (U+200B) - pattern = re.compile(r'\u200B') + pattern = re.compile(r"\u200B") match = re.findall(pattern, field) if match: - print(f'Removing unnecessary Unicode (U+200B): {field}') - field = re.sub(pattern, '', field) + print(f"Removing unnecessary Unicode (U+200B): {field}") + field = re.sub(pattern, "", field) # Check for replacement characters (U+FFFD) - pattern = re.compile(r'\uFFFD') + pattern = re.compile(r"\uFFFD") match = re.findall(pattern, field) if match: - print(f'Removing unnecessary Unicode (U+FFFD): {field}') - field = re.sub(pattern, '', field) + print(f"Removing unnecessary Unicode (U+FFFD): {field}") + field = re.sub(pattern, "", field) # Check for no-break spaces (U+00A0) - pattern = re.compile(r'\u00A0') + pattern = re.compile(r"\u00A0") match = re.findall(pattern, field) if match: - print(f'Removing unnecessary Unicode (U+00A0): {field}') - field = re.sub(pattern, '', field) + print(f"Removing unnecessary Unicode (U+00A0): {field}") + field = re.sub(pattern, "", field) # Check for soft hyphens (U+00AD), sometimes preceeded with a normal hyphen - pattern = re.compile(r'\u002D*?\u00AD') + pattern = re.compile(r"\u002D*?\u00AD") match = re.findall(pattern, field) if match: - print(f'Replacing unnecessary Unicode (U+00AD): {field}') - field = re.sub(pattern, '-', field) + print(f"Replacing unnecessary Unicode (U+00AD): {field}") + field = re.sub(pattern, "-", field) return field @@ -128,7 +128,7 @@ def duplicates(field): return # Try to split multi-value field on "||" separator - values = field.split('||') + values = field.split("||") # Initialize an empty list to hold the de-duplicated values new_values = list() @@ -139,10 +139,10 @@ def duplicates(field): if value not in new_values: new_values.append(value) else: - print(f'Dropping duplicate value: {value}') + print(f"Dropping duplicate value: {value}") # Create a new field consisting of all values joined with "||" - new_field = '||'.join(new_values) + new_field = "||".join(new_values) return new_field @@ -169,11 +169,11 @@ def newlines(field): return # Check for Unix line feed (LF) - match = re.findall(r'\n', field) + match = re.findall(r"\n", field) if match: - print(f'Removing newline: {field}') - field = field.replace('\n', '') + print(f"Removing newline: {field}") + field = field.replace("\n", "") return field @@ -193,10 +193,10 @@ def comma_space(field, field_name): return # Check for comma followed by a word character - match = re.findall(r',\w', field) + match = re.findall(r",\w", field) if match: - print(f'Adding space after comma ({field_name}): {field}') - field = re.sub(r',(\w)', r', \1', field) + print(f"Adding space after comma ({field_name}): {field}") + field = re.sub(r",(\w)", r", \1", field) return field diff --git a/csv_metadata_quality/version.py b/csv_metadata_quality/version.py index ace1dd7..0bcc5d9 100644 --- a/csv_metadata_quality/version.py +++ b/csv_metadata_quality/version.py @@ -1 +1 @@ -VERSION = '0.2.2' +VERSION = "0.2.2"