2019-08-01 23:09:08 +02:00
from csv_metadata_quality . version import VERSION
2019-07-28 19:31:57 +02:00
import argparse
2019-07-26 22:14:10 +02:00
import csv_metadata_quality . check as check
2019-07-26 21:11:10 +02:00
import csv_metadata_quality . fix as fix
import pandas as pd
2019-07-28 15:11:36 +02:00
import re
2019-08-03 20:11:57 +02:00
import signal
import sys
2019-07-26 21:11:10 +02:00
2019-07-28 19:31:57 +02:00
def parse_args ( argv ) :
parser = argparse . ArgumentParser ( description = ' Metadata quality checker and fixer. ' )
2019-08-01 22:51:58 +02:00
parser . add_argument ( ' --agrovoc-fields ' , ' -a ' , help = ' Comma-separated list of fields to validate against AGROVOC, for example: dc.subject,cg.coverage.country ' )
2019-07-28 19:31:57 +02:00
parser . add_argument ( ' --input-file ' , ' -i ' , help = ' Path to input file. Can be UTF-8 CSV or Excel XLSX. ' , required = True , type = argparse . FileType ( ' r ' , encoding = ' UTF-8 ' ) )
parser . add_argument ( ' --output-file ' , ' -o ' , help = ' Path to output file (always CSV). ' , required = True , type = argparse . FileType ( ' w ' , encoding = ' UTF-8 ' ) )
2019-07-28 21:53:39 +02:00
parser . add_argument ( ' --unsafe-fixes ' , ' -u ' , help = ' Perform unsafe fixes. ' , action = ' store_true ' )
2019-08-01 23:09:08 +02:00
parser . add_argument ( ' --version ' , ' -V ' , action = ' version ' , version = f ' CSV Metadata Quality v { VERSION } ' )
2019-08-26 23:10:07 +02:00
parser . add_argument ( ' --exclude-fields ' , ' -x ' , help = ' Comma-separated list of fields to skip, for example: dc.contributor.author,dc.identifier.citation ' )
2019-07-28 19:31:57 +02:00
args = parser . parse_args ( )
return args
2019-08-03 20:11:57 +02:00
def signal_handler ( signal , frame ) :
sys . exit ( 1 )
2019-07-31 16:34:36 +02:00
def run ( argv ) :
2019-07-28 19:31:57 +02:00
args = parse_args ( argv )
2019-08-03 20:11:57 +02:00
# set the signal handler for SIGINT (^C)
signal . signal ( signal . SIGINT , signal_handler )
2019-07-26 21:11:10 +02:00
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
2019-07-28 19:31:57 +02:00
df = pd . read_csv ( args . input_file , dtype = str )
2019-07-26 21:11:10 +02:00
for column in df . columns . values . tolist ( ) :
2019-08-26 23:10:07 +02:00
# Check if the user requested to skip any fields
if args . exclude_fields :
skip = False
# Split the list of excludes on ',' so we can test exact matches
# rather than fuzzy matches with regexes or "if word in string"
for exclude in args . exclude_fields . split ( ' , ' ) :
if column == exclude and skip is False :
skip = True
if skip :
print ( f ' Skipping { column } ' )
continue
2019-07-29 15:24:35 +02:00
# Fix: whitespace
2019-07-26 21:11:10 +02:00
df [ column ] = df [ column ] . apply ( fix . whitespace )
2019-07-30 19:05:12 +02:00
# Fix: newlines
if args . unsafe_fixes :
df [ column ] = df [ column ] . apply ( fix . newlines )
2019-08-27 23:05:52 +02:00
# Fix: missing space after comma. Only run on author and citation
# fields for now, as this problem is mostly an issue in names.
if args . unsafe_fixes :
match = re . match ( r ' ^.*?(author|citation).*$ ' , column )
if match is not None :
df [ column ] = df [ column ] . apply ( fix . comma_space , field_name = column )
2019-07-29 15:38:10 +02:00
# Fix: unnecessary Unicode
df [ column ] = df [ column ] . apply ( fix . unnecessary_unicode )
2019-07-29 15:24:35 +02:00
# Check: invalid multi-value separator
2019-07-26 22:48:24 +02:00
df [ column ] = df [ column ] . apply ( check . separators )
2019-07-29 16:08:49 +02:00
# Check: suspicious characters
2019-08-09 00:22:59 +02:00
df [ column ] = df [ column ] . apply ( check . suspicious_characters , field_name = column )
2019-07-29 16:08:49 +02:00
2019-07-29 15:24:35 +02:00
# Fix: invalid multi-value separator
2019-07-28 21:53:39 +02:00
if args . unsafe_fixes :
df [ column ] = df [ column ] . apply ( fix . separators )
# Run whitespace fix again after fixing invalid separators
df [ column ] = df [ column ] . apply ( fix . whitespace )
2019-07-29 17:05:03 +02:00
# Fix: duplicate metadata values
df [ column ] = df [ column ] . apply ( fix . duplicates )
2019-07-29 23:30:31 +02:00
# Check: invalid AGROVOC subject
2019-08-01 22:51:58 +02:00
if args . agrovoc_fields :
# Identify fields the user wants to validate against AGROVOC
for field in args . agrovoc_fields . split ( ' , ' ) :
if column == field :
df [ column ] = df [ column ] . apply ( check . agrovoc , field_name = column )
2019-07-29 23:30:31 +02:00
2019-07-29 17:59:42 +02:00
# Check: invalid language
match = re . match ( r ' ^.*?language.*$ ' , column )
if match is not None :
df [ column ] = df [ column ] . apply ( check . language )
2019-07-29 15:24:35 +02:00
# Check: invalid ISSN
2019-07-28 16:27:20 +02:00
match = re . match ( r ' ^.*?issn.*$ ' , column )
if match is not None :
2019-07-26 22:14:10 +02:00
df [ column ] = df [ column ] . apply ( check . issn )
2019-07-29 15:24:35 +02:00
# Check: invalid ISBN
2019-07-28 16:27:20 +02:00
match = re . match ( r ' ^.*?isbn.*$ ' , column )
if match is not None :
2019-07-26 22:14:10 +02:00
df [ column ] = df [ column ] . apply ( check . isbn )
2019-07-29 15:24:35 +02:00
# Check: invalid date
2019-07-28 15:11:36 +02:00
match = re . match ( r ' ^.*?date.*$ ' , column )
if match is not None :
2019-08-21 14:31:12 +02:00
df [ column ] = df [ column ] . apply ( check . date , field_name = column )
2019-07-28 15:11:36 +02:00
2019-08-10 22:41:16 +02:00
# Check: filename extension
if column == ' filename ' :
df [ column ] = df [ column ] . apply ( check . filename_extension )
2019-07-26 21:11:10 +02:00
# Write
2019-07-28 19:31:57 +02:00
df . to_csv ( args . output_file , index = False )
2019-08-04 08:10:19 +02:00
# Close the input and output files before exiting
args . input_file . close ( )
args . output_file . close ( )
2019-08-04 08:10:37 +02:00
sys . exit ( 0 )