2019-07-28 19:31:57 +02:00
import argparse
2019-07-26 22:14:10 +02:00
import csv_metadata_quality . check as check
2019-07-26 21:11:10 +02:00
import csv_metadata_quality . fix as fix
import pandas as pd
2019-07-28 15:11:36 +02:00
import re
2019-07-26 21:11:10 +02:00
2019-07-28 19:31:57 +02:00
def parse_args ( argv ) :
parser = argparse . ArgumentParser ( description = ' Metadata quality checker and fixer. ' )
parser . add_argument ( ' --input-file ' , ' -i ' , help = ' Path to input file. Can be UTF-8 CSV or Excel XLSX. ' , required = True , type = argparse . FileType ( ' r ' , encoding = ' UTF-8 ' ) )
parser . add_argument ( ' --output-file ' , ' -o ' , help = ' Path to output file (always CSV). ' , required = True , type = argparse . FileType ( ' w ' , encoding = ' UTF-8 ' ) )
2019-07-28 21:53:39 +02:00
parser . add_argument ( ' --unsafe-fixes ' , ' -u ' , help = ' Perform unsafe fixes. ' , action = ' store_true ' )
2019-07-28 19:31:57 +02:00
args = parser . parse_args ( )
return args
def main ( argv ) :
args = parse_args ( argv )
2019-07-26 21:11:10 +02:00
# Read all fields as strings so dates don't get converted from 1998 to 1998.0
2019-07-28 19:31:57 +02:00
df = pd . read_csv ( args . input_file , dtype = str )
2019-07-26 21:11:10 +02:00
for column in df . columns . values . tolist ( ) :
2019-07-29 15:24:35 +02:00
# Fix: whitespace
2019-07-26 21:11:10 +02:00
df [ column ] = df [ column ] . apply ( fix . whitespace )
2019-07-29 15:38:10 +02:00
# Fix: unnecessary Unicode
df [ column ] = df [ column ] . apply ( fix . unnecessary_unicode )
2019-07-29 15:24:35 +02:00
# Check: invalid multi-value separator
2019-07-26 22:48:24 +02:00
df [ column ] = df [ column ] . apply ( check . separators )
2019-07-29 16:08:49 +02:00
# Check: suspicious characters
df [ column ] = df [ column ] . apply ( check . suspicious_characters )
2019-07-29 15:24:35 +02:00
# Fix: invalid multi-value separator
2019-07-28 21:53:39 +02:00
if args . unsafe_fixes :
df [ column ] = df [ column ] . apply ( fix . separators )
# Run whitespace fix again after fixing invalid separators
df [ column ] = df [ column ] . apply ( fix . whitespace )
2019-07-29 15:24:35 +02:00
# Check: invalid ISSN
2019-07-28 16:27:20 +02:00
match = re . match ( r ' ^.*?issn.*$ ' , column )
if match is not None :
2019-07-26 22:14:10 +02:00
df [ column ] = df [ column ] . apply ( check . issn )
2019-07-29 15:24:35 +02:00
# Check: invalid ISBN
2019-07-28 16:27:20 +02:00
match = re . match ( r ' ^.*?isbn.*$ ' , column )
if match is not None :
2019-07-26 22:14:10 +02:00
df [ column ] = df [ column ] . apply ( check . isbn )
2019-07-29 15:24:35 +02:00
# Check: invalid date
2019-07-28 15:11:36 +02:00
match = re . match ( r ' ^.*?date.*$ ' , column )
if match is not None :
df [ column ] = df [ column ] . apply ( check . date )
2019-07-26 21:11:10 +02:00
# Write
2019-07-28 19:31:57 +02:00
df . to_csv ( args . output_file , index = False )