mirror of
synced 2024-11-16 11:07:03 +01:00
Alan Orth
This detects whether text has likely been encoded in one encoding and decoded in another, perhaps multiple times. This often results in display of "mojibake" characters. For example, a file encoded in UTF-8 is opened as CP-1252 (Windows Latin codepage) in Microsoft Excel, and saved again as UTF-8. You will see strings like this in the resulting file: - CIAT Publicaçao - CIAT Publicación The correct version of these in UTF-8 would be: - CIAT Publicaçao - CIAT Publicación I use a code snippet from Martijn Pieters on StackOverflow to de- tect whether a string is "weird" as determined by the excellent "fixes text for you" (ftfy) Python library, then check if a weird string encodes as CP-1252 or not. If so, I can try to fix it. See: https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python
38 lines
988 B
38 lines
988 B
name = "csv-metadata-quality"
version = "0.4.7"
description="A simple, but opinionated CSV quality checking and fixing pipeline for CSVs in the DSpace ecosystem."
authors = ["Alan Orth <alan.orth@gmail.com>"]
repository = "https://github.com/ilri/csv-metadata-quality"
homepage = "https://github.com/ilri/csv-metadata-quality"
csv-metadata-quality = 'csv_metadata_quality.__main__:main'
python = "^3.7.1"
pandas = "^1.0.4"
python-stdnum = "^1.13"
xlrd = "^1.2.0"
requests = "^2.23.0"
requests-cache = "^0.5.2"
pycountry = "^19.8.18"
langid = "^1.1.6"
colorama = "^0.4.4"
spdx-license-list = "^0.5.2"
ftfy = "^5.9"
pytest = "^6.1.1"
ipython = { version = "^7.18.1", python = "^3.7" }
flake8 = "^3.8.4"
pytest-clarity = "^0.3.0-alpha.0"
black = "20.8b1"
isort = "^5.5.4"
csvkit = "^1.0.5"
requires = ["poetry>=0.12"]
build-backend = "poetry.masonry.api"