1
0
mirror of https://github.com/ilri/csv-metadata-quality.git synced 2025-02-18 07:46:21 +01:00

Compare commits

...

2 Commits

Author SHA1 Message Date
28f9026286
README.md: Minor edit
All checks were successful
continuous-integration/drone/push Build is passing
2021-03-19 16:26:31 +02:00
cfe09f7126
Add SPDX short license identifier to all Python files
See: https://spdx.github.io/spdx-spec/appendix-V-using-SPDX-short-identifiers-in-source-files/
2021-03-19 16:04:40 +02:00
10 changed files with 19 additions and 1 deletions

View File

@ -63,7 +63,7 @@ While it is *theoretically* possible for a single `|` character to be used legit
This will also remove unnecessary trailing multi-value separators, for example `Kenya||Tanzania||`.
## Unsafe Fixes
You can enable several "unsafe" fixes with the `--unsafe-fixes` option. Currently this will remove newlines and perform Unicode normalization.
You can enable several "unsafe" fixes with the `--unsafe-fixes` option. Currently this will remove newlines, perform Unicode normalization, and attempt to fix "mojibake" characters.
### Newlines
This is considered "unsafe" because some systems give special importance to vertical space and render it properly. DSpace does not support rendering newlines in its XMLUI and has, at times, suffered from parsing errors that cause the import process to fail if an input file had newlines. The `--unsafe-fixes` option strips Unix line feeds (U+000A).

View File

@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
from sys import argv
from csv_metadata_quality import app

View File

@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
import argparse
import re
import signal

View File

@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
import os
import re
from datetime import datetime, timedelta

View File

@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
import re
import langid

View File

@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
import re
from unicodedata import normalize

View File

@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
from ftfy.badness import sequence_weirdness

View File

@ -1 +1,3 @@
# SPDX-License-Identifier: GPL-3.0-only
VERSION = "0.4.8-dev"

View File

@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
import pandas as pd
from colorama import Fore

View File

@ -1,3 +1,5 @@
# SPDX-License-Identifier: GPL-3.0-only
import csv_metadata_quality.fix as fix