mirror of
				https://github.com/ilri/csv-metadata-quality.git
				synced 2025-10-30 20:31:14 +01:00 
			
		
		
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			3656e9f976
			...
			ed084da08c
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| ed084da08c | |||
| 10612cf891 | 
| @@ -13,6 +13,10 @@ ified as "unsafe" as I have yet to see a case where this was intentional | ||||
| - Configurable directory for AGROVOC requests cache (to allow running the web | ||||
| version from Google App Engine where we can only write to /tmp) | ||||
|  | ||||
| ### Removed | ||||
| - Checks for invalid and unnecessary multi-value separators because now I fix | ||||
| them whenever I see them, so there is no need to have checks for them | ||||
|  | ||||
| ## [0.4.6] - 2021-03-11 | ||||
| ### Added | ||||
| - Validation of dcterms.license field against SPDX license identifiers  | ||||
|   | ||||
| @@ -104,9 +104,6 @@ def run(argv): | ||||
|         # Fix: unnecessary Unicode | ||||
|         df[column] = df[column].apply(fix.unnecessary_unicode) | ||||
|  | ||||
|         # Check: invalid and unnecessary multi-value separators | ||||
|         df[column] = df[column].apply(check.separators, field_name=column) | ||||
|  | ||||
|         # Check: suspicious characters | ||||
|         df[column] = df[column].apply(check.suspicious_characters, field_name=column) | ||||
|  | ||||
|   | ||||
| @@ -58,42 +58,6 @@ def isbn(field): | ||||
|     return field | ||||
|  | ||||
|  | ||||
| def separators(field, field_name): | ||||
|     """Check for invalid and unnecessary multi-value separators, for example: | ||||
|  | ||||
|         value|value | ||||
|         value|||value | ||||
|         value||value|| | ||||
|  | ||||
|     Prints the field with the invalid multi-value separator. | ||||
|     """ | ||||
|  | ||||
|     # Skip fields with missing values | ||||
|     if pd.isna(field): | ||||
|         return | ||||
|  | ||||
|     # Try to split multi-value field on "||" separator | ||||
|     for value in field.split("||"): | ||||
|         # Check if the current value is blank | ||||
|         if value == "": | ||||
|             print( | ||||
|                 f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}" | ||||
|             ) | ||||
|  | ||||
|             continue | ||||
|  | ||||
|         # After splitting, see if there are any remaining "|" characters | ||||
|         match = re.findall(r"^.*?\|.*$", value) | ||||
|  | ||||
|         # Check if there was a match | ||||
|         if match: | ||||
|             print( | ||||
|                 f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{field}" | ||||
|             ) | ||||
|  | ||||
|     return field | ||||
|  | ||||
|  | ||||
| def date(field, field_name): | ||||
|     """Check if a date is valid. | ||||
|  | ||||
|   | ||||
| @@ -47,50 +47,6 @@ def test_check_valid_isbn(): | ||||
|     assert result == value | ||||
|  | ||||
|  | ||||
| def test_check_invalid_separators(capsys): | ||||
|     """Test checking invalid multi-value separators.""" | ||||
|  | ||||
|     value = "Alan|Orth" | ||||
|  | ||||
|     field_name = "dc.contributor.author" | ||||
|  | ||||
|     check.separators(value, field_name) | ||||
|  | ||||
|     captured = capsys.readouterr() | ||||
|     assert ( | ||||
|         captured.out | ||||
|         == f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{value}\n" | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def test_check_unnecessary_separators(capsys): | ||||
|     """Test checking unnecessary multi-value separators.""" | ||||
|  | ||||
|     field = "Alan||Orth||" | ||||
|  | ||||
|     field_name = "dc.contributor.author" | ||||
|  | ||||
|     check.separators(field, field_name) | ||||
|  | ||||
|     captured = capsys.readouterr() | ||||
|     assert ( | ||||
|         captured.out | ||||
|         == f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}\n" | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def test_check_valid_separators(): | ||||
|     """Test checking valid multi-value separators.""" | ||||
|  | ||||
|     value = "Alan||Orth" | ||||
|  | ||||
|     field_name = "dc.contributor.author" | ||||
|  | ||||
|     result = check.separators(value, field_name) | ||||
|  | ||||
|     assert result == value | ||||
|  | ||||
|  | ||||
| def test_check_missing_date(capsys): | ||||
|     """Test checking missing date.""" | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user