mirror of
				https://github.com/ilri/csv-metadata-quality.git
				synced 2025-10-30 20:31:14 +01:00 
			
		
		
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			3656e9f976
			...
			ed084da08c
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| ed084da08c | |||
| 10612cf891 | 
| @@ -13,6 +13,10 @@ ified as "unsafe" as I have yet to see a case where this was intentional | |||||||
| - Configurable directory for AGROVOC requests cache (to allow running the web | - Configurable directory for AGROVOC requests cache (to allow running the web | ||||||
| version from Google App Engine where we can only write to /tmp) | version from Google App Engine where we can only write to /tmp) | ||||||
|  |  | ||||||
|  | ### Removed | ||||||
|  | - Checks for invalid and unnecessary multi-value separators because now I fix | ||||||
|  | them whenever I see them, so there is no need to have checks for them | ||||||
|  |  | ||||||
| ## [0.4.6] - 2021-03-11 | ## [0.4.6] - 2021-03-11 | ||||||
| ### Added | ### Added | ||||||
| - Validation of dcterms.license field against SPDX license identifiers  | - Validation of dcterms.license field against SPDX license identifiers  | ||||||
|   | |||||||
| @@ -104,9 +104,6 @@ def run(argv): | |||||||
|         # Fix: unnecessary Unicode |         # Fix: unnecessary Unicode | ||||||
|         df[column] = df[column].apply(fix.unnecessary_unicode) |         df[column] = df[column].apply(fix.unnecessary_unicode) | ||||||
|  |  | ||||||
|         # Check: invalid and unnecessary multi-value separators |  | ||||||
|         df[column] = df[column].apply(check.separators, field_name=column) |  | ||||||
|  |  | ||||||
|         # Check: suspicious characters |         # Check: suspicious characters | ||||||
|         df[column] = df[column].apply(check.suspicious_characters, field_name=column) |         df[column] = df[column].apply(check.suspicious_characters, field_name=column) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -58,42 +58,6 @@ def isbn(field): | |||||||
|     return field |     return field | ||||||
|  |  | ||||||
|  |  | ||||||
| def separators(field, field_name): |  | ||||||
|     """Check for invalid and unnecessary multi-value separators, for example: |  | ||||||
|  |  | ||||||
|         value|value |  | ||||||
|         value|||value |  | ||||||
|         value||value|| |  | ||||||
|  |  | ||||||
|     Prints the field with the invalid multi-value separator. |  | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     # Skip fields with missing values |  | ||||||
|     if pd.isna(field): |  | ||||||
|         return |  | ||||||
|  |  | ||||||
|     # Try to split multi-value field on "||" separator |  | ||||||
|     for value in field.split("||"): |  | ||||||
|         # Check if the current value is blank |  | ||||||
|         if value == "": |  | ||||||
|             print( |  | ||||||
|                 f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}" |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|             continue |  | ||||||
|  |  | ||||||
|         # After splitting, see if there are any remaining "|" characters |  | ||||||
|         match = re.findall(r"^.*?\|.*$", value) |  | ||||||
|  |  | ||||||
|         # Check if there was a match |  | ||||||
|         if match: |  | ||||||
|             print( |  | ||||||
|                 f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{field}" |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|     return field |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def date(field, field_name): | def date(field, field_name): | ||||||
|     """Check if a date is valid. |     """Check if a date is valid. | ||||||
|  |  | ||||||
|   | |||||||
| @@ -47,50 +47,6 @@ def test_check_valid_isbn(): | |||||||
|     assert result == value |     assert result == value | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_invalid_separators(capsys): |  | ||||||
|     """Test checking invalid multi-value separators.""" |  | ||||||
|  |  | ||||||
|     value = "Alan|Orth" |  | ||||||
|  |  | ||||||
|     field_name = "dc.contributor.author" |  | ||||||
|  |  | ||||||
|     check.separators(value, field_name) |  | ||||||
|  |  | ||||||
|     captured = capsys.readouterr() |  | ||||||
|     assert ( |  | ||||||
|         captured.out |  | ||||||
|         == f"{Fore.RED}Invalid multi-value separator ({field_name}): {Fore.RESET}{value}\n" |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_unnecessary_separators(capsys): |  | ||||||
|     """Test checking unnecessary multi-value separators.""" |  | ||||||
|  |  | ||||||
|     field = "Alan||Orth||" |  | ||||||
|  |  | ||||||
|     field_name = "dc.contributor.author" |  | ||||||
|  |  | ||||||
|     check.separators(field, field_name) |  | ||||||
|  |  | ||||||
|     captured = capsys.readouterr() |  | ||||||
|     assert ( |  | ||||||
|         captured.out |  | ||||||
|         == f"{Fore.RED}Unnecessary multi-value separator ({field_name}): {Fore.RESET}{field}\n" |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_valid_separators(): |  | ||||||
|     """Test checking valid multi-value separators.""" |  | ||||||
|  |  | ||||||
|     value = "Alan||Orth" |  | ||||||
|  |  | ||||||
|     field_name = "dc.contributor.author" |  | ||||||
|  |  | ||||||
|     result = check.separators(value, field_name) |  | ||||||
|  |  | ||||||
|     assert result == value |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_check_missing_date(capsys): | def test_check_missing_date(capsys): | ||||||
|     """Test checking missing date.""" |     """Test checking missing date.""" | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user