mirror of
				https://github.com/ilri/csv-metadata-quality.git
				synced 2025-10-31 12:51:14 +01:00 
			
		
		
		
	Compare commits
	
		
			7 Commits
		
	
	
		
			d5afbad788
			...
			8bc4cd419c
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 8bc4cd419c | |||
| bde38e9ed4 | |||
| 8db1e36a6d | |||
| fbb625be5c | |||
| 084b970798 | |||
| 171b35b015 | |||
| 545bb8cd0c | 
| @@ -12,7 +12,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | |||||||
| - Use SPDX license data from SPDX themselves instead of spdx-license-list | - Use SPDX license data from SPDX themselves instead of spdx-license-list | ||||||
| because it is deprecated and outdated | because it is deprecated and outdated | ||||||
| - Require Python 3.9+ | - Require Python 3.9+ | ||||||
| - Don't run `fix.separators()` on title fields | - Don't run `fix.separators()` on title or abstract fields | ||||||
|  | - Don't run whitespace or newline fixes on abstract fields | ||||||
|  | - Ignore some common non-SPDX licenses | ||||||
|  | - Ignore `__description` suffix in filenames meant for SAFBuilder when checking | ||||||
|  | for uncommon file extensions | ||||||
|  |  | ||||||
| ### Updated | ### Updated | ||||||
| - Python dependencies | - Python dependencies | ||||||
|   | |||||||
| @@ -90,11 +90,13 @@ def run(argv): | |||||||
|  |  | ||||||
|             continue |             continue | ||||||
|  |  | ||||||
|  |         if args.unsafe_fixes: | ||||||
|  |             match = re.match(r"^.*?abstract.*$", column) | ||||||
|  |             if match is None: | ||||||
|                 # Fix: whitespace |                 # Fix: whitespace | ||||||
|                 df[column] = df[column].apply(fix.whitespace, field_name=column) |                 df[column] = df[column].apply(fix.whitespace, field_name=column) | ||||||
|  |  | ||||||
|                 # Fix: newlines |                 # Fix: newlines | ||||||
|         if args.unsafe_fixes: |  | ||||||
|                 df[column] = df[column].apply(fix.newlines, field_name=column) |                 df[column] = df[column].apply(fix.newlines, field_name=column) | ||||||
|  |  | ||||||
|         # Fix: missing space after comma. Only run on author and citation |         # Fix: missing space after comma. Only run on author and citation | ||||||
| @@ -122,9 +124,9 @@ def run(argv): | |||||||
|         df[column] = df[column].apply(fix.unnecessary_unicode) |         df[column] = df[column].apply(fix.unnecessary_unicode) | ||||||
|  |  | ||||||
|         # Fix: invalid and unnecessary multi-value separators. Skip the title |         # Fix: invalid and unnecessary multi-value separators. Skip the title | ||||||
|         # field because sometimes "|" is used to indicate something like a |         # and abstract fields because "|" is used to indicate something like | ||||||
|         # subtitle. |         # a subtitle. | ||||||
|         match = re.match(r"^.*?title.*$", column) |         match = re.match(r"^.*?(abstract|title).*$", column) | ||||||
|         if match is None: |         if match is None: | ||||||
|             df[column] = df[column].apply(fix.separators, field_name=column) |             df[column] = df[column].apply(fix.separators, field_name=column) | ||||||
|             # Run whitespace fix again after fixing invalid separators |             # Run whitespace fix again after fixing invalid separators | ||||||
|   | |||||||
| @@ -286,6 +286,11 @@ def filename_extension(field): | |||||||
|  |  | ||||||
|     # Iterate over all values |     # Iterate over all values | ||||||
|     for value in values: |     for value in values: | ||||||
|  |         # Strip filename descriptions that are meant for SAF Bundler, for | ||||||
|  |         # example: Annual_Report_2020.pdf__description:Report | ||||||
|  |         if "__description" in value: | ||||||
|  |             value = value.split("__")[0] | ||||||
|  |  | ||||||
|         # Assume filename extension does not match |         # Assume filename extension does not match | ||||||
|         filename_extension_match = False |         filename_extension_match = False | ||||||
|  |  | ||||||
| @@ -312,8 +317,19 @@ def spdx_license_identifier(field): | |||||||
|     Prints the value if it is invalid. |     Prints the value if it is invalid. | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|  |     # List of common non-SPDX licenses to ignore | ||||||
|  |     # See: https://ilri.github.io/cgspace-submission-guidelines/dcterms-license/dcterms-license.txt | ||||||
|  |     ignore_licenses = { | ||||||
|  |         "All rights reserved; no re-use allowed", | ||||||
|  |         "All rights reserved; self-archive copy only", | ||||||
|  |         "Copyrighted; Non-commercial educational use only", | ||||||
|  |         "Copyrighted; Non-commercial use only", | ||||||
|  |         "Copyrighted; all rights reserved", | ||||||
|  |         "Other", | ||||||
|  |     } | ||||||
|  |  | ||||||
|     # Skip fields with missing values |     # Skip fields with missing values | ||||||
|     if pd.isna(field): |     if pd.isna(field) or field in ignore_licenses: | ||||||
|         return |         return | ||||||
|  |  | ||||||
|     spdx_licenses = load_spdx_licenses() |     spdx_licenses = load_spdx_licenses() | ||||||
|   | |||||||
							
								
								
									
										17
									
								
								data/abstract-check.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								data/abstract-check.csv
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,17 @@ | |||||||
|  | id,dc.title,dcterms.abstract | ||||||
|  | 1,Normal item,This is an abstract | ||||||
|  | 2,Leading whitespace, This is an abstract | ||||||
|  | 3,Trailing whitespace,This is an abstract  | ||||||
|  | 4,Consecutive whitespace,This is  an abstract | ||||||
|  | 5,Newline,"This | ||||||
|  | is an abstract" | ||||||
|  | 6,Newline with leading whitespace," This | ||||||
|  |  is an abstract" | ||||||
|  | 7,Newline with trailing whitespace,"This  | ||||||
|  | is an abstract " | ||||||
|  | 8,Newline with consecutive whitespace,"This | ||||||
|  | is an  abstract" | ||||||
|  | 9,Multiple newlines,"This | ||||||
|  | is | ||||||
|  | an | ||||||
|  | abstract" | ||||||
| 
 | 
		Reference in New Issue
	
	Block a user