src/main/java: Tune FixJpgJpgThumbnails a bit

Make sure we don't modify thumbnails if the item is an Infographic
because the JPG in the ORIGINAL bundle might actually be the "real"
file, in which case the THUMBNAIL bundle would have a legitimate
".jpg.jpg" file.

Also, limit the criteria for replacement to original bitstreams
that are less than 100KiB. In my tests I found that we had 4,022
items with ".jpg.jpg" thumbnails, and the average file size of the
originals in those items was 98KiB. Without considering the large
inforgraphics, which are several megabytes apiece, the average of
the remaining 3,765 originals was ~20KiB so 100KiB should be very
safe.
This commit is contained in:
Alan Orth 2020-08-07 09:50:03 +03:00
parent fdc910f93b
commit 26d3cbd778
Signed by: alanorth
GPG Key ID: 0FB860CC9C45B1B9

View File

@ -13,8 +13,8 @@ import java.sql.SQLException;
/**
* @author Andrea Schweer schweer@waikato.ac.nz for the LCoNZ Institutional Research Repositories
* @author Alan Orth for the International Livestock Research Institute
* @version 5.1-SNAPSHOT
* @since 5.1-SNAPSHOT
* @version 5.3
* @since 5.1
*/
public class FixJpgJpgThumbnails {
@ -73,6 +73,16 @@ public class FixJpgJpgThumbnails {
}
private static void processItem(Item item) throws SQLException, AuthorizeException, IOException {
// Some bitstreams like Infographics are large JPGs and put in the ORIGINAL bundle on purpose so we shouldn't
// swap them.
Metadatum[] itemTypes = item.getMetadataByMetadataString("dc.type");
Boolean itemHasInfographic = false;
for (Metadatum itemType: itemTypes) {
if (itemType.value.equals("Infographic")) {
itemHasInfographic = true;
}
}
Bundle[] thumbnailBundles = item.getBundles("THUMBNAIL");
for (Bundle thumbnailBundle : thumbnailBundles) {
Bitstream[] thumbnailBundleBitstreams = thumbnailBundle.getBitstreams();
@ -84,11 +94,25 @@ public class FixJpgJpgThumbnails {
for (Bundle originalBundle : originalBundles) {
Bitstream[] originalBundleBitstreams = originalBundle.getBitstreams();
for(Bitstream originalBitstream : originalBundleBitstreams) {
for (Bitstream originalBitstream : originalBundleBitstreams) {
String originalName = originalBitstream.getName();
//check if the original file name is the same as the thumbnail name minus the extra ".jpg"
if (originalName.equalsIgnoreCase(StringUtils.removeEndIgnoreCase(thumbnailName, ".jpg")) && ("Generated Thumbnail".equals(thumbnailBitstream.getDescription()) || "IM Thumbnail".equals(thumbnailBitstream.getDescription()))) {
Long originalBitstreamBytes = originalBitstream.getSize();
/*
- check if the original file name is the same as the thumbnail name minus the extra ".jpg"
- check if the thumbnail description indicates it was automatically generated
- check if the item has dc.type Infographic (JPG could be the "real" item!)
- check if the original bitstream is less than ~100KiB
- Note: in my tests there were 4022 items with ".jpg.jpg" thumbnails totaling 394549249
bytes for an average of about 98KiB so ~100KiB seems like a good cut off
*/
if (
originalName.equalsIgnoreCase(StringUtils.removeEndIgnoreCase(thumbnailName, ".jpg"))
&& ("Generated Thumbnail".equals(thumbnailBitstream.getDescription()) || "IM Thumbnail".equals(thumbnailBitstream.getDescription()))
&& !itemHasInfographic
&& originalBitstreamBytes < 100000
) {
System.out.println(item.getHandle() + ": replacing " + thumbnailName + " with " + originalName);
//add the original bitstream to the THUMBNAIL bundle