From e2a18c248c02a82fdd990f21c7685638c869905d Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Fri, 18 Jul 2025 16:21:17 +0100 Subject: [PATCH 1/2] Allow use_cropbox to be controlled --- README.md | 1 + src/app/engine/rasterizers.py | 3 +++ src/app/settings.py | 1 + 3 files changed, 5 insertions(+) diff --git a/README.md b/README.md index f635ad9..874284c 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ The following list of environment variables are supported: | `PDF_RASTERIZER_FALLBACK_DPI` | `200` | Engine | The DPI to use for images that exceed pdftoppm memory size and produce a 1x1 pixel (see https://github.com/Belval/pdf2image/issues/34) | | `PDF_RASTERIZER_FORMAT` | `jpg` | Engine | The format to generate rasterized images in. Supported values are `ppm`, `jpeg` / `jpg`, `png` and `tiff` | | `PDF_RASTERIZER_MAX_LENGTH` | `0` | Engine | Optional, the maximum size of pixels on longest edge that will be saved. If rasterized image exceeds this it will be resized, maintaining aspect ratio. | +| `PDF_RASTERIZER_USE_CROPBOX` | `False` | Engine | If `True` the PDF cropbox is used instead of mediabox. The MediaBox is the largest page box in a PDF. The other page boxes can equal the size of the MediaBox but they cannot be larger. The CropBox defines the region to which the page contents are to be clipped. | | `DLCS_API_ROOT` | `https://api.dlcs.digirati.io` | Engine | The root URI of the API of the target DLCS deployment, without the trailing slash. | | `DLCS_S3_BUCKET_NAME` | `dlcs-composite-images` | Engine | The S3 bucket that the Composite Handler will push rasterized images to, for consumption by the wider DLCS. Both the Composite Handler and the DLCS must have access to this bucket. | | `DLCS_S3_OBJECT_KEY_PREFIX` | `composites` | Engine | The S3 key prefix to use when pushing images to the `DLCS_S3_BUCKET_NAME` - in other words, the folder within the S3 bucket into which images are stored. | diff --git a/src/app/engine/rasterizers.py b/src/app/engine/rasterizers.py index f056a81..0c2aac9 100644 --- a/src/app/engine/rasterizers.py +++ b/src/app/engine/rasterizers.py @@ -24,6 +24,7 @@ def __init__(self): self._fmt = settings.PDF_RASTERIZER["format"] self._thread_count = settings.PDF_RASTERIZER["thread_count"] self._max_length = settings.PDF_RASTERIZER["max_length"] + self._use_cropbox = settings.PDF_RASTERIZER["use_cropbox"] def rasterize_pdf(self, subfolder_path): # Typically, pdf2image will write generated images to a temporary path, after @@ -51,6 +52,7 @@ def __rasterize( thread_count=self._thread_count, output_file=output_file, output_folder=subfolder_path, + use_cropbox=self._use_cropbox, ) def __validate_rasterized_images(self, images, pdf_source, subfolder_path): @@ -90,6 +92,7 @@ def __ensure_image_size(self, idx, im: Image): logger.info( f"resizing image index {idx} from {w},{h} to {scale_w},{scale_h}" ) + with im.resize((scale_w, scale_h), resample=Image.LANCZOS) as resized: resized.save(filename) return ResizeResult.RESIZED diff --git a/src/app/settings.py b/src/app/settings.py index 58f1c28..1ae1cb8 100644 --- a/src/app/settings.py +++ b/src/app/settings.py @@ -177,6 +177,7 @@ "dpi": env("PDF_RASTERIZER_DPI", cast=int, default=500), "fallback_dpi": env("PDF_RASTERIZER_FALLBACK_DPI", cast=int, default=200), "max_length": env("PDF_RASTERIZER_MAX_LENGTH", cast=int, default=0), + "use_cropbox": env("PDF_RASTERIZER_USE_CROPBOX", cast=bool, default=False), } ORIGIN_CONFIG = {"chunk_size": env("ORIGIN_CHUNK_SIZE", cast=int, default=8192)} From 4ddebfc45863f212f7a91443699b50c32afd0cf1 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Fri, 18 Jul 2025 17:20:12 +0100 Subject: [PATCH 2/2] Update setuptools in container Without this hitting No module named 'pkg_resources' error --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 3a273ec..31b1b02 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,8 @@ RUN apt-get update && apt-get --yes install apt-utils && apt-get --yes upgrade \ && apt-get --yes install poppler-data poppler-utils \ && apt-get --yes autoremove && apt-get --yes autoclean && apt-get --yes clean \ && useradd --create-home --home-dir /srv/dlcs --shell /bin/bash --uid 1000 dlcs \ - && python -m pip install --upgrade pip + && python -m pip install --upgrade pip \ + && python -m pip install --upgrade setuptools # Copy nginx config and create appropriate folders COPY --chown=dlcs:dlcs ./nginx.conf /etc/nginx/nginx.conf