Source code for pdfebc_core.compress

# -*- coding: utf-8 -*-
"""This module contains the PDF manipulation functions of the pdfebc program. Ghostscript is used to
compress PDF files.

.. module:: compress
    :platform: Unix
    :synopsis: PDF maniupulation functions using Ghostscript.

.. moduleauthor:: Simon Larsén <slarse@kth.se>
"""
import os
import sys
import subprocess
import daiquiri
from .misc_utils import if_callable_call_with_formatted_string

BYTES_PER_MEGABYTE = 1024**2
FILE_SIZE_LOWER_LIMIT = BYTES_PER_MEGABYTE
PDF_EXTENSION = ".pdf"

COMPRESSING_MULTIPLE = """Source directory: '{}'
Output directory: '{}'
Found '{}' PDF files. Starting compression ..."""
ALL_FILES_DONE = """All files done!
Results saved to '{}'"""
COMPRESSING = "Compressing '{}' ..."
FILE_DONE = "File done! Result saved to '{}'"
NOT_COMPRESSING = """Not compressing '{}'
Reason: Actual file size is {} bytes,
lower limit for compression is {} bytes"""
GS_NOT_INSTALLED = """Ghostscript not installed or not aliased to '{}'.
Exiting ..."""

LOGGER = daiquiri.getLogger(__name__)

def _get_pdf_filenames_at(source_directory):
    """Find all PDF files in the specified directory.

    Args:
        source_directory (str): The source directory.

    Returns:
        list(str): Filepaths to all PDF files in the specified directory.

    Raises:
        ValueError
    """
    if not os.path.isdir(source_directory):
        raise ValueError("%s is not a directory!" % source_directory)
    return [os.path.join(source_directory, filename)
            for filename in os.listdir(source_directory)
            if filename.endswith(PDF_EXTENSION)]

[docs]def compress_pdf(filepath, output_path, ghostscript_binary): """Compress a single PDF file. Args: filepath (str): Path to the PDF file. output_path (str): Output path. ghostscript_binary (str): Name/alias of the Ghostscript binary. Raises: ValueError FileNotFoundError """ if not filepath.endswith(PDF_EXTENSION): raise ValueError("Filename must end with .pdf!\n%s does not." % filepath) try: file_size = os.stat(filepath).st_size if file_size < FILE_SIZE_LOWER_LIMIT: LOGGER.info(NOT_COMPRESSING.format(filepath, file_size, FILE_SIZE_LOWER_LIMIT)) process = subprocess.Popen(['cp', filepath, output_path]) else: LOGGER.info(COMPRESSING.format(filepath)) process = subprocess.Popen( [ghostscript_binary, "-sDEVICE=pdfwrite", "-dCompatabilityLevel=1.4", "-dPDFSETTINGS=/ebook", "-dNOPAUSE", "-dQUIET", "-dBATCH", "-sOutputFile=%s" % output_path, filepath] ) except FileNotFoundError: msg = GS_NOT_INSTALLED.format(ghostscript_binary) raise FileNotFoundError(msg) process.communicate() LOGGER.info(FILE_DONE.format(output_path))
[docs]def compress_multiple_pdfs(source_directory, output_directory, ghostscript_binary): """Compress all PDF files in the current directory and place the output in the given output directory. This is a generator function that first yields the amount of files to be compressed, and then yields the output path of each file. Args: source_directory (str): Filepath to the source directory. output_directory (str): Filepath to the output directory. ghostscript_binary (str): Name of the Ghostscript binary. Returns: list(str): paths to outputs. """ source_paths = _get_pdf_filenames_at(source_directory) yield len(source_paths) for source_path in source_paths: output = os.path.join(output_directory, os.path.basename(source_path)) compress_pdf(source_path, output, ghostscript_binary) yield output