Source code for pdfebc_core.compress

# -*- coding: utf-8 -*-
"""This module contains the PDF manipulation functions of the pdfebc program. Ghostscript is used to
compress PDF files.

.. module:: compress
    :platform: Unix
    :synopsis: PDF maniupulation functions using Ghostscript.

.. moduleauthor:: Simon Larsén <slarse@kth.se>
"""
import os
import sys
import subprocess
import daiquiri
from .misc_utils import if_callable_call_with_formatted_string

BYTES_PER_MEGABYTE = 1024**2
FILE_SIZE_LOWER_LIMIT = BYTES_PER_MEGABYTE
PDF_EXTENSION = ".pdf"

COMPRESSING_MULTIPLE = """Source directory: '{}'
Output directory: '{}'
Found '{}' PDF files. Starting compression ..."""
ALL_FILES_DONE = """All files done!
Results saved to '{}'"""
COMPRESSING = "Compressing '{}' ..."
FILE_DONE = "File done! Result saved to '{}'"
NOT_COMPRESSING = """Not compressing '{}'
Reason: Actual file size is {} bytes,
lower limit for compression is {} bytes"""
GS_NOT_INSTALLED = """Ghostscript not installed or not aliased to '{}'.
Exiting ..."""

LOGGER = daiquiri.getLogger(__name__)

def _get_pdf_filenames_at(source_directory):
    """Find all PDF files in the specified directory.

    Args:
        source_directory (str): The source directory.

    Returns:
        list(str): Filepaths to all PDF files in the specified directory.

    Raises:
        ValueError
    """
    if not os.path.isdir(source_directory):
        raise ValueError("%s is not a directory!" % source_directory)
    return [os.path.join(source_directory, filename)
            for filename in os.listdir(source_directory)
            if filename.endswith(PDF_EXTENSION)]

[docs]def compress_pdf(filepath, output_path, ghostscript_binary):
    """Compress a single PDF file.

    Args:
        filepath (str): Path to the PDF file.
        output_path (str): Output path.
        ghostscript_binary (str): Name/alias of the Ghostscript binary.

    Raises:
        ValueError
        FileNotFoundError
    """
    if not filepath.endswith(PDF_EXTENSION):
        raise ValueError("Filename must end with .pdf!\n%s does not." % filepath)
    try:
        file_size = os.stat(filepath).st_size
        if file_size < FILE_SIZE_LOWER_LIMIT:
            LOGGER.info(NOT_COMPRESSING.format(filepath, file_size, FILE_SIZE_LOWER_LIMIT))
            process = subprocess.Popen(['cp', filepath, output_path])
        else:
            LOGGER.info(COMPRESSING.format(filepath))
            process = subprocess.Popen(
                [ghostscript_binary, "-sDEVICE=pdfwrite",
                 "-dCompatabilityLevel=1.4", "-dPDFSETTINGS=/ebook",
                 "-dNOPAUSE", "-dQUIET", "-dBATCH",
                 "-sOutputFile=%s" % output_path, filepath]
                )
    except FileNotFoundError:
        msg = GS_NOT_INSTALLED.format(ghostscript_binary)
        raise FileNotFoundError(msg)
    process.communicate()
    LOGGER.info(FILE_DONE.format(output_path))

[docs]def compress_multiple_pdfs(source_directory, output_directory, ghostscript_binary):
    """Compress all PDF files in the current directory and place the output in the
    given output directory. This is a generator function that first yields the amount
    of files to be compressed, and then yields the output path of each file.

    Args:
        source_directory (str): Filepath to the source directory.
        output_directory (str): Filepath to the output directory.
        ghostscript_binary (str): Name of the Ghostscript binary.

    Returns:
        list(str): paths to outputs.
    """
    source_paths = _get_pdf_filenames_at(source_directory)
    yield len(source_paths)
    for source_path in source_paths:
        output = os.path.join(output_directory, os.path.basename(source_path))
        compress_pdf(source_path, output, ghostscript_binary)
        yield output