Source code for google_drive_ocr.utils

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Utility Functions
"""

import os
import logging
from collections.abc import Iterable
from typing import Generator, Iterator, List, Set, Tuple

from pdf2image import convert_from_path
from pdf2image.generators import threadsafe

###############################################################################

LOGGER = logging.getLogger(__name__)

###############################################################################


[docs]def get_files(topdir: str, extn: str) -> Generator[str, None, None]: """ Search :code:`topdir` recursively for all files with extension :code:`extn` extension is checked with :code:`str.endswith()`, instead of the supposedly better :code:`os.path.splitext()`, in order to facilitate the search with multiple dots in the :code:`extn` i.e. :code:`>>> get_files(topdir, ".xyz.txt")` wouldn't have worked as expected if :code:`splitext()` was used. Parameters ---------- topdir : str Path of the directory to search files in extn : str Extension to look for Returns ------- Generator[str, None, None] Matching file paths """ return ( os.path.join(dirpath, name) for dirpath, dirnames, files in os.walk(topdir) for name in files if name.lower().endswith(extn.lower()) )
############################################################################### # PDF Utils
[docs]def list_to_range(list_of_int: List[int]) -> List[Tuple[int, int]]: """Convert a list of integers into a list of ranges A range is tuple (start, end) Parameters ---------- list_of_int : List[int] List of integers Returns ------- List[Tuple[int, int]] List of ranges """ ranges = [] start, end = None, None last = None for current in sorted(set(list_of_int)): if current == int(current): current = int(current) else: continue if last is None: start = current last = current else: if current != last + 1: end = last ranges.append((start, end)) start = current last = current ranges.append((start, last)) return ranges
# Static Name Generator @threadsafe def static_generator(prefix): while True: yield prefix
[docs]def extract_pages( pdf_path: str, pages: Iterator[Tuple[int, int]] = None ) -> Set[str]: """Extract pages from a PDF file as image files Pages are saved in the same directory as the PDF file, with the suffix :code:`.page-[number].jpg` Parameters ---------- pdf_path : str Path to the PDF file pages : Iterator[Tuple[int, int]], optional Page ranges to extract. If None, all pages will be extracted. The default is None. Returns ------- Set[str] Set of paths to extracted pages """ pdf_path = os.path.realpath(pdf_path) output_path = os.path.dirname(pdf_path) output_name, _ = os.path.splitext(os.path.basename(pdf_path)) if isinstance(pages, Iterable): LOGGER.info(f"Extracting {len(pages)} pages from '{pdf_path}' ..") ranges = list_to_range(pages) else: LOGGER.info(f"Extracting all pages from '{pdf_path}' ..") ranges = [(None, None)] paths = set() for _start, _end in ranges: _paths = convert_from_path( pdf_path=pdf_path, output_folder=output_path, first_page=_start, last_page=_end, fmt="jpeg", jpegopt={"quality": 100, "progressive": True, "optimize": True}, output_file=static_generator(f"{output_name}.page"), paths_only=True, ) paths.update(_paths) if _start is not None and _end is not None: LOGGER.info(f"Extracted {len(_paths)} pages: {_start} to {_end}.") else: LOGGER.info(f"Extracted {len(_paths)} pages.") return paths
###############################################################################