Source code for label_processing.text_recognition

# Import third-party libraries
from __future__ import annotations
import os
import copy
import cv2
import shutil
import math
import pytesseract as py
import numpy as np
from typing import  Union, Tuple, Optional
from deskew import determine_skew
from enum import Enum
from pathlib import Path
import warnings

# Import the necessary module from the 'label_processing' module package
from label_processing import utils

# Suppress warning messages during execution
warnings.filterwarnings('ignore')

# Constants
CONFIG = r'--psm 6 --oem 3' #configuration for OCR
LANGUAGES = 'eng+deu+fra+ita+spa+por' #specifying languages used for OCR
MIN_SKEW_ANGLE = -10
MAX_SKEW_ANGLE = 10


[docs]
def find_tesseract() -> None:
    """
    Searches for the tesseract executable and raises an error if it is not found.
    """
    tesseract_path = shutil.which("tesseract")
    if not tesseract_path:
        raise FileNotFoundError(("Could not find tesseract on your machine!"
                                 "Please read the README for instructions!"))
    else:
        py.pytesseract.tesseract_cmd = tesseract_path



#---------------------Image Preprocessing---------------------#



[docs]
class ImageProcessor():
    """
    A class for image preprocessing and other image actions.
    """
    def __init__(self, image: np.ndarray, path: str, blocksize: int = None, c_value: int = None):
        """
        Initialize an instance of Image class.

        Args:
            image (np.ndarray): The image data as a NumPy array.
            path (str): The path to the image file.
            blocksize (int, optional): The blocksize for thresholding. Defaults to None.
            c_value (int, optional): The c_value for thresholding. Defaults to None.
        """
        self.image = image
        self.path = Path(path)
        self.filename = self.path.name
        self.blocksize: Optional[int] = blocksize
        self.c_value: Optional[int] = c_value
    
    @property 
    def blocksize(self) -> int:
        return self._blocksize
        
    @blocksize.setter
    def blocksize(self, value: int|None) -> None:
        if value is not None:
            if (value <= 1 or value % 2 == 0):
                raise ValueError("Value for blocksize has to be at least 3 and needs\
                    to be odd")
        self._blocksize = value
    
    @property
    def c_value(self) -> int:
        return self._c_value
    
    @c_value.setter
    def c_value(self, value: int) -> None:
        self._c_value = value
    
    @property
    def image(self) -> np.ndarray:
        return self._image
    
    @image.setter
    def image(self, image: np.ndarray) -> None:
        self._image = image
    
    @property
    def path(self) -> str:
        return self._path
    
    @path.setter
    def path(self, path: str) -> None:
        self._path = path


[docs]
    def copy_this(self) -> ImageProcessor:
        """
        Creates a copy of the current Image instance.

        Returns:
            ImageProcessor: A copy of the current Image instance.
        """
        return copy.copy(self)

    

[docs]
    @staticmethod
    def read_image(path: str|Path) -> ImageProcessor:
        """
        Read an image from the specified path and return an instance of the Image class.

        Args:
            path (str): The path to a JPG file.

        Returns:
            Image: An instance of the Image class.
        """ 
        return ImageProcessor(cv2.imread(str(path)), path)

        

[docs]
    def get_grayscale(self) -> ImageProcessor:
        """
        Convert the image to grayscale.

        Returns:
            Image: An instance of the Image class representing the grayscale image.
        """
        image = cv2.cvtColor(self.image, cv2.COLOR_RGB2GRAY)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance

    

[docs]
    def blur(self, ksize: tuple[int, int] = (5,5)) -> ImageProcessor:
        """
        Apply Gaussian blur to the image.

        Args:
            ksize (Tuple[int, int], optional): The kernel size for blurring. Defaults to (5, 5).

        Returns:
            Image: An instance of the Image class representing the blurred image.
        """
        image = cv2.GaussianBlur(self.image, ksize, 0)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def remove_noise(self) -> ImageProcessor:
        """
        Remove noise from the image using median blur.

        Returns:
            Image: An instance of the Image class representing the noise-reduced image.
        """
        image = cv2.medianBlur(self.image,5)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance

    

[docs]
    def thresholding(self, thresh_mode: Enum) -> ImageProcessor:
        """
        Perform thresholding on the image.

        Args:
            thresh_mode (Threshmode): The thresholding mode to use (OTSU, ADAPTIVE_MEAN, or ADAPTIVE_GAUSSIAN).

        Returns:
            Image: An instance of the Image class representing the thresholded image.
        """ 
        if thresh_mode == Threshmode.OTSU:
            image = cv2.threshold(self.image, 0, 255,
                                cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
        elif thresh_mode == Threshmode.ADAPTIVE_GAUSSIAN:
            #set blocksize and c_value
            gaussian_blocksize = self.blocksize if self.blocksize  else 73
            gaussian_c = self.c_value if self.c_value else 16
            
            image = cv2.adaptiveThreshold(self.image ,255,
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
                gaussian_blocksize, gaussian_c)
        elif thresh_mode == Threshmode.ADAPTIVE_MEAN:
            #set blocksize and c_value
            mean_blocksize = self.blocksize if self.blocksize  else 35
            mean_c = self.c_value if self.c_value else 17
            
            image = cv2.adaptiveThreshold(self.image ,255,
                cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY,
                mean_blocksize,mean_c)             
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def dilate(self) -> ImageProcessor:
        """
        Dilate the image using a 5x5 kernel.

        Returns:
            Image: An instance of the Image class representing the dilated image.
        """
        kernel = np.ones((5,5),np.uint8)
        image =  cv2.dilate(self.image, kernel, iterations = 1)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def erode(self) -> ImageProcessor:
        """
        Erode the image using a 5x5 kernel.

        Returns:
            Image: An instance of the Image class representing the eroded image.
        """
        kernel = np.ones((5,5),np.uint8)
        image = cv2.erode(self.image, kernel, iterations = 1)
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance

    
    @staticmethod
    def _rotate(
        image: np.ndarray, angle: float | np.float, background: Union[int, Tuple[int, int, int]]
    ) -> np.ndarray:
        """
        Performs a rotation of an image given an angle.

        Args:
            image (np.ndarray): Image loaded in with OpenCV.
            angle (float): Angle with which the picture should be rotated.
            background (Union[int, Tuple[int, int, int]]): RGB values for the background color.

        Returns:
            np.ndarray: Rotated image.
        """
        old_width, old_height = image.shape[:2]
        angle_radian = math.radians(angle)
        width = (abs(np.sin(angle_radian) * old_height) 
            + abs(np.cos(angle_radian) * old_width))
        height = (abs(np.sin(angle_radian) * old_width) 
            + abs(np.cos(angle_radian) * old_height))

        image_center = tuple(np.array(image.shape[1::-1]) / 2)
        rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
        rot_mat[1, 2] += (width - old_width) / 2
        rot_mat[0, 2] += (height - old_height) / 2
        return cv2.warpAffine(image, rot_mat, (int(round(height)),
                                               int(round(width))),
                              borderValue=background)


[docs]
    def get_skew_angle(self) -> Optional[np.float64]:
        """
        Calculate and return the skew angle of the image.

        Returns:
            Optional[np.float64]: The skew angle in degrees or None if it couldn't be determined.
        """ 
        grayscale = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
        angle = determine_skew(grayscale, max_angle = MAX_SKEW_ANGLE,
                               min_angle=MIN_SKEW_ANGLE)
        return angle



[docs]
    def deskew(self, angle: Optional[np.float64]) -> ImageProcessor:
        """
        Rotate the image to deskew it.

        Args:
            angle (Optional[np.float64]): The skew angle to use for deskewing.

        Returns:
            Image: An instance of the Image class representing the deskewed image.
        """
        if angle is None:
        # Handle the case where angle is None, e.g., log a message or skip deskewing
            print(f"Warning: Skew angle for file {self.filename} could not be determined. Skipping deskewing.")
            return self
    
        # If angle is not None, proceed with deskewing
        image = self._rotate(self.image, angle, (255, 255, 255))
        image_instance = self.copy_this()
        image_instance.image = image
        return image_instance



[docs]
    def preprocessing(self, thresh_mode: Threshmode) -> ImageProcessor:
        """
        Perform a series of preprocessing steps on the image.

        Args:
            thresh_mode (Threshmode): The thresholding mode to use (OTSU, ADAPTIVE_MEAN, or ADAPTIVE_GAUSSIAN).

        Returns:
            ImageProcessor: An instance of the Image class representing the preprocessed image.
        """
        # Skew angle has to be calculated before processing
        angle = self.get_skew_angle()

        if angle is None:
            print("Warning: Skew angle could not be determined. Skipping preprocessing.")
            return self

        # Perform preprocessing
        image = self.get_grayscale()
        image = image.blur()
        image = image.thresholding(thresh_mode=thresh_mode)
        image = image.deskew(angle)
            # Check if angle is None before deskewing
        if angle is not None:
            image = image.deskew(angle)
        return image



#---------------------Read QR-Code---------------------#
    


[docs]
    def read_qr_code(self) -> Optional[str]:
        """
        Tries to identify if a picture has a QR-code and then reads and returns it.

        Returns:
            Optional[str]: Decoded QR-code text as a str or None if there is no QR-code found.
        """
        try:
            detect = cv2.QRCodeDetector()
            value = detect.detectAndDecode(self.image)[0]
            return value if value else None
        except Exception as e:
            print(f"An error occurred while detecting and decoding QR code: {e}")
            return None

    

[docs]
    def save_image(self, dir_path: str | Path, appendix: Optional[str] = None) -> None:
        """
        Save the image to a specified directory with an optional appendix.

        Args:
            dir_path (str | Path): The directory path where the image will be saved.
            appendix (str, optional): An optional string to append to the image filename. Defaults to None.
        """
        try:
            if appendix:
                filename = utils.generate_filename(self.filename, appendix, extension="jpg")
            else:
                filename = self.filename
            filename_processed = os.path.join(dir_path, filename)
            cv2.imwrite(filename_processed, self.image)
        except Exception as e:
            print(f"An error occurred while saving the image: {e}")


    

[docs]
class Threshmode(Enum):
    """
    Different possibilities for thresholding.

    Args:
        Enum (int):  
    """
    OTSU = 1
    ADAPTIVE_MEAN = 2
    ADAPTIVE_GAUSSIAN = 3
    

[docs]
    @classmethod
    def eval(cls, threshmode: int) -> Enum:
        if threshmode == 1:
            return cls.OTSU
        if threshmode == 2:
            return cls.ADAPTIVE_MEAN
        if threshmode == 3:
            return cls.ADAPTIVE_GAUSSIAN


    
    
#---------------------OCR Tesseract---------------------#
        


[docs]
class Tesseract:
    def __init__(self, languages=LANGUAGES, config=CONFIG, image: Optional[ImageProcessor] = None):
        """
        Initialize the Tesseract OCR processor.

        Args:
            languages (str, optional): OCR available languages. Defaults to LANGUAGES.
            config (str, optional): Additional custom configuration flags not available via the pytesseract function. Defaults to CONFIG.
            image (ImageProcessor, optional): An instance of the Image class representing the image to process. Defaults to None.
        """
        self.config = config
        self.languages = languages
        self.image = image 

    @property
    def image(self) -> ImageProcessor:
        return self._image

    @image.setter
    def image(self, img: ImageProcessor) -> None:
        self._image = img

    @staticmethod
    def _process_string(result_raw: str) -> str:
        """
        Processes the OCR output by replacing '\n' with spaces and encoding it to ASCII and decoding it again to UTF-8.

        Args:
            result_raw (str): Raw string from pytesseract output.

        Returns:
            str: Processed string.
        """
        processed = result_raw.replace('\n', ' ')
        return processed


[docs]
    def image_to_string(self) -> dict[str, str]:
        """
        Apply OCR and image parameters on JPG images.

        Returns:
            dict[str, str]: A dictionary containing the image ID (filename) and the OCR-processed text.
        """
        transcript = py.image_to_string(self.image.image, self.languages, self.config)
        transcript = self._process_string(transcript)
        return {"ID": self.image.filename, "text": transcript}