Source code for label_processing.ocr_vision

# Import third-party libraries
from __future__ import annotations
import io
import os
import warnings
from google.cloud import vision

# Import the necessary module from the 'label_processing' module package
import label_processing.utils

# Suppress warning messages during execution
warnings.filterwarnings('ignore')


[docs]
class VisionApi:
    """
    Class for interacting with the Google Cloud Vision API for OCR tasks on images.
    """

    def __init__(self, path: str, image: bytes, credentials: str, encoding: str) -> None:
        """
        Initialize the VisionApi instance.

        Args:
            path (str): Path to the image file.
            image (bytes): Image content in bytes.
            credentials (str): Path to the credentials JSON file.
            encoding (str): Encoding for the result ('ascii' or 'utf8').
        """
        self.image = image
        self.path = path
        self.encoding = encoding
        self.credentials = credentials
        self.client = self._initialize_client(credentials)

    @staticmethod
    def _initialize_client(credentials: str) -> vision.ImageAnnotatorClient:
        """
        Initialize the Google Vision API client.

        Args:
            credentials (str): Path to the credentials JSON file.

        Returns:
            vision.ImageAnnotatorClient: Initialized Google Vision API client.
        """
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials
        return vision.ImageAnnotatorClient()


[docs]
    @staticmethod
    def read_image(path: str, credentials: str, encoding: str = 'utf8') -> VisionApi:
        """
        Read an image file and return an instance of the VisionApi class.

        Args:
            path (str): Path to the image file.
            credentials (str): Path to the credentials JSON file.
            encoding (str, optional): Encoding for the result ('ascii' or 'utf8'). Defaults to 'utf8'.

        Returns:
            VisionApi: Instance of the VisionApi class.
        """
        with io.open(path, 'rb') as image_file:
            image = image_file.read()
        return VisionApi(path, image, credentials, encoding)



[docs]
    def process_string(self, result_raw: str) -> str:
        """
        Process the Google Vision OCR output, replacing newlines with spaces and encoding as specified.

        Args:
            result_raw (str): Raw output string directly from Google Vision.

        Returns:
            str: Processed string.
        """
        processed = result_raw.replace('\n', ' ')
        if self.encoding == "ascii":
            processed = processed.encode("ascii", "ignore").decode()
        return processed



[docs]
    def vision_ocr(self) -> dict[str, str]:
        """
        Perform the actual API call, handle errors, and return the processed transcription.

        Raises:
            Exception: Raises an exception if the API does not respond.

        Returns:
            dict[str, str]: Dictionary with the filename and the transcript.
        """
        vision_image = vision.Image(content=self.image)
        response = self.client.text_detection(image=vision_image)
        single_transcripts = response.text_annotations
        
        transcripts = [str(transcript.description) for transcript in single_transcripts]
        bounding_boxes = []

        for transcript in single_transcripts: 
            vertices = [{"word": f"({vertex.x},{vertex.y})"} for vertex in transcript.bounding_poly.vertices]
            bounding_boxes.append({transcript.description: vertices})

        if transcripts:
            transcript = self.process_string(transcripts[0])
        else:
            transcript = " "
        
        filename = os.path.basename(self.path)
        if response.error.message:
            raise Exception(
                f'{response.error.message}\nFor more info on error messages, '
                'check:  https://cloud.google.com/apis/design/errors'
            )
        
        entry = {'ID': filename, 'text': transcript, 'bounding_boxes': bounding_boxes}
        if label_processing.utils.check_text(entry["text"]): 
            entry = label_processing.utils.replace_nuri(entry)
        return entry