Source code for label_processing.utils

# Import third-party libraries
import os
import re
import json
import pandas as pd
from typing import Optional
import numpy as np
import cv2

# Constant
PATTERN = r"(/u/|http|coll|mfn|URI)"

#---------------------Check dir JPEG---------------------#

[docs] def check_dir(directory: str) -> None: """ Checks if the directory given as an argument contains jpg files. Args: directory (str): path to directory Raises: FileNotFoundError: raised if no jpg files are found in the directory """ if not os.path.isdir(directory): raise FileNotFoundError(f"The directory '{directory}' does not exist.") if not any(file_name.endswith('.tif') for file_name in os.listdir(directory)): raise FileNotFoundError("The directory given does not contain any tif files. You might have chosen the wrong directory?")
#---------------------New Filename Preprocessed Images---------------------#
[docs] def generate_filename(original_path: str, appendix: str, extension: Optional[str] = None) -> str: """ Gets the path to a file or directory as an input and returns it with an appendix added to the end. Args: original_path (str): original path to file or directory appendix (str): what needs to be appended extension (Optional[str]): either no extension (for directories) or a file extension as a string Returns: str: new file or directory name """ # Remove extension if it has one new_filename, _ = os.path.splitext(os.path.basename(original_path)) appendix = appendix.strip("_") if original_path.endswith(os.path.sep): new_filename = f"{os.path.basename(os.path.dirname(new_filename))}_{appendix}" else: new_filename = f"{new_filename}_{appendix}" if extension: if extension[0] != ".": new_filename = f"{new_filename}.{extension}" else: new_filename = f"{new_filename}{extension}" return new_filename
#---------------------Save JSON---------------------#
[docs] def save_json(data: list[dict], filename: str, path: str) -> None: """ Saves a json file with human-readable format. Args: data (list[dict]): output of the OCR filename (str): name for the json file path (str): path where the json should be saved """ filepath = os.path.join(path, filename) with open(filepath, "w", encoding='utf8') as f: json.dump(data, f, ensure_ascii=False, indent=4, separators=(',', ': '))
#---------------------Check and Correct NURIs---------------------#
[docs] def check_nuri_format(transcript: str) -> bool: """ Check NURI's format in OCR transcription "text". Args: transcript (str): text field from OCR output Returns: bool: True if NURI pattern found, False otherwise """ # Search for NURI patterns in "text" pattern = re.compile(PATTERN) match = pattern.search(transcript) return bool(match)
[docs] def replace_nuri(transcript: dict[str, str]) -> dict[str, str]: """ Correct NURI format in OCR transcription JSON output. Args: transcript (dict[str, str]): JSON transcript with "ID" and "text" fields. Returns: dict[str, str]: JSON transcript with corrected NURI formats in "text" field. """ reg_nuri = re.compile(r"_u_[A-Za-z0-9]+") reg_picturae_nuri = re.compile(r"_u_([0-9a-fA-F]+)\.jpg") try: if "ID" in transcript and "text" in transcript: nuri = reg_nuri.search(transcript["ID"]) picturae_nuri = reg_picturae_nuri.search(transcript["ID"]) if nuri: # Replace using the first pattern replace_string = f"http://coll.mfn-berlin.de/u/{nuri.group()[3:]}" transcript["text"] = replace_string elif picturae_nuri: # Replace using the second pattern replace_string = f"http://coll.mfn-berlin.de/u/{picturae_nuri.group(1)}" transcript["text"] = replace_string except AttributeError: pass return transcript
#---------------------Load CSV and JPG Files---------------------#
[docs] def load_dataframe(filepath_csv: str) -> pd.DataFrame: """ Loads the CSV file using Pandas. Args: filepath_csv (str): path to the CSV file Returns: pd.DataFrame: The CSV as a Pandas DataFrame """ dataframe = pd.read_csv(filepath_csv) return dataframe
[docs] def load_jpg(filepath: str) -> np.ndarray: """ Loads the jpg files using the OpenCV module. Args: filepath (str): path to jpg files Returns: np.ndarray: OpenCV image object """ jpg = cv2.imread(filepath) return jpg
[docs] def load_json(file: str) -> dict: """ Load JSON data from a file and deserialize it. Args: file (str): The name of the file containing JSON data. Returns: dict: The JSON data as a dictionary """ with open(file, 'r') as f: data = json.load(f) return data
[docs] def read_vocabulary(file: str) -> dict: """ Read a CSV file containing vocabulary and convert it to a dictionary. Args: file (str): The name of the CSV file containing vocabulary data. Returns: dict: A dictionary where keys and values are taken from the CSV data. """ voc = pd.read_csv(file) return dict(voc.values)