Source code for label_postprocessing.vocabulary

# Import third-party libraries
import json
from nltk import word_tokenize
import pandas as pd
import string
from typing import Dict


[docs]
def contains_only_letters(token: str) -> bool:
    """
    Checks if a token consists only of letters.

    Args:
        token (str): Token from word_tokenize.

    Returns:
        bool: True if token contains only letters, False otherwise.
    """
    return token.isalpha()



[docs]
def is_punctuation(token: str) -> bool:
    """
    Check if a token is a punctuation mark.

    Args:
        token (str): The token to check for punctuation.

    Returns:
        bool: True if the token is a punctuation mark, False otherwise.
    """
    return token in string.punctuation



[docs]
def extract_vocabulary(ocr_output: str) -> None:
    """
    Extract unique words from the transcripts that consist only of letters and are at least 3 characters long.
    Saves the extracted vocabulary to a CSV file.
    
    Args:
        ocr_output (str): Path to the OCR output file.
    """
    try:
        vocabulary: Dict[str, int] = {}
        
        with open(ocr_output, 'r') as f:
            labels = json.load(f)
            
        for label in labels:
            tokens = word_tokenize(label.get("text", ""))
            for token in tokens:
                token = token.lower()
                if not is_punctuation(token) and len(token) >= 3 and contains_only_letters(token):
                    vocabulary[token] = vocabulary.get(token, 0) + 1
                    
        df = pd.DataFrame(vocabulary.items(), columns=['Type', 'Count']).sort_values(by=['Count'], ascending=False)
        df.to_csv("vocabulary.csv", index=False)
        print("Vocabulary saved successfully to vocabulary.csv")
    except Exception as e:
        print(f"Error extracting vocabulary: {e}")