Source code for label_postprocessing.vocabulary

# Import third-party libraries
import json
from nltk import word_tokenize
import pandas as pd
import string
from typing import Dict

[docs] def contains_only_letters(token: str) -> bool: """ Checks if a token consists only of letters. Args: token (str): Token from word_tokenize. Returns: bool: True if token contains only letters, False otherwise. """ return token.isalpha()
[docs] def is_punctuation(token: str) -> bool: """ Check if a token is a punctuation mark. Args: token (str): The token to check for punctuation. Returns: bool: True if the token is a punctuation mark, False otherwise. """ return token in string.punctuation
[docs] def extract_vocabulary(ocr_output: str) -> None: """ Extract unique words from the transcripts that consist only of letters and are at least 3 characters long. Saves the extracted vocabulary to a CSV file. Args: ocr_output (str): Path to the OCR output file. """ try: vocabulary: Dict[str, int] = {} with open(ocr_output, 'r') as f: labels = json.load(f) for label in labels: tokens = word_tokenize(label.get("text", "")) for token in tokens: token = token.lower() if not is_punctuation(token) and len(token) >= 3 and contains_only_letters(token): vocabulary[token] = vocabulary.get(token, 0) + 1 df = pd.DataFrame(vocabulary.items(), columns=['Type', 'Count']).sort_values(by=['Count'], ascending=False) df.to_csv("vocabulary.csv", index=False) print("Vocabulary saved successfully to vocabulary.csv") except Exception as e: print(f"Error extracting vocabulary: {e}")