Source code for label_postprocessing.vocabulary
# Import third-party libraries
import json
from nltk import word_tokenize
import pandas as pd
import string
from typing import Dict
[docs]
def contains_only_letters(token: str) -> bool:
"""
Checks if a token consists only of letters.
Args:
token (str): Token from word_tokenize.
Returns:
bool: True if token contains only letters, False otherwise.
"""
return token.isalpha()
[docs]
def is_punctuation(token: str) -> bool:
"""
Check if a token is a punctuation mark.
Args:
token (str): The token to check for punctuation.
Returns:
bool: True if the token is a punctuation mark, False otherwise.
"""
return token in string.punctuation
[docs]
def extract_vocabulary(ocr_output: str) -> None:
"""
Extract unique words from the transcripts that consist only of letters and are at least 3 characters long.
Saves the extracted vocabulary to a CSV file.
Args:
ocr_output (str): Path to the OCR output file.
"""
try:
vocabulary: Dict[str, int] = {}
with open(ocr_output, 'r') as f:
labels = json.load(f)
for label in labels:
tokens = word_tokenize(label.get("text", ""))
for token in tokens:
token = token.lower()
if not is_punctuation(token) and len(token) >= 3 and contains_only_letters(token):
vocabulary[token] = vocabulary.get(token, 0) + 1
df = pd.DataFrame(vocabulary.items(), columns=['Type', 'Count']).sort_values(by=['Count'], ascending=False)
df.to_csv("vocabulary.csv", index=False)
print("Vocabulary saved successfully to vocabulary.csv")
except Exception as e:
print(f"Error extracting vocabulary: {e}")