class DisfluencyTable(AirTableS3Integration):
def __init__(self, airtable_url: str, filter_formula: str, headers: Dict[str, str]):
"""Constructor for the `DisfluencyTable` class.
Args:
airtable_url (str): URL endpoint to AirTable table.
filter_formula (str): Additional GET URL filter formula parameter.
headers (Dict[str, str]): API Header containing authorization.
"""
super().__init__(airtable_url, filter_formula, headers)
def _apply_annotation_changes_s3(self, record: Dict[str, Any]):
"""Applies changes in an S3 directory based on an AirTable `record`'s disfluency
verdict.
Args:
record (Dict[str, Any]): An AirTable record/row.
"""
def classify_mispronunciation(transcript, ground_truth, language):
_preprocess_sequence = (
lambda sequence: sequence.replace("-", " ")
.translate(str.maketrans("", "", string.punctuation))
.lower()
.strip()
)
transcript = _preprocess_sequence(transcript).split()
ground_truth = _preprocess_sequence(ground_truth).split()
homophones = HOMOPHONES[language] if language in HOMOPHONES else None
mispronunciation = detect_mispronunciation(
ground_truth, transcript, homophones
)
return mispronunciation
fields = record["fields"]
job_name, language = fields["Job Name"], fields["Language"]
ground_truth, transcript = fields["Ground Truth"], fields["Transcript"]
audio_filename = fields["Audio"][0]["filename"]
delete = fields["Delete?"] if "Delete?" in fields else False
# recalculate disfluency
disfluency = classify_mispronunciation(
transcript, ground_truth, language.split("-")[0]
)
fields["Disfluency"] = disfluency
source_path = f"mispronunciations/raw/{language}"
save_path = f"mispronunciations/{disfluency.lower()}/{language}"
# if manually marked to delete or if no disfluency is detected
if delete or disfluency == "DELETE":
delete_file(self.bucket, audio_filename, source_path)
else:
move_file(self.bucket, audio_filename, source_path, save_path)
write_file(self.bucket, transcript, save_path, f"{job_name}.txt")
def _finalize_records(self, records: List[Dict[str, Any]]) -> str:
"""Finalizes disfluency records by marking "AWS" column as `True` and updating
disfluency.
Args:
records (List[Dict[str, Any]]): AirTable records.
Returns:
str: Finalized record payload.
"""
payload = json.dumps(
{
"records": [
{
"id": record["id"],
"fields": {
"Disfluency": record["fields"]["Disfluency"],
"AWS": True,
},
}
for record in records
]
}
)
return payload