Skip to content

Disfluency Table

airtable_apply_annotations.disfluency_table

DisfluencyTable

Bases: AirTableS3Integration

Source code in src/airtable_apply_annotations/disfluency_table.py
class DisfluencyTable(AirTableS3Integration):
    def __init__(self, airtable_url: str, filter_formula: str, headers: Dict[str, str]):
        """Constructor for the `DisfluencyTable` class.

        Args:
            airtable_url (str): URL endpoint to AirTable table.
            filter_formula (str): Additional GET URL filter formula parameter.
            headers (Dict[str, str]): API Header containing authorization.
        """
        super().__init__(airtable_url, filter_formula, headers)

    def _apply_annotation_changes_s3(self, record: Dict[str, Any]):
        """Applies changes in an S3 directory based on an AirTable `record`'s disfluency
        verdict.

        Args:
            record (Dict[str, Any]): An AirTable record/row.
        """

        def classify_mispronunciation(transcript, ground_truth, language):
            _preprocess_sequence = (
                lambda sequence: sequence.replace("-", " ")
                .translate(str.maketrans("", "", string.punctuation))
                .lower()
                .strip()
            )

            transcript = _preprocess_sequence(transcript).split()
            ground_truth = _preprocess_sequence(ground_truth).split()

            homophones = HOMOPHONES[language] if language in HOMOPHONES else None
            mispronunciation = detect_mispronunciation(
                ground_truth, transcript, homophones
            )

            return mispronunciation

        fields = record["fields"]

        job_name, language = fields["Job Name"], fields["Language"]
        ground_truth, transcript = fields["Ground Truth"], fields["Transcript"]
        audio_filename = fields["Audio"][0]["filename"]
        delete = fields["Delete?"] if "Delete?" in fields else False

        # recalculate disfluency
        disfluency = classify_mispronunciation(
            transcript, ground_truth, language.split("-")[0]
        )
        fields["Disfluency"] = disfluency

        source_path = f"mispronunciations/raw/{language}"
        save_path = f"mispronunciations/{disfluency.lower()}/{language}"

        # if manually marked to delete or if no disfluency is detected
        if delete or disfluency == "DELETE":
            delete_file(self.bucket, audio_filename, source_path)
        else:
            move_file(self.bucket, audio_filename, source_path, save_path)
            write_file(self.bucket, transcript, save_path, f"{job_name}.txt")

    def _finalize_records(self, records: List[Dict[str, Any]]) -> str:
        """Finalizes disfluency records by marking "AWS" column as `True` and updating
        disfluency.

        Args:
            records (List[Dict[str, Any]]): AirTable records.

        Returns:
            str: Finalized record payload.
        """
        payload = json.dumps(
            {
                "records": [
                    {
                        "id": record["id"],
                        "fields": {
                            "Disfluency": record["fields"]["Disfluency"],
                            "AWS": True,
                        },
                    }
                    for record in records
                ]
            }
        )
        return payload

__init__(airtable_url, filter_formula, headers)

Constructor for the DisfluencyTable class.

Parameters:

Name Type Description Default
airtable_url str

URL endpoint to AirTable table.

required
filter_formula str

Additional GET URL filter formula parameter.

required
headers Dict[str, str]

API Header containing authorization.

required
Source code in src/airtable_apply_annotations/disfluency_table.py
def __init__(self, airtable_url: str, filter_formula: str, headers: Dict[str, str]):
    """Constructor for the `DisfluencyTable` class.

    Args:
        airtable_url (str): URL endpoint to AirTable table.
        filter_formula (str): Additional GET URL filter formula parameter.
        headers (Dict[str, str]): API Header containing authorization.
    """
    super().__init__(airtable_url, filter_formula, headers)