llmcompressor.transformers.finetune.data.peoples_speech

`PeoplesSpeech`

Bases: TextGenerationDataset

ML Commons People's Speech audio dataset

Unfortunately, due to the specialized nature of audio model preprocessing, some model specific code must be defined here. This dataset has been tested with the WhisperForConditionalGeneration and Qwen2AudioForConditionalGeneration model classes

Parameters:

Name	Type	Description	Default
`data_args`		configuration settings for dataset loading	required
`split`	`str`	split from dataset to load, for instance `test` or `train[:5%]`	required
`processor`	`Processor`	processor or tokenizer to use on dataset	required

Source code in llmcompressor/transformers/finetune/data/peoples_speech.py

@TextGenerationDataset.register(name="peoples_speech")
class PeoplesSpeech(TextGenerationDataset):
    """
    ML Commons People's Speech audio dataset

    Unfortunately, due to the specialized nature of audio model preprocessing, some
    model specific code must be defined here. This dataset has been tested with the
    WhisperForConditionalGeneration and Qwen2AudioForConditionalGeneration model classes

    :param data_args: configuration settings for dataset loading
    :param split: split from dataset to load, for instance `test` or `train[:5%]`
    :param processor: processor or tokenizer to use on dataset
    """

    def __init__(self, dataset_args: "DataArgs", split: str, processor: Processor):
        dataset_args = deepcopy(dataset_args)
        dataset_args.dataset = "MLCommons/peoples_speech"
        dataset_args.dataset_config_name = "test"
        if not dataset_args.overwrite_cache:
            logger.warning(
                "Because audio processors are more complex, dataset mapping functions "
                "vary with model architecture and their results cannot be cached. "
                "Setting overwrite_cache=True"
            )
            dataset_args.overwrite_cache = True
        self.processor_type = processor.__class__.__name__

        super().__init__(dataset_args=dataset_args, split=split, processor=processor)

    def dataset_template(self, example):
        audio = example["audio"]["array"]
        sampling_rate = example["audio"]["sampling_rate"]

        if self.processor_type == "Qwen2AudioProcessor":
            messages = [
                {"role": "user", "content": [{"audio": None}]},
                {"role": "user", "content": [{"text": "What did the person say?"}]},
            ]
            text = self.processor.apply_chat_template(messages)
            return {"audios": [audio], "sampling_rate": sampling_rate, "text": text}

        else:
            # chat template decoder ids are appended later by self.processor.__call__
            text = " " + example["text"].capitalize()
            return {"audio": audio, "sampling_rate": sampling_rate, "text": text}

    def filter_tokenizer_args(self, dataset: DatasetType) -> DatasetType:
        if self.processor_type == "WhisperProcessor":
            tokenizer_args = ["audio", "sampling_rate", "text"]
            column_names = get_columns(dataset)

            return dataset.remove_columns(list(set(column_names) - set(tokenizer_args)))

        else:
            return super().filter_tokenizer_args(dataset)

    def tokenize(self, data: LazyRow) -> Dict[str, Any]:
        if self.processor_type == "WhisperProcessor":
            inputs = self.processor(
                audio=data["audio"],
                sampling_rate=data["sampling_rate"],
                text=data["text"],
                add_special_tokens=True,
                return_tensors="pt",
            )

            # TODO: inputs["input_features"] is a float dtype, which may conflict with
            # the dtype of the model. Add logic to in data pipeline to move inputs to
            # the matching model device and dtype
            inputs["decoder_input_ids"] = inputs["labels"]
            del inputs["labels"]

            return inputs

        else:
            return super().tokenize(data)