feat!: [WFE] implement word feature detection using Whisper

BREAKING CHANGE: no words to WFE are no longer an error, they raise a notice WordFeatureExtractor is not fast- even the import is slow. However, it processes files and returns Features corresponding to matched words.
7 个月前 · 0b066c7dae
--- a/pipeline/feature_extractors.py
+++ b/pipeline/feature_extractors.py
@@ -11,6 +11,9 @@ from pipeline.utils import SourceMedia, Source, Feature, Interval
 import soundfile
 import pyloudnorm

 # for word detection
 from faster_whisper import WhisperModel, BatchedInferencePipeline

 logger = logging.getLogger(__name__)

 class FeatureExtractor(ABC):
@@ -422,10 +425,10 @@ class WordFeatureExtractor(FeatureExtractor):
        if not self.input_files:
            raise ValueError("No input files provided")

        # Validate words
        if not words:
            raise ValueError("No words provided")
        words = words
        # Validate words - raise a notice if none provided
        if len(words) == 0:
            logger.warning("No words provided for detection")
        self.words = words
        # TODO: consider stripping punctuation since Whisper produces words+punctuation
        # and we might want to strip the punctuation there too

@@ -433,7 +436,40 @@ class WordFeatureExtractor(FeatureExtractor):
        """Extract features corresponding to supplied target words (defined in setup) for each input file

        Use Whisper to detect words in the audio, then match these to target words and create features

        Note: if no words are supplied we can exit early
        """
        if len(self.words) == 0: return

        if self.DEFAULT_PIPELINE_TYPE == "batched":
            batched = True
        else:
            batched = False

        # no early exit
        # TODO: consider maybe loglevel notice of estimated time! consider also: max execution time config?
        # TODO: config options for model size, device, compute type
        model = self._whispermodel() # NB uses defaults, TODO: add config options

        # NOTE: batched not available on pypi yet at time of writing
        if batched:
            batched_model = self._batched_inference_pipeline(model)

        for file in self.input_files:
            # transcribe the audio file
            if batched:
                segments, _ = self._transcribe(batched_model, file.path, batch_size=self.DEFAULT_BATCH_SIZE)
            else:
                segments, _ = self._transcribe(model, file.path, beam_size=self.DEFAULT_BEAM_SIZE)

            # process the segments
            # segment has: start, end, text
            for segment in segments:
                # check if any of the words are in the segment
                for word in segment.text.split():
                    if word in self.words:
                        self.features.append(Feature(interval=Interval(start=segment.start, end=segment.end),
                                                     source=file, feature_extractor="word",
                                                     score=1.0))


    def teardown(self):
        """Clean up after Whisper"""