diff --git a/pipeline/feature_extractors.py b/pipeline/feature_extractors.py index dad9f15..0aed58e 100644 --- a/pipeline/feature_extractors.py +++ b/pipeline/feature_extractors.py @@ -11,6 +11,9 @@ from pipeline.utils import SourceMedia, Source, Feature, Interval import soundfile import pyloudnorm +# for word detection +from faster_whisper import WhisperModel, BatchedInferencePipeline + logger = logging.getLogger(__name__) class FeatureExtractor(ABC): @@ -422,10 +425,10 @@ class WordFeatureExtractor(FeatureExtractor): if not self.input_files: raise ValueError("No input files provided") - # Validate words - if not words: - raise ValueError("No words provided") - words = words + # Validate words - raise a notice if none provided + if len(words) == 0: + logger.warning("No words provided for detection") + self.words = words # TODO: consider stripping punctuation since Whisper produces words+punctuation # and we might want to strip the punctuation there too @@ -433,7 +436,40 @@ class WordFeatureExtractor(FeatureExtractor): """Extract features corresponding to supplied target words (defined in setup) for each input file Use Whisper to detect words in the audio, then match these to target words and create features + + Note: if no words are supplied we can exit early """ + if len(self.words) == 0: return + + if self.DEFAULT_PIPELINE_TYPE == "batched": + batched = True + else: + batched = False + + # no early exit + # TODO: consider maybe loglevel notice of estimated time! consider also: max execution time config? + # TODO: config options for model size, device, compute type + model = self._whispermodel() # NB uses defaults, TODO: add config options + + # NOTE: batched not available on pypi yet at time of writing + if batched: + batched_model = self._batched_inference_pipeline(model) + + for file in self.input_files: + # transcribe the audio file + if batched: + segments, _ = self._transcribe(batched_model, file.path, batch_size=self.DEFAULT_BATCH_SIZE) + else: + segments, _ = self._transcribe(model, file.path, beam_size=self.DEFAULT_BEAM_SIZE) + + # process the segments + # segment has: start, end, text + for segment in segments: + # check if any of the words are in the segment + for word in segment.text.split(): + if word in self.words: + self.features.append(Feature(interval=Interval(start=segment.start, end=segment.end), + source=file, feature_extractor="word", + score=1.0)) + - def teardown(self): - """Clean up after Whisper"""