Browse Source

feat!: [WFE] implement word feature detection using Whisper

BREAKING CHANGE: no words to WFE are no longer an error, they raise a notice

WordFeatureExtractor is not fast- even the import is slow. However, it processes
files and returns Features corresponding to matched words.
main
Rob Hallam 2 months ago
parent
commit
0b066c7dae
1 changed files with 42 additions and 6 deletions
  1. +42
    -6
      pipeline/feature_extractors.py

+ 42
- 6
pipeline/feature_extractors.py View File

@@ -11,6 +11,9 @@ from pipeline.utils import SourceMedia, Source, Feature, Interval
import soundfile import soundfile
import pyloudnorm import pyloudnorm


# for word detection
from faster_whisper import WhisperModel, BatchedInferencePipeline

logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)


class FeatureExtractor(ABC): class FeatureExtractor(ABC):
@@ -422,10 +425,10 @@ class WordFeatureExtractor(FeatureExtractor):
if not self.input_files: if not self.input_files:
raise ValueError("No input files provided") raise ValueError("No input files provided")


# Validate words
if not words:
raise ValueError("No words provided")
words = words
# Validate words - raise a notice if none provided
if len(words) == 0:
logger.warning("No words provided for detection")
self.words = words
# TODO: consider stripping punctuation since Whisper produces words+punctuation # TODO: consider stripping punctuation since Whisper produces words+punctuation
# and we might want to strip the punctuation there too # and we might want to strip the punctuation there too


@@ -433,7 +436,40 @@ class WordFeatureExtractor(FeatureExtractor):
"""Extract features corresponding to supplied target words (defined in setup) for each input file """Extract features corresponding to supplied target words (defined in setup) for each input file


Use Whisper to detect words in the audio, then match these to target words and create features Use Whisper to detect words in the audio, then match these to target words and create features

Note: if no words are supplied we can exit early
""" """
if len(self.words) == 0: return

if self.DEFAULT_PIPELINE_TYPE == "batched":
batched = True
else:
batched = False

# no early exit
# TODO: consider maybe loglevel notice of estimated time! consider also: max execution time config?
# TODO: config options for model size, device, compute type
model = self._whispermodel() # NB uses defaults, TODO: add config options

# NOTE: batched not available on pypi yet at time of writing
if batched:
batched_model = self._batched_inference_pipeline(model)

for file in self.input_files:
# transcribe the audio file
if batched:
segments, _ = self._transcribe(batched_model, file.path, batch_size=self.DEFAULT_BATCH_SIZE)
else:
segments, _ = self._transcribe(model, file.path, beam_size=self.DEFAULT_BEAM_SIZE)

# process the segments
# segment has: start, end, text
for segment in segments:
# check if any of the words are in the segment
for word in segment.text.split():
if word in self.words:
self.features.append(Feature(interval=Interval(start=segment.start, end=segment.end),
source=file, feature_extractor="word",
score=1.0))



def teardown(self):
"""Clean up after Whisper"""

Loading…
Cancel
Save