Переглянути джерело

feat!: [WFE] implement word feature detection using Whisper

BREAKING CHANGE: no words to WFE are no longer an error, they raise a notice

WordFeatureExtractor is not fast- even the import is slow. However, it processes
files and returns Features corresponding to matched words.
main
Rob Hallam 1 місяць тому
джерело
коміт
0b066c7dae
1 змінених файлів з 42 додано та 6 видалено
  1. +42
    -6
      pipeline/feature_extractors.py

+ 42
- 6
pipeline/feature_extractors.py Переглянути файл

@@ -11,6 +11,9 @@ from pipeline.utils import SourceMedia, Source, Feature, Interval
import soundfile
import pyloudnorm

# for word detection
from faster_whisper import WhisperModel, BatchedInferencePipeline

logger = logging.getLogger(__name__)

class FeatureExtractor(ABC):
@@ -422,10 +425,10 @@ class WordFeatureExtractor(FeatureExtractor):
if not self.input_files:
raise ValueError("No input files provided")

# Validate words
if not words:
raise ValueError("No words provided")
words = words
# Validate words - raise a notice if none provided
if len(words) == 0:
logger.warning("No words provided for detection")
self.words = words
# TODO: consider stripping punctuation since Whisper produces words+punctuation
# and we might want to strip the punctuation there too

@@ -433,7 +436,40 @@ class WordFeatureExtractor(FeatureExtractor):
"""Extract features corresponding to supplied target words (defined in setup) for each input file

Use Whisper to detect words in the audio, then match these to target words and create features

Note: if no words are supplied we can exit early
"""
if len(self.words) == 0: return

if self.DEFAULT_PIPELINE_TYPE == "batched":
batched = True
else:
batched = False

# no early exit
# TODO: consider maybe loglevel notice of estimated time! consider also: max execution time config?
# TODO: config options for model size, device, compute type
model = self._whispermodel() # NB uses defaults, TODO: add config options

# NOTE: batched not available on pypi yet at time of writing
if batched:
batched_model = self._batched_inference_pipeline(model)

for file in self.input_files:
# transcribe the audio file
if batched:
segments, _ = self._transcribe(batched_model, file.path, batch_size=self.DEFAULT_BATCH_SIZE)
else:
segments, _ = self._transcribe(model, file.path, beam_size=self.DEFAULT_BEAM_SIZE)

# process the segments
# segment has: start, end, text
for segment in segments:
# check if any of the words are in the segment
for word in segment.text.split():
if word in self.words:
self.features.append(Feature(interval=Interval(start=segment.start, end=segment.end),
source=file, feature_extractor="word",
score=1.0))


def teardown(self):
"""Clean up after Whisper"""

Завантаження…
Відмінити
Зберегти