浏览代码

feat!: [WFE] implement word feature detection using Whisper

BREAKING CHANGE: no words to WFE are no longer an error, they raise a notice

WordFeatureExtractor is not fast- even the import is slow. However, it processes
files and returns Features corresponding to matched words.
main
Rob Hallam 2 个月前
父节点
当前提交
0b066c7dae
共有 1 个文件被更改,包括 42 次插入6 次删除
  1. +42
    -6
      pipeline/feature_extractors.py

+ 42
- 6
pipeline/feature_extractors.py 查看文件

@@ -11,6 +11,9 @@ from pipeline.utils import SourceMedia, Source, Feature, Interval
import soundfile
import pyloudnorm

# for word detection
from faster_whisper import WhisperModel, BatchedInferencePipeline

logger = logging.getLogger(__name__)

class FeatureExtractor(ABC):
@@ -422,10 +425,10 @@ class WordFeatureExtractor(FeatureExtractor):
if not self.input_files:
raise ValueError("No input files provided")

# Validate words
if not words:
raise ValueError("No words provided")
words = words
# Validate words - raise a notice if none provided
if len(words) == 0:
logger.warning("No words provided for detection")
self.words = words
# TODO: consider stripping punctuation since Whisper produces words+punctuation
# and we might want to strip the punctuation there too

@@ -433,7 +436,40 @@ class WordFeatureExtractor(FeatureExtractor):
"""Extract features corresponding to supplied target words (defined in setup) for each input file

Use Whisper to detect words in the audio, then match these to target words and create features

Note: if no words are supplied we can exit early
"""
if len(self.words) == 0: return

if self.DEFAULT_PIPELINE_TYPE == "batched":
batched = True
else:
batched = False

# no early exit
# TODO: consider maybe loglevel notice of estimated time! consider also: max execution time config?
# TODO: config options for model size, device, compute type
model = self._whispermodel() # NB uses defaults, TODO: add config options

# NOTE: batched not available on pypi yet at time of writing
if batched:
batched_model = self._batched_inference_pipeline(model)

for file in self.input_files:
# transcribe the audio file
if batched:
segments, _ = self._transcribe(batched_model, file.path, batch_size=self.DEFAULT_BATCH_SIZE)
else:
segments, _ = self._transcribe(model, file.path, beam_size=self.DEFAULT_BEAM_SIZE)

# process the segments
# segment has: start, end, text
for segment in segments:
# check if any of the words are in the segment
for word in segment.text.split():
if word in self.words:
self.features.append(Feature(interval=Interval(start=segment.start, end=segment.end),
source=file, feature_extractor="word",
score=1.0))


def teardown(self):
"""Clean up after Whisper"""

正在加载...
取消
保存