@@ -11,6 +11,9 @@ from pipeline.utils import SourceMedia, Source, Feature, Interval
import soundfile
import soundfile
import pyloudnorm
import pyloudnorm
# for word detection
from faster_whisper import WhisperModel, BatchedInferencePipeline
logger = logging.getLogger(__name__)
logger = logging.getLogger(__name__)
class FeatureExtractor(ABC):
class FeatureExtractor(ABC):
@@ -422,10 +425,10 @@ class WordFeatureExtractor(FeatureExtractor):
if not self.input_files:
if not self.input_files:
raise ValueError("No input files provided")
raise ValueError("No input files provided")
# Validate words
if not words :
raise ValueError("No words provided ")
words = words
# Validate words - raise a notice if none provided
if len(words) == 0 :
logger.warning("No words provided for detection ")
self. words = words
# TODO: consider stripping punctuation since Whisper produces words+punctuation
# TODO: consider stripping punctuation since Whisper produces words+punctuation
# and we might want to strip the punctuation there too
# and we might want to strip the punctuation there too
@@ -433,7 +436,40 @@ class WordFeatureExtractor(FeatureExtractor):
"""Extract features corresponding to supplied target words (defined in setup) for each input file
"""Extract features corresponding to supplied target words (defined in setup) for each input file
Use Whisper to detect words in the audio, then match these to target words and create features
Use Whisper to detect words in the audio, then match these to target words and create features
Note: if no words are supplied we can exit early
"""
"""
if len(self.words) == 0: return
if self.DEFAULT_PIPELINE_TYPE == "batched":
batched = True
else:
batched = False
# no early exit
# TODO: consider maybe loglevel notice of estimated time! consider also: max execution time config?
# TODO: config options for model size, device, compute type
model = self._whispermodel() # NB uses defaults, TODO: add config options
# NOTE: batched not available on pypi yet at time of writing
if batched:
batched_model = self._batched_inference_pipeline(model)
for file in self.input_files:
# transcribe the audio file
if batched:
segments, _ = self._transcribe(batched_model, file.path, batch_size=self.DEFAULT_BATCH_SIZE)
else:
segments, _ = self._transcribe(model, file.path, beam_size=self.DEFAULT_BEAM_SIZE)
# process the segments
# segment has: start, end, text
for segment in segments:
# check if any of the words are in the segment
for word in segment.text.split():
if word in self.words:
self.features.append(Feature(interval=Interval(start=segment.start, end=segment.end),
source=file, feature_extractor="word",
score=1.0))
def teardown(self):
"""Clean up after Whisper"""