You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

558 lines
22 KiB

  1. from abc import ABC
  2. import json
  3. import logging
  4. import os
  5. import random
  6. import subprocess
  7. from ast import literal_eval
  8. from pipeline.utils import SourceMedia, Source, Feature, Interval
  9. # for loudness detection
  10. import soundfile
  11. import pyloudnorm
  12. # for word detection
  13. from faster_whisper import WhisperModel, BatchedInferencePipeline
  14. logger = logging.getLogger(__name__)
  15. class FeatureExtractor(ABC):
  16. """Feature extractor interface."""
  17. _CONFIG_DEFAULT_NUM_FEATURES = 5 # default number of features to keep
  18. def _run_get_output(self, cmd: list, cwd:str=".") -> str:
  19. """Run a command and return the output as a string
  20. Defined to be mocked out in tests via unittest.mock.patch
  21. """
  22. return subprocess.run(cmd, stdout=subprocess.PIPE, cwd=cwd).stdout.decode("utf-8")
  23. def _keep_num(self, features, num=_CONFIG_DEFAULT_NUM_FEATURES, trim_overlap=False) -> list:
  24. """Keep the top n features (default: 5)
  25. Approach:
  26. - for range in 0-n
  27. + expand the nth top feature to min duration
  28. (move start back by 0.5*min_duration, end forward by 0.5*min_duration)
  29. + drop any features that are now in that feature's range (optional)
  30. - return the top n features
  31. Each feature is a Feature object, with an Interval object
  32. """
  33. keep_features = []
  34. # ensure features are sorted by score
  35. features = sorted(features, key=lambda x: x.score, reverse=True)
  36. while len(keep_features) < num and len(features) > 0:
  37. current_feature = features.pop(0)
  38. # expand the feature to min_duration - try and keep centered at current start
  39. if self._min_duration > current_feature.interval.duration:
  40. current_feature.interval.move_start(-0.5*self._min_duration, relative=True)
  41. if current_feature.interval.duration < self._min_duration:
  42. current_feature.interval.update_duration(self._min_duration)
  43. keep_features.append(current_feature)
  44. # drop any features that are now in that feature's range (plus margin)
  45. # features = [f for f in features if
  46. # (f.interval.start < current_feature.interval.start-margin and
  47. # f.interval.end > current_feature.interval.start-margin) or
  48. # (f.interval.end > current_feature.interval.end+margin and
  49. # f.interval.start < current_feature.interval.end+margin)]
  50. if trim_overlap:
  51. features = [f for f in features if not f.interval.overlaps(current_feature.interval)]
  52. return keep_features
  53. def setup(self):
  54. """Setup the feature extractor -- validate input files & config"""
  55. def run(self):
  56. """Run the feature extractor -- extract features"""
  57. def teardown(self):
  58. """Teardown the feature extractor -- clean up any temporary files created during setup"""
  59. class LaughterFeatureExtractor(FeatureExtractor):
  60. """Feature extractor for laughter detection.
  61. This class is responsible for extracting features corresponding to laughter in media files. Uses jrgillick's laughter-detection library.
  62. Here:
  63. setup() (not needed for laughter-detection, as it can work with AV files directly)
  64. run() is used to extract features from the audio using jrgillick's laughter-detection
  65. teardown() (not needed)
  66. @see: https://github.com/jrgillick/laughter-detection for the laughter-detection library
  67. """
  68. _PREPEND_TIME = 7.0 # seconds before the laugh to capture whatever was funny
  69. _APPEND_TIME = 3.0 # seconds after the laugh to capture the reaction
  70. _CONFIG_LAUGH_DETECTOR_DIR = "/home/robert/mounts/980data/code/laughter-detection/"
  71. def __init__(self, input_files=None, config=None):
  72. """It is expected that input_files is a SourceMedia object"""
  73. self.input_files = input_files
  74. self.config = config
  75. self.features = []
  76. def _laughdetect(self, audio_file, laugh_detector_dir=_CONFIG_LAUGH_DETECTOR_DIR) -> list:
  77. """Run laughter detection on the audio file
  78. Returns a list of 2-tuples, each representing a laugh instance in the audio file
  79. in the format: (start, end) in seconds
  80. """
  81. laugh_detector_script = "segment_laughter.py"
  82. # fake output for testing
  83. # laugh_detector_path = "tests/fake_segment_laughter.py"
  84. laugh_detector_cmd = ["python", f"{laugh_detector_dir}{laugh_detector_script}",
  85. f"--input_audio_file={audio_file}"]
  86. # run command, capture output, ignore exit status
  87. # use self._run_get_output to allow mocking in tests
  88. laugh_output = self._run_get_output(laugh_detector_cmd, laugh_detector_dir)
  89. # ↑ have to include cwd to keep laughter-detection imports happy
  90. # also, it isn't happy if no output dir is specified but we get laughs so it's grand
  91. # laughs are lines in stdout that start with "instance:", followed by a space and a 2-tuple of floats
  92. # so jump to the 10th character and evaluate the rest of the line
  93. return [literal_eval(instance[10:])
  94. for instance in laugh_output.splitlines()
  95. if instance.startswith("instance: ")]
  96. def _adjust_features(self) -> None:
  97. """Adjust features according to config
  98. Generically, this ensures features conform to config - min/max feature length, etc.
  99. In the context of LaughterFeatureExtractor, there is some secret sauce: things that
  100. cause a laugh generally /precede/ the laugh, so we want more team before the detected start
  101. than at the end. For example, for a minimum feature length of 15s, we might prepend 10 seconds,
  102. and append 5 seconds (for example), or 12s and 3s. We may wish to do this pre/post adjustment
  103. for all laughter features found, regardless of length.
  104. """
  105. for feature in self.features:
  106. # do the pre & post adjustment
  107. feature.interval.move_start(-self._PREPEND_TIME, relative=True)
  108. feature.interval.move_end(self._APPEND_TIME, relative=True)
  109. def setup(self):
  110. """Setup the laughter feature extractor -- not needed.
  111. jrgillick's laughter-detection library can work with AV files directly!
  112. """
  113. if not self.input_files or len(self.input_files) == 0:
  114. raise ValueError("No input files provided!")
  115. def run(self):
  116. """Extract laughter features for each input file.
  117. Heavy lifting is performed in _laughdetect()
  118. Tuples from _laughdetect are used to create Feature objects, which are appended to self.features by convention
  119. @see: utils.py:Feature, Interval
  120. """
  121. if self.input_files:
  122. for file in self.input_files:
  123. laughs = self._laughdetect(file.path)
  124. for laugh in laughs:
  125. start, end = laugh
  126. self.features.append(Feature(interval=Interval(start=start, end=end),
  127. source=file, feature_extractor="laughter"))
  128. # adjust features
  129. self._adjust_features()
  130. def teardown(self):
  131. """No cleanup needed!"""
  132. class RandomFeatureExtractor(FeatureExtractor):
  133. """Feature extractor for random feature generation.
  134. This class is responsible for generating random features for testing purposes.
  135. Here:
  136. setup() is not needed
  137. run() is used to generate random features
  138. teardown() is not needed
  139. """
  140. NUM_FEATURES = 30
  141. MAX_DURATION = 15.0
  142. MIN_DURATION = 5.0
  143. def __init__(self, input_files=None, config=None):
  144. """It is expected that input_files is a SourceMedia object"""
  145. self.input_files = input_files
  146. self.config = config
  147. self.features = []
  148. def setup(self):
  149. """Setup the random feature extractor -- validate input files & config"""
  150. logger.debug("RandomFeatureExtractor setup")
  151. # Validate input files
  152. if not self.input_files:
  153. raise ValueError("No input files provided")
  154. def run(self):
  155. """Generate random features for each input file"""
  156. # check self.input_files is of type SourceMedia
  157. if not self.input_files or not isinstance(self.input_files, SourceMedia):
  158. raise ValueError("No input files provided")
  159. for file in self.input_files:
  160. for _ in range(self.NUM_FEATURES):
  161. # determine duration between MIN and MAX, round to 3 decimal places
  162. duration = round(random.uniform(self.MIN_DURATION, self.MAX_DURATION), 3)
  163. start = random.random() * file.duration() - duration
  164. self.features.append(Feature(interval=Interval(start=start, duration=duration),
  165. source=file, feature_extractor="random"))
  166. def teardown(self):
  167. pass
  168. class LoudAudioFeatureExtractor(FeatureExtractor):
  169. """Feature extractor for loud audio detection.
  170. This class is responsible for extracting features corresponding to loud audio in media files.
  171. Here:
  172. setup() is used to validate input files & config, and extracting audio
  173. run() uses pyloudnorm to detect loud audio
  174. teardown() is used to clean up temporary files created during setup (if specified by config)
  175. """
  176. _CONFIG_DEFAULT_NUM_FEATURES = 15 # keep the top 5 loudnesses
  177. _CONFIG_DEFAULT_MIN_DURATION = 5.00 # seconds
  178. def __init__(self, input_files=None, config=None,
  179. num_features=_CONFIG_DEFAULT_NUM_FEATURES,
  180. min_duration=_CONFIG_DEFAULT_MIN_DURATION):
  181. if not input_files:
  182. raise ValueError("No input files provided!")
  183. self.input_files = input_files
  184. self.config = config
  185. self.features = []
  186. self._num_features = num_features
  187. self._min_duration = min_duration
  188. def _audio_file_from_path(self, path: str) -> str:
  189. """Return the audio file path given a video file path
  190. Example:
  191. - in = "/path/to/video.mp4"
  192. - out = "/tmp/video.mp4.wav"
  193. """
  194. OUTPUT_DIR = "/tmp"
  195. return f"{OUTPUT_DIR}/{os.path.basename(path)}.wav"
  196. def _get_loudnesses(self, data, meter, rate, window_size, stride_size):
  197. """Extract loudnesses from the audio data using pyloudnorm
  198. return a list of 2-tuples, each representing a timecode and loudness value
  199. """
  200. loudnesses = []
  201. for w in range(0, len(data)-window_size, stride_size):
  202. window = data[w:w+window_size, 0:2] # extract window
  203. loudnesses.append( (w/rate, meter.integrated_loudness(window)) )
  204. return loudnesses
  205. def _loudnorm(self, audio_file):
  206. """Run pyloudnorm on the audio file"""
  207. data, rate = soundfile.read(audio_file) # load audio (with shape (samples, channels))
  208. meter = pyloudnorm.Meter(rate=rate,block_size=0.3) # create BS.1770 meter
  209. loudness_features = []
  210. window_size = int(rate * 0.5) # 500ms
  211. stride_size = int(rate * 0.5) # 500ms -- no overlap
  212. # for w in range(data.shape[0]//100):
  213. # loudnesses.append(meter.integrated_loudness(data[w:w+int(0.3*rate),0:2]))
  214. loudnesses = self._get_loudnesses(data, meter, rate, window_size, stride_size)
  215. for timecode, loudval in sorted([l for l in loudnesses if float(l[1]) != float("-inf")], key=lambda x: x[1], reverse=True):
  216. # print(f"Timecode: {timecode}, Loudness: {loudval}")
  217. loudness_features.append((timecode, round(loudval, 3))) # round to 3 DP
  218. return loudness_features
  219. def setup(self):
  220. """extract audio from video files to be processed by pyloudnorm
  221. TODO: config -- hardcoded for now
  222. """
  223. # pyloudnorm expects WAV files
  224. for file in self.input_files:
  225. audio_file = self._audio_file_from_path(file.path)
  226. # ffmpeg -i input.mp4 -vn -acodec pcm_s16le output.wav
  227. subprocess.run(["ffmpeg", "-y", "-i", file.path, "-vn", "-acodec", "pcm_s16le", audio_file],
  228. stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  229. def run(self):
  230. """Use pyloudnorm to detect loud audio"""
  231. for file in self.input_files:
  232. audio_file = self._audio_file_from_path(file.path)
  233. loudnesses = self._loudnorm(audio_file)
  234. features = []
  235. for time, loudness in loudnesses:
  236. features.append(Feature(interval=Interval(start=time, duration=0.500),
  237. source=file, feature_extractor="loudness",
  238. score=loudness))
  239. # prune features list to keep self.num_features
  240. if len(features) > self._num_features:
  241. self.features = self._keep_num(features, self._num_features)
  242. else:
  243. self.features = features
  244. class VideoActivityFeatureExtractor(FeatureExtractor):
  245. """Feature extractor for video activity detection.
  246. This class is responsible for extracting features corresponding to high activity in video files.
  247. Uses ffmpeg's scdet filter with threshold of zero.
  248. Here:
  249. setup() is used to validate input files & config
  250. run() is used to extract features from the video using OpenCV
  251. teardown() is used to clean up any temporary files created during setup according to the config
  252. #TODO: minimum duration -- consider whether to do here, or expand duration post-consolidation
  253. """
  254. _CONFIG_DEFAULT_NUM_FEATURES = 15 # keep the top 5 activity moments
  255. _CONFIG_DEFAULT_MIN_DURATION = 5.00 # seconds
  256. def __init__(self, input_files=None, config=None,
  257. num_features=_CONFIG_DEFAULT_NUM_FEATURES,
  258. min_duration=_CONFIG_DEFAULT_MIN_DURATION):
  259. if not input_files:
  260. raise ValueError("No input files provided!")
  261. self.input_files = input_files
  262. self.config = config
  263. self.features = []
  264. self._num_features = num_features
  265. self._min_duration = min_duration
  266. def _scdet(self, video_file):
  267. """Run scdet filter on the video file"""
  268. ffmpeg_cmd = ["ffmpeg", "-i", video_file, "-vf", "scdet=threshold=0", "-f", "null", "-"]
  269. # output is of the form:
  270. # [scdet @ 0x7f0798003d00] lavfi.scd.score: 0.031, lavfi.scd.time: 23.65
  271. # [scdet @ 0x7f0798003d00] lavfi.scd.score: 0.006, lavfi.scd.time: 23.70
  272. # capture output, extract time & score
  273. scdet_output = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stderr.decode("utf-8")
  274. # extract time & score
  275. scores = []
  276. for line in scdet_output.splitlines():
  277. if "lavfi.scd.score" in line:
  278. scores.append( (float(line.split(",")[1].split(":")[1]),
  279. float(line.split(",")[0].split(":")[1]))
  280. )
  281. return scores
  282. def _nonoverlap_mean(self, scores, window_size=0.500) -> list:
  283. """Take the mean of non-overlapping windows of scores
  284. Input: list of tuples in the format (time, score)
  285. Output: list of tuples in the format (time, mean_score) (reduced set)
  286. """
  287. means = []
  288. current_window = []
  289. current_window_start = 0.0
  290. for time, score in scores:
  291. if time - current_window_start > window_size:
  292. # calculate mean of current window
  293. mean_score = sum([s for _, s in current_window]) / len(current_window)
  294. means.append((current_window_start, round(mean_score, 3)))
  295. # reset window
  296. current_window = []
  297. current_window_start = time
  298. current_window.append((time, score))
  299. return means
  300. def _drop_lowest(self, scores, percent=33):
  301. """Drop the lowest n% scores from the list"""
  302. scores = sorted(scores, key=lambda x: x[1], reverse=True)
  303. return scores[:int(len(scores) * (percent / 100))]
  304. def setup(self):
  305. pass
  306. def run(self):
  307. for file in self.input_files:
  308. scores = self._scdet(file.path)
  309. means = sorted(self._nonoverlap_mean(scores), key=lambda x: x[1], reverse=True)
  310. features = []
  311. for time, score in self._drop_lowest(means, 66):
  312. features.append(Feature(interval=Interval(start=time, duration=0.500),
  313. source=file, feature_extractor="videoactivity",
  314. score=score))
  315. # prune features list to keep self.num_features
  316. if len(self.features) > self._num_features:
  317. self.features = self._keep_num(features, self._num_features)
  318. else:
  319. self.features = features
  320. def teardown(self):
  321. pass
  322. class JSONFeatureExtractor(FeatureExtractor):
  323. """(Re-)create features from a JSON file
  324. The JSON file can have one of two formats:
  325. - the format produced by the pipleline (@see: video_producers.py:JSONProducer)
  326. - a simplified format which is easier for manual creation
  327. """
  328. def __init__(self, input_files=None, config=None):
  329. if not input_files:
  330. raise ValueError("No input files provided!")
  331. self.input_files = input_files
  332. self.config = config
  333. self.features = []
  334. def setup(self):
  335. pass
  336. def _interval_from_dict(self, d):
  337. return Interval(start=d["start"], duration=d["duration"])
  338. def _source_from_dict(self, d):
  339. return Source(d["source"], d["path"], d["provider"])
  340. def _read_json_from_file(self, file):
  341. """Read a JSON file and return the contents
  342. Method exists to allow for mocking in tests
  343. """
  344. with open(file, "r") as f:
  345. return json.load(f)
  346. def run(self):
  347. # only pipeline JSON format for now
  348. # TODO: add support for simplified format
  349. for file in self.input_files:
  350. features_from_json = self._read_json_from_file(file.path)
  351. for feature in features_from_json:
  352. self.features.append(Feature(interval=self._interval_from_dict(feature["interval"]),
  353. source=self._source_from_dict(feature["source"]),
  354. feature_extractor=feature["feature_extractor"],
  355. score=feature["score"]))
  356. def teardown(self):
  357. pass
  358. class WordFeatureExtractor(FeatureExtractor):
  359. """Feature extractor for specific word detection (uses Whisper)"""
  360. # set defaults for whisper settings
  361. DEFAULT_MODEL_SIZE = "medium"
  362. DEFAULT_DEVICE = "cpu"
  363. DEFAULT_COMPUTE_TYPE = "int8"
  364. DEFAULT_BEAM_SIZE = 5
  365. DEFAULT_BATCH_SIZE = 16
  366. DEFAULT_PIPELINE_TYPE = "batched" # or "stream"
  367. words = []
  368. def _transcribe(self, model, file, **kwargs):
  369. """Defined here to allow for mocking in tests"""
  370. return model.transcribe(file, **kwargs)
  371. def _whispermodel(self, model_size=DEFAULT_MODEL_SIZE,
  372. device=DEFAULT_DEVICE, compute_type=DEFAULT_COMPUTE_TYPE):
  373. """Defined here to allow for mocking out in tests"""
  374. return WhisperModel(model_size, device=device, compute_type=compute_type)
  375. def _batched_inference_pipeline(self, model):
  376. """Defined here to allow for mocking out in tests"""
  377. return BatchedInferencePipeline(model=model)
  378. def __init__(self, input_files=None, config=None):
  379. if not input_files:
  380. raise ValueError("No input files provided!")
  381. self.input_files = input_files
  382. self.config = config
  383. self.features = []
  384. def setup(self, words=[]):
  385. """Setup the word feature extractor -- validate input files & config
  386. Whisper expects a list of words to search for in the audio
  387. """
  388. logger.debug("WordFeatureExtractor setup")
  389. # Validate words - raise a notice if none provided
  390. if len(words) == 0:
  391. logger.warning("No words provided for detection")
  392. self.words = words
  393. # TODO: consider stripping punctuation since Whisper produces words+punctuation
  394. # and we might want to strip the punctuation there too
  395. def run(self):
  396. """Extract features corresponding to supplied target words (defined in setup) for each input file
  397. Use Whisper to detect words in the audio, then match these to target words and create features
  398. Note: if no words are supplied we can exit early
  399. """
  400. if len(self.words) == 0: return
  401. if self.DEFAULT_PIPELINE_TYPE == "batched":
  402. batched = True
  403. else:
  404. batched = False
  405. # no early exit
  406. # TODO: consider maybe loglevel notice of estimated time! consider also: max execution time config?
  407. # TODO: config options for model size, device, compute type
  408. model = self._whispermodel() # NB uses defaults, TODO: add config options
  409. # NOTE: batched not available on pypi yet at time of writing
  410. if batched:
  411. batched_model = self._batched_inference_pipeline(model)
  412. for file in self.input_files:
  413. # transcribe the audio file
  414. if batched:
  415. segments, _ = self._transcribe(batched_model, file.path, batch_size=self.DEFAULT_BATCH_SIZE)
  416. else:
  417. segments, _ = self._transcribe(model, file.path, beam_size=self.DEFAULT_BEAM_SIZE)
  418. # process the segments
  419. # segment has: start, end, text
  420. for segment in segments:
  421. # check if any of the words are in the segment
  422. for word in segment.text.split():
  423. if word in self.words:
  424. self.features.append(Feature(interval=Interval(start=segment.start, end=segment.end),
  425. source=file, feature_extractor="word",
  426. score=1.0))