Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

502 rindas
19 KiB

  1. from abc import ABC
  2. import json
  3. import logging
  4. import os
  5. import random
  6. import subprocess
  7. from ast import literal_eval
  8. from pipeline.utils import SourceMedia, Source, Feature, Interval
  9. # for loudness detection
  10. import soundfile
  11. import pyloudnorm
  12. # for word detection
  13. from faster_whisper import WhisperModel, BatchedInferencePipeline
  14. logger = logging.getLogger(__name__)
  15. class FeatureExtractor(ABC):
  16. """Feature extractor interface."""
  17. # TODO: #API -- decide if .features will be a member variable
  18. def _run_get_output(self, cmd: list, cwd:str=".") -> str:
  19. """Run a command and return the output as a string
  20. Defined to be mocked out in tests via unittest.mock.patch
  21. """
  22. return subprocess.run(cmd, stdout=subprocess.PIPE, cwd=cwd).stdout.decode("utf-8")
  23. def setup(self):
  24. """Setup the feature extractor -- validate input files & config"""
  25. def run(self):
  26. """Run the feature extractor -- extract features"""
  27. def teardown(self):
  28. """Teardown the feature extractor -- clean up any temporary files created during setup"""
  29. class LaughterFeatureExtractor(FeatureExtractor):
  30. """Feature extractor for laughter detection.
  31. This class is responsible for extracting features corresponding to laughter in media files.
  32. Here:
  33. setup() is used to validate input files & config, which may involve processing video files to extract audio
  34. run() is used to extract features from the audio using jrgillick's laughter-detection
  35. teardown() is used to clean up any temporary files created during setup according to the config
  36. See: https://github.com/jrgillick/laughter-detection for the laughter-detection library
  37. """
  38. _PREPEND_TIME = 7.0 # seconds before the laugh
  39. _APPEND_TIME = 3.0 # seconds after the laugh
  40. def __init__(self, input_files=None, config=None):
  41. """It is expected that input_files is a SourceMedia object"""
  42. self.input_files = input_files
  43. self.config = config
  44. self.features = []
  45. def _laughdetect(self, audio_file) -> list:
  46. """Run laughter detection on the audio file
  47. Returns a list of 2-tuples, each representing a laugh instance in the audio file
  48. """
  49. laugh_detector_dir = "/home/robert/mounts/980data/code/laughter-detection/"
  50. laugh_detector_script = "segment_laughter.py"
  51. # fake output for testing
  52. # laugh_detector_path = "tests/fake_segment_laughter.py"
  53. laugh_detector_cmd = ["python", f"{laugh_detector_dir}{laugh_detector_script}",
  54. f"--input_audio_file={audio_file}"]
  55. # run command, capture output, ignore exit status
  56. # use self._run_get_output to allow mocking in tests
  57. laugh_output = self._run_get_output(laugh_detector_cmd, laugh_detector_dir)
  58. # ↑ have to include cwd to keep laughter-detection imports happy
  59. # also, it isn't happy if no output dir is specified but we get laughs so it's grand
  60. # laughs are lines in stdout that start with "instance:", followed by a space and a 2-tuple of floats
  61. # so jump to the 10th character and evaluate the rest of the line
  62. return [literal_eval(instance[10:])
  63. for instance in laugh_output.splitlines()
  64. if instance.startswith("instance: ")]
  65. def _adjust_features(self):
  66. """Adjust features according to config
  67. Generically, this ensures features conform to config - min/max feature length, etc.
  68. In the context of LaughterFeatureExtractor, there is some secret sauce: things that
  69. cause a laugh generally /precede/ the laugh, so we want more team before the detected start
  70. than at the end. For example, for a minimum feature length of 15s, we might prepend 10 seconds,
  71. and append 5 seconds (for example), or 12s and 3s. We may wish to do this pre/post adjustment
  72. for all laughter features found, regardless of length.
  73. TODO: figure out how we're going to handle length adjustments
  74. TODO: config for length adjustments per design doc
  75. TODO: play with numbers more to see what works best
  76. """
  77. for feature in self.features:
  78. # do the pre & post adjustment
  79. feature.interval.move_start(-self._PREPEND_TIME, relative=True)
  80. feature.interval.move_end(self._APPEND_TIME, relative=True)
  81. def setup(self):
  82. """Setup the laughter feature extractor -- validate input files & config
  83. jrgillick's laughter-detection library can work with AV files directly
  84. TODO: validate input files
  85. TODO: handle config
  86. """
  87. logger.debug("LaughterFeatureExtractor setup")
  88. # Validate input files
  89. if not self.input_files:
  90. raise ValueError("No input files provided")
  91. # TODO: convert video to audio if needed
  92. def run(self):
  93. """Extract laughter features for each input file"""
  94. if self.input_files:
  95. for file in self.input_files:
  96. # adjust this call for better test mocking
  97. laughs = self._laughdetect(file.path)
  98. for laugh in laughs:
  99. start, end = laugh
  100. self.features.append(Feature(interval=Interval(start=start, end=end),
  101. source=file, feature_extractor="laughter"))
  102. # TODO: implement options eg minimum feature length
  103. # adjust features
  104. self._adjust_features()
  105. def teardown(self):
  106. pass
  107. class RandomFeatureExtractor(FeatureExtractor):
  108. """Feature extractor for random feature generation.
  109. This class is responsible for generating random features for testing purposes.
  110. Here:
  111. setup() is used to validate input files & config
  112. run() is used to generate random features
  113. teardown() is used to clean up any temporary files created during setup according to the config
  114. """
  115. NUM_FEATURES = 5
  116. MAX_DURATION = 20.0
  117. def __init__(self, input_files=None, config=None):
  118. """It is expected that input_files is a SourceMedia object"""
  119. self.input_files = input_files
  120. self.config = config
  121. self.features = []
  122. def setup(self):
  123. """Setup the random feature extractor -- validate input files & config"""
  124. logger.debug("RandomFeatureExtractor setup")
  125. # Validate input files
  126. if not self.input_files:
  127. raise ValueError("No input files provided")
  128. def run(self):
  129. """Generate random features for each input file"""
  130. # check self.input_files is of type SourceMedia
  131. if not self.input_files or not isinstance(self.input_files, SourceMedia):
  132. raise ValueError("No input files provided")
  133. for file in self.input_files:
  134. for _ in range(self.NUM_FEATURES):
  135. # round to 3 decimal places
  136. duration = random.random() * self.MAX_DURATION
  137. start = random.random() * file.duration() - duration
  138. self.features.append(Feature(interval=Interval(start=start, duration=duration),
  139. source=file, feature_extractor="random"))
  140. def teardown(self):
  141. pass
  142. class LoudAudioFeatureExtractor(FeatureExtractor):
  143. """Feature extractor for loud audio detection.
  144. This class is responsible for extracting features corresponding to loud audio in media files.
  145. Here:
  146. setup() is used to validate input files & config, and extracting audio
  147. run() uses pyloudnorm to detect loud audio
  148. teardown() is used to clean up temporary files created during setup (if specified by config)
  149. """
  150. _CONFIG_DEFAULT_NUM_FEATURES = 5 # keep the top 5 loudnesses
  151. def __init__(self, input_files=None, config=None, num_features=_CONFIG_DEFAULT_NUM_FEATURES):
  152. if not input_files:
  153. raise ValueError("No input files provided!")
  154. self.input_files = input_files
  155. self.config = config
  156. self.features = []
  157. self._num_features = num_features
  158. def _audio_file_from_path(self, path: str) -> str:
  159. """Return the audio file path given a video file path
  160. Example:
  161. - in = "/path/to/video.mp4"
  162. - out = "/tmp/video.mp4.wav"
  163. """
  164. OUTPUT_DIR = "/tmp"
  165. return f"{OUTPUT_DIR}/{os.path.basename(path)}.wav"
  166. def _get_loudnesses(self, data, meter, rate, window_size, stride_size):
  167. """Extract loudnesses from the audio data using pyloudnorm
  168. return a list of 2-tuples, each representing a timecode and loudness value
  169. """
  170. loudnesses = []
  171. for w in range(0, len(data)-window_size, stride_size):
  172. window = data[w:w+window_size, 0:2] # extract window
  173. loudnesses.append( (w/rate, meter.integrated_loudness(window)) )
  174. return loudnesses
  175. def _loudnorm(self, audio_file):
  176. """Run pyloudnorm on the audio file"""
  177. data, rate = soundfile.read(audio_file) # load audio (with shape (samples, channels))
  178. meter = pyloudnorm.Meter(rate=rate,block_size=0.3) # create BS.1770 meter
  179. loudness_features = []
  180. window_size = int(rate * 0.5) # 500ms
  181. stride_size = int(rate * 0.5) # 500ms -- no overlap
  182. # for w in range(data.shape[0]//100):
  183. # loudnesses.append(meter.integrated_loudness(data[w:w+int(0.3*rate),0:2]))
  184. loudnesses = self._get_loudnesses(data, meter, rate, window_size, stride_size)
  185. for timecode, loudval in sorted([l for l in loudnesses if float(l[1]) != float("-inf")], key=lambda x: x[1], reverse=True):
  186. # print(f"Timecode: {timecode}, Loudness: {loudval}")
  187. loudness_features.append((timecode, round(loudval, 3))) # round to 3 DP
  188. return loudness_features
  189. def _keep_num(self, loudnesses, num=_CONFIG_DEFAULT_NUM_FEATURES) -> list:
  190. """Keep the top n loudnesses (default: 5)"""
  191. return sorted(loudnesses, key=lambda x: x[1], reverse=True)[:num]
  192. def setup(self):
  193. """extract audio from video files to be processed by pyloudnorm
  194. TODO: config -- hardcoded for now
  195. """
  196. # pyloudnorm expects WAV files
  197. for file in self.input_files:
  198. audio_file = self._audio_file_from_path(file.path)
  199. # ffmpeg -i input.mp4 -vn -acodec pcm_s16le output.wav
  200. subprocess.run(["ffmpeg", "-y", "-i", file.path, "-vn", "-acodec", "pcm_s16le", audio_file],
  201. stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  202. def run(self):
  203. """Use pyloudnorm to detect loud audio"""
  204. for file in self.input_files:
  205. audio_file = self._audio_file_from_path(file.path)
  206. loudnesses = self._loudnorm(audio_file)
  207. top_loudnesses = self._keep_num(loudnesses, self._num_features)
  208. for time, loudness in top_loudnesses:
  209. self.features.append(Feature(interval=Interval(start=time, duration=0.500),
  210. source=file, feature_extractor="loudness",
  211. score=loudness))
  212. class VideoActivityFeatureExtractor(FeatureExtractor):
  213. """Feature extractor for video activity detection.
  214. This class is responsible for extracting features corresponding to high activity in video files.
  215. Uses ffmpeg's scdet filter with threshold of zero.
  216. Here:
  217. setup() is used to validate input files & config
  218. run() is used to extract features from the video using OpenCV
  219. teardown() is used to clean up any temporary files created during setup according to the config
  220. #TODO: minimum duration -- consider whether to do here, or expand duration post-consolidation
  221. """
  222. def __init__(self, input_files=None, config=None):
  223. if not input_files:
  224. raise ValueError("No input files provided!")
  225. self.input_files = input_files
  226. self.config = config
  227. self.features = []
  228. def _scdet(self, video_file):
  229. """Run scdet filter on the video file"""
  230. ffmpeg_cmd = ["ffmpeg", "-i", video_file, "-vf", "scdet=threshold=0", "-f", "null", "-"]
  231. # output is of the form:
  232. # [scdet @ 0x7f0798003d00] lavfi.scd.score: 0.031, lavfi.scd.time: 23.65
  233. # [scdet @ 0x7f0798003d00] lavfi.scd.score: 0.006, lavfi.scd.time: 23.70
  234. # capture output, extract time & score
  235. scdet_output = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stderr.decode("utf-8")
  236. # extract time & score
  237. scores = []
  238. for line in scdet_output.splitlines():
  239. if "lavfi.scd.score" in line:
  240. scores.append( (float(line.split(",")[1].split(":")[1]),
  241. float(line.split(",")[0].split(":")[1]))
  242. )
  243. return scores
  244. def _nonoverlap_mean(self, scores, window_size=0.500) -> list:
  245. """Take the mean of non-overlapping windows of scores
  246. Input: list of tuples in the format (time, score)
  247. Output: list of tuples in the format (time, mean_score) (reduced set)
  248. """
  249. means = []
  250. current_window = []
  251. current_window_start = 0.0
  252. for time, score in scores:
  253. if time - current_window_start > window_size:
  254. # calculate mean of current window
  255. mean_score = sum([s for _, s in current_window]) / len(current_window)
  256. means.append((current_window_start, round(mean_score, 3)))
  257. # reset window
  258. current_window = []
  259. current_window_start = time
  260. current_window.append((time, score))
  261. return means
  262. def _drop_lowest(self, scores, percent=33):
  263. """Drop the lowest n% scores from the list"""
  264. scores = sorted(scores, key=lambda x: x[1], reverse=True)
  265. return scores[:int(len(scores) * (percent / 100))]
  266. def setup(self):
  267. pass
  268. def run(self):
  269. for file in self.input_files:
  270. scores = self._scdet(file.path)
  271. means = sorted(self._nonoverlap_mean(scores), key=lambda x: x[1], reverse=True)
  272. for time, score in self._drop_lowest(means, 66):
  273. self.features.append(Feature(interval=Interval(start=time, duration=0.500),
  274. source=file, feature_extractor="videoactivity",
  275. score=score))
  276. def teardown(self):
  277. pass
  278. class JSONFeatureExtractor(FeatureExtractor):
  279. """(Re-)create features from a JSON file
  280. The JSON file can have one of two formats:
  281. - the format produced by the pipleline (@see: video_producers.py:JSONProducer)
  282. - a simplified format which is easier for manual creation
  283. """
  284. def __init__(self, input_files=None, config=None):
  285. if not input_files:
  286. raise ValueError("No input files provided!")
  287. self.input_files = input_files
  288. self.config = config
  289. self.features = []
  290. def setup(self):
  291. pass
  292. def _interval_from_dict(self, d):
  293. return Interval(start=d["start"], duration=d["duration"])
  294. def _source_from_dict(self, d):
  295. return Source(d["source"], d["path"], d["provider"])
  296. def _read_json_from_file(self, file):
  297. """Read a JSON file and return the contents
  298. Method exists to allow for mocking in tests
  299. """
  300. with open(file, "r") as f:
  301. return json.load(f)
  302. def run(self):
  303. # only pipeline JSON format for now
  304. # TODO: add support for simplified format
  305. for file in self.input_files:
  306. features_from_json = self._read_json_from_file(file.path)
  307. for feature in features_from_json:
  308. self.features.append(Feature(interval=self._interval_from_dict(feature["interval"]),
  309. source=self._source_from_dict(feature["source"]),
  310. feature_extractor=feature["feature_extractor"],
  311. score=feature["score"]))
  312. def teardown(self):
  313. pass
  314. class WordFeatureExtractor(FeatureExtractor):
  315. """Feature extractor for specific word detection (uses Whisper)"""
  316. # set defaults for whisper settings
  317. DEFAULT_MODEL_SIZE = "medium"
  318. DEFAULT_DEVICE = "cpu"
  319. DEFAULT_COMPUTE_TYPE = "int8"
  320. DEFAULT_BEAM_SIZE = 5
  321. DEFAULT_BATCH_SIZE = 16
  322. DEFAULT_PIPELINE_TYPE = "batched" # or "stream"
  323. words = []
  324. def _transcribe(self, model, file, **kwargs):
  325. """Defined here to allow for mocking in tests"""
  326. return model.transcribe(file, **kwargs)
  327. def _whispermodel(self, model_size=DEFAULT_MODEL_SIZE,
  328. device=DEFAULT_DEVICE, compute_type=DEFAULT_COMPUTE_TYPE):
  329. """Defined here to allow for mocking out in tests"""
  330. return WhisperModel(model_size, device=device, compute_type=compute_type)
  331. def _batched_inference_pipeline(self, model):
  332. """Defined here to allow for mocking out in tests"""
  333. return BatchedInferencePipeline(model=model)
  334. def __init__(self, input_files=None, config=None):
  335. if not input_files:
  336. raise ValueError("No input files provided!")
  337. self.input_files = input_files
  338. self.config = config
  339. self.features = []
  340. def setup(self, words=[]):
  341. """Setup the word feature extractor -- validate input files & config
  342. Whisper expects a list of words to search for in the audio
  343. """
  344. logger.debug("WordFeatureExtractor setup")
  345. # Validate words - raise a notice if none provided
  346. if len(words) == 0:
  347. logger.warning("No words provided for detection")
  348. self.words = words
  349. # TODO: consider stripping punctuation since Whisper produces words+punctuation
  350. # and we might want to strip the punctuation there too
  351. def run(self):
  352. """Extract features corresponding to supplied target words (defined in setup) for each input file
  353. Use Whisper to detect words in the audio, then match these to target words and create features
  354. Note: if no words are supplied we can exit early
  355. """
  356. if len(self.words) == 0: return
  357. if self.DEFAULT_PIPELINE_TYPE == "batched":
  358. batched = True
  359. else:
  360. batched = False
  361. # no early exit
  362. # TODO: consider maybe loglevel notice of estimated time! consider also: max execution time config?
  363. # TODO: config options for model size, device, compute type
  364. model = self._whispermodel() # NB uses defaults, TODO: add config options
  365. # NOTE: batched not available on pypi yet at time of writing
  366. if batched:
  367. batched_model = self._batched_inference_pipeline(model)
  368. for file in self.input_files:
  369. # transcribe the audio file
  370. if batched:
  371. segments, _ = self._transcribe(batched_model, file.path, batch_size=self.DEFAULT_BATCH_SIZE)
  372. else:
  373. segments, _ = self._transcribe(model, file.path, beam_size=self.DEFAULT_BEAM_SIZE)
  374. # process the segments
  375. # segment has: start, end, text
  376. for segment in segments:
  377. # check if any of the words are in the segment
  378. for word in segment.text.split():
  379. if word in self.words:
  380. self.features.append(Feature(interval=Interval(start=segment.start, end=segment.end),
  381. source=file, feature_extractor="word",
  382. score=1.0))