Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

309 linhas
12 KiB

  1. from abc import ABC
  2. import logging
  3. import os
  4. import random
  5. import subprocess
  6. from ast import literal_eval
  7. from pipeline.utils import SourceMedia, Feature, Interval
  8. # for loudness detection
  9. import soundfile
  10. import pyloudnorm
  11. logger = logging.getLogger(__name__)
  12. class FeatureExtractor(ABC):
  13. """Feature extractor interface."""
  14. # TODO: #API -- decide if .features will be a member variable
  15. def setup(self):
  16. pass
  17. def run(self):
  18. pass
  19. def teardown(self):
  20. pass
  21. class LaughterFeatureExtractor(FeatureExtractor):
  22. """Feature extractor for laughter detection.
  23. This class is responsible for extracting features corresponding to laughter in media files.
  24. Here:
  25. setup() is used to validate input files & config, which may involve processing video files to extract audio
  26. run() is used to extract features from the audio using jrgillick's laughter-detection
  27. teardown() is used to clean up any temporary files created during setup according to the config
  28. See: https://github.com/jrgillick/laughter-detection for the laughter-detection library
  29. """
  30. def __init__(self, input_files=None, config=None):
  31. """It is expected that input_files is a SourceMedia object"""
  32. self.input_files = input_files
  33. self.config = config
  34. self.features = []
  35. def _laughdetect(self, audio_file):
  36. """Run laughter detection on the audio file"""
  37. laugh_detector_dir = "/home/robert/mounts/980data/code/laughter-detection/"
  38. laugh_detector_script = "segment_laughter.py"
  39. # fake output for testing
  40. # laugh_detector_path = "tests/fake_segment_laughter.py"
  41. laugh_detector_cmd = ["python", f"{laugh_detector_dir}{laugh_detector_script}",
  42. f"--input_audio_file={audio_file}"]
  43. # run command, capture output, ignore exit status
  44. laugh_output = subprocess.run(laugh_detector_cmd,
  45. stdout=subprocess.PIPE,
  46. cwd=laugh_detector_dir).stdout.decode("utf-8")
  47. # ↑ have to include cwd to keep laughter-detection imports happy
  48. # also, it isn't happy if no output dir is specified but we get laughs so it's grand
  49. # laughs are lines in stdout that start with "instance:", followed by a space and a 2-tuple of floats
  50. # so jump to the 10th character and evaluate the rest of the line
  51. return [literal_eval(instance[10:])
  52. for instance in laugh_output.splitlines()
  53. if instance.startswith("instance: ")]
  54. def _adjust_features(self):
  55. """Adjust features according to config
  56. Generically, this ensures features conform to config - min/max feature length, etc.
  57. In the context of LaughterFeatureExtractor, there is some secret sauce: things that
  58. cause a laugh generally /precede/ the laugh, so we want more team before the detected start
  59. than at the end. For example, for a minimum feature length of 15s, we might prepend 10 seconds,
  60. and append 5 seconds (for example), or 12s and 3s. We may wish to do this pre/post adjustment
  61. for all laughter features found, regardless of length.
  62. TODO: figure out how we're going to handle length adjustments
  63. TODO: config for length adjustments per design doc
  64. TODO: play with numbers more to see what works best
  65. """
  66. PREPEND = 7.0
  67. APPEND = 3.0
  68. for feature in self.features:
  69. # do the pre & post adjustment
  70. feature.interval.move_start(-PREPEND, relative=True)
  71. feature.interval.move_end(APPEND, relative=True)
  72. def setup(self):
  73. """Setup the laughter feature extractor -- validate input files & config
  74. jrgillick's laughter-detection library can work with AV files directly
  75. TODO: validate input files
  76. TODO: handle config
  77. """
  78. logger.debug("LaughterFeatureExtractor setup")
  79. # Validate input files
  80. if not self.input_files:
  81. raise ValueError("No input files provided")
  82. # TODO: convert video to audio if needed
  83. def run(self):
  84. """Extract laughter features for each input file"""
  85. if self.input_files:
  86. for file in self.input_files:
  87. laughs = self._laughdetect(file.path)
  88. for laugh in laughs:
  89. start, end = laugh
  90. self.features.append(Feature(interval=Interval(start=start, end=end),
  91. source="laughter", path=file.path))
  92. # TODO: implement options eg minimum feature length
  93. # adjust features
  94. self._adjust_features()
  95. def teardown(self):
  96. pass
  97. class RandomFeatureExtractor(FeatureExtractor):
  98. """Feature extractor for random feature generation.
  99. This class is responsible for generating random features for testing purposes.
  100. Here:
  101. setup() is used to validate input files & config
  102. run() is used to generate random features
  103. teardown() is used to clean up any temporary files created during setup according to the config
  104. """
  105. NUM_FEATURES = 5
  106. MAX_DURATION = 20.0
  107. def __init__(self, input_files=None, config=None):
  108. """It is expected that input_files is a SourceMedia object"""
  109. self.input_files = input_files
  110. self.config = config
  111. self.features = []
  112. def setup(self):
  113. """Setup the random feature extractor -- validate input files & config"""
  114. logger.debug("RandomFeatureExtractor setup")
  115. # Validate input files
  116. if not self.input_files:
  117. raise ValueError("No input files provided")
  118. def run(self):
  119. """Generate random features for each input file"""
  120. # check self.input_files is of type SourceMedia
  121. if not self.input_files or not isinstance(self.input_files, SourceMedia):
  122. raise ValueError("No input files provided")
  123. for file in self.input_files:
  124. for _ in range(self.NUM_FEATURES):
  125. # round to 3 decimal places
  126. duration = random.random() * self.MAX_DURATION
  127. start = random.random() * file.duration() - duration
  128. self.features.append(Feature(interval=Interval(start=start, duration=duration),
  129. source="random", path=file.path))
  130. class LoudAudioFeatureExtractor(FeatureExtractor):
  131. """Feature extractor for loud audio detection.
  132. This class is responsible for extracting features corresponding to loud audio in media files.
  133. Here:
  134. setup() is used to validate input files & config, and extracting audio
  135. run() uses pyloudnorm to detect loud audio
  136. teardown() is used to clean up temporary files created during setup (if specified by config)
  137. """
  138. def __init__(self, input_files=None, config=None):
  139. if not input_files:
  140. raise ValueError("No input files provided!")
  141. self.input_files = input_files
  142. self.config = config
  143. self.features = []
  144. def _audio_file_from_path(self, path: str) -> str:
  145. """Return the audio file path given a video file path
  146. Example:
  147. - in = "/path/to/video.mp4"
  148. - out = "/tmp/video.mp4.wav"
  149. """
  150. OUTPUT_DIR = "/tmp"
  151. return f"{OUTPUT_DIR}/{os.path.basename(path)}.wav"
  152. def _loudnorm(self, audio_file):
  153. """Run pyloudnorm on the audio file"""
  154. data, rate = soundfile.read(audio_file) # load audio (with shape (samples, channels))
  155. meter = pyloudnorm.Meter(rate=rate,block_size=0.3) # create BS.1770 meter
  156. loudnesses = []
  157. loudness_features = []
  158. window_size = int(rate * 0.5) # 500ms
  159. stride_size = int(rate * 0.5) # 500ms -- no overlap
  160. # for w in range(data.shape[0]//100):
  161. # loudnesses.append(meter.integrated_loudness(data[w:w+int(0.3*rate),0:2]))
  162. for w in range(0, len(data)-window_size, stride_size):
  163. window = data[w:w+window_size, 0:2] # extract window
  164. loudnesses.append( (w/rate, meter.integrated_loudness(window)) )
  165. for timecode, loudval in sorted([l for l in loudnesses if float(l[1]) != float("-inf")], key=lambda x: x[1], reverse=True):
  166. # print(f"Timecode: {timecode}, Loudness: {loudval}")
  167. loudness_features.append((timecode, round(loudval, 3))) # round to 3 DP
  168. return loudness_features
  169. def setup(self):
  170. """extract audio from video files to be processed by pyloudnorm
  171. TODO: config -- hardcoded for now
  172. """
  173. # pyloudnorm expects WAV files
  174. for file in self.input_files:
  175. audio_file = self._audio_file_from_path(file.path)
  176. # ffmpeg -i input.mp4 -vn -acodec pcm_s16le output.wav
  177. subprocess.run(["ffmpeg", "-y", "-i", file.path, "-vn", "-acodec", "pcm_s16le", audio_file],
  178. stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  179. def run(self):
  180. """Use pyloudnorm to detect loud audio"""
  181. for file in self.input_files:
  182. audio_file = self._audio_file_from_path(file.path)
  183. loudnesses = self._loudnorm(audio_file)
  184. for time, loudness in loudnesses:
  185. self.features.append(Feature(interval=Interval(start=time, duration=0.500),
  186. source=file, feature_extractor="loudness",
  187. score=loudness))
  188. class VideoActivityFeatureExtractor(FeatureExtractor):
  189. """Feature extractor for video activity detection.
  190. This class is responsible for extracting features corresponding to high activity in video files.
  191. Uses ffmpeg's scdet filter with threshold of zero.
  192. Here:
  193. setup() is used to validate input files & config
  194. run() is used to extract features from the video using OpenCV
  195. teardown() is used to clean up any temporary files created during setup according to the config
  196. #TODO: minimum duration -- consider whether to do here, or expand duration post-consolidation
  197. """
  198. def __init__(self, input_files=None, config=None):
  199. if not input_files:
  200. raise ValueError("No input files provided!")
  201. self.input_files = input_files
  202. self.config = config
  203. self.features = []
  204. def _scdet(self, video_file):
  205. """Run scdet filter on the video file"""
  206. ffmpeg_cmd = ["ffmpeg", "-i", video_file, "-vf", "scdet=threshold=0", "-f", "null", "-"]
  207. # output is of the form:
  208. # [scdet @ 0x7f0798003d00] lavfi.scd.score: 0.031, lavfi.scd.time: 23.65
  209. # [scdet @ 0x7f0798003d00] lavfi.scd.score: 0.006, lavfi.scd.time: 23.70
  210. # capture output, extract time & score
  211. scdet_output = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stderr.decode("utf-8")
  212. # extract time & score
  213. scores = []
  214. for line in scdet_output.splitlines():
  215. if "lavfi.scd.score" in line:
  216. scores.append( (float(line.split(",")[1].split(":")[1]),
  217. float(line.split(",")[0].split(":")[1]))
  218. )
  219. return scores
  220. def _drop_lowest(self, scores, percent=33):
  221. """Drop the lowest n% scores from the list"""
  222. scores = sorted(scores, key=lambda x: x[1], reverse=True)
  223. return scores[:int(len(scores) * (percent / 100))]
  224. def setup(self):
  225. pass
  226. def run(self):
  227. for file in self.input_files:
  228. scores = self._scdet(file.path)
  229. means = sorted(self._nonoverlap_mean(scores), key=lambda x: x[1], reverse=True)
  230. for time, score in self._drop_lowest(means, 66):
  231. self.features.append(Feature(interval=Interval(start=time, duration=0.500),
  232. source=file, feature_extractor="videoactivity",
  233. score=score))
  234. def teardown(self):
  235. pass
  236. def teardown(self):
  237. pass