Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

feature_extractors.py 15 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. from abc import ABC
  2. import json
  3. import logging
  4. import os
  5. import random
  6. import subprocess
  7. from ast import literal_eval
  8. from pipeline.utils import SourceMedia, Source, Feature, Interval
  9. # for loudness detection
  10. import soundfile
  11. import pyloudnorm
  12. logger = logging.getLogger(__name__)
  13. class FeatureExtractor(ABC):
  14. """Feature extractor interface."""
  15. # TODO: #API -- decide if .features will be a member variable
  16. def _run_get_output(self, cmd: list, cwd:str=".") -> str:
  17. """Run a command and return the output as a string
  18. Defined to be mocked out in tests via unittest.mock.patch
  19. """
  20. return subprocess.run(cmd, stdout=subprocess.PIPE, cwd=cwd).stdout.decode("utf-8")
  21. def setup(self):
  22. """Setup the feature extractor -- validate input files & config"""
  23. def run(self):
  24. """Run the feature extractor -- extract features"""
  25. def teardown(self):
  26. """Teardown the feature extractor -- clean up any temporary files created during setup"""
  27. class LaughterFeatureExtractor(FeatureExtractor):
  28. """Feature extractor for laughter detection.
  29. This class is responsible for extracting features corresponding to laughter in media files.
  30. Here:
  31. setup() is used to validate input files & config, which may involve processing video files to extract audio
  32. run() is used to extract features from the audio using jrgillick's laughter-detection
  33. teardown() is used to clean up any temporary files created during setup according to the config
  34. See: https://github.com/jrgillick/laughter-detection for the laughter-detection library
  35. """
  36. def __init__(self, input_files=None, config=None):
  37. """It is expected that input_files is a SourceMedia object"""
  38. self.input_files = input_files
  39. self.config = config
  40. self.features = []
  41. def _laughdetect(self, audio_file) -> list:
  42. """Run laughter detection on the audio file
  43. Returns a list of 2-tuples, each representing a laugh instance in the audio file
  44. """
  45. laugh_detector_dir = "/home/robert/mounts/980data/code/laughter-detection/"
  46. laugh_detector_script = "segment_laughter.py"
  47. # fake output for testing
  48. # laugh_detector_path = "tests/fake_segment_laughter.py"
  49. laugh_detector_cmd = ["python", f"{laugh_detector_dir}{laugh_detector_script}",
  50. f"--input_audio_file={audio_file}"]
  51. # run command, capture output, ignore exit status
  52. # use self._run_get_output to allow mocking in tests
  53. laugh_output = self._run_get_output(laugh_detector_cmd, laugh_detector_dir)
  54. # ↑ have to include cwd to keep laughter-detection imports happy
  55. # also, it isn't happy if no output dir is specified but we get laughs so it's grand
  56. # laughs are lines in stdout that start with "instance:", followed by a space and a 2-tuple of floats
  57. # so jump to the 10th character and evaluate the rest of the line
  58. return [literal_eval(instance[10:])
  59. for instance in laugh_output.splitlines()
  60. if instance.startswith("instance: ")]
  61. def _adjust_features(self):
  62. """Adjust features according to config
  63. Generically, this ensures features conform to config - min/max feature length, etc.
  64. In the context of LaughterFeatureExtractor, there is some secret sauce: things that
  65. cause a laugh generally /precede/ the laugh, so we want more team before the detected start
  66. than at the end. For example, for a minimum feature length of 15s, we might prepend 10 seconds,
  67. and append 5 seconds (for example), or 12s and 3s. We may wish to do this pre/post adjustment
  68. for all laughter features found, regardless of length.
  69. TODO: figure out how we're going to handle length adjustments
  70. TODO: config for length adjustments per design doc
  71. TODO: play with numbers more to see what works best
  72. """
  73. PREPEND = 7.0
  74. APPEND = 3.0
  75. for feature in self.features:
  76. # do the pre & post adjustment
  77. feature.interval.move_start(-PREPEND, relative=True)
  78. feature.interval.move_end(APPEND, relative=True)
  79. def setup(self):
  80. """Setup the laughter feature extractor -- validate input files & config
  81. jrgillick's laughter-detection library can work with AV files directly
  82. TODO: validate input files
  83. TODO: handle config
  84. """
  85. logger.debug("LaughterFeatureExtractor setup")
  86. # Validate input files
  87. if not self.input_files:
  88. raise ValueError("No input files provided")
  89. # TODO: convert video to audio if needed
  90. def run(self):
  91. """Extract laughter features for each input file"""
  92. if self.input_files:
  93. for file in self.input_files:
  94. # adjust this call for better test mocking
  95. laughs = self._laughdetect(file.path)
  96. for laugh in laughs:
  97. start, end = laugh
  98. self.features.append(Feature(interval=Interval(start=start, end=end),
  99. source=file, feature_extractor="laughter"))
  100. # TODO: implement options eg minimum feature length
  101. # adjust features
  102. self._adjust_features()
  103. def teardown(self):
  104. pass
  105. class RandomFeatureExtractor(FeatureExtractor):
  106. """Feature extractor for random feature generation.
  107. This class is responsible for generating random features for testing purposes.
  108. Here:
  109. setup() is used to validate input files & config
  110. run() is used to generate random features
  111. teardown() is used to clean up any temporary files created during setup according to the config
  112. """
  113. NUM_FEATURES = 5
  114. MAX_DURATION = 20.0
  115. def __init__(self, input_files=None, config=None):
  116. """It is expected that input_files is a SourceMedia object"""
  117. self.input_files = input_files
  118. self.config = config
  119. self.features = []
  120. def setup(self):
  121. """Setup the random feature extractor -- validate input files & config"""
  122. logger.debug("RandomFeatureExtractor setup")
  123. # Validate input files
  124. if not self.input_files:
  125. raise ValueError("No input files provided")
  126. def run(self):
  127. """Generate random features for each input file"""
  128. # check self.input_files is of type SourceMedia
  129. if not self.input_files or not isinstance(self.input_files, SourceMedia):
  130. raise ValueError("No input files provided")
  131. for file in self.input_files:
  132. for _ in range(self.NUM_FEATURES):
  133. # round to 3 decimal places
  134. duration = random.random() * self.MAX_DURATION
  135. start = random.random() * file.duration() - duration
  136. self.features.append(Feature(interval=Interval(start=start, duration=duration),
  137. source=file, feature_extractor="random"))
  138. def teardown(self):
  139. pass
  140. class LoudAudioFeatureExtractor(FeatureExtractor):
  141. """Feature extractor for loud audio detection.
  142. This class is responsible for extracting features corresponding to loud audio in media files.
  143. Here:
  144. setup() is used to validate input files & config, and extracting audio
  145. run() uses pyloudnorm to detect loud audio
  146. teardown() is used to clean up temporary files created during setup (if specified by config)
  147. """
  148. def __init__(self, input_files=None, config=None):
  149. if not input_files:
  150. raise ValueError("No input files provided!")
  151. self.input_files = input_files
  152. self.config = config
  153. self.features = []
  154. def _audio_file_from_path(self, path: str) -> str:
  155. """Return the audio file path given a video file path
  156. Example:
  157. - in = "/path/to/video.mp4"
  158. - out = "/tmp/video.mp4.wav"
  159. """
  160. OUTPUT_DIR = "/tmp"
  161. return f"{OUTPUT_DIR}/{os.path.basename(path)}.wav"
  162. def _loudnorm(self, audio_file):
  163. """Run pyloudnorm on the audio file"""
  164. data, rate = soundfile.read(audio_file) # load audio (with shape (samples, channels))
  165. meter = pyloudnorm.Meter(rate=rate,block_size=0.3) # create BS.1770 meter
  166. loudnesses = []
  167. loudness_features = []
  168. window_size = int(rate * 0.5) # 500ms
  169. stride_size = int(rate * 0.5) # 500ms -- no overlap
  170. # for w in range(data.shape[0]//100):
  171. # loudnesses.append(meter.integrated_loudness(data[w:w+int(0.3*rate),0:2]))
  172. for w in range(0, len(data)-window_size, stride_size):
  173. window = data[w:w+window_size, 0:2] # extract window
  174. loudnesses.append( (w/rate, meter.integrated_loudness(window)) )
  175. for timecode, loudval in sorted([l for l in loudnesses if float(l[1]) != float("-inf")], key=lambda x: x[1], reverse=True):
  176. # print(f"Timecode: {timecode}, Loudness: {loudval}")
  177. loudness_features.append((timecode, round(loudval, 3))) # round to 3 DP
  178. return loudness_features
  179. def setup(self):
  180. """extract audio from video files to be processed by pyloudnorm
  181. TODO: config -- hardcoded for now
  182. """
  183. # pyloudnorm expects WAV files
  184. for file in self.input_files:
  185. audio_file = self._audio_file_from_path(file.path)
  186. # ffmpeg -i input.mp4 -vn -acodec pcm_s16le output.wav
  187. subprocess.run(["ffmpeg", "-y", "-i", file.path, "-vn", "-acodec", "pcm_s16le", audio_file],
  188. stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  189. def run(self):
  190. """Use pyloudnorm to detect loud audio"""
  191. for file in self.input_files:
  192. audio_file = self._audio_file_from_path(file.path)
  193. loudnesses = self._loudnorm(audio_file)
  194. for time, loudness in loudnesses:
  195. self.features.append(Feature(interval=Interval(start=time, duration=0.500),
  196. source=file, feature_extractor="loudness",
  197. score=loudness))
  198. class VideoActivityFeatureExtractor(FeatureExtractor):
  199. """Feature extractor for video activity detection.
  200. This class is responsible for extracting features corresponding to high activity in video files.
  201. Uses ffmpeg's scdet filter with threshold of zero.
  202. Here:
  203. setup() is used to validate input files & config
  204. run() is used to extract features from the video using OpenCV
  205. teardown() is used to clean up any temporary files created during setup according to the config
  206. #TODO: minimum duration -- consider whether to do here, or expand duration post-consolidation
  207. """
  208. def __init__(self, input_files=None, config=None):
  209. if not input_files:
  210. raise ValueError("No input files provided!")
  211. self.input_files = input_files
  212. self.config = config
  213. self.features = []
  214. def _scdet(self, video_file):
  215. """Run scdet filter on the video file"""
  216. ffmpeg_cmd = ["ffmpeg", "-i", video_file, "-vf", "scdet=threshold=0", "-f", "null", "-"]
  217. # output is of the form:
  218. # [scdet @ 0x7f0798003d00] lavfi.scd.score: 0.031, lavfi.scd.time: 23.65
  219. # [scdet @ 0x7f0798003d00] lavfi.scd.score: 0.006, lavfi.scd.time: 23.70
  220. # capture output, extract time & score
  221. scdet_output = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stderr.decode("utf-8")
  222. # extract time & score
  223. scores = []
  224. for line in scdet_output.splitlines():
  225. if "lavfi.scd.score" in line:
  226. scores.append( (float(line.split(",")[1].split(":")[1]),
  227. float(line.split(",")[0].split(":")[1]))
  228. )
  229. return scores
  230. def _nonoverlap_mean(self, scores, window_size=0.500) -> list:
  231. """Take the mean of non-overlapping windows of scores
  232. Input: list of tuples in the format (time, score)
  233. Output: list of tuples in the format (time, mean_score) (reduced set)
  234. """
  235. means = []
  236. current_window = []
  237. current_window_start = 0.0
  238. for time, score in scores:
  239. if time - current_window_start > window_size:
  240. # calculate mean of current window
  241. mean_score = sum([s for _, s in current_window]) / len(current_window)
  242. means.append((current_window_start, round(mean_score, 3)))
  243. # reset window
  244. current_window = []
  245. current_window_start = time
  246. current_window.append((time, score))
  247. return means
  248. def _drop_lowest(self, scores, percent=33):
  249. """Drop the lowest n% scores from the list"""
  250. scores = sorted(scores, key=lambda x: x[1], reverse=True)
  251. return scores[:int(len(scores) * (percent / 100))]
  252. def setup(self):
  253. pass
  254. def run(self):
  255. for file in self.input_files:
  256. scores = self._scdet(file.path)
  257. means = sorted(self._nonoverlap_mean(scores), key=lambda x: x[1], reverse=True)
  258. for time, score in self._drop_lowest(means, 66):
  259. self.features.append(Feature(interval=Interval(start=time, duration=0.500),
  260. source=file, feature_extractor="videoactivity",
  261. score=score))
  262. def teardown(self):
  263. pass
  264. class JSONFeatureExtractor(FeatureExtractor):
  265. """(Re-)create features from a JSON file
  266. The JSON file can have one of two formats:
  267. - the format produced by the pipleline (@see: video_producers.py:JSONProducer)
  268. - a simplified format which is easier for manual creation
  269. """
  270. def __init__(self, input_files=None, config=None):
  271. if not input_files:
  272. raise ValueError("No input files provided!")
  273. self.input_files = input_files
  274. self.config = config
  275. self.features = []
  276. def setup(self):
  277. pass
  278. def _interval_from_dict(self, d):
  279. return Interval(start=d["start"], duration=d["duration"])
  280. def _source_from_dict(self, d):
  281. return Source(d["source"], d["path"], d["provider"])
  282. def _read_json_from_file(self, file):
  283. """Read a JSON file and return the contents
  284. Method exists to allow for mocking in tests
  285. """
  286. with open(file, "r") as f:
  287. return json.load(f)
  288. def run(self):
  289. # only pipeline JSON format for now
  290. # TODO: add support for simplified format
  291. for file in self.input_files:
  292. features_from_json = self._read_json_from_file(file.path)
  293. for feature in features_from_json:
  294. self.features.append(Feature(interval=self._interval_from_dict(feature["interval"]),
  295. source=self._source_from_dict(feature["source"]),
  296. feature_extractor=feature["feature_extractor"],
  297. score=feature["score"]))
  298. def teardown(self):
  299. pass