import subprocess class SourceMedia(): """Source media used by eg feature extractors. This is a list of Source objects. JSON type schema: [{ "source": "/path/to/video.mp4", "path": "/path/to/video.mp4", "provider": "FileInputJSON" }, { "source": "http://example.com/video.mp4", "path": "/path/to/downloaded_video.mp4", "provider": "InputYAML" }] It should be possible to combine/merge/aggregate multiple SourceMedia into one TODO: consider if we actually want that or if we just loop over a list of >0 SourceMedia Iterating over a SourceMedia object should return a list of Source objects. """ def __init__(self, sources=[]): self.sources = sources def __iter__(self): return iter(self.sources) class Source(): """A Source is a single media file (eg), used to populate SourceMedia objects. JSON type schema: { "source": "/path/to/video.mp4", "path": "/path/to/video.mp4", "provider": "FileInputJSON" } Instance variables: source -- the source of the media file (eg, a URL or a local path) path -- the path to the media file provider -- the provider of the media file (eg, "FileInputJSON") Accessing the object should return the path to the media file. Methods: duration() -- return the duration of the media file (uses ffprobe, result is cached) Notes: - source and path may be the same, for example in the case of a local file """ _duration = None def __init__(self, source, path, provider): if not source: raise ValueError("Source must be provided") # TODO: #API -- decide if this is necessary self.source = source if not path: # we need a file to work on for the rest of the pipeline raise ValueError("Path must be provided") self.path = path if not provider: raise ValueError("Provider must be provided") # TODO: #API -- decide if this is necessary self.provider = provider def __str__(self): """See: 'accessing the object should return the path to the media file'""" return self.path def __repr__(self): return f"Source({self.source}, {self.path}, {self.provider})" def duration(self): """Return the duration of the media file at self.path (result is cached)""" return self._duration or self._get_duration(self.path) def _get_duration(self, file): """Use ffprobe to get the duration of the media file at self.path and cache result (_duration) usage: ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 """ # test if file exists try: with open(file) as _: pass except FileNotFoundError: raise FileNotFoundError(f"File not found: {file}") # cache the result self._duration = 0.0 or float(subprocess.check_output(["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", file])) return self._duration def to_json(self): """Return a dict representation of the source for JSON encoding @see video_producers.py:PipelineJSONEncoder """ return dict(source=self.source, path=self.path, provider=self.provider) class Interval(): """An interval of time in a media file This can be defined by a start and end time, a start time and a duration, or an end time and a duration. Instance variables: start -- the start time of the interval end -- the end time of the interval duration -- the duration of the interval (end - start) Notes: Sorts by start time, then end time """ # TODO: decide if ABC or will be used directly # TODO: have default duration for intervals set by config # TODO: consider if we want to permit adjusting intervals (eg, start time, end time, duration) [probably yes] # NOTE: if we have more ways of defining, we could consider multipledispatch? # TODO: consider if we want to keep a reference to the media file (source) in the interval DEFAULT_DURATION = 5 # seconds DEFAUT_PRECISION = 3 # decimal places def __init__(self, start=None, end=None, duration=None): if start is None and end is None and duration is None: raise ValueError("Two of start, end, or duration must be provided") if start is not None and end is not None and duration is not None: raise ValueError("Only two of start, end, or duration may be provided") # start and end if start is not None and end is not None: # some trivial validation if start > end: raise ValueError("Start time must be before end time") self.start = start self.end = end self.duration = end - start # start and duration elif start is not None and duration is not None: if duration < 0: raise ValueError("Duration must be positive") self.start = start self.duration = duration self.end = start + duration # end and duration elif end is not None and duration is not None: if duration < 0: raise ValueError("Duration must be positive") self.end = end self.duration = duration self.start = end - duration # set precision self.start = round(self.start, self.DEFAUT_PRECISION) self.end = round(self.end, self.DEFAUT_PRECISION) self.duration = round(self.duration, self.DEFAUT_PRECISION) @classmethod def from_start(cls, start=None): """Create an interval from a start time using the default duration""" return cls(start=start, duration=cls.DEFAULT_DURATION) @classmethod def from_end(cls, end=None): """Create an interval from an end time using the default duration""" return cls(end=end, duration=cls.DEFAULT_DURATION) def __repr__(self): return f"Interval({self.start}, {self.end}, {self.duration})" def __lt__(self, other): if self.start == other.start: return self.end < other.end return self.start < other.start def to_json(self): """Return a dict representation of the interval for JSON encoding @see video_producers.py:PipelineJSONEncoder """ return dict(start=self.start, end=self.end, duration=self.duration) # -------------------------------------------------------------- # TODO: handle bad cases, eg negative duration, start > end, etc # -------------------------------------------------------------- def move_start(self, new_start: float | int, relative: bool = False): """Update start time of Interval, keeping end time constant (& so modify duration)""" if relative: self.start += new_start else: self.start = new_start self.duration = round((self.end - self.start), self.DEFAUT_PRECISION) def move_end(self, new_end: float | int, relative: bool = False): """Update end time of Interval, keeping start time constant (& so modify duration)""" if relative: self.end += new_end else: self.end = new_end self.duration = round((self.end - self.start), self.DEFAUT_PRECISION) def update_duration(self, new_duration: float | int, relative: bool = False): """Update duration of Interval, keeping start time constant (& so modify end time)""" if relative: self.duration += new_duration else: self.duration = new_duration self.end = self.start + self.duration class Feature(): """A feature extracted from a media file ("has a" Interval) This extends intervals by adding other fields, such as the feature source and 'score' Instance variables: interval -- Interval: time of feature in the media file source -- the source of the feature (ie feature extractor) (default: "unknown") path -- the path to the media file score -- the score of the feature (eg laughter confidence score, [0, 1] = { x ∈ ℝ | 0 ≤ x ≤ 1 }) (default: 0.0) Notes: - score is notionally in the closed interval [0, 1], but this is not enforced -- it is up to the feature extractor to ensure this (or use scores outside this range if desired -- eg a feature manually selected by user input might have a score of 2.0 so it is sorted 'above' other features) - sorts based on interval, then source, then score - path should never be unknown, since we need it to make clips from """ # TODO: consider renaming score to something more generic def __init__(self, interval=None, source=None, score=None, path=None): """Create a feature with an interval, source, and score Expects a ready-made interval; source and score are optional """ if interval is None: raise ValueError("Interval must be provided") self.interval = interval if path is None: raise ValueError("Path must be provided") self.path = path if source is None: source = "unknown" self.source = source if score is None: score = 0.0 self.score = score # classmethods for creating a feature with an interval directly # which delegate to the Interval class :) @classmethod def from_start(cls, start=None, source=None, score=None, path=None): return cls(interval=Interval.from_start(start), source=source, score=score, path=path) @classmethod def from_end(cls, end=None, source=None, score=None, path=None): return cls(interval=Interval.from_end(end), source=source, score=score, path=path) def __repr__(self): return f"Feature({self.interval}, {self.source}, {self.score})" def __lt__(self, other): """Sort based on interval, then source, then score""" if self.interval == other.interval: if self.source == other.source: return self.score < other.score return self.source < other.source return self.interval < other.interval def to_json(self): """Return a dict representation of the feature for JSON encoding @see video_producers.py:PipelineJSONEncoder """ return dict(interval=self.interval.to_json(), source=self.source.to_json(), feature_extractor=self.feature_extractor, score=self.score)