Shortcuts

Source code for minedojo.data.youtube_dataset

from __future__ import annotations

import json
import os

from .download import download as dl
from .download import get_fn


[docs]class YouTubeDataset: """ Class for MineDojo YouTube Database API. We follow PyTorch Dataset format but without actually inheriting from PyTorch dataset to keep the framework general. Args: download: If set to ``True`` and there is no existing cache directory, the data will be downloaded automatically. download_dir: Directory path where the downloaded data will be saved. Default: ``~/.minedojo/``. full: If ``True``, the full version of the YouTube database will be downlaoded. If ``False``, only the tutorial version of the YouTube database will be downloaded. Default: ``True``. Examples: >>> from minedojo.data import YouTubeDataset >>> youtube_dataset = YouTubeDataset() >>> print(youtube_dataset[0].keys()) dict_keys(['id', 'title', 'link', 'view_count', 'like_count', 'duration', 'fps']) """ def __init__( self, *, download: bool = True, download_dir: None | str = None, full: bool = True, ): if download_dir is None: download_dir = os.path.join(os.path.expanduser("~"), ".minedojo") if download: self.root = dl("youtube", download_dir, full) else: self.root, _, url = get_fn("youtube", download_dir, full) assert os.path.exists(self.root), ( f"YouTube data file {self.root} does not exist. " "Please set download=True or you can manually " f"download it from {url}." ) with open(self.root, "r") as f: self.data = json.load(f) def __len__(self): return len(self.data) def __getitem__(self, idx): return self.data[idx]