Source code for minedojo.data.youtube_dataset

from __future__ import annotations

import json
import os

from .download import download as dl
from .download import get_fn


[docs]class YouTubeDataset:
    """
    Class for MineDojo YouTube Database API.
    We follow PyTorch Dataset format but without actually inheriting from PyTorch dataset to keep the framework general.

    Args:
        download: If set to ``True`` and there is no existing cache directory, the data will be downloaded automatically.

        download_dir: Directory path where the downloaded data will be saved.
                Default: ``~/.minedojo/``.

        full: If ``True``, the full version of the YouTube database will be downlaoded.
                If ``False``, only the tutorial version of the YouTube database will be downloaded.
                Default: ``True``.

    Examples:
        >>> from minedojo.data import YouTubeDataset
        >>> youtube_dataset = YouTubeDataset()
        >>> print(youtube_dataset[0].keys())
        dict_keys(['id', 'title', 'link', 'view_count', 'like_count', 'duration', 'fps'])
    """

    def __init__(
        self,
        *,
        download: bool = True,
        download_dir: None | str = None,
        full: bool = True,
    ):
        if download_dir is None:
            download_dir = os.path.join(os.path.expanduser("~"), ".minedojo")

        if download:
            self.root = dl("youtube", download_dir, full)
        else:
            self.root, _, url = get_fn("youtube", download_dir, full)
            assert os.path.exists(self.root), (
                f"YouTube data file {self.root} does not exist. "
                "Please set download=True or you can manually "
                f"download it from {url}."
            )

        with open(self.root, "r") as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]