KaggleDataSource

Requires the kagglehub library.

Bases: DataSource

A DataSource for reading Kaggle datasets in Spark.

This data source allows reading datasets from Kaggle directly into Spark DataFrames.

Name: kaggle

Options

handle: The dataset handle on Kaggle, in the form of {owner_slug}/{dataset_slug} or {owner_slug}/{dataset_slug}/versions/{version_number}
path: The path to a file within the dataset.
username: The Kaggle username for authentication.
key: The Kaggle API key for authentication.

Notes:

The kagglehub library is required to use this data source. Make sure it is installed.
To read private datasets or datasets that require user authentication, username and key must be provided.
Currently all data is read from a single partition.

Examples:

Register the data source.

>>> from pyspark_datasources import KaggleDataSource
>>> spark.dataSource.register(KaggleDataSource)

Load a public dataset from Kaggle.

>>> spark.read.format("kaggle").options(handle="yasserh/titanic-dataset").load("Titanic-Dataset.csv").select("Name").show()
+--------------------+
|                Name|
+--------------------+
|Braund, Mr. Owen ...|
|Cumings, Mrs. Joh...|
|...                 |
+--------------------+

Load a private dataset with authentication.

>>> spark.read.format("kaggle").options(
...     username="myaccount",
...     key="<token>",
...     handle="myaccount/my-private-dataset",
... ).load("file.csv").show()

Source code in pyspark_datasources/kaggle.py

class KaggleDataSource(DataSource):
    """
    A DataSource for reading Kaggle datasets in Spark.

    This data source allows reading datasets from Kaggle directly into Spark DataFrames.

    Name: `kaggle`

    Options
    -------
    - `handle`: The dataset handle on Kaggle, in the form of `{owner_slug}/{dataset_slug}`
        or `{owner_slug}/{dataset_slug}/versions/{version_number}`
    - `path`: The path to a file within the dataset.
    - `username`: The Kaggle username for authentication.
    - `key`: The Kaggle API key for authentication.

    Notes:
    -----
    - The `kagglehub` library is required to use this data source. Make sure it is installed.
    - To read private datasets or datasets that require user authentication, `username` and `key` must be provided.
    - Currently all data is read from a single partition.

    Examples
    --------
    Register the data source.

    >>> from pyspark_datasources import KaggleDataSource
    >>> spark.dataSource.register(KaggleDataSource)

    Load a public dataset from Kaggle.

    >>> spark.read.format("kaggle").options(handle="yasserh/titanic-dataset").load("Titanic-Dataset.csv").select("Name").show()
    +--------------------+
    |                Name|
    +--------------------+
    |Braund, Mr. Owen ...|
    |Cumings, Mrs. Joh...|
    |...                 |
    +--------------------+

    Load a private dataset with authentication.

    >>> spark.read.format("kaggle").options(
    ...     username="myaccount",
    ...     key="<token>",
    ...     handle="myaccount/my-private-dataset",
    ... ).load("file.csv").show()
    """

    @classmethod
    def name(cls) -> str:
        return "kaggle"

    @cached_property
    def _data(self) -> "pa.Table":
        import ast
        import os

        import pyarrow as pa

        handle = self.options.pop("handle")
        path = self.options.pop("path")
        username = self.options.pop("username", None)
        key = self.options.pop("key", None)
        if username or key:
            if not (username and key):
                raise ValueError(
                    "Both username and key must be provided to authenticate."
                )
            os.environ["KAGGLE_USERNAME"] = username
            os.environ["KAGGLE_KEY"] = key

        kwargs = {k: ast.literal_eval(v) for k, v in self.options.items()}

        # Cache in a temporary directory to avoid writing to ~ which may be read-only
        with tempfile.TemporaryDirectory() as tmpdir:
            os.environ["KAGGLEHUB_CACHE"] = tmpdir
            import kagglehub

            df = kagglehub.dataset_load(
                kagglehub.KaggleDatasetAdapter.PANDAS,
                handle,
                path,
                **kwargs,
            )
            return pa.Table.from_pandas(df)

    def schema(self) -> StructType:
        return from_arrow_schema(self._data.schema)

    def reader(self, schema: StructType) -> "KaggleDataReader":
        return KaggleDataReader(self)