Source code for tensorbay.opendataset.Newsgroups20.loader

#!/usr/bin/env python3
#
# Copyright 2021 Graviti. Licensed under MIT License.
#
# pylint: disable=invalid-name

"""Dataloader of Newsgroups20 dataset."""

import os

from tensorbay.dataset import Data, Dataset
from tensorbay.label import Classification
from tensorbay.opendataset._utility import glob

DATASET_NAME = "Newsgroups20"
SEGMENT_DESCRIPTION_DICT = {
    "20_newsgroups": "Original 20 Newsgroups data set",
    "20news-bydate-train": (
        "Training set of the second version of 20 Newsgroups, "
        "which is sorted by date and has duplicates and some headers removed"
    ),
    "20news-bydate-test": (
        "Test set of the second version of 20 Newsgroups, "
        "which is sorted by date and has duplicates and some headers removed"
    ),
    "20news-18828": (
        "The third version of 20 Newsgroups, which has duplicates removed "
        "and includes only 'From' and 'Subject' headers"
    ),
}


[docs]def Newsgroups20(path: str) -> Dataset:
    """`20 Newsgroups <http://qwone.com/~jason/20Newsgroups/>`_ dataset.

    The folder structure should be like::

        <path>
            20news-18828/
                alt.atheism/
                    49960
                    51060
                    51119
                    51120
                    ...
                comp.graphics/
                comp.os.ms-windows.misc/
                comp.sys.ibm.pc.hardware/
                comp.sys.mac.hardware/
                comp.windows.x/
                misc.forsale/
                rec.autos/
                rec.motorcycles/
                rec.sport.baseball/
                rec.sport.hockey/
                sci.crypt/
                sci.electronics/
                sci.med/
                sci.space/
                soc.religion.christian/
                talk.politics.guns/
                talk.politics.mideast/
                talk.politics.misc/
                talk.religion.misc/
            20news-bydate-test/
            20news-bydate-train/
            20_newsgroups/

    Arguments:
        path: The root directory of the dataset.

    Returns:
        Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance.

    """
    root_path = os.path.abspath(os.path.expanduser(path))
    dataset = Dataset(DATASET_NAME)
    dataset.load_catalog(os.path.join(os.path.dirname(__file__), "catalog.json"))

    for segment_name, segment_description in SEGMENT_DESCRIPTION_DICT.items():
        segment_path = os.path.join(root_path, segment_name)
        if not os.path.isdir(segment_path):
            continue

        segment = dataset.create_segment(segment_name)
        segment.description = segment_description

        text_paths = glob(os.path.join(segment_path, "*", "*"))
        for text_path in text_paths:
            category = os.path.basename(os.path.dirname(text_path))

            data = Data(
                text_path, target_remote_path=f"{category}/{os.path.basename(text_path)}.txt"
            )
            data.label.classification = Classification(category)
            segment.append(data)

    return dataset