Source code for tensorbay.opendataset.THCHS30.loader

#!/usr/bin/env python3
#
# Copyright 2021 Graviti. Licensed under MIT License.
#
# pylint: disable=invalid-name

"""Dataloader of THCHS-30 dataset."""

import os
from itertools import islice
from typing import List

from tensorbay.dataset import Data, Dataset
from tensorbay.label import LabeledSentence, SentenceSubcatalog, Word
from tensorbay.opendataset._utility import glob

DATASET_NAME = "THCHS-30"
_SEGMENT_NAME_LIST = ("train", "dev", "test")


[docs]def THCHS30(path: str) -> Dataset: """`THCHS-30 <http://166.111.134.19:7777/data/thchs30/README.html>`_ dataset. The file structure should be like:: <path> lm_word/ lexicon.txt data/ A11_0.wav.trn ... dev/ A11_101.wav ... train/ test/ Arguments: path: The root directory of the dataset. Returns: Loaded :class:`~tensorbay.dataset.dataset.Dataset` instance. """ dataset = Dataset(DATASET_NAME) dataset.catalog.sentence = _get_subcatalog(os.path.join(path, "lm_word", "lexicon.txt")) for segment_name in _SEGMENT_NAME_LIST: segment = dataset.create_segment(segment_name) for filename in glob(os.path.join(path, segment_name, "*.wav")): data = Data(filename) label_file = os.path.join(path, "data", os.path.basename(filename) + ".trn") data.label.sentence = _get_label(label_file) segment.append(data) return dataset
def _get_label(label_file: str) -> List[LabeledSentence]: with open(label_file, encoding="utf-8") as fp: labels = ((Word(text=text) for text in texts.split()) for texts in fp) return [LabeledSentence(*labels)] def _get_subcatalog(lexion_path: str) -> SentenceSubcatalog: subcatalog = SentenceSubcatalog() with open(lexion_path, encoding="utf-8") as fp: for line in islice(fp, 4, None): subcatalog.append_lexicon(line.strip().split()) return subcatalog