-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdatasets.yaml
107 lines (105 loc) · 5.64 KB
/
datasets.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#
# Copyright 2020--2021 IBM Corp. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
api_name: com.ibm.pardata.v1
name: 'dax'
last_updated: 2020-10-08
datasets:
gmb:
"1.0.2":
name: Groningen Meaning Bank Modified
published: 2019-12-19
homepage: https://developer.ibm.com/exchanges/data/all/groningen-meaning-bank/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-groningen-meaning-bank-modified/1.0.2/groningen-meaning-bank-modified.tar.gz
sha512sum: 4b0e6c445bf5be0573ae411f8e0ba307b884300ab6b5473ea0d455dd82b8cf4dc06fb77a9a606850f3b283357f22fd516e91850cea7e45de19ce5625fda2c001
license: cdla_sharing
estimated_size: 10M
description: "A dataset of multi-sentence texts, together with annotations for parts-of-speech, named entities, lexical categories and other natural language structural phenomena."
subdatasets:
gmb_subset_full:
name: GMB Subset Full
description: A full version of the raw dataset. Used to train MAX model – Named Entity Tagger.
format: text/plain
path: groningen_meaning_bank_modified/gmb_subset_full.txt
wikitext103:
"1.0.1":
name: WikiText-103
published: 2020-03-17
homepage: https://developer.ibm.com/exchanges/data/all/wikitext-103/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-wikitext-103/1.0.1/wikitext-103.tar.gz
sha512sum: c8186919aa1840af6b734ea41abc580574ea8efe2fafda220f5d01002464d17566d84be5199b875136c9593f0e0678fb5d7c84bb2231de8b4151cb9c83fa2109
license: cc_by_30
estimated_size: 181M
description: "The WikiText-103 dataset is a collection of over 100 million tokens extracted from the set of verified ‘Good’ and ‘Featured’ articles on Wikipedia."
subdatasets:
train:
name: Train Tokens
description: Tokens in the training subset
format: text/plain
path: wikitext-103/wiki.train.tokens
valid:
name: Validation Tokens
description: Tokens in the validation subset
format: text/plain
path: wikitext-103/wiki.valid.tokens
test:
name: Test Tokens
description: Tokens in the testing subset
format: text/plain
path: wikitext-103/wiki.test.tokens
noaa_jfk:
"1.1.4":
name: NOAA Weather Data – JFK Airport
published: 2019-09-12
homepage: https://developer.ibm.com/exchanges/data/all/jfk-weather-data/
download_url: https://dax-cdn.cdn.appdomain.cloud/dax-noaa-weather-data-jfk-airport/1.1.4/noaa-weather-data-jfk-airport.tar.gz
sha512sum: e3f27a8fcc0db5289df356e3f48aef6df56236798d5b3ae3889d358489ec6609d2d797e4c4932b86016d2ce4a379ac0a0749b6fb2c293ebae4e585ea1c8422ac
license: CDLA-Sharing-1.0
estimated_size: 3.2M
description: "The NOAA JFK dataset contains 114,546 hourly observations of various local climatological variables (including visibility, temperature, wind speed and direction, humidity, dew point, and pressure). The data was collected by a NOAA weather station located at the John F. Kennedy International Airport in Queens, New York."
subdatasets:
jfk_weather_cleaned:
name: Cleaned JFK Weather Data
description: Cleaned version of the JFK weather data.
format:
id: table/csv
options:
encoding: 'UTF-8'
delimiter: ','
columns:
DATE: 'datetime'
# Would have been int in pandas if unspecified. Put this here to have some dtype processing code ran more frequently in test code
HOURLYPressureTendencyCons: 'float'
path: noaa-weather-data-jfk-airport/jfk_weather_cleaned.csv
tensorflow_speech_commands:
"1.0.1":
name: TensorFlow Speech Commands
published: 2020-03-17
homepage: https://developer.ibm.com/exchanges/data/all/speech-commands/
# The following URL is a sample dataset for testing purpose only
download_url: https://github.com/CODAIT/pardata/raw/test-data/tensorflow-speech-commands.tar.xz
sha512sum: c96487b79d65fdf9068163e8b6a02511f08252f96600997072cc0a541a05aeefb8124cb8813f6d736d8d979cc5ad47757ed588598c0252e178d5f0cf25ca70c8
license: cc_by_40
estimated_size: 33M
description: "TensorFlow Speech Command dataset is a set of one-second .wav audio files, each containing a single spoken English word. These words are from a small set of commands, and are spoken by a variety of different speakers. 20 of the words are core words, while 10 words are auxiliary words that could act as tests for algorithms in ignoring speeches that do not contain triggers. Included along with the 30 words is a collection of background noise audio files. The dataset was originally designed for limited vocabulary speech recognition tasks. The audio clips were originally collected by Google, and recorded by volunteers in uncontrolled locations around the world."
subdatasets:
house:
name: house
description: house folder
format:
id: audio/wav
path:
type: regex
value: "TensorFlow-Speech-Commands/house/.*\\.wav"