From d9687a2b3731ca08562d7b08389757653cda104b Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 18 Feb 2021 18:00:27 -0800 Subject: [PATCH 1/3] checkpoint --- test/asset/raw_datasets.json | 2 ++ torchtext/datasets/__init__.py | 2 ++ torchtext/datasets/trec.py | 39 ++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) create mode 100644 torchtext/datasets/trec.py diff --git a/test/asset/raw_datasets.json b/test/asset/raw_datasets.json index 5b4a20b4e3..2eacc3254f 100644 --- a/test/asset/raw_datasets.json +++ b/test/asset/raw_datasets.json @@ -45,3 +45,5 @@ {"dataset_name": "SQuAD2", "split": "train", "NUM_LINES": 130319, "MD5": {"train": "62108c273c268d70893182d5cf8df740", "dev": "246adae8b7002f8679c027697b0b7cf8"}, "URL": {"train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json", "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"}, "first_line": ["Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\".", "When did Beyonce start becoming popular?", ["in the late 1990s"], [269]]} {"dataset_name": "SQuAD2", "split": "dev", "NUM_LINES": 11873, "MD5": {"train": "62108c273c268d70893182d5cf8df740", "dev": "246adae8b7002f8679c027697b0b7cf8"}, "URL": {"train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json", "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"}, "first_line": ["The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.", "In what country is Normandy located?", ["France", "France", "France", "France"], [159, 159, 159, 159]]} {"dataset_name": "EnWik9", "split": "train", "NUM_LINES": 13147026, "MD5": "3e773f8a1577fda2e27f871ca17f31fd", "URL": "http://mattmahoney.net/dc/enwik9.zip", "first_line": "\n"} +{"dataset_name": "Trec", "split": "train", "NUM_LINES": 5452, "MD5": "073462e3fcefaae31e00edb1f18d2d02", "URL": "http://cogcomp.org/Data/QA/QC/train_5500.label", "first_line": ["manner", "How did serfdom develop in and then leave Russia ?\n"]} +{"dataset_name": "Trec", "split": "test", "NUM_LINES": 500, "MD5": "323a3554401d86e650717e2d2f942589", "URL": "http://cogcomp.org/Data/QA/QC/TREC_10.label", "first_line": ["dist", "How far is it from Denver to Aspen ?\n"]} diff --git a/torchtext/datasets/__init__.py b/torchtext/datasets/__init__.py index 35daf648d7..e11cbddb49 100644 --- a/torchtext/datasets/__init__.py +++ b/torchtext/datasets/__init__.py @@ -12,6 +12,7 @@ from .sogounews import SogouNews from .squad1 import SQuAD1 from .squad2 import SQuAD2 +from .trec import Trec from .udpos import UDPOS from .wikitext103 import WikiText103 from .wikitext2 import WikiText2 @@ -35,6 +36,7 @@ 'SQuAD1': SQuAD1, 'SQuAD2': SQuAD2, 'SogouNews': SogouNews, + 'Trec': Trec, 'UDPOS': UDPOS, 'WMT14': WMT14, 'WMTNewsCrawl': WMTNewsCrawl, diff --git a/torchtext/datasets/trec.py b/torchtext/datasets/trec.py new file mode 100644 index 0000000000..510062c397 --- /dev/null +++ b/torchtext/datasets/trec.py @@ -0,0 +1,39 @@ +from torchtext.utils import download_from_url +from torchtext.datasets.common import RawTextIterableDataset +from torchtext.datasets.common import wrap_split_argument +from torchtext.datasets.common import add_docstring_header +import os + +URL = { + 'train': 'http://cogcomp.org/Data/QA/QC/train_5500.label', + 'test': 'http://cogcomp.org/Data/QA/QC/TREC_10.label', +} + +MD5 = { + 'train': '073462e3fcefaae31e00edb1f18d2d02', + 'test': '323a3554401d86e650717e2d2f942589', +} + +NUM_LINES = { + 'train': 5452, + 'test': 500, +} + + +@wrap_split_argument +@add_docstring_header() +def Trec(root='.data', split=('train', 'test'), offset=0): + def _create_data_from_file(data_path): + for line in open(os.path.expanduser(data_path), 'rb'): + # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space + label, _, text = line.replace(b'\xf0', b' ').decode().partition(' ') + label = label.split(":")[1] + yield label, text + + datasets = [] + for item in split: + data_path = download_from_url(URL[item], root=root, + hash_value=MD5[item], hash_type='md5') + datasets.append(RawTextIterableDataset("Trec", NUM_LINES[item], + _create_data_from_file(data_path), offset=offset)) + return datasets From 154870ee72686665f686d2dbfcb1ccf409dba12f Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 18 Feb 2021 18:32:01 -0800 Subject: [PATCH 2/3] checkpoint --- test/asset/raw_datasets.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/asset/raw_datasets.json b/test/asset/raw_datasets.json index 2eacc3254f..4d9c9c534f 100644 --- a/test/asset/raw_datasets.json +++ b/test/asset/raw_datasets.json @@ -45,5 +45,5 @@ {"dataset_name": "SQuAD2", "split": "train", "NUM_LINES": 130319, "MD5": {"train": "62108c273c268d70893182d5cf8df740", "dev": "246adae8b7002f8679c027697b0b7cf8"}, "URL": {"train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json", "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"}, "first_line": ["Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\".", "When did Beyonce start becoming popular?", ["in the late 1990s"], [269]]} {"dataset_name": "SQuAD2", "split": "dev", "NUM_LINES": 11873, "MD5": {"train": "62108c273c268d70893182d5cf8df740", "dev": "246adae8b7002f8679c027697b0b7cf8"}, "URL": {"train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json", "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"}, "first_line": ["The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.", "In what country is Normandy located?", ["France", "France", "France", "France"], [159, 159, 159, 159]]} {"dataset_name": "EnWik9", "split": "train", "NUM_LINES": 13147026, "MD5": "3e773f8a1577fda2e27f871ca17f31fd", "URL": "http://mattmahoney.net/dc/enwik9.zip", "first_line": "\n"} -{"dataset_name": "Trec", "split": "train", "NUM_LINES": 5452, "MD5": "073462e3fcefaae31e00edb1f18d2d02", "URL": "http://cogcomp.org/Data/QA/QC/train_5500.label", "first_line": ["manner", "How did serfdom develop in and then leave Russia ?\n"]} -{"dataset_name": "Trec", "split": "test", "NUM_LINES": 500, "MD5": "323a3554401d86e650717e2d2f942589", "URL": "http://cogcomp.org/Data/QA/QC/TREC_10.label", "first_line": ["dist", "How far is it from Denver to Aspen ?\n"]} +{"dataset_name": "Trec", "split": "train", "NUM_LINES": 5452, "MD5": "073462e3fcefaae31e00edb1f18d2d02", "URL": {"train": "http://cogcomp.org/Data/QA/QC/train_5500.label", "test": "http://cogcomp.org/Data/QA/QC/TREC_10.label"}, "first_line": ["manner", "How did serfdom develop in and then leave Russia ?\n"]} +{"dataset_name": "Trec", "split": "test", "NUM_LINES": 500, "MD5": "323a3554401d86e650717e2d2f942589", "URL": {"train": "http://cogcomp.org/Data/QA/QC/train_5500.label", "test": "http://cogcomp.org/Data/QA/QC/TREC_10.label"}, "first_line": ["dist", "How far is it from Denver to Aspen ?\n"]} From 47f304432680f82a55d4a149c074a67d4ab5d502 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 18 Feb 2021 19:09:09 -0800 Subject: [PATCH 3/3] checkpoint --- test/asset/raw_datasets.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/asset/raw_datasets.json b/test/asset/raw_datasets.json index 4d9c9c534f..cbed634c12 100644 --- a/test/asset/raw_datasets.json +++ b/test/asset/raw_datasets.json @@ -45,5 +45,5 @@ {"dataset_name": "SQuAD2", "split": "train", "NUM_LINES": 130319, "MD5": {"train": "62108c273c268d70893182d5cf8df740", "dev": "246adae8b7002f8679c027697b0b7cf8"}, "URL": {"train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json", "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"}, "first_line": ["Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\".", "When did Beyonce start becoming popular?", ["in the late 1990s"], [269]]} {"dataset_name": "SQuAD2", "split": "dev", "NUM_LINES": 11873, "MD5": {"train": "62108c273c268d70893182d5cf8df740", "dev": "246adae8b7002f8679c027697b0b7cf8"}, "URL": {"train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json", "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"}, "first_line": ["The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.", "In what country is Normandy located?", ["France", "France", "France", "France"], [159, 159, 159, 159]]} {"dataset_name": "EnWik9", "split": "train", "NUM_LINES": 13147026, "MD5": "3e773f8a1577fda2e27f871ca17f31fd", "URL": "http://mattmahoney.net/dc/enwik9.zip", "first_line": "\n"} -{"dataset_name": "Trec", "split": "train", "NUM_LINES": 5452, "MD5": "073462e3fcefaae31e00edb1f18d2d02", "URL": {"train": "http://cogcomp.org/Data/QA/QC/train_5500.label", "test": "http://cogcomp.org/Data/QA/QC/TREC_10.label"}, "first_line": ["manner", "How did serfdom develop in and then leave Russia ?\n"]} -{"dataset_name": "Trec", "split": "test", "NUM_LINES": 500, "MD5": "323a3554401d86e650717e2d2f942589", "URL": {"train": "http://cogcomp.org/Data/QA/QC/train_5500.label", "test": "http://cogcomp.org/Data/QA/QC/TREC_10.label"}, "first_line": ["dist", "How far is it from Denver to Aspen ?\n"]} +{"dataset_name": "Trec", "split": "train", "NUM_LINES": 5452, "MD5": {"train": "073462e3fcefaae31e00edb1f18d2d02", "test": "323a3554401d86e650717e2d2f942589"}, "URL": {"train": "http://cogcomp.org/Data/QA/QC/train_5500.label", "test": "http://cogcomp.org/Data/QA/QC/TREC_10.label"}, "first_line": ["manner", "How did serfdom develop in and then leave Russia ?\n"]} +{"dataset_name": "Trec", "split": "test", "NUM_LINES": 500, "MD5": {"train": "073462e3fcefaae31e00edb1f18d2d02", "test": "323a3554401d86e650717e2d2f942589"}, "URL": {"train": "http://cogcomp.org/Data/QA/QC/train_5500.label", "test": "http://cogcomp.org/Data/QA/QC/TREC_10.label"}, "first_line": ["dist", "How far is it from Denver to Aspen ?\n"]}