-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathuniversal_tags.py
100 lines (80 loc) · 2.82 KB
/
universal_tags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
'''
Interface for converting POS tags from various treebanks
to the universal tagset of Petrov, Das, & McDonald.
The tagset consists of the following 12 coarse tags:
VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation
@see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/
@author: Nathan Schneider (nschneid)
@since: 2011-05-06
'''
# Strive towards Python 3 compatibility
from __future__ import print_function, unicode_literals, division
from future_builtins import map, filter
import re, glob
from collections import defaultdict
MAP_DIR = 'universal_pos_tags.1.01'
COARSE_TAGS = ('VERB','NOUN','PRON','ADJ','ADV','ADP','CONJ','DET','NUM','PRT','X','.')
_MAPS = defaultdict(dict)
def readme():
with open(MAP_DIR+'/README') as f:
return f.read()
def fileids(lang=''):
'''
Optionally given a two-letter ISO language code, returns names of files
containing mappings from a tagset from a treebank in that language to the
universal tagset.
>>> fileids('en')
[u'en-ptb']
>>> fileids('zh')
[u'zh-ctb6', u'zh-sinica']
'''
return [re.match(r'.*[/]([^/\\]+)[.]map', p).group(1) for p in glob.glob(MAP_DIR + '/{}-*.map'.format(lang.lower()))]
def _read(fileid):
with open(MAP_DIR+'/'+fileid+'.map') as f:
for ln in f:
ln = ln.strip()
if ln=='': continue
fine, coarse = ln.split('\t')
assert coarse in COARSE_TAGS,'Unexpected coarse tag: {}'.format(coarse)
assert fine not in _MAPS[fileid],'Multiple entries for original tag: {}'.format(fine)
_MAPS[fileid][fine] = coarse
def mapping(fileid):
'''
Retrieves the mapping from original tags to universal tags for the
treebank in question.
>>> mapping('ru-rnc')=={'!': '.', 'A': 'ADJ', 'AD': 'ADV', 'C': 'CONJ', 'COMP': 'CONJ', 'IJ': 'X', 'NC': 'NUM', 'NN': 'NOUN', 'P': 'PRON', 'PTCL': 'PRT', 'V': 'VERB', 'VG': 'VERB', 'VI': 'VERB', 'VP': 'VERB', 'YES_NO_SENT': 'X', 'Z': 'X'}
True
'''
if fileid not in _MAPS:
_read(fileid)
return _MAPS[fileid]
def convert(fileid, originalTag):
'''
Produces the (coarse) universal tag given an original POS tag from the
treebank in question.
>>> convert('en-ptb', 'VBZ')
u'VERB'
>>> convert('en-ptb', 'VBP')
u'VERB'
>>> convert('en-ptb', '``')
u'.'
'''
return mapping(fileid)[originalTag]
def test():
for fileid in fileids():
mapping(fileid)
import doctest
doctest.testmod()
if __name__=='__main__':
test()