-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathextractword.py
72 lines (56 loc) · 1.83 KB
/
extractword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import MeCab
class Word(object):
def __init__(self, surface, feature, connect=False):
self.surface = surface
self.feature = feature
self.connect = connect
def is_prefix(self):
return self.feature.startswith(u'接頭詞')
def is_postfix(self):
return self.feature.split(',')[1] == u'接尾'
def is_pp_particle(self):
return self.feature.startswith(u'助詞')
class Sentence(object):
def __init__(self):
self.words = []
self.mecab = MeCab.Tagger()
def get_words(self):
result = []
BEFORE = -1
for word in self.words:
if word.connect == True:
result[BEFORE] = result[BEFORE] + word.surface
else:
result.append(word.surface)
return result
def analysis_text(self, text):
words = self.words
CURRENT = -1
BEFORE = -2
res = self.mecab.parseToNode(text.encode("utf-8"))
while res:
if res.surface == '':
res = res.next
continue
words.append(Word(unicode(res.surface), unicode(res.feature)))
if len(words) > 1:
rules = []
rules.append(words[BEFORE].is_prefix())
rules.append(words[CURRENT].is_postfix())
rules.append(words[BEFORE].is_pp_particle() and \
words[CURRENT].is_pp_particle())
for rule in rules:
if rule:
self.words[CURRENT].connect = True
res = res.next
if __name__ == "__main__":
import sys
text = sys.argv[1]
sentence = Sentence()
sentence.analysis_text(text)
words = sentence.get_words()
for word in words:
print word