-
Notifications
You must be signed in to change notification settings - Fork 0
/
OM_processor.py
82 lines (70 loc) · 2.8 KB
/
OM_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import json
with open('om/updates/poz.final.txt', 'r', encoding='utf8') as f:
for i in range(1):
f.readline()
om_positive = set([word.strip() for word in f.readlines()])
with open('om/updates/neg.final.txt', 'r', encoding='utf8') as f:
for i in range(1):
f.readline()
om_negative = set([word.strip() for word in f.readlines()])
with open('om/updates/stopwords.txt', 'r', encoding='utf8') as f:
# List of words that do not hold sentiment....
# And some that do, like "Paldies", "😊", "👍" at the end
for i in range(2):
f.readline()
om_stopwords = set([word.strip() for word in f.readlines()][:-7])
def getSentiment(text):
cleaned_text = [word for word in text.lower().split() if word not in om_stopwords]
positive = len([word for word in cleaned_text if word in om_positive])
negative = len([word for word in cleaned_text if word in om_negative])
if positive - negative > 0:
return 'positive'
elif negative - positive > 0:
return 'negative'
else:
return 'neutral'
with open('latvian-tweet-sentiment-corpus/tweet_corpus.json', 'r', encoding='utf8') as f:
dataset = json.load(f)
correct = 0
total = 0
for d in dataset:
sentiment = {"POZ": 'positive', "NEG": 'negative', "NEU": 'neutral'}[d['sentiment']]
om_sentiment = getSentiment(d['text'])
if sentiment == om_sentiment:
correct += 1
total += 1
print(f'latvian-tweet-sentiment-corpus accuracy: {correct / total}')
with open('LV-twitter-sentiment-corpus/tweet_corpus.json', 'r', encoding='utf8') as f:
dataset = json.load(f)
correct = 0
total = 0
for d in dataset:
sentiment = {"POZ": 'positive', "NEG": 'negative', "NEU": 'neutral'}[d['sentiment']]
om_sentiment = getSentiment(d['text'])
if sentiment == om_sentiment:
correct += 1
total += 1
print(f'LV-twitter-sentiment-corpus accuracy: {correct / total}')
with open('Latvian-Twitter-Eater-Corpus/sub-corpora/sentiment-analysis/ltec-sentiment-annotated-test.json', 'r', encoding='utf8') as f:
dataset = json.load(f)
correct = 0
total = 0
for d in dataset:
sentiment = {"pos": 'positive', "neg": 'negative', "neu": 'neutral'}[d['sentiment']]
om_sentiment = getSentiment(d['tweet_text'])
if sentiment == om_sentiment:
correct += 1
total += 1
print(f'Latvian Twitter Eater Corpus accuracy: {correct / total}')
with open('sikzinu_analize/viksna.json', 'r', encoding='utf8') as f:
dataset = json.load(f)
correct = 0
total = 0
for d in dataset['data']:
sentiment = 'positive' if d['POS'] - d['NEG'] > 0 else 'negative' if d['NEG'] - d['POS'] > 0 else 'neutral'
om_sentiment = getSentiment(d['text'])
if not d['NOT_LV']:
if sentiment == om_sentiment:
correct += 1
total += 1
print(f'sikzinu_analize accuracy: {correct / total}')