-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
244 lines (217 loc) · 9.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#1. Iz text filea pridobi besedilo
#2 Analiziraj pridobljeno besedilo
import re
import collections
import math
import sys
import operator
import numpy as np
import os
def get_text():
try:
dirpath= os.path.dirname(os.path.realpath(__file__))
path = input("Name of source file(in the directory-" + dirpath + ": ")
path = dirpath + "/" + path
#/home/PycharmProjects/pyFormationNEW/pyFormation/samplesource.txt for testing
with open(path, 'r') as myFile:
data = myFile.read()
except IOError:
sys.exit(0)
print("File error")
return data
def split_text(input_text):
#A function that returns an array of words, separated by a space.
#First it splits the parentheses and dots etc. and then it removes the empty characters
word_list = re.split(r'[^a-zA-Z0-9čžšćđšČŽŠĐĆüÜéÉâÂÁáàÀÅåêÊËëèÈïÏîÎìÌôÔöÖòÒÛûùÙå]', input_text)
word_list = list(filter(None, word_list))
return word_list
def word_counter(input_text):
#A method that counts how many times each word has been used and saves it in an array
#Also makes it all lower case
return collections.Counter(map(str.lower, input_text))
def own_probability(input_text, number_of_words):
#A method that takes the input map/list and calculates the own probability
#Uses the formula: number of repetitions of a word divided by all words
#number_of_words = number of non-unique words
#input text = map with amount of repetitions of each word
all_values = list(input_text.values())
all_words = list(input_text.keys())
own_probs = []
for x in range(0,len(all_values)):
own_probs.append(all_values[x] / number_of_words)
new_dict = dict(zip(all_words, own_probs))
#Sorting dictionary makes it a list
return new_dict
def bigram(all_words):
#A function that makes bigrams and returns a
#Counter object with how many times each bigram appears in a text
#An own function because we need bigrams for generating words
#There are no replicates in bigrams!
joined_words = [None] * (len(all_words)-1)
for x in range(1, len(all_words)):
joined_words[x-1] = all_words[x-1] + " " + all_words[x]
cnt = collections.Counter(map(str.lower,joined_words))
return cnt
def cond_prob(input_text, words,cnt):
#Function that calculates the conditional probability of all words, based on the previous word
#First we take join the word list by 2 words, ex.: [the,dog,is,cute]->[the dog,dog is,is cute]
#TRY: P(A|B)= P(AB)/P(A)??
#Then we count how many items the joined words occured in a text
#Remove empty entries
#joined words is double text, that contains non-unique characters
#cnt = contains only unique double words and how many times they appear
#word_count = map with a word and the amount of times it appears
double_count = cnt
lowercase_count = double_count.keys()
#It is irrelevant how many words we have, all that matters is how many times A followed by B occurs
# and how many times A occurs
double_own = own_probability(double_count, len(input_text))
own_probs = words
#In case all the words are the same, the function under this does not work corectly, for an unknown reason
if sum(own_probs.values()) is 1:
return 1.0
#Find the first word of each key in double_own and compare it to own_probs, then divide their keys
divided_list = []
for keyxd in lowercase_count:
first_word = keyxd.split()[0]
if first_word in own_probs:
double_value = double_count[keyxd]
own_value = own_probs[first_word]
divided_value = double_value / own_value
divided_list.append(divided_value)
cond_probs = dict(zip(lowercase_count, divided_list))
return cond_probs
def equal_entropy(all_length):
#Function that calculates the entropy of every word, with the condition,
# THAT EACH WORDS PROBABILITIES ARE EQUAL(THE SAME)
#PROBABILITY = 1/length
#Equation used: sum(PROBABILITY * log(Y)(PROBABILITY)
#Y = number of unique characters in the alphabet
#length = how many unique words in the text
probability = 1/all_length
try:
logy_probability = (math.fabs(math.log(probability, 2)))
except ZeroDivisionError:
logy_probability = 0
#No for loop needed, because every probability is equal
return logy_probability
def own_entropy(input_text):
#Function that calculates entropy using each word's own probability
#Uses the same formula equal entropy uses, but calculates a different probability
#Entropy = p * log p
#Using log base 2, therefore the result is in bits!
all_words = list(input_text.keys())
all_values = list(input_text.values())
if len(input_text) is 1:
return 0
for x in range(0, len(all_values)):
all_values[x] = (math.fabs(all_values[x] * math.log(all_values[x],2)))
new_dict = dict(zip(all_words, all_values))
entropy = 0.0
for x in range(0,len(all_values)):
entropy += all_values[x]
print(" - Sum of all entropies: "+str(entropy))
return new_dict
def first_word_generator(list_words,list_values):
#Normalize the probabilities to 1, to avoid doesnt sum to 1 error
#Uses unigram's list of words and its list of values=probabilities to make a choice
probabilities = np.array(list_values).astype(np.float)
probabilities = probabilities / np.sum(probabilities)
choiceX = np.random.choice(list_words, p=probabilities)
return choiceX
def next_word_generator(first_word,list_words,list_values,limit,first_word_words,first_word_values):
#Normalize the probabilities to 1, to avoid doesnt sum to 1 error
probabilities = np.array(list_values).astype(np.float)
probabilities = probabilities / np.sum(probabilities)
try:
for x in range(0,limit):
indices = [i for i, s in enumerate(list_words) if s.startswith(first_word)]
#ex. between index 14,17 choose one by weighted random,using their values from list_values
temp = probabilities[indices]/np.sum(probabilities[indices])
if not indices:
#Creates a new first word if we hit a wall(last word)
first_word = first_word_generator(first_word_words,first_word_values) + " "
else:
choiceX = np.random.choice(indices, p=temp)
next_word = list_words[choiceX]
next_word = next_word.split(" ",1)[1]
first_word = next_word + " "
print(first_word,end="")
except TypeError:
print("Value Error here")
return next_word
def word_generator(words,bigrams,limit):
#A function that generates words using the Markov chain principle!
#First word is found in the words Counter object, takes the first word(most common)
#Take first word, then find it in conditional probabilities
#Check which word most commonly followed first word
#We put it in a Counter to kind sort it, so we can just look for the first probability
#when looking for the first word
#Limit tells us how many words we want to generate
#count is ordered, meaning the highest probabilities are first
#find a first word and get the highest probability or count for (word|...)
#We use Counter because numpy.random.choice for weighted random
#We make 2 lists out of each dictionary
list_words = list(bigrams.keys())
list_values = list(bigrams.values())
first_word_words = list(words.keys())
first_word_values = list(words.values())
print("Generated text:")
first_word = first_word_generator(first_word_words,first_word_values)
first_word = first_word + " "
#TODO: TEST WORD_GENERATOR SOME MORE
#choice = rnd.choice(list_words,list_values)
#we use a numpy array to be able to use a list as indexes,ex. a[0,14,33]
print(first_word,end="")
try:
next_word_generator(first_word,list_words,list_values,limit,first_word_words,first_word_values)
except IndexError:
print("Index error")
return "."
text = get_text()
words = split_text(text)
word_length = len(words)
word_count = word_counter(words)
#word count is a counter with unique WORDS
#words length is just a split text in a list
#In case there's too many words we print them out in a file for easier readability
own_probabilities = own_probability(word_count, word_length)
with open("Own_probs.txt", "w") as text_file:
print(own_probabilities, file=text_file)
bigrams = bigram(words)
conditional_probabilities = cond_prob(words,word_count,bigrams)
with open("Cond_probs.txt", "w") as text_file:
print(conditional_probabilities, file=text_file)
print("Equal entropy(equal for every word): " + str(equal_entropy(len(word_count))))
print("Entropy using the own probabilities")
own_entropies = own_entropy(own_probabilities)
print("Conditional entropy:")
with open("Own_entropies.txt", "w") as text_file:
print(own_entropies, file=text_file)
cond_entropies = own_entropy(conditional_probabilities)
with open("Cond_entropies.txt", "w") as text_file:
print(cond_entropies, file=text_file)
print("Amount of non-unique words:")
print(len(words))
print("Amount of unique words:")
print(len(word_count))
print("Unique word pairs:")
print(len(conditional_probabilities))
if len(word_count) < 100 :
print("Words:")
print(words)
print("Word counter:")
print(word_count)
print("Own probabilities:")
print(own_probabilities)
print("Conditional probabilities:")
print(conditional_probabilities)
print("Own entropies[bit]:")
print(own_entropies)
print("Conditional entropies[bit]:")
print(cond_entropies)
print("Word generator")
ordered_word = collections.OrderedDict(word_count.most_common())
limit = int(input("How many words to print out?"))
limit = limit + 1
print(word_generator(own_probabilities,conditional_probabilities,limit))