-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path017 - Most frequently used words in a text.py
102 lines (79 loc) · 3.83 KB
/
017 - Most frequently used words in a text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Write a function that, given a string of text (possibly with punctuation and line-breaks),
returns an array of the top-3 most occurring words, in descending order of the number of occurrences.
Assumptions:
A word is a string of letters (A to Z) optionally containing one or more apostrophes (') in ASCII.
Apostrophes can appear at the start, middle or end of a word ('abc, abc', 'abc', ab'c are all valid)
Any other characters (e.g. #, \, / , . ...) are not part of a word and should be treated as whitespace.
Matches should be case-insensitive, and the words in the result should be lowercased.
Ties may be broken arbitrarily.
If a text contains fewer than three unique words, then either the top-2 or top-1 words should be returned,
or an empty array if a text contains no words.
Examples:
top_3_words("In a village of La Mancha, the name of which I have no desire to call to
mind, there lived not long since one of those gentlemen that keep a lance
in the lance-rack, an old buckler, a lean hack, and a greyhound for
coursing. An olla of rather more beef than mutton, a salad on most
nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so extra
on Sundays, made away with three-quarters of his income.")
# => ["a", "of", "on"]
top_3_words("e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e")
# => ["e", "ddd", "aa"]
top_3_words(" //wont won't won't")
# => ["won't", "wont"]
"""
from array import array
import re
import string
import unittest
def top_3_words(text: array) -> array:
top_words = {}
# if char is not a leter or apostrophe replace it with space
allowed_chars = string.ascii_lowercase + string.ascii_uppercase + "'"
for c in text:
if c not in allowed_chars:
text = text.replace(c, " ")
# remove duplicate spaces
text = re.sub(" +", " ", text)
# remove spaces from the begining and end of the text
text = text.strip()
words = text.split(' ')
# fill a dict with words as keys and their occurrences as vals
for word in words:
# skip empty strings
if word == '':
continue
# skip words containing only apostrophes
if (re.match("^'+$", word)):
continue
# words in the result should be lowercased
word = word.lower()
if word not in top_words:
top_words[word] = 1
else:
top_words[word] += 1
# sort dict in descending order and reduce size to top three occuring words
top_three_dict = dict(reversed(sorted(top_words.items(), key=lambda item: item[1])[-3:]))
return list(top_three_dict.keys())
class TestTopWords(unittest.TestCase):
def test_return_top_threee_words_from_long_text(self):
# given
text = "In a village of La Mancha, the name of which I have no desire to call to mind, \
there lived not long since one of those gentlemen that keep a lance in the lance-rack, \
an old buckler, a lean hack, and a greyhound for coursing. An olla of rather more \
beef than mutton, a salad on most nights, scraps on Saturdays, lentils on Fridays, \
and a pigeon or so extra on Sundays, made away with three-quarters of his income."
# then
self.assertEqual(top_3_words(text), ["a", "of", "on"])
def test_return_top_threee_words_from_text_with_not_allwed_chars(self):
# given
text = " //wont won't won't"
# then
self.assertEqual(top_3_words(text), ["won't", "wont"])
def test_return_top_threee_words_from_text(self):
# given
text = "e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e"
# then
self.assertEqual(top_3_words(text), ["e", "ddd", "aa"])
if __name__ == '__main__':
unittest.main()