-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspec.ctr
130 lines (116 loc) · 4.19 KB
/
spec.ctr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
Broom memoryLimit: 1024 * 1024 * 1024 * 12.
import main: \*.
import
Library/Test: 'describe'
.
describe Lexer do: {:it:let
it exists do: {
{ Lexer. } should not raiseError: String.
}.
let[Reflect thisContext] lexer { Lexer }.
it ~ 'is not Nil' do: {
lexer should not = Nil.
}.
it can lex simple persian sentences do: {
(lexer split: 'کلمات فارسی')
should = ['کلمات', 'فارسی'].
}.
it can lex persian sentences with half-width spaces do: {
(lexer split: 'این جمله (خیلی) پیچیده نیست!!!')
should = (['این' ,'جمله' ,'(' ,'خیلی' ,')' ,'پیچیده' ,'نیست', '!', '!', '!']).
}.
it can lex arbitrary persian text do: {
# Pen writeln:
(lexer split: 'میشدید اون پارسر ترم رو دیدی؟ یه تیکه هست مچ رو درست پیدا میکنه ولی توشو خالی میده')
should = [
'میشدید', 'اون', 'پارسر',
'ترم', 'رو', 'دیدی', '؟', 'یه',
'تیکه', 'هست', 'مچ', 'رو', 'درست',
'پیدا', 'میکنه', 'ولی', 'توشو',
'خالی', 'میده'
].
# .
}.
}.
#:language XFrozen
HashMap on: 'toJSON' do: {:self
var kvs is Array new.
me each: {:k:v kvs push: ('"%s": %s' % [(k escape: '\n\r\t\f\v\a\'"'), (JSON serialize: v)]).}
^'{%L}' % [kvs].
}.
describe TokenAnalyser do: {:it:let
it exists do: {
{ TokenAnalyser. } should not raiseError: String.
}.
let[Reflect thisContext] lexer { Lexer }.
let[Reflect thisContext] analyser { TokenAnalyser with: lexer }.
let[Reflect thisContext] glblctx { Reflect thisContext }.
it can tokenise do: {:thisTest
var allTargetTexts is (
File
new: 'orders', read split: '\n',
fmap: {\:name ['label', 'name', '_'] letEqual: (name split: ':') in: {
^[label, (File new: name, read)].
}.}
# groupBy: \:x x head
).
thisTest tests: 'Tokeniser'.
thisTest progressesUpTo: allTargetTexts count.
[allTokens, consolidated] is analyser
consolidateAllWithLabels: allTargetTexts
pre: {:i thisTest progress: i + 1. }
post: { thisTest show. }.
allTokens should not = Nil.
consolidated should not = Nil.
let[glblctx] allTokens { allTokens }.
let[glblctx] consolidated { consolidated }.
let[glblctx] allTargetTexts { allTargetTexts }.
}.
it can calculate categorised TF-IDF do: {:thisTest
thisTest tests: 'TF-IDF'.
thisTest progressesUpTo: allTokens count.
var consolidateAll is analyser # We don't need TF-IDF, since there are only N categories
TFconsolidate: consolidated
withTokens: allTokens
pre: {:i thisTest progress: i + 1. }
post: { thisTest show. }.
consolidateAll should not = Nil.
File new: 'dists', write: (JSON serialize: consolidateAll).
let[glblctx] consolidateAll { consolidateAll }.
}.
it can detect keywords do: {:thisTest
var suggestedKeywords is Map fromArray: (
File
new: 'checks', read split: '\n',
fmap: {\:x x split: ':'.}
).
thisTest tests: 'Keyword detection'.
# thisTest progressesUpTo: allTargetTexts count.
var kws is suggestedKeywords values.
var keywords-for-docs is analyser
listKeywords: consolidateAll
forKeywords: kws
andDocuments: suggestedKeywords keys
pre: {:i thisTest progress: i + 1. }
post: { thisTest show. }.
keywords-for-docs count should = suggestedKeywords count.
var maps is Map new.
var idx is -1.
suggestedKeywords each: {:k:i
var thisres is keywords-for-docs at: (idx +=: 1). # ordered at kw order
maps merge: (Map new put: thisres at: k) with: \:k:v [i, k last + v last].
}.
maps is maps kvmap: \:x [x head, Map fromArray: x last].
var topics is suggestedKeywords keys.
var resultf is File new: 'out', open: 'w+'.
suggestedKeywords each: {:tp:kw
var sum is topics
fmap: \:x maps @ x @ kw,
foldl: {\:acc:x acc + x} accumulator: 0.
var kwval is maps @ tp @ kw or: 0.
resultf write: 'Confidence for $$kw in $$tp is ${{kwval / sum}}$\n'.
}.
resultf write: (JSON serialize: maps).
resultf close.
}.
}.