-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspec.ctr
116 lines (99 loc) · 3.35 KB
/
spec.ctr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
Broom memoryLimit: 1024 * 1024 * 1024 * 8.
Broom autoAlloc: True.
import main: \*.
import
Library/Test: 'describe'
.
describe Lexer do: {:it:let
it exists do: {
{ Lexer. } should not raiseError: String.
}.
let[Reflect thisContext] lexer { Lexer }.
let[Reflect thisContext] collist {
File new: 'collation', read split: '\n',
fmap: {:x ^x split: ' ' max: 1.}
}.
it ~ 'is not Nil' do: {
lexer should not = Nil.
lexer collate: collist.
}.
it can lex simple persian sentences do: {
(lexer split: 'کلمات فارسی')
should = ['کلمات', 'فارسی'].
}.
it can lex persian sentences with half-width spaces do: {
(lexer split: 'این جمله (خیلی) پیچیده نیست!!!')
should = (['این' ,'جمله' ,'(' ,'خیلی' ,')' ,'پیچیده' ,'نیست', '!', '!', '!']).
}.
it can lex arbitrary persian text do: {
# Pen writeln:
(lexer split: 'میشدید اون پارسر ترم رو دیدی؟ یه تیکه هست مچ رو درست پیدا میکنه ولی توشو خالی میده')
should = [
'میشدید', 'اون', 'پارسر',
'ترم', 'رو', 'دیدی', '؟', 'یه',
'تیکه', 'هست', 'مچ', 'رو', 'درست',
'پیدا', 'میکنه', 'ولی', 'توشو',
'خالی', 'میده'
].
# .
}.
}.
#:language XFrozen
describe TokenAnalyser do: {:it:let
it exists do: {
{ TokenAnalyser. } should not raiseError: String.
}.
let[Reflect thisContext] lexer { Lexer }.
let[Reflect thisContext] analyser { TokenAnalyser with: lexer }.
let[Reflect thisContext] glblctx { Reflect thisContext }.
let[Reflect thisContext] consolidateAll { Nil }.
let[Reflect thisContext] DFconsAll { Nil }.
let[Reflect thisContext] tokenMap { Nil }.
let[Reflect thisContext] allTargetTexts { Nil }.
let[Reflect thisContext] sliceCount { 300 }.
it can tokenise do: {:thisTest
allTargetTexts is (
File
list: 'samples',
filter: (\:_:descr (descr @ 'file' startsWith: 'sample')),
from: 0 length: sliceCount, asGenerator
fmap: {\:descr File new: 'samples/' + (descr @ 'file'), read.}
).
thisTest tests: 'Tokeniser'.
thisTest progressesUpTo: sliceCount.
var res is analyser
SWconsolidate: allTargetTexts
pre: {:i thisTest progress: i. }
post: { thisTest show. }.
consolidateAll is res @ 0.
DFconsAll is res @ 1.
tokenMap is res @ 3.
# consolidateAll display.
# DFconsAll display.
}.
it can calculate rsd do: {:thisTest
thisTest tests: 'Relative Standard Deviation'.
var size is consolidateAll size.
thisTest progressesUpTo: size head.
var rsd is analyser
calculateRSD: consolidateAll
withMap: tokenMap
andDFVector: DFconsAll
pre: {:i thisTest progress: i. }
post: { thisTest show. }
.
let[glblctx] rsd { rsd }.
# dump rsd for inspection
var fs is File new: 'rsd', open: 'w+'.
rsd each: {:k:v
fs write: '${{tokenMap at: k}}$\t${{v}}$\n'.
}.
fs close.
}.
it can detect stopwords do: {:thisTest
thisTest tests: 'Stopword detection'.
var kfs is File new: 'gen-stopwords', open: 'w+'.
kfs write: '%:L' % ['\n', (rsd imap: \:i:x [x, i], sort: {:x:y ^x head > y head.}, take: 300, fmap!: \:x tokenMap at: x last)].
kfs close.
}.
}.