config.toml

stop-words = "../data/lemur-stopwords.txt"
libsvm-modules = "../deps/libsvm-modules/"
prefix = "../data/"
function-words = "../data/function-words.txt"
punctuation = "../data/sentence-boundaries/sentence-punctuation.txt"
start-exceptions = "../data/sentence-boundaries/sentence-start-exceptions.txt"
end-exceptions = "../data/sentence-boundaries/sentence-end-exceptions.txt"

dataset = "ceeaus"
corpus = "line.toml" # located inside dataset folder
index = "ceeaus"
indexer-ram-budget = 1024 # **estimated** RAM budget for indexing in MB
                          # always set this lower than your physical RAM!
# indexer-num-threads = 8 # default value is system thread concurrency

[[analyzers]]
method = "ngram-word"
ngram = 1
filter = "default-unigram-chain"

[query-runner]
#query-judgements = "../data/ceeaus-qrels.txt"  # uncomment to run IR eval
query-path = "../queries.txt"  # create this file!
trec-format = false            # default: false
max-results = 10               # default: 10
query-id-start = 1             # default: 1

[ranker]
method = "bm25"
k1 = 1.2
b = 0.75
k3 = 500

[classifier]
method = "one-vs-all"
[classifier.base]
method = "sgd"
loss = "hinge"

[regressor]
method = "sgd"
loss = "least-squares"

[lda]
inference = "gibbs"
max-iters = 1000
alpha = 1.0
beta = 1.0
topics = 4
model-prefix = "lda-model"

[crf]
prefix = "crf"
treebank = "penn-treebank" # relative to data prefix
corpus = "wsj"
section-size = 99
train-sections = [0, 18]
dev-sections = [19, 21]
test-sections = [22, 24]

[language-model]
arpa-file = "../data/english-sentences.arpa"
binary-file-prefix = "english-sentences-"

[diff]
n-value = 3
max-edits = 3
# penalty defaults are all zero (no penalty)
base-penalty = 0.0 # base penalty is for any edit
insert-penalty = 0.0
substitute-penalty = 0.0
remove-penalty = 0.0

[features]
method = "info-gain"
prefix = "features"
features-per-class = 20

[sequence]
prefix = "perceptron-tagger"
treebank = "penn-treebank" # relative to data prefix
corpus = "wsj"
section-size = 99
train-sections = [0, 18]
dev-sections = [19, 21]
test-sections = [22, 24]

[parser]
prefix = "parser"
treebank = "penn-treebank" # relative to data prefix
corpus = "wsj"
section-size = 99
train-sections = [2, 21]
dev-sections = [22, 22]
test-sections = [23, 23]

[embeddings]
prefix = "word-embeddings"
filter = [{type = "icu-tokenizer", suppress-tags = true}, {type = "lowercase"}]
vector-size = 50
[embeddings.vocab]
min-count = 10
max-size = 500000