-
Notifications
You must be signed in to change notification settings - Fork 33
/
kp_gen_eval_transfer.py
391 lines (346 loc) · 21.3 KB
/
kp_gen_eval_transfer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# -*- encoding: utf-8 -*-
import codecs
import json
import random
import shutil
from onmt.constants import ModelTask
from onmt.translate.translator import build_translator
from onmt.utils.parse import ArgumentParser
import os
import datetime
import time
import numpy as np
import kp_evaluate
from onmt.utils import split_corpus
from onmt.utils.logging import init_logger
import onmt.opts as opts
train_test_mappings = {
'kp20k': ['kp20k', 'kp20k_valid2k', 'inspec', 'krapivin', 'semeval', 'nus', 'duc'],
'openkp': ['openkp', 'openkp_valid2k', 'jptimes', 'duc'],
'kptimes': ['kptimes', 'kptimes_valid2k', 'jptimes', 'duc'],
'stackex': ['stackex', 'stackex_valid2k', 'duc'],
}
def scan_new_checkpoints(ckpt_dir):
ckpts = []
for subdir, dirs, files in os.walk(ckpt_dir):
for file in files:
if file.endswith('.pt'):
ckpt_name = file[: file.find('.pt')]
if not 'step' in ckpt_name:
# skip the last ckpt of fairseq jobs such as 'checkpoint_last.pt' and 'checkpoint54.pt'
continue
ckpt_path = os.path.join(subdir, file)
exp_dir = os.path.dirname(subdir)
exp_name = exp_dir[exp_dir.rfind('/') + 1: ]
ckpt_dict = {
'exp_dir': exp_dir,
'exp_name': exp_name,
'ckpt_path': ckpt_path,
'ckpt_name': ckpt_name,
'step': int(ckpt_name[ckpt_name.rfind('step_')+5:])
}
ckpts.append(ckpt_dict)
return ckpts
def scan_predictions(exp_root_dir):
preds = []
for subdir, dirs, files in os.walk(exp_root_dir):
for file in files:
if file.endswith('.pred'):
try:
pred_name = file[: -5]
step, data = pred_name.split('-')
step = int(step[step.rfind('step_') + 5:])
data = data[5:]
pred = {
'ckpt_name': pred_name,
'pred_path': os.path.join(subdir, file),
'dataset': data,
'step': step
}
preds.append(pred)
except:
print('invalid pred name %s' % file)
return preds
def _get_parser():
parser = ArgumentParser(description='run_kp_eval_transfer.py')
opts.translate_opts(parser)
# opts.train_opts(parser)
opts.model_opts(parser)
opts.dynamic_prepare_opts(parser, build_vocab_only=False)
return parser
if __name__ == "__main__":
parser = _get_parser()
parser.add_argument('--tasks', '-tasks', nargs='+', type=str,
required=True,
choices=['pred', 'eval', 'report'],
help='Specify process to run, generation or evaluation')
parser.add_argument('-exp_root_dir', type=str, required=True, help='Directory to all checkpoints')
parser.add_argument('-data_dir', type=str, required=True, help='Directory to datasets (ground-truth)')
parser.add_argument('--step_base', '-step_base', type=int, default=1,
help='the base of step to be evaluated, only if ckpt_step % step_base==0 we evaluate it, '
'1 means evaluate everything.')
parser.add_argument('-test_interval', type=int, default=600, help='Minimum time interval the job should wait if a .pred file is not updated by another job (imply another job failed).')
parser.add_argument('-testsets', nargs='+', type=str, default=['kp20k', 'openkp', 'stackex', 'kptimes', 'jptimes'], help='Specify datasets to test on')
parser.add_argument('--pred_trained_only', '-pred_trained_only', action='store_true', help='If true, it only runs inference of testsets that the job has been trained with.')
parser.add_argument('--ignore_existing', '-ignore_existing', action='store_true', help='If true, it ignores previous generated results.')
parser.add_argument('--empirical_result', '-empirical_result', action='store_true', help='For export empirical results to CSV.')
parser.add_argument('-splits', nargs='+', type=str, default=['train', 'test', 'valid'], help='Specify datasets to test on')
parser.add_argument('-tokenizer', type=str, default='split_nopunc', choices=['spacy', 'split', 'split_nopunc'],
help='Specify what tokenizer used in evaluation')
parser.add_argument('--onepass', '-onepass', action='store_true', help='If true, it only scans and generates once, otherwise an infinite loop scanning new available ckpts.')
parser.add_argument('--wait_patience', '-wait_patience', type=int, default=2, help='Terminates evaluation after scan this number of times.')
parser.add_argument('--wait_time', '-wait_time', type=int, default=120, help='.')
parser.add_argument('--sleep_time', '-sleep_time', type=int, default=600, help='.')
opt = parser.parse_args()
if isinstance(opt.data, str):
setattr(opt, 'data', json.loads(opt.data.replace('\'', '"')))
setattr(opt, 'data_task', ModelTask.SEQ2SEQ)
if opt.data:
ArgumentParser._get_all_transform(opt)
ArgumentParser._validate_transforms_opts(opt)
ArgumentParser._validate_fields_opts(opt)
opt.__setattr__('valid_batch_size', opt.batch_size)
opt.__setattr__('batch_size_multiple', 1)
opt.__setattr__('bucket_size', 128)
opt.__setattr__('pool_factor', 256)
# np.random.seed()
wait_time = np.random.randint(opt.wait_time) if opt.wait_time > 0 else 0
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # "%Y-%m-%d_%H:%M:%S"
if not os.path.exists(os.path.join(opt.exp_root_dir, 'logs')):
os.makedirs(os.path.join(opt.exp_root_dir, 'logs'))
logger = init_logger(opt.exp_root_dir + '/logs/autoeval_%s.log'
% (current_time))
if not opt.onepass:
logger.info('Sleep for %d sec to avoid conflicting with other threads' % wait_time)
time.sleep(wait_time)
# do generate summarized report
if 'report' in opt.tasks:
report_dir = os.path.join(opt.exp_root_dir, 'report')
if not os.path.exists(report_dir): os.makedirs(report_dir)
kp_evaluate.gather_eval_results(eval_root_dir=opt.exp_root_dir,
report_csv_dir=report_dir,
tokenizer=opt.tokenizer,
empirical_result=opt.empirical_result)
logger.warning('Report accomplished, exit!')
exit(0)
logger.info(opt)
testset_path_dict = {}
for testset in opt.testsets:
for split in opt.splits:
if not os.path.exists(opt.data_dir + '/%s/%s.json' % (testset, split)):
logger.info("Data does not exist, skip: %s-%s" % (testset, split))
continue
src_shard = split_corpus(opt.data_dir + '/%s/%s.json' % (testset, split), shard_size=-1)
tgt_shard = split_corpus(opt.data_dir + '/%s/%s.json' % (testset, split), shard_size=-1)
src_shard, tgt_shard = list(zip(src_shard, tgt_shard))[0]
src_shard = [json.loads(l) for l in src_shard]
tgt_shard = [json.loads(l) for l in tgt_shard]
logger.info("Loaded data from %s-%s: #src=%d, #tgt=%d" % (testset, split, len(src_shard), len(tgt_shard)))
testset_path_dict[testset+'_'+split] = (opt.data_dir + '/%s/%s.json' % (testset, split),
opt.data_dir + '/%s/%s.json' % (testset, split),
src_shard, tgt_shard)
current_patience = opt.wait_patience
while True:
pred_linecount_dict = {}
eval_linecount_dict = {}
job_done = False # a flag indicating if any real pred/eval job is done
if 'pred' in opt.tasks:
ckpts = scan_new_checkpoints(opt.exp_root_dir)
ckpts = sorted(ckpts, key=lambda x:x['step'])
random.shuffle(ckpts)
logger.info('Found %d checkpoints from %s!' % (len(ckpts), opt.exp_root_dir))
if opt.step_base is not None and opt.step_base > 1:
num_ckpts = len(ckpts)
ckpts = [ckpt for ckpt in ckpts if ckpt['step'] % opt.step_base == 0 and ckpt['step'] // opt.step_base > 0]
logger.info('After filtering non opt.step_base=%d ckpts, found %d/%d checkpoints!' %
(opt.step_base, len(ckpts), num_ckpts))
# iterate each ckpt and start predicting/evaluating
for ckpt_id, ckpt in enumerate(ckpts):
ckpt_path = ckpt['ckpt_path']
ckpt_name = ckpt['ckpt_name']
exp_dir = ckpt['exp_dir']
exp_name = ckpt['exp_name']
logger.info("[%d/%d] Checking checkpoint: %s" % (ckpt_id, len(ckpts), ckpt_path))
setattr(opt, 'models', [ckpt['ckpt_path']])
for datasplit_name, dataset in testset_path_dict.items():
# ignore current testdata-split if this data is not used in training
if opt.pred_trained_only:
pass_flag = False
cur_testname = datasplit_name[: datasplit_name.index('_')] if '_' in datasplit_name else datasplit_name
for trainname, testnames in train_test_mappings.items():
# training dataset name appears in exp_name
if trainname in exp_name and cur_testname in testnames:
pass_flag = True
break
if not pass_flag:
print('Skip predict/evaluate test=[%s] for ckpt=[%s] due to train/test mismatch.' % (datasplit_name, exp_name))
continue
src_path, tgt_path, src_shard, tgt_shard = dataset
decoding_method = 'beamsearch-width_%d-maxlen_%d' % (opt.beam_size, opt.max_length)
pred_dir = os.path.join(exp_dir, 'outputs', decoding_method, 'pred')
if not os.path.exists(pred_dir): os.makedirs(pred_dir)
pred_file = '%s-data_%s.pred' % (ckpt_name, datasplit_name)
pred_path = os.path.join(pred_dir, pred_file)
# skip translation for this dataset if previous pred exists
do_trans_flag = True
if os.path.exists(pred_path):
elapsed_time = time.time() - os.stat(pred_path).st_mtime
if pred_path in pred_linecount_dict and pred_linecount_dict[pred_path] == len(src_shard):
# if it's already in pred_linecount_dict, means it's done and counted
do_trans_flag = False
elif elapsed_time < opt.test_interval:
do_trans_flag = False
logger.info("Skip translating because previous PRED file was generated only %d sec ago (<%d sec). PRED file: %s"
% (elapsed_time, opt.test_interval, pred_path))
else:
# count line numbers of long-done files to check if this is a done pred
try:
pred_linecount_dict[pred_path] = len([1 for i in open(pred_path, 'r').readlines()])
# count is same means it's done
if pred_linecount_dict[pred_path] == len(src_shard):
do_trans_flag = False
logger.info("Skip translating because previous PRED is complete. PRED file: %s" % (pred_path))
else:
# if file is modified less than opt.test_interval min, it might be being processed by another job. Otherwise it's a bad result and delete it
if elapsed_time < opt.test_interval:
do_trans_flag = False
else:
os.remove(pred_path)
logger.info('Removed a bad PRED file, #(line)=%d, #(elapsed_time)=%ds. PRED file: %s'
% (pred_linecount_dict[pred_path], int(elapsed_time), pred_path))
except Exception as e:
logger.exception('Error while validating or deleting PRED file: %s' % pred_path)
opt.data['valid']['path_src'] = src_path
opt.data['valid']['path_tgt'] = src_path
opt.data['valid']['path_align'] = None
# do translation
try:
if do_trans_flag or opt.ignore_existing:
logger.info("*" * 50)
logger.info("Start translating [data=%s] for [exp=%s]-[ckpt=%s]." % (datasplit_name, exp_name, ckpt_name))
logger.info("\t exporting PRED result to %s." % (pred_path))
logger.info("*" * 50)
# if it's BART model, OpenNMT has to do something additional
if 'bart' in exp_name.lower() and not exp_name.lower().startswith('transformer'):
opt.__setattr__('fairseq_model', True)
opt.__setattr__('encoder_type', 'bart')
opt.__setattr__('decoder_type', 'bart')
opt.__setattr__('pretrained_tokenizer', True)
opt.__setattr__('copy_attn', False)
opt.__setattr__('model_dtype', 'fp16')
else:
opt.__setattr__('fairseq_model', False)
translator = build_translator(opt, report_score=opt.verbose, logger=logger)
# create an empty file to indicate that the translator is working on it
codecs.open(pred_path, 'w+', 'utf-8').close()
# set output_file for each dataset (instead of outputting to opt.output)
translator.out_file = codecs.open(pred_path, 'w+', 'utf-8')
_, _ = translator.translate(
src=src_shard,
batch_size=opt.batch_size,
attn_debug=opt.attn_debug,
opt=opt
)
job_done = True
logger.info("Complete translating [%s], PRED file: %s." % (datasplit_name, pred_path))
else:
logger.info("Skip translating [%s] for %s, PRED file: %s." % (datasplit_name, ckpt_name, pred_path))
except Exception as e:
logger.exception('Error while translating [%s], PRED file: %s.' % (datasplit_name, pred_path))
# do evaluation
if 'eval' in opt.tasks:
new_preds = scan_predictions(opt.exp_root_dir)
new_preds = sorted(new_preds, key=lambda x: x['step'])
random.shuffle(new_preds)
logger.info('Found %d predictions from %s!' % (len(new_preds), opt.exp_root_dir))
for pred_id, pred in enumerate(new_preds):
ckpt_name = pred['ckpt_name']
pred_path = pred['pred_path']
datasplit_name = pred['dataset']
if datasplit_name not in testset_path_dict: continue
logger.info("[%d/%d] Checking prediction: %s" % (pred_id, len(new_preds), pred_path))
eval_path = pred_path[: -5] + '.%s.eval' % opt.tokenizer
printout_path = pred_path[: -5] + '.%s.report' % opt.tokenizer
do_eval_flag = True
try:
src_path, tgt_path, src_shard, tgt_shard = testset_path_dict[datasplit_name]
if not pred_path in pred_linecount_dict: # may be out of date
pred_linecount_dict[pred_path] = len([1 for i in open(pred_path, 'r').readlines()])
num_pred = pred_linecount_dict[pred_path]
if num_pred != len(src_shard):
do_eval_flag = False
logger.info("Skip evaluating because current PRED file is not complete, #(line)=%d. PRED file: %s"
% (num_pred, pred_path))
elapsed_time = time.time() - os.stat(pred_path).st_mtime
if elapsed_time > opt.test_interval:
os.remove(pred_path)
logger.warning('Removed a bad PRED file, #(line)=%d, #(elapsed_time)=%ds. PRED file: %s'
% (num_pred, int(elapsed_time), pred_path))
del pred_linecount_dict[pred_path]
else:
# if pred is good, check if re-eval is necessary
if os.path.exists(eval_path):
elapsed_time = time.time() - os.stat(eval_path).st_mtime
if eval_path in eval_linecount_dict and eval_linecount_dict[eval_path] == len(src_shard):
do_eval_flag = False
elif elapsed_time < opt.test_interval:
# if file is modified less than opt.test_interval min, it might be being processed by another job.
do_eval_flag = False
logger.info("Skip evaluating because previous EVAL file was generated only %d sec ago (<%d sec). EVAL file: %s"
% (elapsed_time, opt.test_interval, eval_path))
else:
try:
score_dict = json.load(open(eval_path, 'r'))
if 'present_exact_correct@5' in score_dict:
num_eval = len(score_dict['present_exact_correct@5'])
else:
num_eval = 0
eval_linecount_dict[eval_path] = num_eval
if num_eval == len(src_shard):
do_eval_flag = False
logger.info("Skip evaluating because existing eval file is complete.")
else:
# it's a bad result and delete it
os.remove(eval_path)
logger.info('Removed a bad eval file, #(pred)=%d, #(eval)=%d, #(elapsed_time)=%ds: %s' % (num_pred, num_eval, int(elapsed_time), eval_path))
except:
os.remove(eval_path)
logger.info('Removed a bad eval file: %s' % (eval_path))
except Exception as e:
logger.exception('Error while validating or deleting EVAL file: %s' % eval_path)
try:
if do_eval_flag or opt.ignore_existing:
logger.info("*" * 50)
logger.info("Start evaluating [%s] for %s" % (datasplit_name, ckpt_name))
logger.info("\t will export eval result to %s." % (eval_path))
logger.info("*" * 50)
codecs.open(eval_path, 'w+', 'utf-8').close()
score_dict = kp_evaluate.keyphrase_eval(datasplit_name,
src_path, tgt_path,
pred_path=pred_path, logger=logger,
verbose=opt.verbose,
report_path=printout_path,
tokenizer=opt.tokenizer
)
if score_dict is not None:
with open(eval_path, 'w') as output_json:
output_json.write(json.dumps(score_dict)+'\n')
job_done = True
else:
logger.info("Skip evaluating [%s] for %s, EVAL file: %s." % (datasplit_name, ckpt_name, eval_path))
except Exception as e:
logger.exception('Error while evaluating')
if job_done: # reset current_patience if no real job is done in the current iteration
current_patience = opt.wait_patience
else:
current_patience -= 1
if opt.onepass:
break
if current_patience <= 0:
break
else:
# scan again for every 10min
sleep_time = opt.sleep_time
logger.info('Sleep for %d sec, current_patience=%d' % (sleep_time, current_patience))
logger.info('*' * 50)
time.sleep(sleep_time)