forked from ZihengZZH/awesome-multimodal-machine-translation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreference.bib
597 lines (537 loc) · 21.7 KB
/
reference.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
@inproceedings{jia2011learning,
title={Learning cross-modality similarity for multinomial data},
author={Jia, Yangqing and Salzmann, Mathieu and Darrell, Trevor},
booktitle={2011 International Conference on Computer Vision},
pages={2407--2414},
year={2011},
organization={IEEE}
}
@article{mao2014explain,
title={Explain images with multimodal recurrent neural networks},
author={Mao, Junhua and Xu, Wei and Yang, Yi and Wang, Jiang and Yuille, Alan L},
journal={arXiv preprint arXiv:1410.1090},
year={2014}
}
@article{kiros2014unifying,
title={Unifying visual-semantic embeddings with multimodal neural language models},
author={Kiros, Ryan and Salakhutdinov, Ruslan and Zemel, Richard S},
journal={arXiv preprint arXiv:1411.2539},
year={2014}
}
@InProceedings{ma2015multimodal,
author = {Ma, Lin and Lu, Zhengdong and Shang, Lifeng and Li, Hang},
title = {Multimodal Convolutional Neural Networks for Matching Image and Sentence},
booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
month = {December},
year = {2015}
}
@article{ferraro2015survey,
title={A survey of current datasets for vision and language research},
author={Ferraro, Francis and Mostafazadeh, Nasrin and Vanderwende, Lucy and Devlin, Jacob and Galley, Michel and Mitchell, Margaret and others},
journal={arXiv preprint arXiv:1506.06833},
year={2015}
}
@article {mao2014deep,
title = {Deep Captioning with Multimodal Recurrent Neural Networks (m-RNN)},
number = {033},
year = {2015},
month = {05/07/2015},
author = {Junhua Mao and Wei Xu and Yi Yang and Jiang Wang and Zhiheng Huang and Alan Yuille}
}
@inproceedings{qin2019look,
title={Look Back and Predict Forward in Image Captioning},
author={Qin, Yu and Du, Jiajun and Zhang, Yonghua and Lu, Hongtao},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
pages={8367--8375},
year={2019}
}
@article{caglayan2016multimodality,
title={Does multimodality help human and machine for translation and image captioning?},
author={Caglayan, Ozan and Aransa, Walid and Wang, Yaxing and Masana, Marc and Garc{\'\i}a-Mart{\'\i}nez, Mercedes and Bougares, Fethi and Barrault, Lo{\"\i}c and Van de Weijer, Joost},
journal={arXiv preprint arXiv:1605.09186},
year={2016}
}
@article{caglayan2016multimodal,
title={Multimodal attention for neural machine translation},
author={Caglayan, Ozan and Barrault, Lo{\"\i}c and Bougares, Fethi},
journal={arXiv preprint arXiv:1609.03976},
year={2016}
}
@inproceedings{huang2016attention,
title={Attention-based multimodal neural machine translation},
author={Huang, Po-Yao and Liu, Frederick and Shiang, Sz-Rung and Oh, Jean and Dyer, Chris},
booktitle={Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers},
pages={639--645},
year={2016}
}
@incollection{yang2016review,
title = {Review Networks for Caption Generation},
author = {Yang, Zhilin and Yuan, Ye and Wu, Yuexin and Cohen, William W and Salakhutdinov, Ruslan R},
booktitle = {Advances in Neural Information Processing Systems 29},
editor = {D. D. Lee and M. Sugiyama and U. V. Luxburg and I. Guyon and R. Garnett},
pages = {2361--2369},
year = {2016},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/6167-review-networks-for-caption-generation.pdf}
}
@inproceedings{yang2016hierarchical,
title = "Hierarchical Attention Networks for Document Classification",
author = "Yang, Zichao and
Yang, Diyi and
Dyer, Chris and
He, Xiaodong and
Smola, Alex and
Hovy, Eduard",
booktitle = "Proceedings of the 2016 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jun,
year = "2016",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N16-1174",
doi = "10.18653/v1/N16-1174",
pages = "1480--1489",
}
@InProceedings{you2016image,
author = {You, Quanzeng and Jin, Hailin and Wang, Zhaowen and Fang, Chen and Luo, Jiebo},
title = {Image Captioning With Semantic Attention},
booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2016}
}
@inproceedings{vaswani2017attention,
title={Attention is all you need},
author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
booktitle={Advances in neural information processing systems},
pages={5998--6008},
year={2017}
}
@article{chen2017teacher,
title={A teacher-student framework for zero-resource neural machine translation},
author={Chen, Yun and Liu, Yang and Cheng, Yong and Li, Victor OK},
journal={arXiv preprint arXiv:1705.00753},
year={2017}
}
@article{nakayama2017zeroresource,
title={Zero-resource machine translation by multimodal encoder--decoder network with multimedia pivot},
author={Nakayama, Hideki and Nishida, Noriki},
journal={Machine Translation},
volume={31},
number={1-2},
pages={49--64},
year={2017},
publisher={Springer}
}
@article{delbrouck2017multimodal,
title={Multimodal compact bilinear pooling for multimodal neural machine translation},
author={Delbrouck, Jean-Benoit and Dupont, Stephane},
journal={arXiv preprint arXiv:1703.08084},
year={2017}
}
@article{lala2017unraveling,
title={Unraveling the contribution of image captioning and neural machine translation for multimodal machine translation},
author={Lala, Chiraag and Madhyastha, Pranava and Wang, Josiah and Specia, Lucia},
journal={The Prague Bulletin of Mathematical Linguistics},
volume={108},
number={1},
pages={197--208},
year={2017},
publisher={De Gruyter Open}
}
@article{elliott2017findings,
title={Findings of the second shared task on multimodal machine translation and multilingual image description},
author={Elliott, Desmond and Frank, Stella and Barrault, Lo{\"\i}c and Bougares, Fethi and Specia, Lucia},
journal={arXiv preprint arXiv:1710.07177},
year={2017}
}
@incollection{xia2017delibertaion,
title = {Deliberation Networks: Sequence Generation Beyond One-Pass Decoding},
author = {Xia, Yingce and Tian, Fei and Wu, Lijun and Lin, Jianxin and Qin, Tao and Yu, Nenghai and Liu, Tie-Yan},
booktitle = {Advances in Neural Information Processing Systems 30},
editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
pages = {1784--1794},
year = {2017},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/6775-deliberation-networks-sequence-generation-beyond-one-pass-decoding.pdf}
}
@article{elliott2017imagination,
title={Imagination improves multimodal translation},
author={Elliott, Desmond and K{\'a}d{\'a}r, Akos},
journal={arXiv preprint arXiv:1705.04350},
year={2017}
}
@article{calixto2017doubly,
title={Doubly-attentive decoder for multi-modal neural machine translation},
author={Calixto, Iacer and Liu, Qun and Campbell, Nick},
journal={arXiv preprint arXiv:1702.01287},
year={2017}
}
@inproceedings{libovicky2017attention,
title = "Attention Strategies for Multi-Source Sequence-to-Sequence Learning",
author = "Libovick{\'y}, Jind{\v{r}}ich and Helcl, Jind{\v{r}}ich",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P17-2031",
doi = "10.18653/v1/P17-2031",
pages = "196--202"
}
@inproceedings{calixto2017incorporating,
title = "Incorporating Global Visual Features into Attention-based Neural Machine Translation.",
author = "Calixto, Iacer and Liu, Qun",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1105",
doi = "10.18653/v1/D17-1105",
pages = "992--1003"
}
@inproceedings{barrault2018findings,
title={Findings of the third shared task on multimodal machine translation},
author={Barrault, Lo{\"\i}c and Bougares, Fethi and Specia, Lucia and Lala, Chiraag and Elliott, Desmond and Frank, Stella},
year={2018}
}
@inproceedings{caglayan2018LIUM-CVC,
title = "{LIUM}-{CVC} Submissions for {WMT}18 Multimodal Translation Task",
author = {Caglayan, Ozan and
Bardet, Adrien and
Bougares, Fethi and
Barrault, Lo{\"\i}c and
Wang, Kai and
Masana, Marc and
Herranz, Luis and
van de Weijer, Joost},
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6438",
doi = "10.18653/v1/W18-6438",
pages = "597--602",
}
@inproceedings{gronroos2018MeMAD,
title = "The {M}e{MAD} Submission to the {WMT}18 Multimodal Translation Task",
author = {Gr{\"o}nroos, Stig-Arne and
Huet, Benoit and
Kurimo, Mikko and
Laaksonen, Jorma and
Merialdo, Bernard and
Pham, Phu and
Sj{\"o}berg, Mats and
Sulubacak, Umut and
Tiedemann, J{\"o}rg and
Troncy, Raphael and
V{\'a}zquez, Ra{\'u}l},
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6439",
doi = "10.18653/v1/W18-6439",
pages = "603--611",
}
@inproceedings{gwinnup2018AFRL-Ohio,
title = "The {AFRL}-Ohio State {WMT}18 Multimodal System: Combining Visual with Traditional",
author = "Gwinnup, Jeremy and
Sandvick, Joshua and
Hutt, Michael and
Erdmann, Grant and
Duselis, John and
Davis, James",
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6440",
doi = "10.18653/v1/W18-6440",
pages = "612--615",
}
@inproceedings{helcl2018CUNI,
title = "{CUNI} System for the {WMT}18 Multimodal Translation Task",
author = "Helcl, Jind{\v{r}}ich and
Libovick{\'y}, Jind{\v{r}}ich and
Vari{\v{s}}, Du{\v{s}}an",
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6441",
doi = "10.18653/v1/W18-6441",
pages = "616--623",
}
@inproceedings{lala2018sheffield,
title = "{S}heffield Submissions for {WMT}18 Multimodal Translation Shared Task",
author = "Lala, Chiraag and
Madhyastha, Pranava Swaroop and
Scarton, Carolina and
Specia, Lucia",
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6442",
doi = "10.18653/v1/W18-6442",
pages = "624--631",
}
@inproceedings{zheng2018ensemble,
title = "Ensemble Sequence Level Training for Multimodal {MT}: {OSU}-{B}aidu {WMT}18 Multimodal Machine Translation System Report",
author = "Zheng, Renjie and
Yang, Yilin and
Ma, Mingbo and
Huang, Liang",
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6443",
doi = "10.18653/v1/W18-6443",
pages = "632--636",
}
@article{delbrouck2018UMONS,
title={Umons submission for wmt18 multimodal translation task},
author={Delbrouck, Jean-Benoit and Dupont, St{\'e}phane},
journal={arXiv preprint arXiv:1810.06233},
year={2018}
}
@inproceedings{libovicky2018input,
title = "Input Combination Strategies for Multi-Source Transformer Decoder",
author = "Libovick{\'y}, Jind{\v{r}}ich and
Helcl, Jind{\v{r}}ich and
Mare{\v{c}}ek, David",
booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6326",
doi = "10.18653/v1/W18-6326",
pages = "253--260",
}
@inproceedings{shin2018multi,
title = "Multi-encoder Transformer Network for Automatic Post-Editing",
author = "Shin, Jaehun and
Lee, Jong-Hyeok",
booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6470",
doi = "10.18653/v1/W18-6470",
pages = "840--845",
}
@article{zhou2018visual,
title={A visual attention grounding neural model for multimodal machine translation},
author={Zhou, Mingyang and Cheng, Runxiang and Lee, Yong Jae and Yu, Zhou},
journal={arXiv preprint arXiv:1808.08266},
year={2018}
}
@article{miculicich2018document,
title={Document-level neural machine translation with hierarchical attention networks},
author={Miculicich, Lesly and Ram, Dhananjay and Pappas, Nikolaos and Henderson, James},
journal={arXiv preprint arXiv:1809.01576},
year={2018}
}
@article{devlin2018bert,
title={Bert: Pre-training of deep bidirectional transformers for language understanding},
author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
journal={arXiv preprint arXiv:1810.04805},
year={2018}
}
@article{yang2018improving,
title={Improving neural machine translation with conditional sequence generative adversarial nets},
author={Yang, Zhen and Chen, Wei and Wang, Feng and Xu, Bo},
journal={arXiv preprint arXiv:1703.04887},
year={2018}
}
@article{wu2018adversarial,
title={Adversarial neural machine translation},
author={Wu, Lijun and Xia, Yingce and Zhao, Li and Tian, Fei and Qin, Tao and Lai, Jianhuang and Liu, Tie-Yan},
journal={arXiv preprint arXiv:1704.06933},
year={2018}
}
@InProceedings{anderson2018bottom,
author = {Anderson, Peter and He, Xiaodong and Buehler, Chris and Teney, Damien and Johnson, Mark and Gould, Stephen and Zhang, Lei},
title = {Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering},
booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2018}
}
@article{caglayan2019probing,
title={Probing the Need for Visual Context in Multimodal Machine Translation},
author={Caglayan, Ozan and Madhyastha, Pranava and Specia, Lucia and Barrault, Lo{\"\i}c},
journal={arXiv preprint arXiv:1903.08678},
year={2019}
}
@inproceedings{su2019unsupervised,
title={Unsupervised multi-modal neural machine translation},
author={Su, Yuanhang and Fan, Kai and Bach, Nguyen and Kuo, C-C Jay and Huang, Fei},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
pages={10482--10491},
year={2019}
}
@article{ive2019distilling,
title={Distilling Translations with Visual Awareness},
author={Ive, Julia and Madhyastha, Pranava and Specia, Lucia},
journal={arXiv preprint arXiv:1906.07701},
year={2019}
}
@inproceedings{calixto2019latent,
title={Latent Variable Model for Multi-modal Translation},
author={Calixto, Iacer and Rios, Miguel and Aziz, Wilker},
booktitle={Proceedings of the 57th Conference of the Association for Computational Linguistics},
pages={6392--6405},
year={2019}
}
@article{chen2019from,
title={From Words to Sentences: A Progressive Learning Approach for Zero-resource Machine Translation with Visual Pivots},
author={Chen, Shizhe and Jin, Qin and Fu, Jianlong},
journal={arXiv preprint arXiv:1906.00872},
year={2019}
}
@article{dai2019transformerxl,
title={Transformer-xl: Attentive language models beyond a fixed-length context},
author={Dai, Zihang and Yang, Zhilin and Yang, Yiming and Cohen, William W and Carbonell, Jaime and Le, Quoc V and Salakhutdinov, Ruslan},
journal={arXiv preprint arXiv:1901.02860},
year={2019}
}
@article{yang2019xlnet,
title={XLNet: Generalized Autoregressive Pretraining for Language Understanding},
author={Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Ruslan and Le, Quoc V},
journal={arXiv preprint arXiv:1906.08237},
year={2019}
}
@inproceedings{liu2019hierarchical,
title = "Hierarchical Transformers for Multi-Document Summarization",
author = "Liu, Yang and
Lapata, Mirella",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1500",
pages = "5070--5081"
}
@article{pourdamghani2019translating,
author = {Nima Pourdamghani and Nada Aldarrab and
Marjan Ghazvininejad and Kevin Knight and
Jonathan May},
title = {Translating Translationese: {A} Two-Step Approach to Unsupervised
Machine Translation},
journal = {CoRR},
volume = {abs/1906.05683},
year = {2019},
archivePrefix = {arXiv},
eprint = {1906.05683}
}
@inproceedings{hirasawa2019debiasing,
title = "Debiasing Word Embeddings Improves Multimodal Machine Translation",
author = "Hirasawa, Tosho and
Komachi, Mamoru",
booktitle = "Proceedings of Machine Translation Summit XVII Volume 1: Research Track",
month = "19{--}23 " # aug,
year = "2019",
address = "Dublin, Ireland",
publisher = "European Association for Machine Translation",
url = "https://www.aclweb.org/anthology/W19-6604",
pages = "32--42",
}
@article{mogadala2019trends,
author = {Aditya Mogadala and Marimuthu Kalimuthu and
Dietrich Klakow},
title = {Trends in Integration of Vision and Language Research: {A} Survey
of Tasks, Datasets, and Methods},
journal = {CoRR},
volume = {abs/1907.09358},
year = {2019},
url = {http://arxiv.org/abs/1907.09358},
archivePrefix = {arXiv}
}
@Article{calixto2019error,
author="Calixto, Iacer and Liu, Qun",
title="An error analysis for image-based multi-modal neural machine translation",
journal="Machine Translation",
year="2019",
month="Jun",
day="01",
volume="33",
number="1",
pages="155--177",
}
@article{zhou2019synchronous,
title={Synchronous bidirectional neural machine translation},
author={Zhou, Long and Zhang, Jiajun and Zong, Chengqing},
journal={Transactions of the Association for Computational Linguistics},
volume={7},
pages={91--105},
year={2019},
publisher={MIT Press}
}
@article{hirasawa2019multimodal,
title={Multimodal Machine Translation with Embedding Prediction},
author={Hirasawa, Tosho and Yamagishi, Hayahide and Matsumura, Yukio and Komachi, Mamoru},
journal={arXiv preprint arXiv:1904.00639},
year={2019}
}
@article{qian2018multimodal,
title={Multimodal Machine Translation with Reinforcement Learning},
author={Qian, Xin and Zhong, Ziyi and Zhou, Jieli},
journal={arXiv preprint arXiv:1805.02356},
year={2018}
}
@inproceedings{nguyen2018improved,
title={Improved fusion of visual and language representations by dense symmetric co-attention for visual question answering},
author={Nguyen, Duy-Kien and Okatani, Takayuki},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
pages={6087--6096},
year={2018}
}
@article{zoph2016multi,
title={Multi-source neural translation},
author={Zoph, Barret and Knight, Kevin},
journal={arXiv preprint arXiv:1601.00710},
year={2016}
}
@inproceedings{li2019beyond,
title={Beyond RNNs: Positional Self-Attention with Co-Attention for Video Question Answering},
author={Li, Xiangpeng and Song, Jingkuan and Gao, Lianli and Liu, Xianglong and Huang, Wenbing and He, Xiangnan and Gan, Chuang},
year={2019}
}
@inproceedings{lu2016hierarchical,
title={Hierarchical question-image co-attention for visual question answering},
author={Lu, Jiasen and Yang, Jianwei and Batra, Dhruv and Parikh, Devi},
booktitle={Advances In Neural Information Processing Systems},
pages={289--297},
year={2016}
}
@inproceedings{wang2018object,
title = "Object Counts! Bringing Explicit Detections Back into Image Captioning",
author = "Wang, Josiah and
Madhyastha, Pranava Swaroop and
Specia, Lucia",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/N18-1198",
doi = "10.18653/v1/N18-1198",
pages = "2180--2193"
}
@inproceedings{yu2019deep,
title={Deep Modular Co-Attention Networks for Visual Question Answering},
author={Yu, Zhou and Yu, Jun and Cui, Yuhao and Tao, Dacheng and Tian, Qi},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
pages={6281--6290},
year={2019}
}