reference.bib

@inproceedings{jia2011learning,
  title={Learning cross-modality similarity for multinomial data},
  author={Jia, Yangqing and Salzmann, Mathieu and Darrell, Trevor},
  booktitle={2011 International Conference on Computer Vision},
  pages={2407--2414},
  year={2011},
  organization={IEEE}
}

@article{mao2014explain,
  title={Explain images with multimodal recurrent neural networks},
  author={Mao, Junhua and Xu, Wei and Yang, Yi and Wang, Jiang and Yuille, Alan L},
  journal={arXiv preprint arXiv:1410.1090},
  year={2014}
}

@article{kiros2014unifying,
  title={Unifying visual-semantic embeddings with multimodal neural language models},
  author={Kiros, Ryan and Salakhutdinov, Ruslan and Zemel, Richard S},
  journal={arXiv preprint arXiv:1411.2539},
  year={2014}
}

@InProceedings{ma2015multimodal,
  author = {Ma, Lin and Lu, Zhengdong and Shang, Lifeng and Li, Hang},
  title = {Multimodal Convolutional Neural Networks for Matching Image and Sentence},
  booktitle = {The IEEE International Conference on Computer Vision (ICCV)},
  month = {December},
  year = {2015}
}

@article{ferraro2015survey,
  title={A survey of current datasets for vision and language research},
  author={Ferraro, Francis and Mostafazadeh, Nasrin and Vanderwende, Lucy and Devlin, Jacob and Galley, Michel and Mitchell, Margaret and others},
  journal={arXiv preprint arXiv:1506.06833},
  year={2015}
}

@article {mao2014deep,
	title = {Deep Captioning with Multimodal Recurrent Neural Networks (m-RNN)},
	number = {033},
	year = {2015},
	month = {05/07/2015},
	author = {Junhua Mao and Wei Xu and Yi Yang and Jiang Wang and Zhiheng Huang and Alan Yuille}
}

@inproceedings{qin2019look,
  title={Look Back and Predict Forward in Image Captioning},
  author={Qin, Yu and Du, Jiajun and Zhang, Yonghua and Lu, Hongtao},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={8367--8375},
  year={2019}
}

@article{caglayan2016multimodality,
  title={Does multimodality help human and machine for translation and image captioning?},
  author={Caglayan, Ozan and Aransa, Walid and Wang, Yaxing and Masana, Marc and Garc{\'\i}a-Mart{\'\i}nez, Mercedes and Bougares, Fethi and Barrault, Lo{\"\i}c and Van de Weijer, Joost},
  journal={arXiv preprint arXiv:1605.09186},
  year={2016}
}

@article{caglayan2016multimodal,
  title={Multimodal attention for neural machine translation},
  author={Caglayan, Ozan and Barrault, Lo{\"\i}c and Bougares, Fethi},
  journal={arXiv preprint arXiv:1609.03976},
  year={2016}
}

@inproceedings{huang2016attention,
  title={Attention-based multimodal neural machine translation},
  author={Huang, Po-Yao and Liu, Frederick and Shiang, Sz-Rung and Oh, Jean and Dyer, Chris},
  booktitle={Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers},
  pages={639--645},
  year={2016}
}

@incollection{yang2016review,
  title = {Review Networks for Caption Generation},
  author = {Yang, Zhilin and Yuan, Ye and Wu, Yuexin and Cohen, William W and Salakhutdinov, Ruslan R},
  booktitle = {Advances in Neural Information Processing Systems 29},
  editor = {D. D. Lee and M. Sugiyama and U. V. Luxburg and I. Guyon and R. Garnett},
  pages = {2361--2369},
  year = {2016},
  publisher = {Curran Associates, Inc.},
  url = {http://papers.nips.cc/paper/6167-review-networks-for-caption-generation.pdf}
}

@inproceedings{yang2016hierarchical,
    title = "Hierarchical Attention Networks for Document Classification",
    author = "Yang, Zichao  and
      Yang, Diyi  and
      Dyer, Chris  and
      He, Xiaodong  and
      Smola, Alex  and
      Hovy, Eduard",
    booktitle = "Proceedings of the 2016 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies",
    month = jun,
    year = "2016",
    address = "San Diego, California",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/N16-1174",
    doi = "10.18653/v1/N16-1174",
    pages = "1480--1489",
}

@InProceedings{you2016image,
author = {You, Quanzeng and Jin, Hailin and Wang, Zhaowen and Fang, Chen and Luo, Jiebo},
title = {Image Captioning With Semantic Attention},
booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2016}
}

@inproceedings{vaswani2017attention,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle={Advances in neural information processing systems},
  pages={5998--6008},
  year={2017}
}

@article{chen2017teacher,
  title={A teacher-student framework for zero-resource neural machine translation},
  author={Chen, Yun and Liu, Yang and Cheng, Yong and Li, Victor OK},
  journal={arXiv preprint arXiv:1705.00753},
  year={2017}
}

@article{nakayama2017zeroresource,
  title={Zero-resource machine translation by multimodal encoder--decoder network with multimedia pivot},
  author={Nakayama, Hideki and Nishida, Noriki},
  journal={Machine Translation},
  volume={31},
  number={1-2},
  pages={49--64},
  year={2017},
  publisher={Springer}
}

@article{delbrouck2017multimodal,
  title={Multimodal compact bilinear pooling for multimodal neural machine translation},
  author={Delbrouck, Jean-Benoit and Dupont, Stephane},
  journal={arXiv preprint arXiv:1703.08084},
  year={2017}
}

@article{lala2017unraveling,
  title={Unraveling the contribution of image captioning and neural machine translation for multimodal machine translation},
  author={Lala, Chiraag and Madhyastha, Pranava and Wang, Josiah and Specia, Lucia},
  journal={The Prague Bulletin of Mathematical Linguistics},
  volume={108},
  number={1},
  pages={197--208},
  year={2017},
  publisher={De Gruyter Open}
}

@article{elliott2017findings,
  title={Findings of the second shared task on multimodal machine translation and multilingual image description},
  author={Elliott, Desmond and Frank, Stella and Barrault, Lo{\"\i}c and Bougares, Fethi and Specia, Lucia},
  journal={arXiv preprint arXiv:1710.07177},
  year={2017}
}

@incollection{xia2017delibertaion,
  title = {Deliberation Networks: Sequence Generation Beyond One-Pass Decoding},
  author = {Xia, Yingce and Tian, Fei and Wu, Lijun and Lin, Jianxin and Qin, Tao and Yu, Nenghai and Liu, Tie-Yan},
  booktitle = {Advances in Neural Information Processing Systems 30},
  editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
  pages = {1784--1794},
  year = {2017},
  publisher = {Curran Associates, Inc.},
  url = {http://papers.nips.cc/paper/6775-deliberation-networks-sequence-generation-beyond-one-pass-decoding.pdf}
}

@article{elliott2017imagination,
  title={Imagination improves multimodal translation},
  author={Elliott, Desmond and K{\'a}d{\'a}r, Akos},
  journal={arXiv preprint arXiv:1705.04350},
  year={2017}
}

@article{calixto2017doubly,
  title={Doubly-attentive decoder for multi-modal neural machine translation},
  author={Calixto, Iacer and Liu, Qun and Campbell, Nick},
  journal={arXiv preprint arXiv:1702.01287},
  year={2017}
}

@inproceedings{libovicky2017attention,
  title = "Attention Strategies for Multi-Source Sequence-to-Sequence Learning",
  author = "Libovick{\'y}, Jind{\v{r}}ich and Helcl, Jind{\v{r}}ich",
  booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
  month = jul,
  year = "2017",
  address = "Vancouver, Canada",
  publisher = "Association for Computational Linguistics",
  url = "https://www.aclweb.org/anthology/P17-2031",
  doi = "10.18653/v1/P17-2031",
  pages = "196--202"
}

@inproceedings{calixto2017incorporating,
  title = "Incorporating Global Visual Features into Attention-based Neural Machine Translation.",
  author = "Calixto, Iacer and Liu, Qun",
  booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
  month = sep,
  year = "2017",
  address = "Copenhagen, Denmark",
  publisher = "Association for Computational Linguistics",
  url = "https://www.aclweb.org/anthology/D17-1105",
  doi = "10.18653/v1/D17-1105",
  pages = "992--1003"
}

@inproceedings{barrault2018findings,
  title={Findings of the third shared task on multimodal machine translation},
  author={Barrault, Lo{\"\i}c and Bougares, Fethi and Specia, Lucia and Lala, Chiraag and Elliott, Desmond and Frank, Stella},
  year={2018}
}

@inproceedings{caglayan2018LIUM-CVC,
    title = "{LIUM}-{CVC} Submissions for {WMT}18 Multimodal Translation Task",
    author = {Caglayan, Ozan  and
      Bardet, Adrien  and
      Bougares, Fethi  and
      Barrault, Lo{\"\i}c  and
      Wang, Kai  and
      Masana, Marc  and
      Herranz, Luis  and
      van de Weijer, Joost},
    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-6438",
    doi = "10.18653/v1/W18-6438",
    pages = "597--602",
}

@inproceedings{gronroos2018MeMAD,
    title = "The {M}e{MAD} Submission to the {WMT}18 Multimodal Translation Task",
    author = {Gr{\"o}nroos, Stig-Arne  and
      Huet, Benoit  and
      Kurimo, Mikko  and
      Laaksonen, Jorma  and
      Merialdo, Bernard  and
      Pham, Phu  and
      Sj{\"o}berg, Mats  and
      Sulubacak, Umut  and
      Tiedemann, J{\"o}rg  and
      Troncy, Raphael  and
      V{\'a}zquez, Ra{\'u}l},
    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-6439",
    doi = "10.18653/v1/W18-6439",
    pages = "603--611",
}

@inproceedings{gwinnup2018AFRL-Ohio,
    title = "The {AFRL}-Ohio State {WMT}18 Multimodal System: Combining Visual with Traditional",
    author = "Gwinnup, Jeremy  and
      Sandvick, Joshua  and
      Hutt, Michael  and
      Erdmann, Grant  and
      Duselis, John  and
      Davis, James",
    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-6440",
    doi = "10.18653/v1/W18-6440",
    pages = "612--615",
}

@inproceedings{helcl2018CUNI,
    title = "{CUNI} System for the {WMT}18 Multimodal Translation Task",
    author = "Helcl, Jind{\v{r}}ich  and
      Libovick{\'y}, Jind{\v{r}}ich  and
      Vari{\v{s}}, Du{\v{s}}an",
    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-6441",
    doi = "10.18653/v1/W18-6441",
    pages = "616--623",
}

@inproceedings{lala2018sheffield,
    title = "{S}heffield Submissions for {WMT}18 Multimodal Translation Shared Task",
    author = "Lala, Chiraag  and
      Madhyastha, Pranava Swaroop  and
      Scarton, Carolina  and
      Specia, Lucia",
    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-6442",
    doi = "10.18653/v1/W18-6442",
    pages = "624--631",
}

@inproceedings{zheng2018ensemble,
    title = "Ensemble Sequence Level Training for Multimodal {MT}: {OSU}-{B}aidu {WMT}18 Multimodal Machine Translation System Report",
    author = "Zheng, Renjie  and
      Yang, Yilin  and
      Ma, Mingbo  and
      Huang, Liang",
    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-6443",
    doi = "10.18653/v1/W18-6443",
    pages = "632--636",
}

@article{delbrouck2018UMONS,
  title={Umons submission for wmt18 multimodal translation task},
  author={Delbrouck, Jean-Benoit and Dupont, St{\'e}phane},
  journal={arXiv preprint arXiv:1810.06233},
  year={2018}
}

@inproceedings{libovicky2018input,
    title = "Input Combination Strategies for Multi-Source Transformer Decoder",
    author = "Libovick{\'y}, Jind{\v{r}}ich  and
      Helcl, Jind{\v{r}}ich  and
      Mare{\v{c}}ek, David",
    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-6326",
    doi = "10.18653/v1/W18-6326",
    pages = "253--260",
}

@inproceedings{shin2018multi,
    title = "Multi-encoder Transformer Network for Automatic Post-Editing",
    author = "Shin, Jaehun  and
      Lee, Jong-Hyeok",
    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
    month = oct,
    year = "2018",
    address = "Belgium, Brussels",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-6470",
    doi = "10.18653/v1/W18-6470",
    pages = "840--845",
}

@article{zhou2018visual,
  title={A visual attention grounding neural model for multimodal machine translation},
  author={Zhou, Mingyang and Cheng, Runxiang and Lee, Yong Jae and Yu, Zhou},
  journal={arXiv preprint arXiv:1808.08266},
  year={2018}
}

@article{miculicich2018document,
  title={Document-level neural machine translation with hierarchical attention networks},
  author={Miculicich, Lesly and Ram, Dhananjay and Pappas, Nikolaos and Henderson, James},
  journal={arXiv preprint arXiv:1809.01576},
  year={2018}
}

@article{devlin2018bert,
  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  journal={arXiv preprint arXiv:1810.04805},
  year={2018}
}

@article{yang2018improving,
  title={Improving neural machine translation with conditional sequence generative adversarial nets},
  author={Yang, Zhen and Chen, Wei and Wang, Feng and Xu, Bo},
  journal={arXiv preprint arXiv:1703.04887},
  year={2018}
}

@article{wu2018adversarial,
  title={Adversarial neural machine translation},
  author={Wu, Lijun and Xia, Yingce and Zhao, Li and Tian, Fei and Qin, Tao and Lai, Jianhuang and Liu, Tie-Yan},
  journal={arXiv preprint arXiv:1704.06933},
  year={2018}
}

@InProceedings{anderson2018bottom,
author = {Anderson, Peter and He, Xiaodong and Buehler, Chris and Teney, Damien and Johnson, Mark and Gould, Stephen and Zhang, Lei},
title = {Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering},
booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2018}
}

@article{caglayan2019probing,
  title={Probing the Need for Visual Context in Multimodal Machine Translation},
  author={Caglayan, Ozan and Madhyastha, Pranava and Specia, Lucia and Barrault, Lo{\"\i}c},
  journal={arXiv preprint arXiv:1903.08678},
  year={2019}
}

@inproceedings{su2019unsupervised,
  title={Unsupervised multi-modal neural machine translation},
  author={Su, Yuanhang and Fan, Kai and Bach, Nguyen and Kuo, C-C Jay and Huang, Fei},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={10482--10491},
  year={2019}
}

@article{ive2019distilling,
  title={Distilling Translations with Visual Awareness},
  author={Ive, Julia and Madhyastha, Pranava and Specia, Lucia},
  journal={arXiv preprint arXiv:1906.07701},
  year={2019}
}

@inproceedings{calixto2019latent,
  title={Latent Variable Model for Multi-modal Translation},
  author={Calixto, Iacer and Rios, Miguel and Aziz, Wilker},
  booktitle={Proceedings of the 57th Conference of the Association for Computational Linguistics},
  pages={6392--6405},
  year={2019}
}

@article{chen2019from,
  title={From Words to Sentences: A Progressive Learning Approach for Zero-resource Machine Translation with Visual Pivots},
  author={Chen, Shizhe and Jin, Qin and Fu, Jianlong},
  journal={arXiv preprint arXiv:1906.00872},
  year={2019}
}

@article{dai2019transformerxl,
  title={Transformer-xl: Attentive language models beyond a fixed-length context},
  author={Dai, Zihang and Yang, Zhilin and Yang, Yiming and Cohen, William W and Carbonell, Jaime and Le, Quoc V and Salakhutdinov, Ruslan},
  journal={arXiv preprint arXiv:1901.02860},
  year={2019}
}

@article{yang2019xlnet,
  title={XLNet: Generalized Autoregressive Pretraining for Language Understanding},
  author={Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Ruslan and Le, Quoc V},
  journal={arXiv preprint arXiv:1906.08237},
  year={2019}
}

@inproceedings{liu2019hierarchical,
  title = "Hierarchical Transformers for Multi-Document Summarization",
  author = "Liu, Yang  and
    Lapata, Mirella",
  booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
  month = jul,
  year = "2019",
  address = "Florence, Italy",
  publisher = "Association for Computational Linguistics",
  url = "https://www.aclweb.org/anthology/P19-1500",
  pages = "5070--5081"
}

@article{pourdamghani2019translating,
  author = {Nima Pourdamghani and Nada Aldarrab and
            Marjan Ghazvininejad and Kevin Knight and
            Jonathan May},
  title = {Translating Translationese: {A} Two-Step Approach to Unsupervised
          Machine Translation},
  journal = {CoRR},
  volume = {abs/1906.05683},
  year = {2019},
  archivePrefix = {arXiv},
  eprint = {1906.05683}
}

@inproceedings{hirasawa2019debiasing,
  title = "Debiasing Word Embeddings Improves Multimodal Machine Translation",
  author = "Hirasawa, Tosho  and
    Komachi, Mamoru",
  booktitle = "Proceedings of Machine Translation Summit XVII Volume 1: Research Track",
  month = "19{--}23 " # aug,
  year = "2019",
  address = "Dublin, Ireland",
  publisher = "European Association for Machine Translation",
  url = "https://www.aclweb.org/anthology/W19-6604",
  pages = "32--42",
}

@article{mogadala2019trends,
  author = {Aditya Mogadala and Marimuthu Kalimuthu and
            Dietrich Klakow},
  title = {Trends in Integration of Vision and Language Research: {A} Survey
          of Tasks, Datasets, and Methods},
  journal = {CoRR},
  volume = {abs/1907.09358},
  year = {2019},
  url = {http://arxiv.org/abs/1907.09358},
  archivePrefix = {arXiv}
}

@Article{calixto2019error,
  author="Calixto, Iacer and Liu, Qun",
  title="An error analysis for image-based multi-modal neural machine translation",
  journal="Machine Translation",
  year="2019",
  month="Jun",
  day="01",
  volume="33",
  number="1",
  pages="155--177",
}

@article{zhou2019synchronous,
  title={Synchronous bidirectional neural machine translation},
  author={Zhou, Long and Zhang, Jiajun and Zong, Chengqing},
  journal={Transactions of the Association for Computational Linguistics},
  volume={7},
  pages={91--105},
  year={2019},
  publisher={MIT Press}
}

@article{hirasawa2019multimodal,
  title={Multimodal Machine Translation with Embedding Prediction},
  author={Hirasawa, Tosho and Yamagishi, Hayahide and Matsumura, Yukio and Komachi, Mamoru},
  journal={arXiv preprint arXiv:1904.00639},
  year={2019}
}

@article{qian2018multimodal,
  title={Multimodal Machine Translation with Reinforcement Learning},
  author={Qian, Xin and Zhong, Ziyi and Zhou, Jieli},
  journal={arXiv preprint arXiv:1805.02356},
  year={2018}
}

@inproceedings{nguyen2018improved,
  title={Improved fusion of visual and language representations by dense symmetric co-attention for visual question answering},
  author={Nguyen, Duy-Kien and Okatani, Takayuki},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={6087--6096},
  year={2018}
}

@article{zoph2016multi,
  title={Multi-source neural translation},
  author={Zoph, Barret and Knight, Kevin},
  journal={arXiv preprint arXiv:1601.00710},
  year={2016}
}

@inproceedings{li2019beyond,
  title={Beyond RNNs: Positional Self-Attention with Co-Attention for Video Question Answering},
  author={Li, Xiangpeng and Song, Jingkuan and Gao, Lianli and Liu, Xianglong and Huang, Wenbing and He, Xiangnan and Gan, Chuang},
  year={2019}
}

@inproceedings{lu2016hierarchical,
  title={Hierarchical question-image co-attention for visual question answering},
  author={Lu, Jiasen and Yang, Jianwei and Batra, Dhruv and Parikh, Devi},
  booktitle={Advances In Neural Information Processing Systems},
  pages={289--297},
  year={2016}
}

@inproceedings{wang2018object,
  title = "Object Counts! Bringing Explicit Detections Back into Image Captioning",
  author = "Wang, Josiah  and
    Madhyastha, Pranava Swaroop  and
    Specia, Lucia",
  booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
  month = jun,
  year = "2018",
  address = "New Orleans, Louisiana",
  publisher = "Association for Computational Linguistics",
  url = "https://www.aclweb.org/anthology/N18-1198",
  doi = "10.18653/v1/N18-1198",
  pages = "2180--2193"
}

@inproceedings{yu2019deep,
  title={Deep Modular Co-Attention Networks for Visual Question Answering},
  author={Yu, Zhou and Yu, Jun and Cui, Yuhao and Tao, Dacheng and Tian, Qi},
  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  pages={6281--6290},
  year={2019}
}