{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,13]],"date-time":"2026-05-13T16:46:27Z","timestamp":1778690787933,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,15]],"date-time":"2019-10-15T00:00:00Z","timestamp":1571097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"The research work is supported by the National Key R&D Program with No.2016QY03D0503, 2016YFB081304, Strategic Priority Research Program of Chinese Academy of Sciences, Grant No.XDC02040400."}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,15]]},"DOI":"10.1145\/3343031.3350869","type":"proceedings-article","created":{"date-parts":[[2019,10,21]],"date-time":"2019-10-21T16:32:26Z","timestamp":1571675546000},"page":"3-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":172,"title":["Focus Your Attention"],"prefix":"10.1145","author":[{"given":"Chunxiao","family":"Liu","sequence":"first","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhendong","family":"Mao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"An-An","family":"Liu","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianzhu","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bin","family":"Wang","sequence":"additional","affiliation":[{"name":"Xiaomi AI Lab, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongdong","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2019,10,15]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_2_1","volume-title":"Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473","author":"Bahdanau Dzmitry","year":"2014","unstructured":"Dzmitry Bahdanau , Kyunghyun Cho , and Yoshua Bengio . 2014. Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 ( 2014 ). Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2014. Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2798607"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"A. Eisenschtat and L. Wolf. 2017. Linking Image and Text with 2-Way Nets. In CVPR. 1855--1865.  A. Eisenschtat and L. Wolf. 2017. Linking Image and Text with 2-Way Nets. In CVPR. 1855--1865.","DOI":"10.1109\/CVPR.2017.201"},{"key":"e_1_3_2_1_5_1","unstructured":"Fartash Faghri David J. Fleet Jamie Kiros and Sanja Fidler. 2018. VSE  Fartash Faghri David J. Fleet Jamie Kiros and Sanja Fidler. 2018. VSE"},{"key":"e_1_3_2_1_6_1","unstructured":": Improving Visual-Semantic Embeddings with Hard Negatives. In BMVC .  : Improving Visual-Semantic Embeddings with Hard Negatives. In BMVC ."},{"key":"e_1_3_2_1_7_1","volume-title":"Stacked Latent Attention for Multimodal Reasoning. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Fan Haoqi","year":"2018","unstructured":"Haoqi Fan and Jiatong Zhou . 2018 . Stacked Latent Attention for Multimodal Reasoning. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018), 1072--1080. Haoqi Fan and Jiatong Zhou. 2018. Stacked Latent Attention for Multimodal Reasoning. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018), 1072--1080."},{"key":"e_1_3_2_1_8_1","volume-title":"Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval with Generative Models. CoRR","author":"Gu Jiuxiang","year":"2017","unstructured":"Jiuxiang Gu , Jianfei Cai , Shafiq R. Joty , Li Niu , and Gang Wang . 2017. Look , Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval with Generative Models. CoRR , Vol. abs\/ 1711 .06420 ( 2017 ). Jiuxiang Gu, Jianfei Cai, Shafiq R. Joty, Li Niu, and Gang Wang. 2017. Look, Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval with Generative Models. CoRR, Vol. abs\/1711.06420 (2017)."},{"key":"e_1_3_2_1_9_1","volume-title":"Deep Residual Learning for Image Recognition. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","author":"He Kaiming","year":"2016","unstructured":"Kaiming He , Xiangyu Zhang , Shaoqing Ren , and Jian Sun . 2016 . Deep Residual Learning for Image Recognition. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016), 770--778. Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep Residual Learning for Image Recognition. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016), 770--778."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2882225"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.767"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00645"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_14_1","volume-title":"Deep Fragment Embeddings for Bidirectional Image Sentence Mapping","author":"Karpathy Andrej","year":"2014","unstructured":"Andrej Karpathy , Armand Joulin , and Li Fei-Fei . 2014. Deep Fragment Embeddings for Bidirectional Image Sentence Mapping ., Vol. 3 ( 2014 ), 1889--1897. Andrej Karpathy, Armand Joulin, and Li Fei-Fei. 2014. Deep Fragment Embeddings for Bidirectional Image Sentence Mapping., Vol. 3 (2014), 1889--1897."},{"key":"e_1_3_2_1_15_1","volume-title":"Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models. 31st International Conference on Machine Learning, ICML 2014","volume":"3","author":"Kiros Ryan","year":"2014","unstructured":"Ryan Kiros , Ruslan Salakhutdinov , and Richard S. Zemel . 2014 . Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models. 31st International Conference on Machine Learning, ICML 2014 , Vol. 3 (11 2014 ). Ryan Kiros, Ruslan Salakhutdinov, and Richard S. Zemel. 2014. Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models. 31st International Conference on Machine Learning, ICML 2014, Vol. 3 (11 2014)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_17_1","volume-title":"Identity-Aware Textual-Visual Matching with Latent Co-attention. 2017 IEEE International Conference on Computer Vision (ICCV) (2017)","author":"Li Shuang","year":"2017","unstructured":"Shuang Li , Tong Xiao , Hongsheng Li , Wei Yang , and Xiaogang Wang . 2017 . Identity-Aware Textual-Visual Matching with Latent Co-attention. 2017 IEEE International Conference on Computer Vision (ICCV) (2017) , 1908--1917. Shuang Li, Tong Xiao, Hongsheng Li, Wei Yang, and Xiaogang Wang. 2017. Identity-Aware Textual-Visual Matching with Latent Co-attention. 2017 IEEE International Conference on Computer Vision (ICCV) (2017), 1908--1917."},{"key":"e_1_3_2_1_18_1","unstructured":"Tsung-Yi Lin Michael Maire Serge J. Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll&#225;r and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In ECCV .  Tsung-Yi Lin Michael Maire Serge J. Belongie James Hays Pietro Perona Deva Ramanan Piotr Doll&#225;r and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In ECCV ."},{"key":"e_1_3_2_1_19_1","volume-title":"Lew","author":"Liu Yu","year":"2017","unstructured":"Yu Liu , Yanming Guo , Erwin M. Bakker , and Michael S . Lew . 2017 . Learning a Recurrent Residual Fusion Network for Multimodal Matching. In ICCV. 4127--4136. Yu Liu, Yanming Guo, Erwin M. Bakker, and Michael S. Lew. 2017. Learning a Recurrent Residual Fusion Network for Multimodal Matching. In ICCV. 4127--4136."},{"key":"e_1_3_2_1_20_1","volume-title":"Hierarchical Question-Image Co-Attention for Visual Question Answering. CoRR","author":"Lu Jiasen","year":"2016","unstructured":"Jiasen Lu , Jianwei Yang , Dhruv Batra , and Devi Parikh . 2016. Hierarchical Question-Image Co-Attention for Visual Question Answering. CoRR , Vol. abs\/ 1606 .00061 ( 2016 ). Jiasen Lu, Jianwei Yang, Dhruv Batra, and Devi Parikh. 2016. Hierarchical Question-Image Co-Attention for Visual Question Answering. CoRR, Vol. abs\/1606.00061 (2016)."},{"key":"e_1_3_2_1_21_1","volume-title":"Manning","author":"Luong Minh Thang","year":"2015","unstructured":"Minh Thang Luong , Hieu Pham , and Christopher D . Manning . 2015 . Effective Approaches to Attention-based Neural Machine Translation. Computer Science ( 2015). Minh Thang Luong, Hieu Pham, and Christopher D. Manning. 2015. Effective Approaches to Attention-based Neural Machine Translation. Computer Science (2015)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Lin Ma Zhengdong Lu Lifeng Shang and Hang Li. 2015. Multimodal Convolutional Neural Networks for Matching Image and Sentence. In ICCV. 2623--2631.  Lin Ma Zhengdong Lu Lifeng Shang and Hang Li. 2015. Multimodal Convolutional Neural Networks for Matching Image and Sentence. In ICCV. 2623--2631.","DOI":"10.1109\/ICCV.2015.301"},{"key":"e_1_3_2_1_23_1","volume-title":"Yuille","author":"Mao Junhua","year":"2014","unstructured":"Junhua Mao , Wei Xu , Yi Yang , Jiang Wang , and Alan L . Yuille . 2014 . Deep Captioning with Multimodal Recurrent Neural Networks. CoRR , Vol. abs\/ 1412 .6632 (2014). Junhua Mao, Wei Xu, Yi Yang, Jiang Wang, and Alan L. Yuille. 2014. Deep Captioning with Multimodal Recurrent Neural Networks. CoRR, Vol. abs\/1412.6632 (2014)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.232"},{"key":"e_1_3_2_1_25_1","unstructured":"Zhenxing Niu Mo Zhou Le Wang Xinbo Gao and Gang Hua. 2017. Hierarchical Multimodal LSTM for Dense Visual-Semantic Embedding. In ICCV. 1899--1907.  Zhenxing Niu Mo Zhou Le Wang Xinbo Gao and Gang Hua. 2017. Hierarchical Multimodal LSTM for Dense Visual-Semantic Embedding. In ICCV. 1899--1907."},{"key":"e_1_3_2_1_26_1","volume-title":"Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. ICCV","author":"Plummer Bryan A.","year":"2015","unstructured":"Bryan A. Plummer , Liwei Wang , Chris M. Cervantes , Juan C. Caicedo , Julia Hockenmaier , and Svetlana Lazebnik . 2015. Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. ICCV ( 2015 ), 2641--2649. Bryan A. Plummer, Liwei Wang, Chris M. Cervantes, Juan C. Caicedo, Julia Hockenmaier, and Svetlana Lazebnik. 2015. Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models. ICCV (2015), 2641--2649."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_1_28_1","volume-title":"Bidirectional attention flow for machine comprehension. arXiv preprint arXiv:1611.01603","author":"Seo Minjoon","year":"2016","unstructured":"Minjoon Seo , Aniruddha Kembhavi , Ali Farhadi , and Hannaneh Hajishirzi . 2016. Bidirectional attention flow for machine comprehension. arXiv preprint arXiv:1611.01603 ( 2016 ). Minjoon Seo, Aniruddha Kembhavi, Ali Farhadi, and Hannaneh Hajishirzi. 2016. Bidirectional attention flow for machine comprehension. arXiv preprint arXiv:1611.01603 (2016)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2797921"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"e_1_3_2_1_31_1","volume-title":"Bidirectional Retrieval Made Simple. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Wehrmann Jonatas","year":"2018","unstructured":"Jonatas Wehrmann and Rodrigo C. Barros . 2018 . Bidirectional Retrieval Made Simple. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition ( 2018 ), 7718--7726. Jonatas Wehrmann and Rodrigo C. Barros. 2018. Bidirectional Retrieval Made Simple. 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018), 7718--7726."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.424"},{"key":"e_1_3_2_1_33_1","volume-title":"Learning Semantic Structure-preserved Embeddings for Cross-modal Retrieval. In 2018 ACM Multimedia Conference on Multimedia Conference. ACM, 825--833","author":"Wu Yiling","year":"2018","unstructured":"Yiling Wu , Shuhui Wang , and Qingming Huang . 2018 . Learning Semantic Structure-preserved Embeddings for Cross-modal Retrieval. In 2018 ACM Multimedia Conference on Multimedia Conference. ACM, 825--833 . Yiling Wu, Shuhui Wang, and Qingming Huang. 2018. Learning Semantic Structure-preserved Embeddings for Cross-modal Retrieval. In 2018 ACM Multimedia Conference on Multimedia Conference. ACM, 825--833."},{"key":"e_1_3_2_1_34_1","volume-title":"International conference on machine learning. 2048--2057","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu , Jimmy Ba , Ryan Kiros , Kyunghyun Cho , Aaron Courville , Ruslan Salakhudinov , Rich Zemel , and Yoshua Bengio . 2015 . Show, attend and tell: Neural image caption generation with visual attention . In International conference on machine learning. 2048--2057 . Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhudinov, Rich Zemel, and Yoshua Bengio. 2015. Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning. 2048--2057."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.446"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_42"},{"key":"e_1_3_2_1_39_1","volume-title":"Dual-Path Convolutional Image-Text Embedding with Instance Loss. arXiv preprint arXiv:1711.05535","author":"Zheng Zhedong","year":"2017","unstructured":"Zhedong Zheng , Liang Zheng , Michael Garrett , Yi Yang , and Yi-Dong Shen . 2017. Dual-Path Convolutional Image-Text Embedding with Instance Loss. arXiv preprint arXiv:1711.05535 ( 2017 ). Zhedong Zheng, Liang Zheng, Michael Garrett, Yi Yang, and Yi-Dong Shen. 2017. Dual-Path Convolutional Image-Text Embedding with Instance Loss. arXiv preprint arXiv:1711.05535 (2017)."}],"event":{"name":"MM '19: The 27th ACM International Conference on Multimedia","location":"Nice France","acronym":"MM '19","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 27th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350869","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3343031.3350869","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:13:25Z","timestamp":1750202005000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350869"}},"subtitle":["A Bidirectional Focal Attention Network for Image-Text Matching"],"short-title":[],"issued":{"date-parts":[[2019,10,15]]},"references-count":39,"alternative-id":["10.1145\/3343031.3350869","10.1145\/3343031"],"URL":"https:\/\/doi.org\/10.1145\/3343031.3350869","relation":{},"subject":[],"published":{"date-parts":[[2019,10,15]]},"assertion":[{"value":"2019-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}