{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T21:36:12Z","timestamp":1777066572185,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,15]],"date-time":"2019-10-15T00:00:00Z","timestamp":1571097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000185","name":"Defense Advanced Research Projects Agency","doi-asserted-by":"publisher","award":["FA8750-18-2-0018,FA8750-19-2-0501"],"award-info":[{"award-number":["FA8750-18-2-0018,FA8750-19-2-0501"]}],"id":[{"id":"10.13039\/100000185","id-type":"DOI","asserted-by":"publisher"}]},{"name":"ARC DECRA","award":["DE190100626"],"award-info":[{"award-number":["DE190100626"]}]},{"DOI":"10.13039\/100011039","name":"Intelligence Advanced Research Projects Activity","doi-asserted-by":"publisher","award":["D17PC00340"],"award-info":[{"award-number":["D17PC00340"]}],"id":[{"id":"10.13039\/100011039","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,15]]},"DOI":"10.1145\/3343031.3350894","type":"proceedings-article","created":{"date-parts":[[2019,10,21]],"date-time":"2019-10-21T16:32:26Z","timestamp":1571675546000},"page":"1758-1767","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Annotation Efficient Cross-Modal Retrieval with Adversarial Attentive Alignment"],"prefix":"10.1145","author":[{"given":"Po-Yao","family":"Huang","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guoliang","family":"Kang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenhe","family":"Liu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaojun","family":"Chang","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexander G.","family":"Hauptmann","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2019,10,15]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Waleed Abdulla. 2017. Mask R-CNN for object detection and instance segmentation on Keras and TensorFlow. https:\/\/github.com\/matterport\/Mask_RCNN .  Waleed Abdulla. 2017. Mask R-CNN for object detection and instance segmentation on Keras and TensorFlow. https:\/\/github.com\/matterport\/Mask_RCNN ."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In CVPR .  Peter Anderson Xiaodong He Chris Buehler Damien Teney Mark Johnson Stephen Gould and Lei Zhang. 2018. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In CVPR .","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_3_1","volume-title":"VQA: Visual Question Answering. In International Conference on Computer Vision (ICCV) .","author":"Antol Stanislaw","year":"2015"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.323"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995516"},{"key":"e_1_3_2_1_6_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2018"},{"key":"e_1_3_2_1_7_1","unstructured":": Improving Visual-Semantic Embeddings with Hard Negatives. (2018). https:\/\/github.com\/fartashf\/vsepp  : Improving Visual-Semantic Embeddings with Hard Negatives. (2018). https:\/\/github.com\/fartashf\/vsepp"},{"key":"e_1_3_2_1_8_1","volume-title":"WordNet: An Electronic Lexical Database","author":"Fellbaum Christiane"},{"key":"e_1_3_2_1_9_1","volume-title":"Advances in Neural Information Processing Systems 26: 27th Annual Conference on Neural Information Processing Systems","author":"Frome Andrea","year":"2013"},{"key":"e_1_3_2_1_10_1","volume-title":"International Conference on Machine Learning . 1180--1189","author":"Ganin Yaroslav","year":"2015"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the thirteenth international conference on artificial intelligence and statistics. 249--256","author":"Glorot Xavier","year":"2010"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10593-2_35"},{"key":"e_1_3_2_1_13_1","volume-title":"Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR) .","author":"Goyal Yash","year":"2017"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_15_1","volume-title":"Long short-term memory. Neural computation","author":"Hochreiter Sepp","year":"1997"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 2018 ACM on International Conference on Multimedia Retrieval (ICMR '18)","author":"Huang Po-Yao","year":"2060"},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the 2019 on International Conference on Multimedia Retrieval (ICMR '19)","author":"Huang Po-Yao"},{"key":"e_1_3_2_1_18_1","volume-title":"Instance-Aware Image and Sentence Matching with Selective Multimodal LSTM. In 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 7254--7262","author":"Huang Yan","year":"2017"},{"key":"e_1_3_2_1_19_1","volume-title":"Learning semantic concepts and order for image and sentence matching. arXiv preprint arXiv:1712.02036","author":"Huang Yan","year":"2017"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_8"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806240"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_23_1","unstructured":"Andrej Karpathy Armand Joulin and Li F Fei-Fei. 2014a. Deep fragment embeddings for bidirectional image sentence mapping. In Advances in neural information processing systems. 1889--1897.  Andrej Karpathy Armand Joulin and Li F Fei-Fei. 2014a. Deep fragment embeddings for bidirectional image sentence mapping. In Advances in neural information processing systems. 1889--1897."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Andrej Karpathy George Toderici Sanketh Shetty Thomas Leung Rahul Sukthankar and Li Fei-Fei. 2014b. Large-scale Video Classification with Convolutional Neural Networks. In CVPR .  Andrej Karpathy George Toderici Sanketh Shetty Thomas Leung Rahul Sukthankar and Li Fei-Fei. 2014b. Large-scale Video Classification with Convolutional Neural Networks. In CVPR .","DOI":"10.1109\/CVPR.2014.223"},{"key":"e_1_3_2_1_25_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014"},{"key":"e_1_3_2_1_26_1","volume-title":"Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539","author":"Kiros Ryan","year":"2014"},{"key":"e_1_3_2_1_27_1","volume-title":"Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models. NIPS Workshop","author":"Kiros Ryan","year":"2014"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"e_1_3_2_1_29_1","volume-title":"Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations. https:\/\/arxiv.org\/abs\/1602.07332","author":"Krishna Ranjay","year":"2016"},{"key":"e_1_3_2_1_30_1","volume-title":"et almbox","author":"Kuznetsova Alina","year":"2018"},{"key":"e_1_3_2_1_31_1","volume-title":"Stacked Cross Attention for Image-Text Matching. arXiv preprint arXiv:1803.08024","author":"Lee Kuang-Huei","year":"2018"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123432"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 2016 ACM on International Conference on Multimedia Retrieval (ICMR '16)","author":"Mettes Pascal","year":"1996"},{"key":"e_1_3_2_1_35_1","unstructured":"Tomas Mikolov Ilya Sutskever Kai Chen Greg S Corrado and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. In Advances in neural information processing systems. 3111--3119.  Tomas Mikolov Ilya Sutskever Kai Chen Greg S Corrado and Jeff Dean. 2013. Distributed representations of words and phrases and their compositionality. In Advances in neural information processing systems. 3111--3119."},{"key":"e_1_3_2_1_36_1","volume-title":"Webly Supervised Joint Embedding for Cross-Modal Image-Text Retrieval. In 2018 ACM Multimedia Conference on Multimedia Conference. ACM","author":"Mithun Niluthpol Chowdhury","year":"2018"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.232"},{"key":"e_1_3_2_1_38_1","volume-title":"Manning","author":"Pennington Jeffrey","year":"2014"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2007.383266"},{"key":"e_1_3_2_1_40_1","unstructured":"Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems. 91--99.  Shaoqing Ren Kaiming He Ross Girshick and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems. 91--99."},{"key":"e_1_3_2_1_41_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806226"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.386"},{"key":"e_1_3_2_1_44_1","volume-title":"Order-embeddings of images and language. arXiv preprint arXiv:1511.06361","author":"Vendrov Ivan","year":"2015"},{"key":"e_1_3_2_1_45_1","volume-title":"Order-Embeddings of Images and Language. CoRR","author":"Vendrov Ivan","year":"2015"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Bokun Wang Yang Yang Xing Xu Alan Hanjalic and Heng Tao Shen. 2017. Adversarial Cross-Modal Retrieval. In ACM MM.  Bokun Wang Yang Yang Xing Xu Alan Hanjalic and Heng Tao Shen. 2017. Adversarial Cross-Modal Retrieval. In ACM MM.","DOI":"10.1145\/3123266.3123326"},{"key":"e_1_3_2_1_47_1","volume-title":"A comprehensive survey on cross-modal retrieval. arXiv preprint arXiv:1607.06215","author":"Wang Kaiye","year":"2016"},{"key":"e_1_3_2_1_48_1","volume-title":"Learning two-branch neural networks for image-text matching tasks","author":"Wang Liwei","year":"2018"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.541"},{"key":"e_1_3_2_1_50_1","volume-title":"International conference on machine learning. 2048--2057","author":"Xu Kelvin","year":"2015"},{"key":"e_1_3_2_1_51_1","unstructured":"Xing Xu Li He Huimin Lu Lianli Gao and Yanli Ji. [n. d.]. Deep adversarial metric learning for cross-modal retrieval. World Wide Web ( [n. d.]) 1--16.  Xing Xu Li He Huimin Lu Lianli Gao and Yanli Ji. [n. d.]. Deep adversarial metric learning for cross-modal retrieval. World Wide Web ( [n. d.]) 1--16."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_53_1","volume-title":"Dual-Path Convolutional Image-Text Embedding. CoRR","author":"Zheng Zhedong","year":"2017"},{"key":"e_1_3_2_1_54_1","volume-title":"Advances in Neural Information Processing Systems 27","author":"Zhou Bolei"}],"event":{"name":"MM '19: The 27th ACM International Conference on Multimedia","location":"Nice France","acronym":"MM '19","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 27th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350894","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3343031.3350894","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3343031.3350894","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:13:17Z","timestamp":1750201997000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350894"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,10,15]]},"references-count":54,"alternative-id":["10.1145\/3343031.3350894","10.1145\/3343031"],"URL":"https:\/\/doi.org\/10.1145\/3343031.3350894","relation":{},"subject":[],"published":{"date-parts":[[2019,10,15]]},"assertion":[{"value":"2019-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}