{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,5]],"date-time":"2026-06-05T11:40:39Z","timestamp":1780659639161,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,10,15]],"date-time":"2018-10-15T00:00:00Z","timestamp":1539561600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["IIS-1746031, CNS-1544969"],"award-info":[{"award-number":["IIS-1746031, CNS-1544969"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,10,15]]},"DOI":"10.1145\/3240508.3240712","type":"proceedings-article","created":{"date-parts":[[2018,10,18]],"date-time":"2018-10-18T17:52:08Z","timestamp":1539885128000},"page":"1856-1864","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":57,"title":["Webly Supervised Joint Embedding for Cross-Modal Image-Text Retrieval"],"prefix":"10.1145","author":[{"given":"Niluthpol Chowdhury","family":"Mithun","sequence":"first","affiliation":[{"name":"University of California, Riverside, Riverside, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rameswar","family":"Panda","sequence":"additional","affiliation":[{"name":"University of California, Riverside, Riverside, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Evangelos E.","family":"Papalexakis","sequence":"additional","affiliation":[{"name":"University of California, Riverside, Riverside, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Amit K.","family":"Roy-Chowdhury","sequence":"additional","affiliation":[{"name":"University of California, Riverside, Riverside, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2018,10,15]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/1553374.1553380"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-2026"},{"key":"e_1_3_2_1_3_1","unstructured":"Xinlei Chen Hao Fang Tsung-Yi Lin Ramakrishna Vedantam Saurabh Gupta Piotr Doll\u00e1r and C Lawrence Zitnick. 2015. Microsoft COCO captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015).  Xinlei Chen Hao Fang Tsung-Yi Lin Ramakrishna Vedantam Saurabh Gupta Piotr Doll\u00e1r and C Lawrence Zitnick. 2015. Microsoft COCO captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_1_4_1","unstructured":"Jinsoo Choi Tae-Hyun Oh and In So Kweon. 2017. Textually Customized Video Summaries. arXiv preprint arXiv:1702.01528 (2017).  Jinsoo Choi Tae-Hyun Oh and In So Kweon. 2017. Textually Customized Video Summaries. arXiv preprint arXiv:1702.01528 (2017)."},{"key":"e_1_3_2_1_5_1","unstructured":"Junyoung Chung Caglar Gulcehre KyungHyun Cho and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014).  Junyoung Chung Caglar Gulcehre KyungHyun Cho and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)."},{"key":"e_1_3_2_1_6_1","unstructured":"Jianfeng Dong Xirong Li and Cees GM Snoek. 2016. Word2VisualVec: Image and video to sentence matching by visual feature prediction. CoRR abs\/1604.06838 (2016).  Jianfeng Dong Xirong Li and Cees GM Snoek. 2016. Word2VisualVec: Image and video to sentence matching by visual feature prediction. CoRR abs\/1604.06838 (2016)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.201"},{"key":"e_1_3_2_1_8_1","unstructured":"Fartash Faghri David J. Fleet Ryan Kiros and Sanja Fidler. 2017. VSE++: Improved Visual-Semantic Embeddings. CoRR abs\/1707.05612 (2017). arXiv:1707.05612 http:\/\/arxiv.org\/abs\/1707.05612  Fartash Faghri David J. Fleet Ryan Kiros and Sanja Fidler. 2017. VSE++: Improved Visual-Semantic Embeddings. CoRR abs\/1707.05612 (2017). arXiv:1707.05612 http:\/\/arxiv.org\/abs\/1707.05612"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.5555\/1888089.1888092"},{"key":"e_1_3_2_1_10_1","volume-title":"Devise: A deep visual-semantic embedding model. In Advances in neural information processing systems. 2121--2129.","author":"Frome Andrea","year":"2013"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123296"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0658-4"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10593-2_35"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3078971.3078991"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.5555\/2566972.2566993"},{"key":"e_1_3_2_1_17_1","unstructured":"Zeyuan Hu and Julia Strout. 2018. Exploring Stereotypes and Biased Data with the Crowd. arXiv preprint arXiv:1801.03261 (2018).  Zeyuan Hu and Julia Strout. 2018. Exploring Stereotypes and Biased Data with the Crowd. arXiv preprint arXiv:1801.03261 (2018)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126686.3126720"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.767"},{"key":"e_1_3_2_1_20_1","volume-title":"Learning Robust Visual-Semantic Embeddings. In IEEE Conference on Computer Vision and Pattern Recognition. 3571--3580","author":"Hubert Tsai Yao-Hung","year":"2017"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/1460096.1460104"},{"key":"e_1_3_2_1_22_1","volume-title":"European Conference on Computer Vision. Springer, 67--84","author":"Joulin Armand"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_24_1","unstructured":"Andrej Karpathy Armand Joulin and Fei Fei F Li. 2014. Deep fragment embeddings for bidirectional image sentence mapping. In Advances in neural information processing systems. 1889--1897.   Andrej Karpathy Armand Joulin and Fei Fei F Li. 2014. Deep fragment embeddings for bidirectional image sentence mapping. In Advances in neural information processing systems. 1889--1897."},{"key":"e_1_3_2_1_25_1","unstructured":"Ronald Kemker Angelina Abitino Marc McClure and Christopher Kanan. 2017. Measuring Catastrophic Forgetting in Neural Networks. arXiv preprint arXiv:1708.02072 (2017).  Ronald Kemker Angelina Abitino Marc McClure and Christopher Kanan. 2017. Measuring Catastrophic Forgetting in Neural Networks. arXiv preprint arXiv:1708.02072 (2017)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Tom Kenter Alexey Borisov and Maarten de Rijke. 2016. Siamese cbow: Optimizing word embeddings for sentence representations. arXiv preprint arXiv:1606.04640 (2016).  Tom Kenter Alexey Borisov and Maarten de Rijke. 2016. Siamese cbow: Optimizing word embeddings for sentence representations. arXiv preprint arXiv:1606.04640 (2016).","DOI":"10.18653\/v1\/P16-1089"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/2806416.2806475"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33718-5_12"},{"key":"e_1_3_2_1_29_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik","year":"2014"},{"key":"e_1_3_2_1_30_1","unstructured":"Ryan Kiros Ruslan Salakhutdinov and Richard S Zemel. 2014. Unifying visualsemantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014).  Ryan Kiros Ruslan Salakhutdinov and Richard S Zemel. 2014. Unifying visualsemantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_19"},{"key":"e_1_3_2_1_33_1","volume-title":"International Conference on Computer Vision.","author":"Li Ang"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123432"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_36_1","unstructured":"Junhua Mao Wei Xu Yi Yang Jiang Wang Zhiheng Huang and Alan Yuille. 2014. Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632 (2014).  Junhua Mao Wei Xu Yi Yang Jiang Wang Zhiheng Huang and Alan Yuille. 2014. Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632 (2014)."},{"key":"e_1_3_2_1_37_1","unstructured":"Tomas Mikolov Kai Chen Greg Corrado and Je!rey Dean. 2013. E#cient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013).  Tomas Mikolov Kai Chen Greg Corrado and Je!rey Dean. 2013. E#cient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2967285"},{"key":"e_1_3_2_1_40_1","volume-title":"Dual Attention Networks for Multimodal Reasoning and Matching. In IEEE Conference on Computer Vision and Pattern Recognition. 299--307","author":"Nam Hyeonseob","year":"2017"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.118"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_1_43_1","unstructured":"Sebastian Ruder. 2017. An overview of multi-task learning in deep neural networks. arXiv preprint arXiv:1706.05098 (2017).  Sebastian Ruder. 2017. An overview of multi-task learning in deep neural networks. arXiv preprint arXiv:1706.05098 (2017)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.107"},{"key":"e_1_3_2_1_46_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014).  Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5540112"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806226"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995347"},{"key":"e_1_3_2_1_50_1","unstructured":"Emiel van Miltenburg. 2016. Stereotyping and bias in the $ickr30k dataset. arXiv preprint arXiv:1605.06083 (2016).  Emiel van Miltenburg. 2016. Stereotyping and bias in the $ickr30k dataset. arXiv preprint arXiv:1605.06083 (2016)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2009.06.042"},{"key":"e_1_3_2_1_52_1","unstructured":"Ivan Vendrov Ryan Kiros Sanja Fidler and Raquel Urtasun. 2015. Orderembeddings of images and language. arXiv preprint arXiv:1511.06361 (2015).  Ivan Vendrov Ryan Kiros Sanja Fidler and Raquel Urtasun. 2015. Orderembeddings of images and language. arXiv preprint arXiv:1511.06361 (2015)."},{"key":"e_1_3_2_1_53_1","unstructured":"Liwei Wang Yin Li Jing Huang and Svetlana Lazebnik. 2018. Learning twobranch neural networks for image-text matching tasks. IEEE Transactions on Pattern Analysis and Machine Intelligence (2018).  Liwei Wang Yin Li Jing Huang and Svetlana Lazebnik. 2018. Learning twobranch neural networks for image-text matching tasks. IEEE Transactions on Pattern Analysis and Machine Intelligence (2018)."},{"key":"e_1_3_2_1_54_1","unstructured":"Liwei Wang Yin Li Jing Huang and Svetlana Lazebnik. 2018. Learning twobranch neural networks for image-text matching tasks. IEEE Transactions on Pattern Analysis and Machine Intelligence (2018).  Liwei Wang Yin Li Jing Huang and Svetlana Lazebnik. 2018. Learning twobranch neural networks for image-text matching tasks. IEEE Transactions on Pattern Analysis and Machine Intelligence (2018)."},{"key":"e_1_3_2_1_55_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition. IEEE, 5005--5013","author":"Wang Liwei","year":"2016"},{"key":"e_1_3_2_1_56_1","volume-title":"Sampling Matters in Deep Embedding Learning. In IEEE International Conference on Computer Vision. IEEE, 2859--2867","author":"Wu Chao-Yuan","year":"2017"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298966"},{"key":"e_1_3_2_1_58_1","unstructured":"Lei Yu Karl Moritz Hermann Phil Blunsom and Stephen Pulman. 2014. Deep learning for answer sentence selection. arXiv preprint arXiv:1412.1632 (2014).  Lei Yu Karl Moritz Hermann Phil Blunsom and Stephen Pulman. 2014. Deep learning for answer sentence selection. arXiv preprint arXiv:1412.1632 (2014)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Jieyu Zhao Tianlu Wang Mark Yatskar Vicente Ordonez and Kai-Wei Chang. 2017. Men also like shopping: Reducing gender bias amplification using corpuslevel constraints. arXiv preprint arXiv:1707.09457 (2017).  Jieyu Zhao Tianlu Wang Mark Yatskar Vicente Ordonez and Kai-Wei Chang. 2017. Men also like shopping: Reducing gender bias amplification using corpuslevel constraints. arXiv preprint arXiv:1707.09457 (2017).","DOI":"10.18653\/v1\/D17-1323"},{"key":"e_1_3_2_1_60_1","unstructured":"Zhedong Zheng Liang Zheng Michael Garrett Yi Yang and Yi-Dong Shen. 2017. Dual-Path Convolutional Image-Text Embedding. arXiv preprint arXiv:1711.05535 (2017).  Zhedong Zheng Liang Zheng Michael Garrett Yi Yang and Yi-Dong Shen. 2017. Dual-Path Convolutional Image-Text Embedding. arXiv preprint arXiv:1711.05535 (2017)."}],"event":{"name":"MM '18: ACM Multimedia Conference","location":"Seoul Republic of Korea","acronym":"MM '18","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 26th ACM international conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3240508.3240712","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3240508.3240712","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3240508.3240712","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:43:32Z","timestamp":1750207412000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3240508.3240712"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,10,15]]},"references-count":60,"alternative-id":["10.1145\/3240508.3240712","10.1145\/3240508"],"URL":"https:\/\/doi.org\/10.1145\/3240508.3240712","relation":{},"subject":[],"published":{"date-parts":[[2018,10,15]]},"assertion":[{"value":"2018-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}