{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T01:56:41Z","timestamp":1776131801808,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["No. 2020AAA0104500"],"award-info":[{"award-number":["No. 2020AAA0104500"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475634","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T06:21:10Z","timestamp":1634538070000},"page":"5185-5193","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":51,"title":["Structured Multi-modal Feature Embedding and Alignment for Image-Sentence Retrieval"],"prefix":"10.1145","author":[{"given":"Xuri","family":"Ge","sequence":"first","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fuhai","family":"Chen","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Joemon M.","family":"Jose","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhilong","family":"Ji","sequence":"additional","affiliation":[{"name":"Tomorrow Advancing Life, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhongqin","family":"Wu","sequence":"additional","affiliation":[{"name":"Tomorrow Advancing Life, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiao","family":"Liu","sequence":"additional","affiliation":[{"name":"Tomorrow Advancing Life, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454460"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123275"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"e_1_3_2_1_5_1","volume-title":"Expressing Objects just like Words: Recurrent Visual Embedding for Image-Text Matching. arXiv preprint arXiv:2002.08510","author":"Chen Tianlang","year":"2020","unstructured":"Tianlang Chen and Jiebo Luo . 2020. Expressing Objects just like Words: Recurrent Visual Embedding for Image-Text Matching. arXiv preprint arXiv:2002.08510 ( 2020 ). Tianlang Chen and Jiebo Luo. 2020. Expressing Objects just like Words: Recurrent Visual Embedding for Image-Text Matching. arXiv preprint arXiv:2002.08510 (2020)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00419"},{"key":"e_1_3_2_1_7_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri , David J Fleet , Jamie Ryan Kiros, and Sanja Fidler . 2017 . Vse : Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612(2017). Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612(2017)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.5555\/2999792.2999849"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00645"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969038"},{"key":"e_1_3_2_1_13_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba . 2014 . Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014). Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3042193"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350869"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240684"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.3115\/1118108.1118117"},{"key":"e_1_3_2_1_22_1","volume-title":"Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632","author":"Mao Junhua","year":"2014","unstructured":"Junhua Mao , Wei Xu , Yi Yang , Jiang Wang , Zhiheng Huang , and Alan Yuille . 2014. Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632 ( 2014 ). Junhua Mao, Wei Xu, Yi Yang, Jiang Wang, Zhiheng Huang, and Alan Yuille. 2014. Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632 (2014)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.232"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.208"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969239.2969250"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.5555\/3104482.3104499"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_29_1","volume-title":"Order-embeddings of images and language. arXiv preprint arXiv:1511.06361","author":"Vendrov Ivan","year":"2015","unstructured":"Ivan Vendrov , Ryan Kiros , Sanja Fidler , and Raquel Urtasun . 2015. Order-embeddings of images and language. arXiv preprint arXiv:1511.06361 ( 2015 ). Ivan Vendrov, Ryan Kiros, Sanja Fidler, and Raquel Urtasun. 2015. Order-embeddings of images and language. arXiv preprint arXiv:1511.06361 (2015)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58586-0_2"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2797921"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.541"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.5555\/3367471.3367568"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00359"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475634","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475634","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:24Z","timestamp":1750193304000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475634"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":36,"alternative-id":["10.1145\/3474085.3475634","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475634","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}