{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T00:20:03Z","timestamp":1759969203133,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T00:00:00Z","timestamp":1746662400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,8]]},"DOI":"10.1145\/3701716.3717564","type":"proceedings-article","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T16:12:56Z","timestamp":1748016776000},"page":"2153-2160","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["The DenseCap-Guided Attention Network For Image-Text Matching"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3925-4951","authenticated-orcid":false,"given":"Xuri","family":"Ge","sequence":"first","affiliation":[{"name":"Shandong University, jinan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2184-9971","authenticated-orcid":false,"given":"Linqing","family":"Li","sequence":"additional","affiliation":[{"name":"Central China Normal University, Wuhan, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5735-8674","authenticated-orcid":false,"given":"Songpei","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2516-8407","authenticated-orcid":false,"given":"Kaiwen","family":"Zheng","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7131-4581","authenticated-orcid":false,"given":"Yaoqin","family":"He","sequence":"additional","affiliation":[{"name":"Tomorrow Advancing Life, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4759-2042","authenticated-orcid":false,"given":"Junchen","family":"Fu","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9228-1759","authenticated-orcid":false,"given":"Joemon M.","family":"Jose","sequence":"additional","affiliation":[{"name":"University of Glasgow, Glasgow, United Kingdom"}]}],"member":"320","published-online":{"date-parts":[[2025,5,23]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_3_1","unstructured":"Fuhai Chen Rongrong Ji Jiayi Ji Xiaoshuai Sun Baochang Zhang Xuri Ge Yongjian Wu Feiyue Huang and Yan Wang. 2019. Variational Structured Semantic Inference for Diverse Image Captioning. In Advances in Neural Information Processing Systems. 1931--1941."},{"key":"e_1_3_2_1_4_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_5_1","volume-title":"Devise: A deep visual-semantic embedding model. In Advances in Neural Information Processing Systems. 2121--2129.","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Greg S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marc'Aurelio Ranzato, and Tomas Mikolov. 2013. Devise: A deep visual-semantic embedding model. In Advances in Neural Information Processing Systems. 2121--2129."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475634"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00069"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision. 1022--1031","author":"Ge Xuri","year":"2023","unstructured":"Xuri Ge, Fuhai Chen, Songpei Xu, Fuxiang Tao, and Joemon M Jose. 2023. Crossmodal semantic enhanced interaction for image-sentence retrieval. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision. 1022--1031."},{"key":"e_1_3_2_1_9_1","volume-title":"Hire: Hybrid-modal interaction with multiple relational enhancements for image-text matching. ACM Transactions on Intelligent Systems and Technology","author":"Ge Xuri","year":"2020","unstructured":"Xuri Ge, Fuhai Chen, Songpei Xu, Fuxiang Tao, Jie Wang, and Joemon M Jose. 2020. Hire: Hybrid-modal interaction with multiple relational enhancements for image-text matching. ACM Transactions on Intelligent Systems and Technology (2020)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2024.103716"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00645"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00585"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.494"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_17_1","unstructured":"Andrej Karpathy Armand Joulin and Li F Fei-Fei. 2014. Deep fragment embeddings for bidirectional image sentence mapping. In Advances in Neural Information Processing Systems. 1889--1897."},{"key":"e_1_3_2_1_18_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350869"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.442"},{"key":"e_1_3_2_1_25_1","volume-title":"Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632","author":"Mao Junhua","year":"2014","unstructured":"Junhua Mao,Wei Xu, Yi Yang, JiangWang, Zhiheng Huang, and Alan Yuille. 2014. Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632 (2014)."},{"key":"e_1_3_2_1_26_1","volume-title":"Fine-grained Visual Textual Alignment for Cross-Modal Retrieval using Transformer Encoders. arXiv preprint arXiv:2008.05231","author":"Messina Nicola","year":"2020","unstructured":"Nicola Messina, Giuseppe Amato, Andrea Esuli, Fabrizio Falchi, Claudio Gennaro, and St\u00e9phane Marchand-Maillet. 2020. Fine-grained Visual Textual Alignment for Cross-Modal Retrieval using Transformer Encoders. arXiv preprint arXiv:2008.05231 (2020)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.232"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.208"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"key":"e_1_3_2_1_30_1","volume-title":"Orderembeddings of images and language. arXiv preprint arXiv:1511.06361","author":"Vendrov Ivan","year":"2015","unstructured":"Ivan Vendrov, Ryan Kiros, Sanja Fidler, and Raquel Urtasun. 2015. Orderembeddings of images and language. arXiv preprint arXiv:1511.06361 (2015)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_32_1","volume-title":"Consensus- Aware Visual-Semantic Embedding for Image-Text Matching. arXiv preprint arXiv:2007.08883","author":"Zhang Ying","year":"2020","unstructured":"HaoranWang, Ying Zhang, Zhong Ji, Yanwei Pang, and Lin Ma. 2020. Consensus- Aware Visual-Semantic Embedding for Image-Text Matching. arXiv preprint arXiv:2007.08883 (2020)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2797921"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.541"},{"key":"e_1_3_2_1_35_1","volume-title":"Position focused attention network for image-text matching. arXiv preprint arXiv:1907.09748","author":"Wang Yaxiong","year":"2019","unstructured":"Yaxiong Wang, Hao Yang, Xueming Qian, Lin Ma, Jing Lu, Biao Li, and Xin Fan. 2019. Position focused attention network for image-text matching. arXiv preprint arXiv:1907.09748 (2019)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00359"}],"event":{"name":"WWW '25: The ACM Web Conference 2025","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Sydney NSW Australia","acronym":"WWW '25"},"container-title":["Companion Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717564","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3701716.3717564","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T03:04:42Z","timestamp":1759892682000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3701716.3717564"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,8]]},"references-count":37,"alternative-id":["10.1145\/3701716.3717564","10.1145\/3701716"],"URL":"https:\/\/doi.org\/10.1145\/3701716.3717564","relation":{},"subject":[],"published":{"date-parts":[[2025,5,8]]},"assertion":[{"value":"2025-05-23","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}