{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:19:31Z","timestamp":1767986371488,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Sichuan Science and Technology Program","award":["2019YJ0176,2019YJ0177,2019YFQ0005"],"award-info":[{"award-number":["2019YJ0176,2019YJ0177,2019YFQ0005"]}]},{"name":"Key-Area Research and Development Program of Guangdong Province","award":["2019B010136003"],"award-info":[{"award-number":["2019B010136003"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475567","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T05:04:15Z","timestamp":1634533455000},"page":"4277-4286","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Exploring Graph-Structured Semantics for Cross-Modal Retrieval"],"prefix":"10.1145","author":[{"given":"Lei","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Leiting","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Chuan","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]},{"given":"Fan","family":"Yang","sequence":"additional","affiliation":[{"name":"AIQ, Abu Dhabi, UAE"}]},{"given":"Xin","family":"Li","sequence":"additional","affiliation":[{"name":"AIQ, Abu Dhabi, UAE"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE","author":"Anderson Peter","year":"2018"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.5555\/3042817.3043076"},{"key":"e_1_3_2_2_3_1","unstructured":"Joan Bruna Wojciech Zaremba Arthur Szlam and Yann LeCun. 2014. Spectral Networks and Locally Connected Networks on Graphs. arxiv: 1312.6203 [cs.LG]  Joan Bruna Wojciech Zaremba Arthur Szlam and Yann LeCun. 2014. Spectral Networks and Locally Connected Networks on Graphs. arxiv: 1312.6203 [cs.LG]"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2012.6467268"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.142"},{"key":"e_1_3_2_2_7_1","series-title":"Series B (Methodological)","volume-title":"Maximum Likelihood From Incomplete Data Via The EM algorithm. Journal of the Royal Statistical Society","author":"Dempster Arthur","year":"1977"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00808"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654902"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0658-4"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969125"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1162\/0899766042321814"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/3157096.3157188"},{"key":"e_1_3_2_2_14_1","volume-title":"Biometrika","volume":"28","author":"Hotelling Harold","year":"1935"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331213"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.93"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2018.2879846"},{"key":"e_1_3_2_2_18_1","volume-title":"MMBERT: Multimodal BERT Pretraining for Improved Medical VQA. In 2021 IEEE 18th International Symposium on Biomedical Imaging (ISBI). IEEE","author":"Khare Yash","year":"2021"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1181"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_21"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6839"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.5555\/3104482.3104569"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.5555\/3061053.3061157"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3284750"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2742704"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2015.2400779"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.5555\/1866696.1866717"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"key":"e_1_3_2_2_29_1","volume-title":"Edit and Tell: A Framework for Editing Image Captions. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE","author":"Sammani Fawaz","year":"2020"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995350"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.5555\/2354409.2355114"},{"key":"e_1_3_2_2_32_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition.  Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition."},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1162\/089976600300015349"},{"key":"e_1_3_2_2_34_1","article-title":"Viualizing data using t-SNE","volume":"9","author":"van der Maaten Laurens","year":"2008","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123326"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00192"},{"key":"e_1_3_2_2_37_1","first-page":"449","article-title":"Cross-Modal Retrieval With CNN Visual Features: A New Baseline","volume":"47","author":"Wei Yunchao","year":"2017","journal-title":"IEEE Transactions on Cybernetics"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413822"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.5555\/3367032.3367172"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298966"},{"key":"e_1_3_2_2_41_1","volume-title":"CPR-GCN: Conditional Partial-Residual Graph Convolutional Network in Automated Anatomical Labeling of Coronary Arteries. In 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Yang Han","year":"2020"},{"key":"e_1_3_2_2_42_1","volume-title":"DualGAN: Unsupervised Dual Learning for Image-to-Image Translation. In 2017 IEEE International Conference on Computer Vision (ICCV). IEEE","author":"Yi Zili","year":"2017"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-00776-8_21"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01280"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2013.2276704"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2013.2276704"},{"key":"e_1_3_2_2_47_1","unstructured":"Ye Zhang and Byron Wallace. 2015. A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification.  Ye Zhang and Byron Wallace. 2015. A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification."},{"key":"e_1_3_2_2_48_1","volume-title":"Deep Supervised Cross-Modal Retrieval. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE","author":"Zhen Liangli","year":"2019"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413607"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475567","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475567","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:49:11Z","timestamp":1750193351000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475567"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":49,"alternative-id":["10.1145\/3474085.3475567","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475567","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}