{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:10:25Z","timestamp":1755825025027,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","funder":[{"name":"Fundamental Research Funds for the Central Universities, CHD","award":["300102244202"],"award-info":[{"award-number":["300102244202"]}]},{"name":"Postdoctoral Fellowship Program of CPSF","award":["GZC20241447"],"award-info":[{"award-number":["GZC20241447"]}]},{"name":"National Key R\\&D Program of China","award":["2023YFB4301800"],"award-info":[{"award-number":["2023YFB4301800"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3734427","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:29:43Z","timestamp":1750876183000},"page":"1983-1987","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MGSGM: Multi-Granularity Selective Graph Mamba for Image-Text Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-6462-6087","authenticated-orcid":false,"given":"Yongle","family":"Huang","sequence":"first","affiliation":[{"name":"Chang'an University, Xi'an, Shaanxi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8898-4308","authenticated-orcid":false,"given":"Yongfeng","family":"Bu","sequence":"additional","affiliation":[{"name":"Chang'an University, Xi'an, Shaanxi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0699-3992","authenticated-orcid":false,"given":"Keyu","family":"Guo","sequence":"additional","affiliation":[{"name":"Chang'an University, Xi'an, Shaanxi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0345-804X","authenticated-orcid":false,"given":"Zedong","family":"Liu","sequence":"additional","affiliation":[{"name":"Chang'an University, Xi'an, Shaanxi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5550-6354","authenticated-orcid":false,"given":"Xiangyu","family":"Song","sequence":"additional","affiliation":[{"name":"Chang'an University, Xi'an, Shaanxi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4043-8448","authenticated-orcid":false,"given":"Shijie","family":"Sun","sequence":"additional","affiliation":[{"name":"Chang'an University, Xi'an, Shaanxi, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3499027"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"e_1_3_2_1_4_1","volume-title":"Proceedings of the British Machine Vision Conference (BMVC). https:\/\/github.com\/fartashf\/vsepp","author":"Faghri Fartash","year":"2018","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2018. VSE: Improving Visual-Semantic Embeddings with Hard Negatives. In Proceedings of the British Machine Vision Conference (BMVC). https:\/\/github.com\/fartashf\/vsepp"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3062794"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00108"},{"key":"e_1_3_2_1_8_1","volume-title":"Mamba: Linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:2312.00752","author":"Gu Albert","year":"2023","unstructured":"Albert Gu and Tri Dao. 2023. Mamba: Linear-time sequence modeling with selective state spaces. arXiv preprint arXiv:2312.00752 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"Image captioning: Transforming objects into words. Advances in neural information processing systems","author":"Herdade Simao","year":"2019","unstructured":"Simao Herdade, Armin Kappeler, Kofi Boakye, and Joao Soares. 2019. Image captioning: Transforming objects into words. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_10_1","volume-title":"International conference on machine learning. PMLR, 3734--3743","author":"Lee Junhyun","year":"2019","unstructured":"Junhyun Lee, Inyeop Lee, and Jaewoo Kang. 2019. Self-attention graph pooling. In International conference on machine learning. PMLR, 3734--3743."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_12_1","volume-title":"European Conference on Computer Vision. Springer, 237--255","author":"Li Kunchang","year":"2024","unstructured":"Kunchang Li, Xinhao Li, Yi Wang, Yinan He, Yali Wang, Limin Wang, and Yu Qiao. 2024a. Videomamba: State space model for efficient video understanding. In European Conference on Computer Vision. Springer, 237--255."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111457"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings, Part V 13","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part V 13. Springer, 740--755."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2023.103546"},{"key":"e_1_3_2_1_18_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692, Vol. 364 (2019)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/P14-5010"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33014602"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01847"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00160"},{"key":"e_1_3_2_1_23_1","volume-title":"Vl-mamba: Exploring state space models for multimodal learning. arXiv preprint arXiv:2403.13600","author":"Qiao Yanyuan","year":"2024","unstructured":"Yanyuan Qiao, Zheng Yu, Longteng Guo, Sihan Chen, Zijia Zhao, Mingzhen Sun, Qi Wu, and Jing Liu. 2024. Vl-mamba: Exploring state space models for multimodal learning. arXiv preprint arXiv:2403.13600 (2024)."},{"key":"e_1_3_2_1_24_1","volume-title":"International conference on machine learning. Pmlr, 8821--8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International conference on machine learning. Pmlr, 8821--8831."},{"key":"e_1_3_2_1_25_1","volume-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"Ren Shaoqing","year":"2016","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2016. Faster R-CNN: Towards real-time object detection with region proposal networks. IEEE transactions on pattern analysis and machine intelligence, Vol. 39, 6 (2016), 1137--1149."},{"key":"e_1_3_2_1_26_1","volume-title":"Graph attention networks. arXiv preprint arXiv:1710.10903","author":"Veli\u010dkovi\u0107 Petar","year":"2017","unstructured":"Petar Veli\u010dkovi\u0107, Guillem Cucurull, Arantxa Casanova, Adriana Romero, Pietro Lio, and Yoshua Bengio. 2017. Graph attention networks. arXiv preprint arXiv:1710.10903 (2017)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.sigpro.2016.08.012"},{"key":"e_1_3_2_1_28_1","volume-title":"Fvqa: Fact-based visual question answering","author":"Wang Peng","year":"2017","unstructured":"Peng Wang, Qi Wu, Chunhua Shen, Anthony Dick, and Anton Van Den Hengel. 2017b. Fvqa: Fact-based visual question answering. IEEE transactions on pattern analysis and machine intelligence, Vol. 40, 10 (2017), 2413--2427."},{"key":"e_1_3_2_1_29_1","volume-title":"Inter-Modality and Intra-Sample Alignment for Multi-Modal Emotion Recognition. In ICASSP 2024--2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 8301--8305","author":"Wang Yusong","year":"2024","unstructured":"Yusong Wang, Dongyuan Li, and Jialun Shen. 2024. Inter-Modality and Intra-Sample Alignment for Multi-Modal Emotion Recognition. In ICASSP 2024--2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 8301--8305."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413877"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01302"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00359"},{"key":"e_1_3_2_1_35_1","volume-title":"Cross-modal Prominent Fragments Enhancement Aligning Network for Image-text Retrieval. In 2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6.","author":"Zhang Yang","year":"2024","unstructured":"Yang Zhang, Yue Zhou, Zonghao Yang, and Ao Chen. 2024. Cross-modal Prominent Fragments Enhancement Aligning Network for Image-text Retrieval. In 2024 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1--6."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539618.3592025"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01064"},{"key":"e_1_3_2_1_38_1","volume-title":"Vision mamba: Efficient visual representation learning with bidirectional state space model. arXiv preprint arXiv:2401.09417","author":"Zhu Lianghui","year":"2024","unstructured":"Lianghui Zhu, Bencheng Liao, Qian Zhang, Xinlong Wang, Wenyu Liu, and Xinggang Wang. 2024. Vision mamba: Efficient visual representation learning with bidirectional state space model. arXiv preprint arXiv:2401.09417 (2024)."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3734427","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:12:00Z","timestamp":1755749520000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3734427"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":38,"alternative-id":["10.1145\/3731715.3734427","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3734427","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}