{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:10:24Z","timestamp":1755825024945,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","funder":[{"name":"Nos.62176144,Nos.ZR2019ZD03,ZR2024MF043,ZR2024ZD08,No.ts20190924"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733460","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:29:43Z","timestamp":1750876183000},"page":"265-274","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Video Frame Enhancement based Text Semantic Fusion for Cross-modal Text-video Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2226-8811","authenticated-orcid":false,"given":"Kang","family":"Du","sequence":"first","affiliation":[{"name":"Shandong Normal University, Jinan, Shandong Province, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6259-7533","authenticated-orcid":false,"given":"Huaxiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Shandong Normal University, Jinan, Shandong Province, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9121-5124","authenticated-orcid":false,"given":"Li","family":"Liu","sequence":"additional","affiliation":[{"name":"Shandong Normal University, Jinan, Shandong Province, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8061-1797","authenticated-orcid":false,"given":"Dongmei","family":"Liu","sequence":"additional","affiliation":[{"name":"Shandong Normal University, Jinan, Shandong Province, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4500-2519","authenticated-orcid":false,"given":"Hao","family":"Du","sequence":"additional","affiliation":[{"name":"Shandong Normal University, Jinan, Shandong Province, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i8.16822"},{"key":"e_1_3_2_1_2_1","volume-title":"GQE: Generalized Query Expansion for Enhanced Text-Video Retrieval. arXiv preprint arXiv:2408.07249","author":"Bai Zechen","year":"2024","unstructured":"Zechen Bai, Tianjun Xiao, Tong He, Pichao Wang, Zheng Zhang, Thomas Brox, and Mike Zheng Shou. 2024. GQE: Generalized Query Expansion for Enhanced Text-Video Retrieval. arXiv preprint arXiv:2408.07249 (2024)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02080"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00513"},{"key":"e_1_3_2_1_6_1","volume-title":"RAP: Efficient Text-Video Retrieval with Sparse-and-Correlated Adapter. arXiv preprint arXiv:2405.19465","author":"Cao Meng","year":"2024","unstructured":"Meng Cao, Haoran Tang, Jinfa Huang, Peng Jin, Can Zhang, Ruyang Liu, Long Chen, Xiaodan Liang, Li Yuan, and Ge Li. 2024. RAP: Efficient Text-Video Retrieval with Sparse-and-Correlated Adapter. arXiv preprint arXiv:2405.19465 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25113"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01434"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01262"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611756"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings, Part IV 16","author":"Gabeur Valentin","year":"2020","unstructured":"Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. 2020. Multi-modal transformer for video retrieval. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part IV 16. Springer, 214--229."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3422622"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01107"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3411298"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00244"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00234"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02563"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of naacL-HLT","volume":"1","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of naacL-HLT, Vol. 1. Minneapolis, Minnesota, 2."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00379"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3132068"},{"key":"e_1_3_2_1_25_1","volume-title":"Deep collaborative embedding for social image understanding","author":"Li Zechao","year":"2018","unstructured":"Zechao Li, Jinhui Tang, and Tao Mei. 2018. Deep collaborative embedding for social image understanding. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 9 (2018), 2070--2083."},{"key":"e_1_3_2_1_26_1","volume-title":"Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487","author":"Liu Yang","year":"2019","unstructured":"Yang Liu, Samuel Albanie, Arsha Nagrani, and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)."},{"key":"e_1_3_2_1_27_1","volume-title":"Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860","author":"Luo Huaishao","year":"2021","unstructured":"Huaishao Luo, Lei Ji, Ming Zhong, Yang Chen, Wen Lei, Nan Duan, and Tianrui Li. 2021. Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860 (2021)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_30_1","volume-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, and Mark Chen. 2021. Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.261"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01820"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.4249\/scholarpedia.1883"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-77004-4_1"},{"key":"e_1_3_2_1_35_1","volume-title":"Caiming Xiong, Silvio Savarese, et al.","author":"Qin Can","year":"2023","unstructured":"Can Qin, Shu Zhang, Ning Yu, Yihao Feng, Xinyi Yang, Yingbo Zhou, Huan Wang, Juan Carlos Niebles, Caiming Xiong, Silvio Savarese, et al. 2023. Unicontrol: A unified diffusion model for controllable visual generation in the wild. arXiv preprint arXiv:2305.11147 (2023)."},{"key":"e_1_3_2_1_36_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_37_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_1_38_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 , Vol. 1, 2 (2022), 3."},{"key":"e_1_3_2_1_39_1","first-page":"71","article-title":"Dall-e: Creating images from text","volume":"8","author":"Murahari Reddy Mr D","year":"2021","unstructured":"Mr D Murahari Reddy, Mr Sk Masthan Basha, Mr M Chinnaiahgari Hari, and Mr N Penchalaiah. 2021. Dall-e: Creating images from text. UGC Care Group I Journal , Vol. 8, 14 (2021), 71--75.","journal-title":"UGC Care Group I Journal"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_42_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems, Vol. 35 (2022), 36479--36494."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2024.102801"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01602"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28327"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01566"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3204444"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13735-024-00335-7"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"e_1_3_2_1_53_1","volume-title":"Clip-vip: Adapting pre-trained image-text model to video-language representation alignment. arXiv preprint arXiv:2209.06430","author":"Xue Hongwei","year":"2022","unstructured":"Hongwei Xue, Yuchong Sun, Bei Liu, Jianlong Fu, Ruihua Song, Houqiang Li, and Jiebo Luo. 2022. Clip-vip: Adapting pre-trained image-text model to video-language representation alignment. arXiv preprint arXiv:2209.06430 (2022)."},{"key":"e_1_3_2_1_54_1","volume-title":"Classification is a strong baseline for deep metric learning. arXiv preprint arXiv:1811.12649","author":"Zhai Andrew","year":"2018","unstructured":"Andrew Zhai and Hao-Yu Wu. 2018. Classification is a strong baseline for deep metric learning. arXiv preprint arXiv:1811.12649 (2018)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531950"}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Chicago IL USA","acronym":"ICMR '25"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733460","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:12:32Z","timestamp":1755749552000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733460"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":56,"alternative-id":["10.1145\/3731715.3733460","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733460","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}