{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T21:54:00Z","timestamp":1757541240319,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Beijing Natural Science Foundation","award":["L233008"],"award-info":[{"award-number":["L233008"]}]},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072462"],"award-info":[{"award-number":["62072462"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658038","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"1034-1042","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["UBiSS: A Unified Framework for Bimodal Semantic Summarization of Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6036-8687","authenticated-orcid":false,"given":"Yuting","family":"Mei","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9809-8864","authenticated-orcid":false,"given":"Linli","family":"Yao","sequence":"additional","affiliation":[{"name":"Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6486-6020","authenticated-orcid":false,"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Spice: Semantic propositional image caption evaluation. In Computer Vision-ECCV 2016: 14th European Conference","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part V 14. Springer, 382--398."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISM52913.2021.00045"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Bor-Chun Chen Yan-Ying Chen and Francine Chen. 2017. Video to Text Summary: Joint Video Summarization and Captioning with Recurrent Neural Networks.. In Bmvc.","DOI":"10.5244\/C.31.118"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298981"},{"key":"e_1_3_2_1_6_1","volume-title":"Antonio da Luz Jr, and Arnaldo de Albuquerque Ara\u00fajo.","author":"Fontes De Avila Sandra Eliza","year":"2011","unstructured":"Sandra Eliza Fontes De Avila, Ana Paula Brandao Lopes, Antonio da Luz Jr, and Arnaldo de Albuquerque Ara\u00fajo. 2011. VSUMM: A mechanism designed to produce static video summaries and a novel evaluation method. Pattern recognition letters, Vol. 32, 1 (2011), 56--68."},{"key":"e_1_3_2_1_7_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-21074-8_4"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10584-0_33"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/582415.582418"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.04.132"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2904996"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01590"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1093\/biomet\/33.3.239"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_17_1","volume-title":"International conference on multimedia modeling. Springer, 67--79","author":"Casas Luis Lebron","year":"2018","unstructured":"Luis Lebron Casas and Eugenia Koblents. 2018. Video summarization with LSTM and deep attention models. In International conference on multimedia modeling. Springer, 67--79."},{"key":"e_1_3_2_1_18_1","first-page":"11846","article-title":"Detecting moments and highlights in videos via natural language queries","volume":"34","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Tamara L Berg, and Mohit Bansal. 2021. Detecting moments and highlights in videos via natural language queries. Advances in Neural Information Processing Systems, Vol. 34 (2021), 11846--11858.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_19_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_20_1","volume-title":"Videoxum: Cross-modal visual and textural summarization of videos","author":"Lin Jingyang","year":"2023","unstructured":"Jingyang Lin, Hang Hua, Ming Chen, Yikang Li, Jenhao Hsiao, Chiuman Ho, and Jiebo Luo. 2023. Videoxum: Cross-modal visual and textural summarization of videos. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_24_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2018","unstructured":"Ilya Loshchilov and Frank Hutter. 2018. Decoupled Weight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_31"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00778"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00205"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_29_1","volume-title":"NeuralNDCG: Direct Optimisation of a Ranking Metric via Differentiable Relaxation of Sorting. ArXiv","author":"Pobrotyn Przemyslaw","year":"2021","unstructured":"Przemyslaw Pobrotyn and Radoslaw Bialobrzeski. 2021. NeuralNDCG: Direct Optimisation of a Ranking Metric via Differentiable Relaxation of Sorting. ArXiv, Vol. abs\/2102.07831 (2021)."},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00174"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01109"},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 5179--5187","author":"Song Yale","year":"2015","unstructured":"Yale Song, Jordi Vallmitjana, Amanda Stent, and Alejandro Jaimes. 2015. Tvsum: Summarizing web videos using titles. In Proceedings of the IEEE conference on computer vision and pattern recognition. 5179--5187."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350992"},{"key":"e_1_3_2_1_38_1","volume-title":"GIT: A Generative Image-to-text Transformer for Vision and Language. Transactions on Machine Learning Research","author":"Wang Jianfeng","year":"2022","unstructured":"Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, and Lijuan Wang. 2022. GIT: A Generative Image-to-text Transformer for Vision and Language. Transactions on Machine Learning Research (2022)."},{"key":"e_1_3_2_1_39_1","volume-title":"International Conference on Machine Learning. PMLR, 38728--38748","author":"Xu Haiyang","year":"2023","unstructured":"Haiyang Xu, Qinghao Ye, Ming Yan, Yaya Shi, Jiabo Ye, Yuanhong Xu, Chenliang Li, Bin Bi, Qi Qian, Wei Wang, et al. 2023. mplug-2: A modularized multi-modal foundation model across text, image and video. In International Conference on Machine Learning. PMLR, 38728--38748."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.112"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_47"},{"volume-title":"CRC standard probability and statistics tables and formulae","author":"Zwillinger Daniel","key":"e_1_3_2_1_43_1","unstructured":"Daniel Zwillinger and Stephen Kokoska. 1999. CRC standard probability and statistics tables and formulae. Crc Press."}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"],"location":"Phuket Thailand","acronym":"ICMR '24"},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658038","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658038","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:50:20Z","timestamp":1755766220000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658038"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":43,"alternative-id":["10.1145\/3652583.3658038","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658038","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}