{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T17:49:08Z","timestamp":1772905748127,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3696409.3700273","type":"proceedings-article","created":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T09:55:23Z","timestamp":1735379723000},"page":"1-1","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Language-Guided Self-Supervised Video Summarization Using Text Semantic Matching Considering the Diversity of the Video"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-3375-8028","authenticated-orcid":false,"given":"Tomoya","family":"Sugihara","sequence":"first","affiliation":[{"name":"The University of Tokyo, Bunkyo-ku, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2490-9557","authenticated-orcid":false,"given":"Shuntaro","family":"Masuda","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Bunkyo-ku, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4650-8841","authenticated-orcid":false,"given":"Ling","family":"Xiao","sequence":"additional","affiliation":[{"name":"The university of Tokyo, Bunkyo-ku, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1784-2314","authenticated-orcid":false,"given":"Toshihiko","family":"Yamasaki","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Bunkyo-ku, Tokyo, Japan"}]}],"member":"320","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP49359.2023.10222350"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.newsum-1.7"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531404"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00796"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1142\/9789812797926_0003"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3338533.3366583"},{"key":"e_1_3_3_2_8_2","first-page":"7871","volume-title":"Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics. 7871\u20137880."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME51207.2021.9428318"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10584-0_33"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01428"},{"key":"e_1_3_3_2_12_2","first-page":"15","volume-title":"Pacific-Rim Symposium on Image and Video Technology","author":"Huynh-Lam Hai-Dang","year":"2023","unstructured":"Hai-Dang Huynh-Lam, Ngoc-Phuong Ho-Thi, Minh-Triet Tran, and Trung-Nghia Le. 2023. Cluster-Based Video Summarization with Temporal Context Awareness. In Pacific-Rim Symposium on Image and Video Technology. 15\u201328."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01590"},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018537"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58595-2_11"},{"key":"e_1_3_3_2_16_2","first-page":"7482","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Kendall Alex","year":"2018","unstructured":"Alex Kendall, Yarin Gal, and Roberto Cipolla. 2018. Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry and Semantics. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 7482\u20137491."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Maurice\u00a0G Kendall. 1945. The Treatment of Ties in Ranking Problems. Biometrika 33 3 (1945) 239\u2013251.","DOI":"10.1093\/biomet\/33.3.239"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00554"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1387"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.318"},{"key":"e_1_3_3_2_22_2","first-page":"462","volume-title":"Proceedings of Neural Information Processing Systems","author":"Meng Yu","year":"2022","unstructured":"Yu Meng, Jiaxin Huang, Yu Zhang, and Jiawei Han. 2022. Generating training data with language models: Towards zero-shot language understanding. In Proceedings of Neural Information Processing Systems. 462\u2013477."},{"key":"e_1_3_3_2_23_2","first-page":"13988","volume-title":"Proceedings of Neural Information Processing Systems","author":"Narasimhan Medhini","year":"2021","unstructured":"Medhini Narasimhan, Anna Rohrbach, and Trevor Darrell. 2021. Clip-it! language-guided video summarization. In Proceedings of Neural Information Processing Systems. 13988\u201314000."},{"key":"e_1_3_3_2_24_2","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2303.08774"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00778"},{"key":"e_1_3_3_2_26_2","first-page":"12097","volume-title":"Multimedia Tools and Applications","author":"Otani Mayu","year":"2017","unstructured":"Mayu Otani, Yuta Nakashima, Tomokazu Sato, and Naokazu Yokoya. 2017. Video summarization using textual descriptions for authoring video blogs. In Multimedia Tools and Applications. 12097\u201312115."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_35"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01783"},{"key":"e_1_3_3_2_30_2","first-page":"5179","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Song Yale","year":"2015","unstructured":"Yale Song, Jordi Vallmitjana, Amanda Stent, and Alejandro Jaimes. 2015. TVSum: Summarizing web videos using titles. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 5179\u20135187."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.531"},{"key":"e_1_3_3_2_32_2","first-page":"40542","volume-title":"Proceedings of Neural Information Processing Systems","volume":"36","author":"Sul Jinhwan","year":"2023","unstructured":"Jinhwan Sul, Jihoon Han, and Joonseok Lee. 2023. Mr. HiSum: A Large-scale Dataset for Video Highlight Detection and Summarization. In Proceedings of Neural Information Processing Systems , Vol.\u00a036. 40542\u201340555."},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.322"},{"key":"e_1_3_3_2_34_2","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew\u00a0M Dai Anja Hauth et\u00a0al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.11805 (2023)."},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00316"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448236"},{"key":"e_1_3_3_2_37_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et\u00a0al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_2_38_2","first-page":"5998","volume-title":"Proceedings of Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Proceedings of Neural Information Processing Systems. 5998\u20136008."},{"key":"e_1_3_3_2_39_2","first-page":"21665","volume-title":"Proceedings of Neural Information Processing Systems","author":"Vyas Apoorv","year":"2020","unstructured":"Apoorv Vyas, Angelos Katharopoulos, and Fran\u00e7ois Fleuret. 2020. Fast transformers with clustered attention. In Proceedings of Neural Information Processing Systems. 21665\u201321674."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447504"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25353"},{"key":"e_1_3_3_2_42_2","unstructured":"Jianfeng Wang Zhengyuan Yang Xiaowei Hu Linjie Li Kevin Lin Zhe Gan Zicheng Liu Ce Liu and Lijuan Wang. 2022. GIT: A Generative Image-to-text Transformer for Vision and Language. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2205.14100 (2022)."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612087"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11297"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21409"},{"key":"e_1_3_3_2_46_2","volume-title":"Proceedings of the IEEE International Conference on Visual Communications and Image Processing","author":"Yamanishi Hiromasa","year":"2024","unstructured":"Hiromasa Yamanishi, Ling Xiao, and Toshihiko Yamasaki. 2024. LLaVA-Tour: A Large Multimodal Model for Japanese Tourist Spot Prediction and Review Generation. In Proceedings of the IEEE International Conference on Visual Communications and Image Processing."},{"key":"e_1_3_3_2_47_2","volume-title":"Proceedings of the ACM International Conference on Recommender Systems Workshops","author":"Yamanishi Hiromasa","year":"2024","unstructured":"Hiromasa Yamanishi, Ling Xiao, and Toshihiko Yamasaki. 2024. A Multimodal Dataset and Benchmark for Tourism Review Generation. In Proceedings of the ACM International Conference on Recommender Systems Workshops."},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-emnlp.192"},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_47"},{"key":"e_1_3_3_2_50_2","unstructured":"Bin Zhao Haopeng Li Xiaoqiang Lu and Xuelong Li. 2021. Reconstructive sequence-graph network for video summarization. IEEE Transactions on Pattern Analysis and Machine Intelligence 44 5 (2021) 2793\u20132801."},{"key":"e_1_3_3_2_51_2","unstructured":"Wayne\u00a0Xin Zhao Kun Zhou Junyi Li Tianyi Tang Xiaolei Wang Yupeng Hou Yingqian Min Beichen Zhang Junjie Zhang Zican Dong et\u00a0al. 2023. A survey of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.18223 (2023)."},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.552"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"crossref","unstructured":"Sheng-Hua Zhong Jingxu Lin Jianglin Lu Ahmed Fares and Tongwei Ren. 2022. Deep semantic and attentive network for unsupervised video summarization. ACM Transactions on Multimedia Computing Communications and Applications 18 2 (2022) 1\u201321.","DOI":"10.1145\/3477538"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12255"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"crossref","unstructured":"Daniel Zwillinger and Stephen Kokoska. 1999. CRC standard probability and statistics tables and formulae. Crc Press (1999).","DOI":"10.1201\/9780367802417"}],"event":{"name":"MMAsia '24: ACM Multimedia Asia","location":"Auckland New Zealand","acronym":"MMAsia '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700273","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696409.3700273","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:16Z","timestamp":1750295416000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700273"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":55,"alternative-id":["10.1145\/3696409.3700273","10.1145\/3696409"],"URL":"https:\/\/doi.org\/10.1145\/3696409.3700273","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}