{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:03Z","timestamp":1765343043689,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022ZD0160300"],"award-info":[{"award-number":["2022ZD0160300"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758252","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"13030-13037","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DreamFrame: Enhancing Video Understanding via Automatically Generated QA and Style-Consistent Keyframes"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-3659-0305","authenticated-orcid":false,"given":"Zhende","family":"Song","sequence":"first","affiliation":[{"name":"College of Intelligent Robotics and Advanced Manufacturing, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3663-8884","authenticated-orcid":false,"given":"Chenchen","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Future Information Technology, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6073-8723","authenticated-orcid":false,"given":"Jiamu","family":"Sheng","sequence":"additional","affiliation":[{"name":"College of Intelligent Robotics and Advanced Manufacturing, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6344-2824","authenticated-orcid":false,"given":"Chi","family":"Zhang","sequence":"additional","affiliation":[{"name":"AGI Lab, Westlake University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3456-8310","authenticated-orcid":false,"given":"Shengji","family":"Tang","sequence":"additional","affiliation":[{"name":"College of Future Information Technology, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4983-1353","authenticated-orcid":false,"given":"Jiayuan","family":"Fan","sequence":"additional","affiliation":[{"name":"College of Intelligent Robotics and Advanced Manufacturing, Fudan University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0779-9818","authenticated-orcid":false,"given":"Tao","family":"Chen","sequence":"additional","affiliation":[{"name":"College of Future Information Technology, Fudan University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Alayrac Jean-Baptiste","year":"2022","unstructured":"Jean-Baptiste Alayrac, Jeff Donahue, Pauline Luc, Antoine Miech, Iain Barr, Yana Hasson, Karel Lenc, Arthur Mensch, Katie Millicah, Malcolm Reynolds, Roman Ring, Eliza Rutherford, Serkan Cabi, Tengda Han, Zhitao Gong, Sina Samangooei, Marianne Monteiro, Jacob Menick, Sebastian Borgeaud, Andrew Brock, Aida Nematzadeh, Sahand Sharifzadeh, Mikolaj Binkowski, Ricardo Barreira, Oriol Vinyals, Andrew Zisserman, and Karen Simonyan. 2022. Flamingo: a visual language model for few-shot learning. In Proceedings of the 36th International Conference on Neural Information Processing Systems (New Orleans, LA, USA) (NIPS '22). Curran Associates Inc., Red Hook, NY, USA, Article 1723, 21 pages."},{"key":"e_1_3_2_1_2_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenhang Ge Yu Han Fei Huang Binyuan Hui Luo Ji Mei Li Junyang Lin Runji Lin Dayiheng Liu Gao Liu Chengqiang Lu K. Lu Jianxin Ma Rui Men Xingzhang Ren Xuancheng Ren Chuanqi Tan Sinan Tan Jianhong Tu Peng Wang Shijie Wang Wei Wang Shengguang Wu Benfeng Xu Jin Xu An Yang Hao Yang Jian Yang Jian Yang Shusheng Yang Yang Yao Bowen Yu Yu Bowen Hongyi Yuan Zheng Yuan Jianwei Zhang Xing Zhang Yichang Zhang Zhenru Zhang Chang Zhou Jingren Zhou Xiaohuan Zhou and Tianhang Zhu. 2023a. Qwen Technical Report. ArXiv Vol. abs\/2309.16609 (2023). https:\/\/arxiv.org\/abs\/2309.16609"},{"key":"e_1_3_2_1_3_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023b. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. https:\/\/arxiv.org\/abs\/2308.12966"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference. 4352-4358","author":"Castro Santiago","year":"2020","unstructured":"Santiago Castro, Mahmoud Azab, Jonathan Stroud, Cristina Noujaim, Ruoyao Wang, Jia Deng, and Rada Mihalcea. 2020. LifeQA: A Real-life Dataset for Video Question Answering. In Proceedings of the Twelfth Language Resources and Evaluation Conference. 4352-4358."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the 29th International Conference on Computational Linguistics. 5613-5635","author":"Castro Santiago","year":"2022","unstructured":"Santiago Castro, Naihao Deng, Pingxuan Huang, Mihai Burzo, and Rada Mihalcea. 2022. In-the-Wild Video Question Answering. In Proceedings of the 29th International Conference on Computational Linguistics. 5613-5635."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_1_8_1","volume-title":"Yuwei Fang, Hsin-Ying Lee, Jian Ren, Ming-Hsuan Yang, et al.","author":"Chen Tsai-Shien","year":"2024","unstructured":"Tsai-Shien Chen, Aliaksandr Siarohin, Willi Menapace, Ekaterina Deyneka, Hsiang-wei Chao, Byung Eun Jeon, Yuwei Fang, Hsin-Ying Lee, Jian Ren, Ming-Hsuan Yang, et al., 2024. Panda-70M: Captioning 70M Videos with Multiple Cross-Modality Teachers. arXiv preprint arXiv:2402.19479 (2024)."},{"key":"e_1_3_2_1_9_1","first-page":"24185","volume-title":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Chen Zhe","year":"2023","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Zhong Muyan, Qinglong Zhang, Xizhou Zhu, Lewei Lu, Bin Li, Ping Luo, Tong Lu, Yu Qiao, and Jifeng Dai. 2023. Intern VL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023), 24185-24198."},{"key":"e_1_3_2_1_10_1","first-page":"49250","article-title":"InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning","volume":"36","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, DONGXU LI, Anthony Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale N Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In Advances in Neural Information Processing Systems, Vol. 36. 49250-49267.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_11_1","unstructured":"DeepSeek-AI Daya Guo Dejian Yang Haowei Zhang Jun-Mei Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiaoling Bi Xiaokang Zhang Xingkai Yu Yu Wu Z. F. Wu Zhibin Gou Zhihong Shao Zhuoshu Li Ziyi Gao Aixin Liu Bing Xue Bing-Li Wang Bochao Wu Bei Feng Chengda Lu Chenggang Zhao Chengqi Deng Chenyu Zhang Chong Ruan Damai Dai Deli Chen Dong-Li Ji Erhang Li Fangyun Lin Fucong Dai Fuli Luo Guangbo Hao Guanting Chen Guowei Li H. Zhang Han Bao Hanwei Xu Haocheng Wang Honghui Ding Huajian Xin Huazuo Gao Hui Qu Hui Li Jianzhong Guo Jiashi Li Jiawei Wang Jingchang Chen Jingyang Yuan Junjie Qiu Junlong Li Jiong Cai Jiaqi Ni Jian Liang Jin Chen Kai Dong Kai Hu Kaige Gao Kang Guan Kexin Huang Kuai Yu Lean Wang Lecong Zhang Liang Zhao Litong Wang Liyue Zhang Lei Xu Leyi Xia Mingchuan Zhang Minghua Zhang M. Tang Meng Li Miaojun Wang Mingming Li Ning Tian Panpan Huang Peng Zhang Qiancheng Wang Qinyu Chen Qiushi Du Ruiqi Ge Ruisong Zhang Ruizhe Pan Runji Wang R. J. Chen Ruiqi Jin Ruyi Chen Shanghao Lu Shangyan Zhou Shanhuang Chen Shengfeng Ye Shiyu Wang Shuiping Yu Shunfeng Zhou Shuting Pan S. S. Li Shuang Zhou Shao-Kang Wu Tao Yun Tian Pei Tianyu Sun T. Wang Wangding Zeng Wanjia Zhao Wen Liu Wenfeng Liang Wenjun Gao Wen-Xia Yu Wentao Zhang Wangding Xiao Wei An Xiaodong Liu Xiaohan Wang Xiaokang Chen Xiaotao Nie Xin Cheng Xin Liu Xin Xie Xingchao Liu Xinyu Yang Xinyuan Li Xuecheng Su Xuheng Lin X. Q. Li Xiangyu Jin Xi-Cheng Shen Xiaosha Chen Xiaowen Sun Xiaoxiang Wang Xinnan Song Xinyi Zhou Xianzu Wang Xinxia Shan Y. K. Li Y. Q. Wang Y. X. Wei Yang Zhang Yanhong Xu Yao Li Yao Zhao Yaofeng Sun Yaohui Wang Yi Yu Yichao Zhang Yifan Shi Yi Xiong Ying He Yishi Piao Yisong Wang Yixuan Tan Yiyang Ma Yiyuan Liu Yongqiang Guo Yuan Ou Yuduan Wang Yue Gong Yu-Jing Zou Yujia He Yunfan Xiong Yu-Wei Luo Yu mei You Yuxuan Liu Yuyang Zhou Y. X. Zhu Yanping Huang Yao Li Yi Zheng Yuchen Zhu Yunxiang Ma Ying Tang Yukun Zha Yuting Yan Zehui Ren Zehui Ren Zhangli Sha Zhe Fu Zhean Xu Zhenda Xie Zhen guo Zhang Zhewen Hao Zhicheng Ma Zhigang Yan Zhiyu Wu Zihui Gu Zijia Zhu Zijun Liu Zi-An Li Ziwei Xie Ziyang Song Zizheng Pan Zhen Huang Zhipeng Xu Zhongyu Zhang and Zhen Zhang. 2025. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. ArXiv Vol. abs\/2501.12948 (2025)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"e_1_3_2_1_13_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. CLIPScore: A Reference-free Evaluation Metric for Image Captioning. In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, Marie-Francine Moens, Xuanjing Huang, Lucia Specia, and Scott Wen-tau Yih (Eds.). Association for Computational Linguistics, Online and Punta Cana, Dominican Republic, 7514-7528."},{"key":"e_1_3_2_1_14_1","volume-title":"Chat-univi: Unified visual representation empowers large language models with image and video understanding. arXiv preprint arXiv:2311.08046","author":"Jin Peng","year":"2023","unstructured":"Peng Jin, Ryuichi Takanobu, Caiwan Zhang, Xiaochun Cao, and Li Yuan. 2023. Chat-univi: Unified visual representation empowers large language models with image and video understanding. arXiv preprint arXiv:2311.08046 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Video-LaVIT: Unified Video-Language Pre-training with Decoupled Visual-Motional Tokenization. In International Conference on Machine Learning. PMLR, 22185-22209","author":"Jin Yang","year":"2024","unstructured":"Yang Jin, Zhicheng Sun, Kun Xu, Kun Xu, Liwei Chen, Hao Jiang, Quzhe Huang, Chengru Song, Yuliang Liu, Di Zhang, Yang Song, Kun Gai, and Yadong Mu. 2024. Video-LaVIT: Unified Video-Language Pre-training with Decoupled Visual-Motional Tokenization. In International Conference on Machine Learning. PMLR, 22185-22209."},{"key":"e_1_3_2_1_16_1","volume-title":"Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726","author":"Li Bo","year":"2023","unstructured":"Bo Li, Yuanhan Zhang, Liangyu Chen, Jinghao Wang, Jingkang Yang, and Ziwei Liu. 2023d. Otter: A multi-modal model with in-context instruction tuning. arXiv preprint arXiv:2305.03726 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023b. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In International Conference on Machine Learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_18_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023a. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"key":"e_1_3_2_1_19_1","first-page":"22195","volume-title":"MVBench: A Comprehensive Multi-modal Video Understanding Benchmark. 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Li Kunchang","year":"2023","unstructured":"Kunchang Li, Yali Wang, Yinan He, Yizhuo Li, Yi Wang, Yi Liu, Zun Wang, Jilan Xu, Guo Chen, Ping Luo, Limin Wang, and Yu Qiao. 2023c. MVBench: A Comprehensive Multi-modal Video Understanding Benchmark. 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023), 22195-22206."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings, Part XLVI (Lecture Notes in Computer Science","volume":"340","author":"Li Yanwei","year":"2024","unstructured":"Yanwei Li, Chengyao Wang, and Jiaya Jia. 2024. LLaMA-VID: An Image is Worth 2 Tokens in Large Language Models. In Computer Vision - ECCV 2024 - 18th European Conference, Milan, Italy, September 29-October 4, 2024, Proceedings, Part XLVI (Lecture Notes in Computer Science, Vol. 15104). Springer, 323-340."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"e_1_3_2_1_22_1","first-page":"34892","article-title":"Visual Instruction Tuning","volume":"36","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023. Visual Instruction Tuning. In Advances in Neural Information Processing Systems, Vol. 36. 34892-34916.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_23_1","first-page":"8731","volume-title":"ACL 2024, Bangkok, Thailand and virtual meeting","author":"Liu Yuanxin","year":"2024","unstructured":"Yuanxin Liu, Shicheng Li, Yi Liu, Yuxiang Wang, Shuhuai Ren, Lei Li, Sishuo Chen, Xu Sun, and Lu Hou. 2024. TempCompass: Do Video LLMs Really Understand Videos?. In Findings of the Association for Computational Linguistics, ACL 2024, Bangkok, Thailand and virtual meeting, August 11-16, 2024. Association for Computational Linguistics, 8731-8772."},{"key":"e_1_3_2_1_24_1","volume-title":"Valley: Video assistant with large language model enhanced ability. arXiv preprint arXiv:2306.07207","author":"Luo Ruipu","year":"2023","unstructured":"Ruipu Luo, Ziwang Zhao, Min Yang, Junwei Dong, Minghui Qiu, Pengcheng Lu, Tao Wang, and Zhongyu Wei. 2023. Valley: Video assistant with large language model enhanced ability. arXiv preprint arXiv:2306.07207 (2023)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"e_1_3_2_1_26_1","volume-title":"Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models. ArXiv","author":"Ning Munan","year":"2023","unstructured":"Munan Ning, Bin Zhu, Yujia Xie, Bin Lin, Jiaxi Cui, Lu Yuan, Dongdong Chen, and Li Yuan. 2023. Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating Video-based Large Language Models. ArXiv, Vol. abs\/2311.16103 (2023)."},{"key":"e_1_3_2_1_27_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_29_1","unstructured":"Xingwu Sun Yanfeng Chen Yiqing Huang Ruobing Xie Jiaqi Zhu Kai Zhang Shuaipeng Li Zhen Yang Jonny Han Xiaobo Shu Jiahao Bu Zhongzhi Chen Xuemeng Huang Feng Lian Saiyong Yang Jianfeng Yan Yuyuan Zeng Xiaoqing Ren Chao Yu Lulu Wu Yue Mao Jun Xia Tao Yang Suncong Zheng Kan Wu Dian Jiao Jinbao Xue Xipeng Zhang Decheng Wu Kai Liu Dengpeng Wu Guanghui Xu Shaohua Chen Shuang Chen Xiaowei Feng Yigeng Hong Junqiang Zheng Chengcheng Xu Zong-Rui Li Xi Kuang Jian hua Hu Yiqi Chen Yuchi Deng Guiyang Li Ao Liu Chenchen Zhang Shi-He Hu Zilong Zhao Zi-Hao Wu Yao Ding Weichao Wang Han Liu Roberts Wang Haoyang Fei Peijie Yu Ze Zhao Xun Cao Hai Wang Fusheng Xiang Meng-Sheng Huang Zhiyu Xiong Bin Hu Xue yan Hou Lei Jiang Jia bing Ma Jiajia Wu Yaping Deng Yi Shen Qian Wang Weijie Liu Jie Liu Meng Chen Liang Dong Wei Jia Hu Chen Feifei Liu Ruixin Yuan Huilin Xu Zhenxiang Yan Tengfei Cao Zhichao Hu Xinhua Feng Dong Du Ting-Ting Yu Yang-Dan Tao Feng Zhang Jianchen Zhu Chengzhong Xu Xirui Li Chong Zha Ouyang Wen Yi Xia Xiang Li Ze He Rongpeng Chen Jiawei Song Ruibin Chen Fan Jiang Chongqing Zhao and Bo Wang. 2024. Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent. ArXiv Vol. abs\/2411.02265 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3292266"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_33_1","unstructured":"Qinghao Ye Haiyang Xu Guohai Xu Jiabo Ye Ming Yan Yiyang Zhou Junyang Wang Anwen Hu Pengcheng Shi Yaya Shi et al. 2023. mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3339661"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"e_1_3_2_1_36_1","volume-title":"LLaMA-Adapter: Efficient Fine-tuning of Large Language Models with Zero-initialized Attention. In International Conference on Learning Representations.","author":"Zhang Renrui","year":"2024","unstructured":"Renrui Zhang, Jiaming Han, Chris Liu, Aojun Zhou, Pan Lu, Yu Qiao, Hongsheng Li, and Peng Gao. 2024. LLaMA-Adapter: Efficient Fine-tuning of Large Language Models with Zero-initialized Attention. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"volume-title":"Towards Automatic Learning of Procedures From Web Instructional Videos. In AAAI Conference on Artificial Intelligence.","author":"Zhou Luowei","key":"e_1_3_2_1_38_1","unstructured":"Luowei Zhou, Chenliang Xu, and Jason J. Corso. 2017. Towards Automatic Learning of Procedures From Web Instructional Videos. In AAAI Conference on Artificial Intelligence."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758252","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:59:34Z","timestamp":1765342774000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758252"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":38,"alternative-id":["10.1145\/3746027.3758252","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758252","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}