{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:26:31Z","timestamp":1765506391305,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3761621","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T00:52:37Z","timestamp":1762563157000},"page":"6340-6345","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["VideoAVE: A Multi-Attribute Video-to-Text Attribute Value Extraction Dataset and Benchmark Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8475-2331","authenticated-orcid":false,"given":"Ming","family":"Cheng","sequence":"first","affiliation":[{"name":"Virginia Tech, Blacksburg, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8310-6501","authenticated-orcid":false,"given":"Tong","family":"Wu","sequence":"additional","affiliation":[{"name":"Virginia Tech, Blacksburg, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-6950-1910","authenticated-orcid":false,"given":"Jiazhen","family":"Hu","sequence":"additional","affiliation":[{"name":"Virginia Tech, Blacksburg, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8945-6909","authenticated-orcid":false,"given":"Jiaying","family":"Gong","sequence":"additional","affiliation":[{"name":"Virginia Tech, Blacksburg, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9712-6667","authenticated-orcid":false,"given":"Hoda","family":"Eldardiry","sequence":"additional","affiliation":[{"name":"Virginia Tech, Blacksburg, VA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems Vol. 35 (2022) 23716-23736.","key":"e_1_3_2_1_1_1"},{"unstructured":"AskVideos. 2024. AskVideos-VideoCLIP: Language-grounded video embeddings. GitHub. https:\/\/github.com\/AskYoutubeAI\/AskVideos-VideoCLIP","key":"e_1_3_2_1_2_1"},{"key":"e_1_3_2_1_3_1","volume-title":"Using llms for the extraction and normalization of product attribute values. arXiv e-prints","author":"Brinkmann Alexander","year":"2024","unstructured":"Alexander Brinkmann, Nick Baumann, and Christian Bizer. 2024. Using llms for the extraction and normalization of product attribute values. arXiv e-prints (2024), arXiv-2403."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_4_1","DOI":"10.1145\/3626772.3661357"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_5_1","DOI":"10.1109\/WACV56688.2023.00363"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_6_1","DOI":"10.1145\/3583780.3615142"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_7_1","DOI":"10.18653\/v1\/2025.naacl-industry.38"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_8_1","DOI":"10.18653\/v1\/2025.acl-industry.80"},{"key":"e_1_3_2_1_9_1","volume-title":"Bridging language and items for retrieval and recommendation. arXiv preprint arXiv:2403.03952","author":"Hou Yupeng","year":"2024","unstructured":"Yupeng Hou, Jiacheng Li, Zhankui He, An Yan, Xiusi Chen, and Julian McAuley. 2024. Bridging language and items for retrieval and recommendation. arXiv preprint arXiv:2403.03952 (2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"Video-lavit: Unified video-language pre-training with decoupled visual-motional tokenization. arXiv preprint arXiv:2402.03161","author":"Jin Yang","year":"2024","unstructured":"Yang Jin, Zhicheng Sun, Kun Xu, Liwei Chen, Hao Jiang, Quzhe Huang, Chengru Song, Yuliang Liu, Di Zhang, Yang Song, et al., 2024. Video-lavit: Unified video-language pre-training with decoupled visual-motional tokenization. arXiv preprint arXiv:2402.03161 (2024)."},{"key":"e_1_3_2_1_11_1","volume-title":"Videopoet: A large language model for zero-shot video generation. arXiv preprint arXiv:2312.14125","author":"Kondratyuk Dan","year":"2023","unstructured":"Dan Kondratyuk, Lijun Yu, Xiuye Gu, Jos\u00e9 Lezama, Jonathan Huang, Grant Schindler, Rachel Hornung, Vighnesh Birodkar, Jimmy Yan, Ming-Chang Chiu, et al., 2023. Videopoet: A large language model for zero-shot video generation. arXiv preprint arXiv:2312.14125 (2023)."},{"key":"e_1_3_2_1_12_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023b. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_13_1","volume-title":"Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355","author":"Li KunChang","year":"2023","unstructured":"KunChang Li, Yinan He, Yi Wang, Yizhuo Li, Wenhai Wang, Ping Luo, Yali Wang, Limin Wang, and Yu Qiao. 2023a. Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_14_1","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"e_1_3_2_1_15_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023b. Visual instruction tuning. Advances in neural information processing systems, Vol. 36 (2023), 34892-34916."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1145\/3539597.3570423"},{"key":"e_1_3_2_1_17_1","volume-title":"Multimodal attribute extraction. arXiv preprint arXiv:1711.11118","author":"Robert L","year":"2017","unstructured":"Robert L Logan IV, Samuel Humeau, and Sameer Singh. 2017. Multimodal attribute extraction. arXiv preprint arXiv:1711.11118 (2017)."},{"key":"e_1_3_2_1_18_1","volume-title":"Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424","author":"Maaz Muhammad","year":"2023","unstructured":"Muhammad Maaz, Hanoona Rasheed, Salman Khan, and Fahad Shahbaz Khan. 2023. Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424 (2023)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_19_1","DOI":"10.1007\/978-3-031-19772-7_1"},{"key":"e_1_3_2_1_20_1","volume-title":"Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824","author":"Peng Zhiliang","year":"2023","unstructured":"Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, and Furu Wei. 2023. Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Luke Zettlemoyer, and Lili Yu.","author":"Shi Weijia","year":"2024","unstructured":"Weijia Shi, Xiaochuang Han, Chunting Zhou, Weixin Liang, Xi Victoria Lin, Luke Zettlemoyer, and Lili Yu. 2024. LlamaFusion: Adapting Pretrained Language Models for Multimodal Generation. arXiv preprint arXiv:2412.15188 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"NER-MQMRC: formulating named entity recognition as multi question machine reading comprehension. arXiv preprint arXiv:2205.05904","author":"Shrimal Anubhav","year":"2022","unstructured":"Anubhav Shrimal, Avi Jain, Kartik Mehta, and Promod Yenigalla. 2022. NER-MQMRC: formulating named entity recognition as multi question machine reading comprehension. arXiv preprint arXiv:2205.05904 (2022)."},{"key":"e_1_3_2_1_23_1","volume-title":"Internvid: A large-scale video-text dataset for multimodal understanding and generation. arXiv preprint arXiv:2307.06942","author":"Wang Yi","year":"2023","unstructured":"Yi Wang, Yinan He, Yizhuo Li, Kunchang Li, Jiashuo Yu, Xin Ma, Xinhao Li, Guo Chen, Xinyuan Chen, Yaohui Wang, et al., 2023. Internvid: A large-scale video-text dataset for multimodal understanding and generation. arXiv preprint arXiv:2307.06942 (2023)."},{"unstructured":"Yi Wang Xinhao Li Ziang Yan Yinan He Jiashuo Yu Xiangyu Zeng Chenting Wang Changlian Ma Haian Huang Jianfei Gao et al. 2025. InternVideo2. 5: Empowering Video MLLMs with Long and Rich Context Modeling. arXiv preprint arXiv:2501.12386 (2025).","key":"e_1_3_2_1_24_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_25_1","DOI":"10.1145\/3664647.3681464"},{"key":"e_1_3_2_1_26_1","volume-title":"Liquid: Language models are scalable multi-modal generators. arXiv preprint arXiv:2412.04332","author":"Wu Junfeng","year":"2024","unstructured":"Junfeng Wu, Yi Jiang, Chuofan Ma, Yuliang Liu, Hengshuang Zhao, Zehuan Yuan, Song Bai, and Xiang Bai. 2024a. Liquid: Language models are scalable multi-modal generators. arXiv preprint arXiv:2412.04332 (2024)."},{"key":"e_1_3_2_1_27_1","first-page":"69925","article-title":"Visionllm v2: An end-to-end generalist multimodal large language model for hundreds of vision-language tasks","volume":"37","author":"Wu Jiannan","year":"2024","unstructured":"Jiannan Wu, Muyan Zhong, Sen Xing, Zeqiang Lai, Zhaoyang Liu, Zhe Chen, Wenhai Wang, Xizhou Zhu, Lewei Lu, Tong Lu, et al., 2024b. Visionllm v2: An end-to-end generalist multimodal large language model for hundreds of vision-language tasks. Advances in Neural Information Processing Systems, Vol. 37 (2024), 69925-69975.","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_28_1","DOI":"10.18653\/v1\/P19-1514"},{"key":"e_1_3_2_1_29_1","volume-title":"Adatag: Multi-attribute value extraction from product profiles with adaptive decoding. arXiv preprint arXiv:2106.02318","author":"Yan Jun","year":"2021","unstructured":"Jun Yan, Nasser Zalmout, Yan Liang, Christan Grant, Xiang Ren, and Xin Luna Dong. 2021. Adatag: Multi-attribute value extraction from product profiles with adaptive decoding. arXiv preprint arXiv:2106.02318 (2021)."},{"unstructured":"An Yang Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chengyuan Li Dayiheng Liu Fei Huang Haoran Wei et al. 2024. Qwen2. 5 technical report. arXiv preprint arXiv:2412.15115 (2024).","key":"e_1_3_2_1_30_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_31_1","DOI":"10.1145\/3488560.3498377"},{"key":"e_1_3_2_1_32_1","volume-title":"VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding. arXiv preprint arXiv:2501.13106","author":"Zhang Boqiang","year":"2025","unstructured":"Boqiang Zhang, Kehan Li, Zesen Cheng, Zhiqiang Hu, Yuqian Yuan, Guanzheng Chen, Sicong Leng, Yuming Jiang, Hang Zhang, Xin Li, Peng Jin, Wenqi Zhang, Fan Wang, Lidong Bing, and Deli Zhao. 2025. VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding. arXiv preprint arXiv:2501.13106 (2025). https:\/\/arxiv.org\/abs\/2501.13106"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_33_1","DOI":"10.1145\/3485447.3512035"},{"key":"e_1_3_2_1_34_1","first-page":"13139","article-title":"Pay attention to implicit attribute values: A multi-modal generative framework for AVE task. In Findings of the association for computational linguistics","volume":"2023","author":"Zhang Yupeng","year":"2023","unstructured":"Yupeng Zhang, Shensi Wang, Peiguang Li, Guanting Dong, Sirui Wang, Yunsen Xian, Zhoujun Li, and Hongzhi Zhang. 2023a. Pay attention to implicit attribute values: A multi-modal generative framework for AVE task. In Findings of the association for computational linguistics: ACL 2023. 13139-13151.","journal-title":"ACL"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_35_1","DOI":"10.18653\/v1\/2023.findings-acl.831"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_36_1","DOI":"10.1145\/3219819.3219839"},{"key":"e_1_3_2_1_37_1","volume-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592","author":"Zhu Deyao","year":"2023","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)."},{"key":"e_1_3_2_1_38_1","volume-title":"Multimodal joint attribute prediction and value extraction for e-commerce product. arXiv preprint arXiv:2009.07162","author":"Zhu Tiangang","year":"2020","unstructured":"Tiangang Zhu, Yue Wang, Haoran Li, Youzheng Wu, Xiaodong He, and Bowen Zhou. 2020. Multimodal joint attribute prediction and value extraction for e-commerce product. arXiv preprint arXiv:2009.07162 (2020)."},{"key":"e_1_3_2_1_39_1","volume-title":"Implicitave: An open-source dataset and multimodal llms benchmark for implicit attribute value extraction. arXiv preprint arXiv:2404.15592","author":"Zou Henry Peng","year":"2024","unstructured":"Henry Peng Zou, Vinay Samuel, Yue Zhou, Weizhi Zhang, Liancheng Fang, Zihe Song, Philip S Yu, and Cornelia Caragea. 2024. Implicitave: An open-source dataset and multimodal llms benchmark for implicit attribute value extraction. arXiv preprint arXiv:2404.15592 (2024)."}],"event":{"sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"acronym":"CIKM '25","name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","location":"Seoul Republic of Korea"},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3761621","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:24:25Z","timestamp":1765506265000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3761621"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":39,"alternative-id":["10.1145\/3746252.3761621","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3761621","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}