{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T15:00:53Z","timestamp":1776092453259,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":10,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3688865.3689482","type":"proceedings-article","created":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T14:10:52Z","timestamp":1729433452000},"page":"3-5","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["AssistGPT: Towards Multi-modal Agent for Human-Centric AI Assistant"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8494-3492","authenticated-orcid":false,"given":"Difei","family":"Gao","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9741-5028","authenticated-orcid":false,"given":"Siyuan","family":"Hu","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2568-2346","authenticated-orcid":false,"given":"Qinghong","family":"Lin","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7681-2166","authenticated-orcid":false,"given":"Mike Zheng","family":"Shou","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang et al. 2023. Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/cloud.google.com\/vision\/docs\/ocr","author":"Optical Character OCR","year":"2024","unstructured":"Google. 2024. OCR (Optical Character Recognition) with world-class Google Cloud AI. (2024). https:\/\/cloud.google.com\/vision\/docs\/ocr."},{"key":"e_1_3_2_1_4_1","volume-title":"VILA: On Pre-training for Visual Language Models. arxiv: 2312.07533 [cs.CV]","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Hongxu Yin, Wei Ping, Yao Lu, Pavlo Molchanov, Andrew Tao, Huizi Mao, Jan Kautz, Mohammad Shoeybi, and Song Han. 2023. VILA: On Pre-training for Visual Language Models. arxiv: 2312.07533 [cs.CV]"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"e_1_3_2_1_6_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning."},{"key":"e_1_3_2_1_7_1","unstructured":"OpenAI. 2023. Gpt-4v(ision) system card."},{"key":"e_1_3_2_1_8_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arxiv: 2103.00020 [cs.CV] https:\/\/arxiv.org\/abs\/2103.00020"},{"key":"e_1_3_2_1_9_1","volume-title":"Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever.","author":"Radford Alec","year":"2022","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2022. Robust Speech Recognition via Large-Scale Weak Supervision. arxiv: 2212.04356 [eess.AS] https:\/\/arxiv.org\/abs\/2212.04356"},{"key":"e_1_3_2_1_10_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Yonghui Wu Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 5th International Workshop on Human-centric Multimedia Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3688865.3689482","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3688865.3689482","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T18:38:51Z","timestamp":1755974331000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3688865.3689482"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":10,"alternative-id":["10.1145\/3688865.3689482","10.1145\/3688865"],"URL":"https:\/\/doi.org\/10.1145\/3688865.3689482","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}