{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T15:10:47Z","timestamp":1763997047501,"version":"3.45.0"},"reference-count":76,"publisher":"Association for Computing Machinery (ACM)","issue":"11","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":["ACM Trans. Multimedia Comput. Commun. Appl."],"published-print":{"date-parts":[[2025,11,30]]},"DOI":"10.1145\/3762666","type":"journal-article","created":{"date-parts":[[2025,9,5]],"date-time":"2025-09-05T15:10:51Z","timestamp":1757085051000},"page":"1-13","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Introduction to the Special Issue on Deep Multimodal Generation and Retrieval"],"prefix":"10.1145","volume":"21","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3026-6347","authenticated-orcid":false,"given":"Hao","family":"Fei","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8106-9768","authenticated-orcid":false,"given":"Wei","family":"Ji","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1791-3159","authenticated-orcid":false,"given":"Yinwei","family":"Wei","sequence":"additional","affiliation":[{"name":"Monash University, Melbourne, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2434-9050","authenticated-orcid":false,"given":"Zhedong","family":"Zheng","sequence":"additional","affiliation":[{"name":"University of Macau, Taipa, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4560-8509","authenticated-orcid":false,"given":"Jialie","family":"Shen","sequence":"additional","affiliation":[{"name":"City St George\u2019s, University of London, London, UK"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5771-2549","authenticated-orcid":false,"given":"Alan","family":"Hanjalic","sequence":"additional","affiliation":[{"name":"Delft University of Technology, Delft, Netherlands"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7410-2590","authenticated-orcid":false,"given":"Roger","family":"Zimmermann","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_1_2_2","doi-asserted-by":"publisher","DOI":"10.1145\/3724122"},{"key":"e_1_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548399"},{"key":"e_1_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.1145\/3688804"},{"key":"e_1_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2940693"},{"key":"e_1_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1145\/3698772"},{"key":"e_1_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3724397"},{"key":"e_1_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3570405"},{"key":"e_1_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.329"},{"key":"e_1_3_1_10_2","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Wei Ji, Hanwang Zhang, Meishan Zhang, Mong-Li Lee, and Wynne Hsu. 2024. Video-of-thought: Step-by-step video reasoning from perception to cognition. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_1_11_2","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Fei Hao","year":"2022","unstructured":"Hao Fei, Shengqiong Wu, Jingye Li, Bobo Li, Fei Li, Libo Qin, Meishan Zhang, Min Zhang, and Tat-Seng Chua. 2022. LasUIE: Unifying information extraction with latent adaptive structure-aware generative language model. In Proceedings of the 36th International Conference on Neural Information Processing Systems."},{"key":"e_1_3_1_12_2","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024. VITRON: A unified pixel-level vision LLM for understanding, generating, segmenting, editing. In Proceedings of the Advances in Neural Information Processing Systems."},{"key":"e_1_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3393452"},{"key":"e_1_3_1_14_2","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Fei Hao","year":"2025","unstructured":"Hao Fei, Yuan Zhou, Juncheng Li, Xiangtai Li, Qingshan Xu, Bobo Li, Shengqiong Wu, Yaoting Wang, Junbao Zhou, Jiahao Meng, et al. 2025. On path to multimodal generalist: General-level and general-bench. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3700770"},{"key":"e_1_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3143712"},{"key":"e_1_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.2995496"},{"key":"e_1_3_1_18_2","volume-title":"Proceedings of the International Joint Conference on Artificial Intelligence","author":"Huang Zhikun","year":"2021","unstructured":"Zhikun Huang, Zhedong Zheng, Chenggang Yan, Hongtao Xie, Yaoqi Sun, Jianzhong Wang, and Jiyong Zhang. 2021. Real-world automatic makeup via identity preservation makeup net. In Proceedings of the International Joint Conference on Artificial Intelligence. International Joint Conference on Artificial Intelligence."},{"key":"e_1_3_1_19_2","unstructured":"Wei Ji Long Chen Yinwei Wei Yiming Wu and Tat-Seng Chua. 2022. MRTNet: Multi-resolution temporal network for video sentence grounding. arXiv:2212.13163. Retrieved from https:\/\/arxiv.org\/abs\/2212.13163"},{"key":"e_1_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2962216"},{"key":"e_1_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479232"},{"key":"e_1_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Wei Ji Renjie Liang Zhedong Zheng Wenqiao Zhang Shengyu Zhang Juncheng Li Mengze Li and Tat-seng Chua. 2023. Are binary annotations sufficient? Video moment retrieval via hierarchical uncertainty-based active learning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition.","DOI":"10.1109\/CVPR52729.2023.02204"},{"key":"e_1_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413653"},{"key":"e_1_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA55743.2025.11127895"},{"key":"e_1_3_1_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3715698"},{"key":"e_1_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3689646"},{"key":"e_1_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612053"},{"key":"e_1_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3708348"},{"key":"e_1_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00294"},{"key":"e_1_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3175601"},{"key":"e_1_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3656475"},{"key":"e_1_3_1_33_2","doi-asserted-by":"publisher","DOI":"10.1145\/3700596"},{"key":"e_1_3_1_34_2","article-title":"Self-supervised correlation learning for cross-modal retrieval","author":"Liu Yaxin","year":"2022","unstructured":"Yaxin Liu, Jianlong Wu, Leigang Qu, Tian Gan, Jianhua Yin, and Liqiang Nie. 2022. Self-supervised correlation learning for cross-modal retrieval. IEEE Transactions on Multimedia (2022).","journal-title":"IEEE Transactions on Multimedia"},{"key":"e_1_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3687475"},{"key":"e_1_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i12.26689"},{"key":"e_1_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/3697838"},{"key":"e_1_3_1_38_2","first-page":"8748","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the 38th International Conference on Machine Learning, 8748\u20138763."},{"key":"e_1_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475263"},{"key":"e_1_3_1_40_2","doi-asserted-by":"publisher","DOI":"10.1145\/3695255"},{"key":"e_1_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3715137"},{"key":"e_1_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1145\/3712063"},{"key":"e_1_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1145\/3648368"},{"key":"e_1_3_1_44_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"e_1_3_1_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3699959"},{"key":"e_1_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3690642"},{"key":"e_1_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3256092"},{"key":"e_1_3_1_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3204444"},{"key":"e_1_3_1_49_2","unstructured":"Yaoting Wang Shengqiong Wu Yuecheng Zhang Shuicheng Yan Ziwei Liu Jiebo Luo and Hao Fei. 2025. Multimodal chain-of-thought reasoning: A comprehensive survey. arXiv:2503.12605. Retrieved from https:\/\/arxiv.org\/abs\/2503.12605"},{"key":"e_1_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3715136"},{"key":"e_1_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20167"},{"key":"e_1_3_1_52_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.823"},{"key":"e_1_3_1_53_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01321"},{"key":"e_1_3_1_54_2","unstructured":"Shengqiong Wu Hao Fei Xiangtai Li Jiayi Ji Hanwang Zhang Tat-Seng Chua and Shuicheng Yan. 2024. Towards semantic equivalence of tokenization in multimodal LLM. arXiv:2406.05127. Retrieved from https:\/\/arxiv.org\/abs\/2406.05127"},{"key":"e_1_3_1_55_2","first-page":"53366","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2024. NExT-GPT: Any-to-any multimodal LLM. In Proceedings of the International Conference on Machine Learning, 53366\u201353397."},{"key":"e_1_3_1_56_2","doi-asserted-by":"publisher","DOI":"10.5555\/3666122.3669590"},{"key":"e_1_3_1_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3715915"},{"key":"e_1_3_1_58_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20184"},{"key":"e_1_3_1_59_2","unstructured":"Shuyu Yang Yaxiong Wang Yongrui Li Li Zhu and Zhedong Zheng. 2025. Minimizing the pretraining gap: Domain-aligned text-based person retrieval. arXiv:2507.10195. Retrieved from https:\/\/arxiv.org\/abs\/2507.10195"},{"key":"e_1_3_1_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_1_61_2","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3532063"},{"key":"e_1_3_1_62_2","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2025.3565392"},{"key":"e_1_3_1_63_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413640"},{"key":"e_1_3_1_64_2","unstructured":"Guiyu Zhang Huan-ang Gao Zijian Jiang Hao Zhao and Zhedong Zheng. 2024. Ctrl-u: Robust conditional image generation via uncertainty-aware reward modeling. arXiv:2410.11236. Retrieved from https:\/\/arxiv.org\/abs\/2410.11236"},{"key":"e_1_3_1_65_2","doi-asserted-by":"publisher","DOI":"10.1145\/3705322"},{"key":"e_1_3_1_66_2","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01805-x"},{"key":"e_1_3_1_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3702999"},{"key":"e_1_3_1_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612096"},{"key":"e_1_3_1_69_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.442"},{"key":"e_1_3_1_70_2","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3014488"},{"key":"e_1_3_1_71_2","article-title":"Parameter-efficient person re-identification in the 3D space","author":"Zheng Zhedong","year":"2022","unstructured":"Zhedong Zheng, Xiaohan Wang, Nenggan Zheng, and Yi Yang. 2022. Parameter-efficient person re-identification in the 3D space. IEEE Transactions on Neural Networks and Learning Systems (2022).","journal-title":"IEEE Transactions on Neural Networks and Learning Systems"},{"key":"e_1_3_1_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00224"},{"key":"e_1_3_1_73_2","doi-asserted-by":"publisher","DOI":"10.1145\/3383184"},{"key":"e_1_3_1_74_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.405"},{"key":"e_1_3_1_75_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.432"},{"key":"e_1_3_1_76_2","doi-asserted-by":"publisher","DOI":"10.1145\/3712597"},{"key":"e_1_3_1_77_2","article-title":"Scale up composed image retrieval learning via modification text generation","author":"Zhou Yinan","year":"2025","unstructured":"Yinan Zhou, Yaxiong Wang, Haokun Lin, Chen Ma, Li Zhu, and Zhedong Zheng. 2025. Scale up composed image retrieval learning via modification text generation. IEEE Transactions on Multimedia (2025).","journal-title":"IEEE Transactions on Multimedia"}],"container-title":["ACM Transactions on Multimedia Computing, Communications, and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3762666","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,24]],"date-time":"2025-11-24T15:06:52Z","timestamp":1763996812000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3762666"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":76,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2025,11,30]]}},"alternative-id":["10.1145\/3762666"],"URL":"https:\/\/doi.org\/10.1145\/3762666","relation":{},"ISSN":["1551-6857","1551-6865"],"issn-type":[{"type":"print","value":"1551-6857"},{"type":"electronic","value":"1551-6865"}],"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-08-13","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-08-14","order":2,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}