{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:40:59Z","timestamp":1765309259993,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3762000","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"13797-13798","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MIRAGE25: ACM MM25 Multimodal Interleaved Reasoning and Generation Challenge"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4859-1757","authenticated-orcid":false,"given":"Dong","family":"Chen","sequence":"first","affiliation":[{"name":"Zhengzhou University, Zhengzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0144-2815","authenticated-orcid":false,"given":"Fei","family":"Gao","sequence":"additional","affiliation":[{"name":"Zhengzhou University, Zhengzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7293-2735","authenticated-orcid":false,"given":"Zhengqing","family":"Hu","sequence":"additional","affiliation":[{"name":"Zhengzhou University, Zhengzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7778-8807","authenticated-orcid":false,"given":"Xiaojun","family":"Chang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Oncel Tuzel, Ping Huang, Jiulong Shan, Jianjun Shi, and Meng Cao.","author":"Bai Haoping","year":"2023","unstructured":"Haoping Bai, Shancong Mou, Tatiana Likhomanenko, Ramazan Gokberk Cinbis, Oncel Tuzel, Ping Huang, Jiulong Shan, Jianjun Shi, and Meng Cao. 2023. VISION Datasets: A Benchmark for Vision-based InduStrial InspectiON. arXiv preprint arXiv:2306.07890 (2023)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00437"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01164"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01600"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01391"},{"key":"e_1_3_2_1_6_1","volume-title":"Neural naturalist: generating fine-grained image comparisons. arXiv preprint arXiv:1909.04101","author":"Forbes Maxwell","year":"2019","unstructured":"Maxwell Forbes, Christine Kaeser-Chen, Piyush Sharma, and Serge Belongie. 2019. Neural naturalist: generating fine-grained image comparisons. arXiv preprint arXiv:1909.04101 (2019)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_37"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.163"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00275"},{"key":"e_1_3_2_1_10_1","volume-title":"Workshop on faces in'Real-Life'Images: detection, alignment, and recognition.","author":"Huang Gary B","year":"2008","unstructured":"Gary B Huang, Marwan Mattar, Tamara Berg, and Eric Learned-Miller. 2008. Labeled faces in the wild: A database forstudying face recognition in unconstrained environments. In Workshop on faces in'Real-Life'Images: detection, alignment, and recognition."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1147"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298744"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.686"},{"key":"e_1_3_2_1_14_1","volume-title":"Learning to describe differences between pairs of similar images. arXiv preprint arXiv:1808.10584","author":"Jhamtani Harsh","year":"2018","unstructured":"Harsh Jhamtani and Taylor Berg-Kirkpatrick. 2018. Learning to describe differences between pairs of similar images. arXiv preprint arXiv:1808.10584 (2018)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.571"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00649"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.290"},{"key":"e_1_3_2_1_18_1","first-page":"740","volume-title":"Zurich","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision-ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer, 740-755."},{"key":"e_1_3_2_1_19_1","first-page":"70","volume-title":"Tel Aviv","author":"Maharana Adyasha","year":"2022","unstructured":"Adyasha Maharana, Darryl Hannan, and Mohit Bansal. 2022. Storydall-e: Adapting pretrained text-to-image transformers for story continuation. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXVII. Springer, 70-87."},{"key":"e_1_3_2_1_20_1","first-page":"12","article-title":"The kth-tips2 database. Computational Vision and Active Perception Laboratory, Stockholm","volume":"11","author":"Mallikarjuna P","year":"2006","unstructured":"P Mallikarjuna, Alireza Tavakoli Targhi, Mario Fritz, Eric Hayman, Barbara Caputo, and Jan-Olof Eklundh. 2006. The kth-tips2 database. Computational Vision and Active Perception Laboratory, Stockholm, Sweden, Vol. 11 (2006), 12.","journal-title":"Sweden"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00225"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00156"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00206"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2018.00262"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1167\/9.8.784"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01075"},{"key":"e_1_3_2_1_27_1","volume-title":"A corpus for reasoning about natural language grounded in photographs. arXiv preprint arXiv:1811.00491","author":"Suhr Alane","year":"2018","unstructured":"Alane Suhr, Stephanie Zhou, Ally Zhang, Iris Zhang, Huajun Bai, and Yoav Artzi. 2018. A corpus for reasoning about natural language grounded in photographs. arXiv preprint arXiv:1811.00491 (2018)."},{"key":"e_1_3_2_1_28_1","volume-title":"Multimodalqa: Complex question answering over text, tables and images. arXiv preprint arXiv:2104.06039","author":"Talmor Alon","year":"2021","unstructured":"Alon Talmor, Ori Yoran, Amnon Catav, Dan Lahav, Yizhong Wang, Akari Asai, Gabriel Ilharco, Hannaneh Hajishirzi, and Jonathan Berant. 2021. Multimodalqa: Complex question answering over text, tables and images. arXiv preprint arXiv:2104.06039 (2021)."},{"key":"e_1_3_2_1_29_1","volume-title":"Expressing visual relationships via language. arXiv preprint arXiv:1906.07689","author":"Tan Hao","year":"2019","unstructured":"Hao Tan, Franck Dernoncourt, Zhe Lin, Trung Bui, and Mohit Bansal. 2019. Expressing visual relationships via language. arXiv preprint arXiv:1906.07689 (2019)."},{"key":"e_1_3_2_1_30_1","volume-title":"SlideVQA: A Dataset for Document Visual Question Answering on Multiple Images. arXiv preprint arXiv:2301.04883","author":"Tanaka Ryota","year":"2023","unstructured":"Ryota Tanaka, Kyosuke Nishida, Kosuke Nishida, Taku Hasegawa, Itsumi Saito, and Kuniko Saito. 2023. SlideVQA: A Dataset for Document Visual Question Answering on Multiple Images. arXiv preprint arXiv:2301.04883 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"Recipeqa: A challenge dataset for multimodal comprehension of cooking recipes. arXiv preprint arXiv:1809.00812","author":"Yagcioglu Semih","year":"2018","unstructured":"Semih Yagcioglu, Aykut Erdem, Erkut Erdem, and Nazli Ikizler-Cinbis. 2018. Recipeqa: A challenge dataset for multimodal comprehension of cooking recipes. arXiv preprint arXiv:1809.00812 (2018)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3762000","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:37:22Z","timestamp":1765309042000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3762000"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":31,"alternative-id":["10.1145\/3746027.3762000","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3762000","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}