{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T02:00:38Z","timestamp":1769047238543,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"Science Foundation Ireland","doi-asserted-by":"publisher","award":["13\/RC\/21-06_P2"],"award-info":[{"award-number":["13\/RC\/21-06_P2"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733263","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:31:04Z","timestamp":1750876264000},"page":"1303-1312","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["A RAG Approach for Multi-Modal Open-ended Lifelog Question-Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5409-0916","authenticated-orcid":false,"given":"Quang-Linh","family":"Tran","sequence":"first","affiliation":[{"name":"ADAPT Centre, School of Computing, Dublin City University, Dublin, Ireland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4453-7246","authenticated-orcid":false,"given":"Ngo Ngoc Diep","family":"Pham","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7868-1401","authenticated-orcid":false,"given":"Quoc Trung","family":"Truong","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1870-1528","authenticated-orcid":false,"given":"Minh Hung","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7839-5709","authenticated-orcid":false,"given":"Hong Cat","family":"Le","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1040-970X","authenticated-orcid":false,"given":"Dang Khoi","family":"Vu","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8325-1222","authenticated-orcid":false,"given":"Van Minh Thien","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1617-0520","authenticated-orcid":false,"given":"Van Kinh","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2208-2640","authenticated-orcid":false,"given":"Luu Phuong Ngoc Lam","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0739-7075","authenticated-orcid":false,"given":"Tan","family":"Le","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5114-0580","authenticated-orcid":false,"given":"Minh Phuc","family":"Dang","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5249-9702","authenticated-orcid":false,"given":"Binh","family":"Nguyen","sequence":"additional","affiliation":[{"name":"University of Science, Vietnam National University, Ho Chi Minh City, Vietnam"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2923-8365","authenticated-orcid":false,"given":"Gareth J. F.","family":"Jones","sequence":"additional","affiliation":[{"name":"ADAPT Centre, School of Computing, Dublin City University, Dublin, Ireland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2903-3968","authenticated-orcid":false,"given":"Cathal","family":"Gurrin","sequence":"additional","affiliation":[{"name":"ADAPT Centre, School of Computing, Dublin City University, Dublin, Ireland"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2024.09.178"},{"key":"e_1_3_2_1_2_1","unstructured":"Hangbo Bao Li Dong Furu Wei Wenhui Wang Nan Yang Xiaodong Liu Yu Wang Songhao Piao Jianfeng Gao Ming Zhou and Hsiao-Wuen Hon. 2020. UniLMv2: Pseudo-Masked Language Models for Unified Language Model Pre-Training. arXiv:2002.12804 [cs.CL] https:\/\/arxiv.org\/abs\/2002.12804"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Danqi Chen Adam Fisch Jason Weston and Antoine Bordes. 2017. Reading Wikipedia to Answer Open-Domain Questions. arXiv:1704.00051 [cs.CL] https:\/\/arxiv.org\/abs\/1704.00051","DOI":"10.18653\/v1\/P17-1171"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3640544.3645216"},{"key":"e_1_3_2_1_5_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_6_1","unstructured":"Abhimanyu Dubey et al. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.AI] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1561\/1500000033"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658891"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531439"},{"key":"e_1_3_2_1_10_1","unstructured":"Albert Q. Jiang Alexandre Sablayrolles Arthur Mensch Chris Bamford Devendra Singh Chaplot Diego de las Casas Florian Bressand Gianna Lengyel Guillaume Lample Lucile Saulnier L\u00e9lio Renard Lavaud Marie-Anne Lachaux Pierre Stock Teven Le Scao Thibaut Lavril Thomas Wang Timoth\u00e9e Lacroix and William El Sayed. 2023. Mistral 7B. arXiv:2310.06825 [cs.CL] https:\/\/arxiv.org\/abs\/2310.06825"},{"key":"e_1_3_2_1_11_1","unstructured":"Dian Jiao Li Cai Jingsheng Huang Wenqiao Zhang Siliang Tang and Yueting Zhuang. 2024. DuetRAG: Collaborative Retrieval-Augmented Generation. arXiv:2405.13002 [cs.CL] https:\/\/arxiv.org\/abs\/2405.13002"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.550"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1504\/IJCIH.2010.037460"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2805304"},{"key":"e_1_3_2_1_15_1","volume-title":"Berg","author":"Lei Jie","year":"2019","unstructured":"Jie Lei, Licheng Yu, Mohit Bansal, and Tamara L. Berg. 2019. TVQA: Localized, Compositional Video Question Answering. arXiv:1809.01696 [cs.CL] https:\/\/arxiv.org\/abs\/1809.01696"},{"key":"e_1_3_2_1_16_1","volume-title":"BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. arXiv:1910.13461 [cs.CL] https:\/\/arxiv.org\/abs\/1910.13461","author":"Lewis Mike","year":"2019","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension. arXiv:1910.13461 [cs.CL] https:\/\/arxiv.org\/abs\/1910.13461"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","unstructured":"Mike Lewis Yinhan Liu Naman Goyal Marjan Ghazvininejad Abdelrahman Mohamed Omer Levy Veselin Stoyanov and Luke Zettlemoyer. 2020. BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation Translation and Comprehension. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics Dan Jurafsky Joyce Chai Natalie Schluter and Joel Tetreault (Eds.). Association for Computational Linguistics Online 7871--7880. doi:10.18653\/v1\/2020.acl-main.703","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"e_1_3_2_1_18_1","volume-title":"Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela.","author":"Lewis Patrick","year":"2021","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2021. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. arXiv:2005.11401 [cs.CL] https:\/\/arxiv.org\/abs\/2005.11401"},{"key":"e_1_3_2_1_19_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. arXiv:2301.12597 [cs.CV] https:\/\/arxiv.org\/abs\/2301.12597"},{"key":"e_1_3_2_1_20_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. Association for Computational Linguistics, Barcelona, Spain, 74--81. https:\/\/aclanthology.org\/W04--1013"},{"key":"e_1_3_2_1_21_1","volume-title":"TAPEX: Table Pre-training via Learning a Neural SQL Executor. arXiv:2107.07653 [cs.CL] https:\/\/arxiv.org\/abs\/2107.07653","author":"Liu Qian","year":"2022","unstructured":"Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, and Jian-Guang Lou. 2022. TAPEX: Table Pre-training via Learning a Neural SQL Executor. arXiv:2107.07653 [cs.CL] https:\/\/arxiv.org\/abs\/2107.07653"},{"key":"e_1_3_2_1_22_1","volume-title":"MTEB: Massive Text Embedding Benchmark. arXiv:2210.07316 [cs.CL] https:\/\/arxiv.org\/abs\/2210.07316","author":"Muennighoff Niklas","year":"2023","unstructured":"Niklas Muennighoff, Nouamane Tazi, Lo\u00efc Magne, and Nils Reimers. 2023. MTEB: Massive Text Embedding Benchmark. arXiv:2210.07316 [cs.CL] https:\/\/arxiv.org\/abs\/2210.07316"},{"key":"e_1_3_2_1_23_1","unstructured":"OpenAI. 2024. Hello GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/ Accessed: 2024-09--13."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643489.3661123"},{"key":"e_1_3_2_1_25_1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21, 1, Article 140 (Jan. 2020), 67 pages.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1561\/1500000019"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00530"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.6"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-98358-1_18"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643489.3661128"},{"key":"e_1_3_2_1_31_1","volume-title":"Interactive Question Answering for Multimodal Lifelog Retrieval. In International Conference on Multimedia Modeling. Springer, 68--81","author":"Tran Ly-Duyen","year":"2024","unstructured":"Ly-Duyen Tran, Liting Zhou, Binh Nguyen, and Cathal Gurrin. 2024. Interactive Question Answering for Multimodal Lifelog Retrieval. In International Conference on Multimedia Modeling. Springer, 68--81."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643489.3661114"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643479.3662050"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592573.3593101"},{"key":"e_1_3_2_1_35_1","volume-title":"Tatsunori Hashimoto, Oriol Vinyals, Percy Liang, Jeff Dean, and William Fedus.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Yi Tay, Rishi Bommasani, Colin Raffel, Barret Zoph, Sebastian Borgeaud, Dani Yogatama, Maarten Bosma, Denny Zhou, Donald Metzler, Ed H. Chi, Tatsunori Hashimoto, Oriol Vinyals, Percy Liang, Jeff Dean, and William Fedus. 2022. Emergent Abilities of Large Language Models. arXiv:2206.07682 [cs.CL] https:\/\/arxiv.org\/abs\/2206.07682"},{"key":"e_1_3_2_1_36_1","volume-title":"Chi, Quoc Le, and Denny Zhou","author":"Wei Jason","year":"2023","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia, Ed Chi, Quoc Le, and Denny Zhou. 2023. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. arXiv:2201.11903 [cs.CL] https:\/\/arxiv.org\/abs\/2201.11903"},{"key":"e_1_3_2_1_37_1","volume-title":"InstructRAG: Instructing Retrieval-Augmented Generation with Explicit Denoising. arXiv preprint arXiv:2406.13629","author":"Wei Zhepei","year":"2024","unstructured":"Zhepei Wei, Wei-Lin Chen, and Yu Meng. 2024. InstructRAG: Instructing Retrieval-Augmented Generation with Explicit Denoising. arXiv preprint arXiv:2406.13629 (2024)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.372"},{"key":"e_1_3_2_1_39_1","unstructured":"Guangzhi Xiong Qiao Jin Zhiyong Lu and Aidong Zhang. 2024. Benchmarking Retrieval-Augmented Generation for Medicine. arXiv:2402.13178 [cs.CL] https:\/\/arxiv.org\/abs\/2402.13178"},{"key":"e_1_3_2_1_40_1","volume-title":"HotpotQA: A dataset for diverse, explainable multi-hop question answering. arXiv preprint arXiv:1809.09600","author":"Yang Zhilin","year":"2018","unstructured":"Zhilin Yang, Peng Qi, Saizheng Zhang, Yoshua Bengio, WilliamWCohen, Ruslan Salakhutdinov, and Christopher D Manning. 2018. HotpotQA: A dataset for diverse, explainable multi-hop question answering. arXiv preprint arXiv:1809.09600 (2018)."},{"key":"e_1_3_2_1_41_1","volume-title":"Financial report chunking for effective retrieval augmented generation. arXiv preprint arXiv:2402.05131","author":"Yepes Antonio Jimeno","year":"2024","unstructured":"Antonio Jimeno Yepes, Yao You, Jan Milczek, Sebastian Laverde, and Renyu Li. 2024. Financial report chunking for effective retrieval augmented generation. arXiv preprint arXiv:2402.05131 (2024)."},{"key":"e_1_3_2_1_42_1","unstructured":"Tianyi Zhang Varsha Kishore Felix Wu Kilian Q. Weinberger and Yoav Artzi. 2020. BERTScore: Evaluating Text Generation with BERT. arXiv:1904.09675 [cs.CL] https:\/\/arxiv.org\/abs\/1904.09675"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the 17th NTCIR Conference on Evaluation of Information Access Technologies (NTCIR-17)","author":"Zhou Liting","year":"2023","unstructured":"Liting Zhou, Cathal Gurrin, Duc-Tien Dang-Nguyen, Graham Healy, Chenyang Lyu, Tianbo Ji, Longyue Wang, Joho Hideo, Ly-Duyen Tran, and Naushad Alam. 2023. Overview of the NTCIR-17 Lifelog-5 Task. In Proceedings of the 17th NTCIR Conference on Evaluation of Information Access Technologies (NTCIR-17). Tokyo, Japan."}],"event":{"name":"ICMR '25: International Conference on Multimedia Retrieval","location":"Chicago IL USA","acronym":"ICMR '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733263","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:08:06Z","timestamp":1755749286000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733263"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":43,"alternative-id":["10.1145\/3731715.3733263","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733263","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}