{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T09:51:23Z","timestamp":1773481883752,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,9]],"date-time":"2024-06-09T00:00:00Z","timestamp":1717891200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,9]]},"DOI":"10.1145\/3665601.3669846","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T12:20:36Z","timestamp":1717762836000},"page":"16-25","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["CMDBench: A Benchmark for Coarse-to-fine Multimodal Data Discovery in Compound AI Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7696-9020","authenticated-orcid":false,"given":"Yanlin","family":"Feng","sequence":"first","affiliation":[{"name":"Megagon Labs, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4210-1582","authenticated-orcid":false,"given":"Sajjadur","family":"Rahman","sequence":"additional","affiliation":[{"name":"Megagon Labs, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2918-1462","authenticated-orcid":false,"given":"Aaron","family":"Feng","sequence":"additional","affiliation":[{"name":"Megagon Labs, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9111-2401","authenticated-orcid":false,"given":"Vincent","family":"Chen","sequence":"additional","affiliation":[{"name":"Megagon Labs, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3509-5502","authenticated-orcid":false,"given":"Eser","family":"Kandogan","sequence":"additional","affiliation":[{"name":"Megagon Labs, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,6,9]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of CIDR, Vol.\u00a08. 28","author":"Armbrust Michael","year":"2021","unstructured":"Michael Armbrust, Ali Ghodsi, Reynold Xin, and Matei Zaharia. 2021. Lakehouse: a new generation of open platforms that unify data warehousing and advanced analytics. In Proceedings of CIDR, Vol.\u00a08. 28."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2501511.2501516"},{"key":"e_1_3_2_1_3_1","volume-title":"Translating embeddings for modeling multi-relational data. Advances in neural information processing systems 26","author":"Bordes Antoine","year":"2013","unstructured":"Antoine Bordes, Nicolas Usunier, Alberto Garcia-Duran, Jason Weston, and Oksana Yakhnenko. 2013. Translating embeddings for modeling multi-relational data. Advances in neural information processing systems 26 (2013)."},{"key":"e_1_3_2_1_4_1","unstructured":"Shannon Bradshaw Eoin Brazil and Kristina Chodorow. 2019. MongoDB: the definitive guide: powerful and scalable data storage. O\u2019Reilly Media."},{"key":"e_1_3_2_1_5_1","volume-title":"KQA pro: A dataset with explicit compositional programs for complex question answering over knowledge base. arXiv preprint arXiv:2007.03875","author":"Cao Shulin","year":"2020","unstructured":"Shulin Cao, Jiaxin Shi, Liangming Pan, Lunyiu Nie, Yutong Xiang, Lei Hou, Juanzi Li, Bin He, and Hanwang Zhang. 2020. KQA pro: A dataset with explicit compositional programs for complex question answering over knowledge base. arXiv preprint arXiv:2007.03875 (2020)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.422"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1146\/annurev.polisci.8.081404.075608"},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC","author":"Elsahar Hady","year":"2018","unstructured":"Hady Elsahar, Pavlos Vougiouklis, Arslen Remaci, Christophe Gravier, Jonathon Hare, Frederique Laforest, and Elena Simperl. 2018. T-rex: A large scale alignment of natural language with knowledge base triples. In Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.14778\/3611479.3611533"},{"key":"e_1_3_2_1_10_1","volume-title":"Ragas: Automated evaluation of retrieval augmented generation. arXiv preprint arXiv:2309.15217","author":"Es Shahul","year":"2023","unstructured":"Shahul Es, Jithin James, Luis Espinosa-Anke, and Steven Schockaert. 2023. Ragas: Automated evaluation of retrieval augmented generation. arXiv preprint arXiv:2309.15217 (2023)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.14778\/3587136.3587146"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.99"},{"key":"e_1_3_2_1_13_1","volume-title":"2018 IEEE 34th International Conference on Data Engineering (ICDE). IEEE, 1001\u20131012","author":"Fernandez Raul\u00a0Castro","year":"2018","unstructured":"Raul\u00a0Castro Fernandez, Ziawasch Abedjan, Famien Koko, Gina Yuan, Samuel Madden, and Michael Stonebraker. 2018. Aurum: A data discovery system. In 2018 IEEE 34th International Conference on Data Engineering (ICDE). IEEE, 1001\u20131012."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2948674.2948675"},{"key":"e_1_3_2_1_15_1","volume-title":"Exploiting linked data and knowledge graphs in large organisations","author":"Gomez-Perez Jose\u00a0Manuel","unstructured":"Jose\u00a0Manuel Gomez-Perez, Jeff\u00a0Z Pan, Guido Vetere, and Honghan Wu. 2017. Enterprise knowledge graph: An introduction. In Exploiting linked data and knowledge graphs in large organisations. Springer, 1\u201314."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1147"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588689"},{"key":"e_1_3_2_1_18_1","volume-title":"An LLM compiler for parallel function calling. arXiv preprint arXiv:2312.04511","author":"Kim Sehoon","year":"2023","unstructured":"Sehoon Kim, Suhong Moon, Ryan Tabrizi, Nicholas Lee, Michael\u00a0W Mahoney, Kurt Keutzer, and Amir Gholami. 2023. An LLM compiler for parallel function calling. arXiv preprint arXiv:2312.04511 (2023)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00276"},{"key":"e_1_3_2_1_20_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems 33 (2020), 9459\u20139474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_21_1","volume-title":"Hani Itani, Dmitrii Khizbullin, and Bernard Ghanem.","author":"Li Guohao","year":"2023","unstructured":"Guohao Li, Hasan Abed Al\u00a0Kader Hammoud, Hani Itani, Dmitrii Khizbullin, and Bernard Ghanem. 2023. Camel: Communicative agents for\" mind\" exploration of large scale language model society. arXiv preprint arXiv:2303.17760 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","unstructured":"Xue Li Weibin Zeng Zhibin Wang Diwen Zhu Jingbo Xu Wenyuan Yu and Jingren Zhou. 2023. Enhancing Data Lakes with GraphAr: Efficient Graph Data Management with a Specialized Storage Scheme. (2023). https:\/\/doi.org\/10.48550\/ARXIV.2312.09577 arXiv:2312.09577","DOI":"10.48550\/ARXIV.2312.09577"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","unstructured":"Jerry Liu. 2022. LlamaIndex. https:\/\/doi.org\/10.5281\/zenodo.1234","DOI":"10.5281\/zenodo.1234"},{"key":"e_1_3_2_1_24_1","unstructured":"Sports\u00a0Reference LLC.2022. Basketball Statistics and History. https:\/\/www.basketball-reference.com\/."},{"key":"e_1_3_2_1_25_1","volume-title":"BGE Landmark Embedding: A Chunking-Free Embedding Method For Retrieval Augmented Long-Context Large Language Models. arXiv preprint arXiv:2402.11573","author":"Luo Kun","year":"2024","unstructured":"Kun Luo, Zheng Liu, Shitao Xiao, and Kang Liu. 2024. BGE Landmark Embedding: A Chunking-Free Embedding Method For Retrieval Augmented Long-Context Large Language Models. arXiv preprint arXiv:2402.11573 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Proceedings of the southern association for information systems conference","author":"Miller J","year":"2013","unstructured":"Justin\u00a0J Miller. 2013. Graph database applications and concepts with Neo4j. In Proceedings of the southern association for information systems conference, Atlanta, GA, USA, Vol.\u00a02324. 141\u2013147."},{"key":"e_1_3_2_1_27_1","volume-title":"Characterizing Large Language Models as Rationalizers of Knowledge-intensive Tasks. arXiv preprint arXiv:2311.05085","author":"Mishra Aditi","year":"2023","unstructured":"Aditi Mishra, Sajjadur Rahman, Hannah Kim, Kushan Mitra, and Estevam Hruschka. 2023. Characterizing Large Language Models as Rationalizers of Knowledge-intensive Tasks. arXiv preprint arXiv:2311.05085 (2023)."},{"key":"e_1_3_2_1_28_1","unstructured":"Raghunath\u00a0Othayoth Nambiar and Meikel Poess. 2006. The Making of TPC-DS.. In VLDB Vol.\u00a06. 1049\u20131058."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.5121\/ijdms.2019.11301"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Fabio Petroni Aleksandra Piktus Angela Fan Patrick Lewis Majid Yazdani Nicola\u00a0De Cao James Thorne Yacine Jernite Vassilis Plachouras Tim Rocktaschel and Sebastian Riedel. 2020. KILT: a Benchmark for Knowledge Intensive Language Tasks. In North American Chapter of the Association for Computational Linguistics. https:\/\/api.semanticscholar.org\/CorpusID:221507798","DOI":"10.18653\/v1\/2021.naacl-main.200"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.200"},{"key":"e_1_3_2_1_32_1","volume-title":"The probabilistic relevance framework: BM25 and beyond. Foundations and Trends\u00ae in Information Retrieval 3, 4","author":"Robertson Stephen","year":"2009","unstructured":"Stephen Robertson, Hugo Zaragoza, 2009. The probabilistic relevance framework: BM25 and beyond. Foundations and Trends\u00ae in Information Retrieval 3, 4 (2009), 333\u2013389."},{"key":"e_1_3_2_1_33_1","volume-title":"Ares: An automated evaluation framework for retrieval-augmented generation systems. arXiv preprint arXiv:2311.09476","author":"Saad-Falcon Jon","year":"2023","unstructured":"Jon Saad-Falcon, Omar Khattab, Christopher Potts, and Matei Zaharia. 2023. Ares: An automated evaluation framework for retrieval-augmented generation systems. arXiv preprint arXiv:2311.09476 (2023)."},{"key":"e_1_3_2_1_34_1","volume-title":"RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval. In The Twelfth International Conference on Learning Representations.","author":"Sarthi Parth","year":"2023","unstructured":"Parth Sarthi, Salman Abdullah, Aditi Tuli, Shubh Khanna, Anna Goldie, and Christopher\u00a0D Manning. 2023. RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3211954.3211955"},{"key":"e_1_3_2_1_36_1","volume-title":"LakeBench: Benchmarks for Data Discovery over Data Lakes. arXiv preprint arXiv:2307.04217","author":"Srinivas Kavitha","year":"2023","unstructured":"Kavitha Srinivas, Julian Dolby, Ibrahim Abdelaziz, Oktie Hassanzadeh, Harsha Kokel, Aamod Khatiwada, Tejaswini Pedapati, Subhajit Chaudhury, and Horst Samulowitz. 2023. LakeBench: Benchmarks for Data Discovery over Data Lakes. arXiv preprint arXiv:2307.04217 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/16856.16888"},{"key":"e_1_3_2_1_38_1","volume-title":"VerifAI: Verified Generative AI. arXiv preprint arXiv:2307.02796","author":"Tang Nan","year":"2023","unstructured":"Nan Tang, Chenyu Yang, Ju Fan, and Lei Cao. 2023. VerifAI: Verified Generative AI. arXiv preprint arXiv:2307.02796 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Elasticsearch: The Definitive Guide. O\u2019Reilly.","author":"Tong Zachary","year":"2015","unstructured":"Zachary Tong. 2015. Elasticsearch: The Definitive Guide. O\u2019Reilly."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/1958824.1958906"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/2629489"},{"key":"e_1_3_2_1_42_1","volume-title":"DBCopilot: Scaling Natural Language Querying to Massive Databases. arXiv preprint arXiv:2312.03463","author":"Wang Tianshu","year":"2023","unstructured":"Tianshu Wang, Hongyu Lin, Xianpei Han, Le Sun, Xiaoyang Chen, Hao Wang, and Zhenyu Zeng. 2023. DBCopilot: Scaling Natural Language Querying to Massive Databases. arXiv preprint arXiv:2312.03463 (2023)."},{"key":"e_1_3_2_1_43_1","volume-title":"Autogen: Enabling next-gen llm applications via multi-agent conversation framework. arXiv preprint arXiv:2308.08155","author":"Wu Qingyun","year":"2023","unstructured":"Qingyun Wu, Gagan Bansal, Jieyu Zhang, Yiran Wu, Shaokun Zhang, Erkang Zhu, Beibin Li, Li Jiang, Xiaoyun Zhang, and Chi Wang. 2023. Autogen: Enabling next-gen llm applications via multi-agent conversation framework. arXiv preprint arXiv:2308.08155 (2023)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1259"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.45"},{"key":"e_1_3_2_1_46_1","unstructured":"Matei Zaharia Omar Khattab Lingjiao Chen Jared\u00a0Quincy Davis Heather Miller Chris Potts James Zou Michael Carbin Jonathan Frankle Naveen Rao and Ali Ghodsi. 2024. The Shift from Models to Compound AI Systems. https:\/\/bair.berkeley.edu\/blog\/2024\/02\/18\/compound-ai-systems\/."},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the Twenty-Ninth International Conference on International Joint Conferences on Artificial Intelligence. 4039\u20134045","author":"Zhao Yang","year":"2021","unstructured":"Yang Zhao, Jiajun Zhang, Yu Zhou, and Chengqing Zong. 2021. Knowledge graphs enhanced neural machine translation. In Proceedings of the Twenty-Ninth International Conference on International Joint Conferences on Artificial Intelligence. 4039\u20134045."},{"key":"e_1_3_2_1_48_1","volume-title":"Seq2SQL: Generating Structured Queries from Natural Language using Reinforcement Learning. CoRR abs\/1709.00103","author":"Zhong Victor","year":"2017","unstructured":"Victor Zhong, Caiming Xiong, and Richard Socher. 2017. Seq2SQL: Generating Structured Queries from Natural Language using Reinforcement Learning. CoRR abs\/1709.00103 (2017)."}],"event":{"name":"SIGMOD\/PODS '24: International Conference on Management of Data","location":"Santiago AA Chile","acronym":"SIGMOD\/PODS '24","sponsor":["SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the Conference on Governance, Understanding and Integration of Data for Effective and Responsible AI"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3665601.3669846","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3665601.3669846","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T18:35:47Z","timestamp":1755974147000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3665601.3669846"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,9]]},"references-count":48,"alternative-id":["10.1145\/3665601.3669846","10.1145\/3665601"],"URL":"https:\/\/doi.org\/10.1145\/3665601.3669846","relation":{},"subject":[],"published":{"date-parts":[[2024,6,9]]},"assertion":[{"value":"2024-06-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}