{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T13:09:53Z","timestamp":1765544993008,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681010","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"9234-9243","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["AdaCoder: Adaptive Prompt Compression for Programmatic Visual Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-6266-9477","authenticated-orcid":false,"given":"Mahiro","family":"Ukai","sequence":"first","affiliation":[{"name":"Institute of Science Tokyo, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7415-3120","authenticated-orcid":false,"given":"Shuhei","family":"Kurita","sequence":"additional","affiliation":[{"name":"National Institute of Informatics, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0799-4269","authenticated-orcid":false,"given":"Atsushi","family":"Hashimoto","sequence":"additional","affiliation":[{"name":"OMRON SINIC X, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9014-1389","authenticated-orcid":false,"given":"Yoshitaka","family":"Ushiku","sequence":"additional","affiliation":[{"name":"OMRON SINIC X, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9761-4142","authenticated-orcid":false,"given":"Nakamasa","family":"Inoue","sequence":"additional","affiliation":[{"name":"Institute of Science Tokyo, Tokyo, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_2_1","unstructured":"Anthropic Claude API. 2023. https:\/\/claude.ai."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_4_1","volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Bolya Daniel","year":"2023","unstructured":"Daniel Bolya, Cheng-Yang Fu, Xiaoliang Dai, Peizhao Zhang, Christoph Feichtenhofer, and Judy Hoffman. 2023. Token Merging: Your ViT But Faster. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_5_1","volume-title":"Proc. Annual Conference on Neural Information Processing Systems (NeurIPS). 1877--1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, et al. 2020. Language models are few-shot learners. In Proc. Annual Conference on Neural Information Processing Systems (NeurIPS). 1877--1901."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612536"},{"key":"e_1_3_2_1_7_1","volume-title":"VTQA2023: ACM Multimedia 2023 Visual Text Question Answering Challenge. In Proc. ACM International Conference on Multimedia (ACMMM). 9646--9650","author":"Chen Kang","year":"2023","unstructured":"Kang Chen, Tianli Zhao, and Xiangqian Wu. 2023. VTQA2023: ACM Multimedia 2023 Visual Text Question Answering Challenge. In Proc. ACM International Conference on Multimedia (ACMMM). 9646--9650."},{"key":"e_1_3_2_1_8_1","unstructured":"Mark Chen Jerry Tworek et al. 2021. Evaluating Large Language Models Trained on Code. arXiv2107.03374 (2021)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.232"},{"key":"e_1_3_2_1_10_1","volume-title":"Proc. European Conference on Computer Vision (ECCV).","author":"Fu Xiang","year":"2024","unstructured":"Xiang Fu, Yuxin Hu, Baoxu Li, Yasheng Feng, Hao Wang, Xian Lin, Dan Roth, Noah A. Smith, Wei-Chiu Ma, and Ranjay Krishna. 2024. BLINK: Multimodal Large Language Models Can See but Not Perceive. In Proc. European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_1_11_1","volume-title":"Proc. European Conference on Computer Vision (ECCV).","author":"Ge Jiaxin","year":"2024","unstructured":"Jiaxin Ge, Sanjay Subramanian, Baifeng Shi, Roei Herzig, and Trevor Darrell. 2024. Recursive Visual Programming. In Proc. European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proc. Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Ge Tao","year":"2022","unstructured":"Tao Ge, Jing Hu, Li Dong, Shaoguang Mao, Yan Xia, Xun Wang, Si-Qing Chen, and Furu Wei. 2022. Extensible Prompts for Language Models on Zero-shot Language Style Customization. In Proc. Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_13_1","volume-title":"Proc. International Conference on Machine Learning (ICML). 3690--3699","author":"Goyal Saurabh","year":"2020","unstructured":"Saurabh Goyal, Anamitra Roy Choudhury, Saurabh Raje, Venkatesan T. Chakaravarthy, Yogish Sabharwal, and Ashish Verma. 2020. Power-bert: Accelerating BERT inference via progressive word-vector elimination. In Proc. International Conference on Machine Learning (ICML). 3690--3699."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_15_1","volume-title":"Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Gupta Tanmay","year":"2022","unstructured":"Tanmay Gupta and Aniruddha Kembhavi. 2022. Visual Programming: Compositional visual reasoning without training. In Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00916"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP42928.2021.9506539"},{"volume-title":"Proc. IEEE\/CVF International Conference on Computer Vision (ICCV).","author":"Drew","key":"e_1_3_2_1_18_1","unstructured":"Drew A. Hudson and Christopher D. Manning. 2019. GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering. In Proc. IEEE\/CVF International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.825"},{"key":"e_1_3_2_1_20_1","volume-title":"Proc. Findings of the Association for Computational Linguistics (ACL Findings).","author":"Jin Ziqi","year":"2023","unstructured":"Ziqi Jin and Wei Lu. 2023. Tab-CoT: Zero-shot Tabular Chain of Thought. In Proc. Findings of the Association for Computational Linguistics (ACL Findings)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.508"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539260"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612389"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP42928.2021.9506796"},{"key":"e_1_3_2_1_25_1","volume-title":"Proc. Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL).","author":"Li Liunian Harold","year":"2021","unstructured":"Liunian Harold Li, Haoxuan You, Zhecan Wang, et al. 2021. Unsupervised Visionand-Language Pre-training Without Parallel Images and Captions. In Proc. Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548387"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.9"},{"key":"e_1_3_2_1_29_1","volume-title":"Proc. Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Mu Jesse","year":"2023","unstructured":"Jesse Mu, Xiang Lisa Li, and Noah Goodman. 2023. Learning to Compress Prompts with Gist Tokens. In Proc. Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_30_1","volume-title":"LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression. arXiv preprint arXiv:2403.12968","author":"Pan Zhuoshi","year":"2024","unstructured":"Zhuoshi Pan, Qianhui Wu, Huiqiang Jiang, Menglin Xia, Xufang Luo, Jue Zhang, Qingwei Lin, Victor R\u00fchle, Yuqing Yang, Chin-Yew Lin, H. Vicky Zhao, Lili Qiu, and Dongmei Zhang. 2024. LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression. arXiv preprint arXiv:2403.12968 (2024)."},{"key":"e_1_3_2_1_31_1","unstructured":"Baptiste Roziere Jonas Gehring Fabian Gloeckle et al. 2023. Code llama: Open foundation models for code. arXiv preprint arXiv:2308.12950 (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897381"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP51287.2024.10648180"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-short.65"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1644"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"e_1_3_2_1_37_1","volume-title":"Gemini: A family of highly capable multimodal models. arXiv preprint arXiv:2312.11805","author":"Team Gemini","year":"2023","unstructured":"Gemini Team, Rohan Anil, et al. 2023. Gemini: A family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)."},{"key":"e_1_3_2_1_38_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Proc. ACM International Conference on Multimedia (ACMMM). 2281--2289","author":"Wang Qingqing","year":"2022","unstructured":"Qingqing Wang, Liqiang Xiao, Yue Lu, Yaohui Jin, and Hao He. 2022. Towards Reasoning Ability in Scene Text Visual Question Answering. In Proc. ACM International Conference on Multimedia (ACMMM). 2281--2289."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-emnlp.412"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897277"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548291"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.10"},{"key":"e_1_3_2_1_44_1","volume-title":"Proc. Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Dian Yu, Jeffrey Zhao, et al. 2023. Tree of Thoughts: Deliberate Problem Solving with Large Language Models. In Proc. Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"volume-title":"Proc. European Conference on Computer Vision (ECCV). 69--85","author":"Yu Licheng","key":"e_1_3_2_1_45_1","unstructured":"Licheng Yu, Patrick Poirson, Shan Yang, Alexander C. Berg, and Tamara L. Berg. 2016. Modeling context in referring expressions. In Proc. European Conference on Computer Vision (ECCV). 69--85."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612222"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897563"},{"key":"e_1_3_2_1_48_1","volume-title":"Proc. Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Zhang Kai","year":"2023","unstructured":"Kai Zhang, Li Mo, Wei Chen, Hao Sun, and Yejin Su. 2023. MagicBrush: A manually annotated dataset for instruction-guided image editing. In Proc. Annual Conference on Neural Information Processing Systems (NeurIPS)."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681010","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681010","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:36Z","timestamp":1750295856000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681010"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":48,"alternative-id":["10.1145\/3664647.3681010","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681010","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}