{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:10:07Z","timestamp":1765311007753,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":86,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2024-00338772"],"award-info":[{"award-number":["RS-2024-00338772"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100014188","name":"Ministry of Science and ICT, South Korea","doi-asserted-by":"publisher","award":["RS-2024-00425354"],"award-info":[{"award-number":["RS-2024-00425354"]}],"id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Institute of Information & communications Technology Planning & Evaluation","award":["IITP-2025-RS-2023-00254129"],"award-info":[{"award-number":["IITP-2025-RS-2023-00254129"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755106","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:30:51Z","timestamp":1761377451000},"page":"1288-1297","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["RA-Touch: Retrieval-Augmented Touch Understanding with Enriched Visual Data"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-7179-9291","authenticated-orcid":false,"given":"Yoorhim","family":"Cho","sequence":"first","affiliation":[{"name":"Sungkyunkwan University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5890-4102","authenticated-orcid":false,"given":"Hongyeob","family":"Kim","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2630-4654","authenticated-orcid":false,"given":"Semin","family":"Kim","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5671-3232","authenticated-orcid":false,"given":"Youjia","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9971-1501","authenticated-orcid":false,"given":"YunSeok","family":"Choi","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Seoul, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1774-9168","authenticated-orcid":false,"given":"Sungeun","family":"Hong","sequence":"additional","affiliation":[{"name":"Sungkyunkwan University, Seoul, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Josh Achiam Steven Adler Sandhini Agarwal Lama Ahmad Ilge Akkaya Florencia Leoni Aleman Diogo Almeida Janko Altenschmidt Sam Altman Shyamal Anadkat et al. 2023. Gpt-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Parishad BehnamGhader Vaibhav Adlakha Marius Mosbach Dzmitry Bahdanau Nicolas Chapados and Siva Reddy. 2024. LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders. In COLM."},{"key":"e_1_3_2_1_3_1","unstructured":"Andreas Blattmann and Jonas M\u00fcller Bj\u00f6rn Ommer Robin Rombach Kaan Oktay. 2022. Semi-Parametric Neural Image Synthesis. In NeurIPS."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"J. Bresciani Franziska Dammeier and M. Ernst. 2006. Vision and Touch are Automatically Integrated for the Perception of Sequences of Events. In Journal of Vision.","DOI":"10.1167\/6.5.2"},{"key":"e_1_3_2_1_5_1","volume-title":"Gregory P. Meyer, Yuning Chai, Dennis Park, and Yong Jae Lee.","author":"Cai Mu","year":"2024","unstructured":"Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, and Yong Jae Lee. 2024. Making Large Multimodal Models Understand Arbitrary Visual Prompts. In CVPR."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2018.2852779"},{"key":"e_1_3_2_1_7_1","unstructured":"R. Calandra Andrew Owens M. Upadhyaya Wenzhen Yuan Justin Lin E. Adelson and S. Levine. 2017. The Feeling of Success: Does Touch Sensing Help Predict Grasp Outcomes?. In CoRL."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"I. Camponogara and R. Volcic. 2020. Integration of Haptics and Vision in Human Multisensory Grasping. In Cortex.","DOI":"10.1101\/2020.05.12.090647"},{"key":"e_1_3_2_1_9_1","volume-title":"Cohen","author":"Chen Wenhu","year":"2022","unstructured":"Wenhu Chen, Hexiang Hu, Xi Chen, Pat Verga, and William W. Cohen. 2022a. MuRAG: Multimodal Retrieval-Augmented Generator for Open Question Answering over Images and Text. In EMNLP."},{"key":"e_1_3_2_1_10_1","volume-title":"Cohen","author":"Chen Wenhu","year":"2023","unstructured":"Wenhu Chen, Hexiang Hu, Chitwan Saharia, and William W. Cohen. 2023. Re-Imagen: Retrieval-Augmented Text-to-Image Generator. In ICLR."},{"key":"e_1_3_2_1_11_1","volume-title":"Mark Van der Merwe, and Nima Fazeli","author":"Chen Yizhou","year":"2022","unstructured":"Yizhou Chen, Andrea Sipos, Mark Van der Merwe, and Nima Fazeli. 2022b. Visuo-Tactile Transformers for Manipulation. In CoRL."},{"key":"e_1_3_2_1_12_1","volume-title":"Touch100k: A Large-Scale Touch-Language-Vision Dataset for Touch-Centric Multimodal Representation. arXiv preprint arXiv:2406.03813","author":"Cheng Ning","year":"2024","unstructured":"Ning Cheng, Changhao Guan, Jing Gao, Weihao Wang, You Li, Fandong Meng, Jie Zhou, Bin Fang, Jinan Xu, and Wenjuan Han. 2024a. Touch100k: A Large-Scale Touch-Language-Vision Dataset for Touch-Centric Multimodal Representation. arXiv preprint arXiv:2406.03813 (2024)."},{"key":"e_1_3_2_1_13_1","unstructured":"Ning Cheng You Li Jing Gao Bin Fang Jinan Xu and Wenjuan Han. 2024b. Towards Comprehensive Multimodal Perception: Introducing the Touch-Language-Vision Dataset In ICIC. arXiv preprint arXiv:2403.09813."},{"key":"e_1_3_2_1_14_1","volume-title":"Xing","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. 2023. Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality."},{"key":"e_1_3_2_1_15_1","volume-title":"DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow Decoding. ICCV","author":"Cho Jungbin","year":"2025","unstructured":"Jungbin Cho, Junwan Kim, Jisoo Kim, Minseo Kim, Mingu Kang, Sungeun Hong, Tae-Hyun Oh, and Youngjae Yu. 2025. DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow Decoding. ICCV (2025)."},{"key":"e_1_3_2_1_16_1","first-page":"217","article-title":"Intra-Inter Modal Attention Blocks for RGB-D Semantic Segmentation","author":"Choi Soyun","year":"2023","unstructured":"Soyun Choi, Youjia Zhang, and Sungeun Hong. 2023. Intra-Inter Modal Attention Blocks for RGB-D Semantic Segmentation. In ICMR. 217-225.","journal-title":"ICMR."},{"key":"e_1_3_2_1_17_1","volume-title":"Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi.","author":"Dai Wenliang","year":"2023","unstructured":"Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, and Steven Hoi. 2023. InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning. In NeurIPS."},{"key":"e_1_3_2_1_18_1","first-page":"1","article-title":"CLAP Learning Audio Concepts from Natural Language Supervision","author":"Elizalde Benjamin","year":"2023","unstructured":"Benjamin Elizalde, Soham Deshmukh, Mahmoud Al Ismail, and Huaming Wang. 2023. CLAP Learning Audio Concepts from Natural Language Supervision. In ICASSP. 1-5.","journal-title":"ICASSP."},{"key":"e_1_3_2_1_19_1","first-page":"28","article-title":"Not All Inputs Are Valid: Towards Open-Set Video Moment Retrieval using Language","author":"Fang Xiang","year":"2024","unstructured":"Xiang Fang, Wanlong Fang, Daizong Liu, Xiaoye Qu, Jianfeng Dong, Pan Zhou, Renfu Li, Zichuan Xu, Lixing Chen, Panpan Zheng, et al., 2024. Not All Inputs Are Valid: Towards Open-Set Video Moment Retrieval using Language. In ACM MM. 28-37.","journal-title":"ACM MM."},{"key":"e_1_3_2_1_20_1","unstructured":"Ruoxuan Feng Di Hu Wenke Ma and Xuelong Li. 2024. Play to the Score: Stage-Guided Dynamic Multi-Sensory Fusion for Robotic Manipulation. In CoRL."},{"key":"e_1_3_2_1_21_1","unstructured":"Letian Fu Gaurav Datta Huang Huang Will Panitch Jaimyn Drake Joseph Ortiz Mustafa Mukadam Mike Lambeta Roberto Calandra and Ken Goldberg. 2024. A Touch Vision and Language Dataset for Multimodal Alignment. In ICML."},{"key":"e_1_3_2_1_22_1","unstructured":"Ruohan Gao Zilin Si Yen-Yu Chang Samuel Clarke Jeannette Bohg Li Fei-Fei Wenzhen Yuan and Jiajun Wu. 2022. ObjectFolder 2.0: A Multisensory Object Dataset for Sim2Real Transfer. In CVPR."},{"key":"e_1_3_2_1_23_1","volume-title":"REALM: Retrieval-Augmented Language Model Pre-Training. In ICML.","author":"Guu Kelvin","year":"2020","unstructured":"Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat, and Ming-Wei Chang. 2020. REALM: Retrieval-Augmented Language Model Pre-Training. In ICML."},{"key":"e_1_3_2_1_24_1","first-page":"353","article-title":"CBVMR","author":"Hong Sungeun","year":"2018","unstructured":"Sungeun Hong, Woobin Im, and Hyun S Yang. 2018. CBVMR: Content-Based Video-Music Retrieval Using Soft Intra-Modal Structure Constraint. In ICMR. 353-361.","journal-title":"Content-Based Video-Music Retrieval Using Soft Intra-Modal Structure Constraint. In ICMR."},{"key":"e_1_3_2_1_25_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o System Card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_26_1","unstructured":"Gabriel Ilharco Mitchell Wortsman Nicholas Carlini Rohan Taori Achal Dave Vaishaal Shankar Hongseok Namkoong John Miller Hannaneh Hajishirzi Ali Farhadi and Ludwig Schmidt. 2021. OpenCLIP."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"M. Ittyerah and L. Marks. 2007. Memory for Curvature of Objects: Haptic Touch vs. Vision. In British Journal of Psychology.","DOI":"10.1348\/000712606X171531"},{"key":"e_1_3_2_1_28_1","unstructured":"M. G. Jones Alexandra Bokinsky T. Tretter and Atsuko Negishi. 2005. A Comparison of Learning with Haptic and Visual Modalities."},{"key":"e_1_3_2_1_29_1","volume-title":"Self-Supervised Visuo-Tactile Pretraining to Locate and Follow Garment Features. arXiv preprint arXiv:2209.13042","author":"Kerr Justin","year":"2022","unstructured":"Justin Kerr, Huang Huang, Albert Wilcox, Ryan Hoque, Jeffrey Ichnowski, Roberto Calandra, and Ken Goldberg. 2022. Self-Supervised Visuo-Tactile Pretraining to Locate and Follow Garment Features. arXiv preprint arXiv:2209.13042 (2022)."},{"key":"e_1_3_2_1_30_1","first-page":"13681","article-title":"Question-Aware Gaussian Experts for Audio-Visual Question Answering","author":"Kim Hongyeob","year":"2025","unstructured":"Hongyeob Kim, Inyoung Jung, Dayoon Suh, Youjia Zhang, Sangmin Lee, and Sungeun Hong. 2025. Question-Aware Gaussian Experts for Audio-Visual Question Answering. In CVPR. 13681-13690.","journal-title":"CVPR."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2977257"},{"key":"e_1_3_2_1_32_1","unstructured":"Visual Layer. 2024. imagenet-1k-vl-enriched. https:\/\/huggingface.co\/datasets\/visual-layer\/imagenet-1k-vl-enriched."},{"key":"e_1_3_2_1_33_1","unstructured":"Chankyu Lee Rajarshi Roy Mengyao Xu Jonathan Raiman Mohammad Shoeybi Bryan Catanzaro and Wei Ping. 2025. NV-Embed: Improved Techniques for Training LLMs as Generalist Embedding Models. In ICLR."},{"key":"e_1_3_2_1_34_1","unstructured":"Jiyoung Lee Seungryong Kim Sunok Kim Jungin Park and K. Sohn. 2019. Context-Aware Emotion Recognition Networks. In ICCV."},{"key":"e_1_3_2_1_35_1","volume-title":"FVTTS: Face Based Voice Synthesis for Text-to-Speech. INTERSPEECH","author":"Lee Minyoung","year":"2024","unstructured":"Minyoung Lee, Eunil Park, and Sungeun Hong. 2024. FVTTS: Face Based Voice Synthesis for Text-to-Speech. INTERSPEECH (2024), 4953-4957."},{"key":"e_1_3_2_1_36_1","volume-title":"Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela.","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen tau Yih, Tim Rockt\u00e4schel, Sebastian Riedel, and Douwe Kiela. 2020. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In NeurIPS."},{"key":"e_1_3_2_1_37_1","volume-title":"Boosting Visual Question Answering with Context-aware Knowledge Aggregation. ACM MM","author":"Li Guohao","year":"2020","unstructured":"Guohao Li, Xin Wang, and Wenwu Zhu. 2020. Boosting Visual Question Answering with Context-aware Knowledge Aggregation. ACM MM (2020), 1227-1235."},{"volume-title":"ViHOPE: Visuotactile In-Hand Object 6D Pose Estimation With Shape Completion","author":"Li Hongyu","key":"e_1_3_2_1_38_1","unstructured":"Hongyu Li, Snehal Dikhale, Soshi Iba, and Nawid Jamali. 2023a. ViHOPE: Visuotactile In-Hand Object 6D Pose Estimation With Shape Completion. In IEEE Robotics and Automation Letters."},{"key":"e_1_3_2_1_39_1","unstructured":"Hao Li Yizhi Zhang Junzhe Zhu Shaoxiong Wang Michelle A. Lee Huazhe Xu E. Adelson Li Fei-Fei Ruohan Gao and Jiajun Wu. 2022. See Hear and Feel: Smart Sensory Fusion for Robotic Manipulation. In CoRL."},{"key":"e_1_3_2_1_40_1","unstructured":"Junnan Li Dongxu Li Silvio Savarese and Steven Hoi. 2023b. BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. In ICML."},{"key":"e_1_3_2_1_41_1","unstructured":"Jiapeng Li Ping Wei Wenjuan Han and Lifeng Fan. 2023c. IntentQA: Context-Aware Video Intent Reasoning. In ICCV."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Rui Li Robert W. Platt Wenzhen Yuan A. T. Pas Nathan Roscup M. Srinivasan and E. Adelson. 2014. Localization and Manipulation of Small Parts using GelSight Tactile Sensing. In IROS.","DOI":"10.1109\/IROS.2014.6943123"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102000"},{"key":"e_1_3_2_1_44_1","unstructured":"Yunzhu Li Jun-Yan Zhu Russ Tedrake and Antonio Torralba. 2019. Connecting Touch and Vision via Cross-Modal Prediction. In CVPR."},{"key":"e_1_3_2_1_45_1","unstructured":"Weizhe Lin Jinghong Chen Jingbiao Mei Alexandru Coca and Bill Byrne. 2023b. Fine-grained Late-interaction Multi-modal Retrieval for Retrieval Augmented Visual Question Answering. In NeurIPS."},{"key":"e_1_3_2_1_46_1","first-page":"2227","article-title":"Relaxing Contrastiveness in Multimodal Representation Learning","author":"Lin Zudi","year":"2023","unstructured":"Zudi Lin, Erhan Bas, Kunwar Yashraj Singh, Gurumurthy Swaminathan, and Rahul Bhotika. 2023a. Relaxing Contrastiveness in Multimodal Representation Learning. In WACV. 2227-2236.","journal-title":"WACV."},{"key":"e_1_3_2_1_47_1","first-page":"26296","article-title":"Improved Baselines with Visual Instruction Tuning","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Yuheng Li, and Yong Jae Lee. 2024. Improved Baselines with Visual Instruction Tuning. In CVPR. 26296-26306.","journal-title":"CVPR."},{"key":"e_1_3_2_1_48_1","first-page":"34892","article-title":"Visual Instruction Tuning","volume":"36","author":"Liu Haotian","year":"2023","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2023a. Visual Instruction Tuning. In NeurIPS, Vol. 36. 34892-34916.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_49_1","unstructured":"Zhenghao Liu Chenyan Xiong Yuanhuiyi Lv Zhiyuan Liu and Ge Yu. 2023b. Universal Vision-Language Dense Retrieval: Learning A Unified Representation Space for Multi-Modal Retrieval. In ICLR."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/UR61395.2024.10597462"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-024-51261-5"},{"key":"e_1_3_2_1_52_1","unstructured":"Do June Min Karel Mundnich Andy Lapastora Erfan Soltanmohammadi S. Ronanki and Kyu J Han. 2025. Speech Retrieval-Augmented Generation without Automatic Speech Recognition. In ICASSP."},{"volume-title":"Multiresolution Gray-Scale and Rotation Invariant Texture Classification with Local Binary Patterns","author":"Ojala T.","key":"e_1_3_2_1_53_1","unstructured":"T. Ojala, M. Pietik\u00e4inen, and Topi M\u00e4enp\u00e4\u00e4. 2002. Multiresolution Gray-Scale and Rotation Invariant Texture Classification with Local Binary Patterns. In IEEE TPAMI."},{"key":"e_1_3_2_1_54_1","volume-title":"Representation Learning with Contrastive Predictive Coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1186\/s40486-023-00191-w"},{"key":"e_1_3_2_1_56_1","first-page":"3987","article-title":"Visual-Tactile Multimodality for Following Deformable Linear Objects using Reinforcement Learning","author":"Pecyna Leszek","year":"2022","unstructured":"Leszek Pecyna, Siyuan Dong, and Shan Luo. 2022. Visual-Tactile Multimodality for Following Deformable Linear Objects using Reinforcement Learning. In IROS. IEEE, 3987-3994.","journal-title":"IROS. IEEE"},{"key":"e_1_3_2_1_57_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning Transferable Visual Models from Natural Language Supervision. In ICML."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Rita Ramos Desmond Elliott and Bruno Martins. 2023. Retrieval-Augmented Image Captioning. In ACL.","DOI":"10.18653\/v1\/2023.eacl-main.266"},{"key":"e_1_3_2_1_59_1","first-page":"5399","article-title":"Retrieval-Based Knowledge Augmented Vision Language Pre-Training","author":"Rao Jiahua","year":"2023","unstructured":"Jiahua Rao, Zifei Shan, Longpo Liu, Yao Zhou, and Yuedong Yang. 2023. Retrieval-Based Knowledge Augmented Vision Language Pre-Training. In ACM MM. 5399-5409.","journal-title":"ACM MM."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"Carmelo Sferrazza and R. D'Andrea. 2019. Design Motivation and Evaluation of a Full-Resolution Optical Tactile Sensor. In Sensors.","DOI":"10.3390\/s19040928"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"K. Shimonomura. 2019. Tactile Image Sensors Employing Camera: A Review. In Sensors.","DOI":"10.3390\/s19183933"},{"key":"e_1_3_2_1_63_1","first-page":"14193","article-title":"3D Shape Reconstruction from Vision and Touch","volume":"33","author":"Smith Edward","year":"2020","unstructured":"Edward Smith, Roberto Calandra, Adriana Romero, Georgia Gkioxari, David Meger, Jitendra Malik, and Michal Drozdzal. 2020. 3D Shape Reconstruction from Vision and Touch. NeurIPS, Vol. 33 (2020), 14193-14206.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_64_1","volume-title":"Gonzalez","author":"Stone K.","year":"2015","unstructured":"K. Stone and Claudia L. R. Gonzalez. 2015. The Contributions of Vision and Haptics to Reaching and Grasping. In Frontiers in Psychology."},{"key":"e_1_3_2_1_65_1","first-page":"7073","article-title":"ShapeMap 3-D","author":"Suresh Sudharshan","year":"2022","unstructured":"Sudharshan Suresh, Zilin Si, Joshua G Mangelson, Wenzhen Yuan, and Michael Kaess. 2022. ShapeMap 3-D: Efficient Shape Mapping Through Dense Touch and Vision. In ICRA. IEEE, 7073-7080.","journal-title":"Efficient Shape Mapping Through Dense Touch and Vision. In ICRA. IEEE"},{"key":"e_1_3_2_1_66_1","volume-title":"Salman Khan, Michael Felsberg, Mubarak Shah, and Fahad Shahbaz Khan.","author":"Thawakar Omkar","year":"2024","unstructured":"Omkar Thawakar, Muzammal Naseer, Rao Muhammad Anwer, Salman Khan, Michael Felsberg, Mubarak Shah, and Fahad Shahbaz Khan. 2024. Composed Video Retrieval via Enriched Context and Discriminative Embeddings. In CVPR."},{"key":"e_1_3_2_1_67_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.11.017"},{"volume-title":"UniIR: Training and Benchmarking Universal Multimodal Information Retrievers","author":"Wei Cong","key":"e_1_3_2_1_69_1","unstructured":"Cong Wei, Yang Chen, Haonan Chen, Hexiang Hu, Ge Zhang, Jie Fu, Alan Ritter, and Wenhu Chen. 2024. UniIR: Training and Benchmarking Universal Multimodal Information Retrievers. In ECCV. Springer, 387-404."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664524.3675369"},{"key":"e_1_3_2_1_71_1","unstructured":"Chen-Wei Xie Siyang Sun Xiong Xiong Yun Zheng Deli Zhao and Jingren Zhou. 2023. RA-CLIP: Retrieval Augmented Contrastive Language-Image Pre-Training. In CVPR."},{"key":"e_1_3_2_1_72_1","volume-title":"Wan Shou, Dongyi Wang, and Yu She.","author":"Xu Zhengtong","year":"2024","unstructured":"Zhengtong Xu, Raghava Uppuluri, Xinwei Zhang, Cael Fitch, Philip Glen Crandall, Wan Shou, Dongyi Wang, and Yu She. 2024. UniT: Unified Tactile Representation for Robot Learning. In arXiv.org."},{"volume-title":"IEEE-RAS International Conference on Humanoid Robots.","author":"Yamaguchi Akihiko","key":"e_1_3_2_1_73_1","unstructured":"Akihiko Yamaguchi and C. Atkeson. 2016. Combining Finger Vision and Optical Tactile Sensing: Reducing and Handling Errors while Cutting Vegetables. In IEEE-RAS International Conference on Humanoid Robots."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"crossref","unstructured":"Fengyu Yang Chao Feng Ziyang Chen Hyoungseob Park Daniel Wang Yiming Dou Ziyao Zeng Xien Chen Rit Gangopadhyay Andrew Owens and Alex Wong. 2024. Binding Touch to Everything: Learning Unified Multimodal Tactile Representations. In CVPR.","DOI":"10.1109\/CVPR52733.2024.02488"},{"key":"e_1_3_2_1_75_1","unstructured":"Fengyu Yang Chenyang Ma Jiacheng Zhang Jing Zhu Wenzhen Yuan and Andrew Owens. 2022. Touch and Go: Learning from Human-Collected Vision and Touch. In NeurIPS."},{"key":"e_1_3_2_1_76_1","volume-title":"Octopi: Object Property Reasoning with Large Tactile-Language Models. arXiv preprint arXiv:2405.02794","author":"Yu Samson","year":"2024","unstructured":"Samson Yu, Kelvin Lin, Anxing Xiao, Jiafei Duan, and Harold Soh. 2024. Octopi: Object Property Reasoning with Large Tactile-Language Models. arXiv preprint arXiv:2405.02794 (2024)."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"crossref","unstructured":"Wenzhen Yuan Siyuan Dong and E. Adelson. 2017. GelSight: High-Resolution Robot Tactile Sensors for Estimating Geometry and Force. In Sensors.","DOI":"10.3390\/s17122762"},{"key":"e_1_3_2_1_78_1","first-page":"12310","article-title":"Barlow Twins: Self-Supervised Learning via Redundancy Reduction","author":"Zbontar Jure","year":"2021","unstructured":"Jure Zbontar, Li Jing, Ishan Misra, Yann LeCun, and St\u00e9phane Deny. 2021. Barlow Twins: Self-Supervised Learning via Redundancy Reduction. In ICML. PMLR, 12310-12320.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_1_79_1","first-page":"710","volume-title":"IEEE TPAMI","volume":"44","author":"Zha Zheng-Jun","year":"2019","unstructured":"Zheng-Jun Zha, Daqing Liu, Hanwang Zhang, Yongdong Zhang, and Feng Wu. 2019. Context-Aware Visual Policy Network for Fine-Grained Image Captioning. IEEE TPAMI, Vol. 44, 2 (2019), 710-722."},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3296371"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"crossref","unstructured":"Mingyuan Zhang Xinying Guo Liang Pan Zhongang Cai Fangzhou Hong Huirong Li Lei Yang and Ziwei Liu. 2023. ReMoDiffuse: Retrieval-Augmented Motion Diffusion Model. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00040"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"crossref","unstructured":"Qi Zhang Zhen Lei Zhaoxiang Zhang and S. Li. 2020. Context-Aware Attention Network for Image-Text Retrieval. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00359"},{"key":"e_1_3_2_1_83_1","unstructured":"Renrui Zhang Jiaming Han Chris Liu Aojun Zhou Pan Lu Yu Qiao Hongsheng Li and Peng Gao. 2024. LLaMA-Adapter: Efficient Fine-tuning of Large Language Models with Zero-initialized Attention. In ICLR."},{"key":"e_1_3_2_1_84_1","first-page":"90","article-title":"Spatio-Channel Attention Blocks for Cross-Modal Crowd Counting","author":"Zhang Youjia","year":"2022","unstructured":"Youjia Zhang, Soyun Choi, and Sungeun Hong. 2022. Spatio-Channel Attention Blocks for Cross-Modal Crowd Counting. In ACCV. 90-107.","journal-title":"ACCV."},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111376"},{"key":"e_1_3_2_1_86_1","volume-title":"CAT-TPT: Class-Agnostic Text-based Test-time Prompt Tuning for Vision-Language Models. IJCV","author":"Zhang Youjia","year":"2025","unstructured":"Youjia Zhang, Huiling Liu, Youngeun Kim, and Sungeun Hong. 2025b. CAT-TPT: Class-Agnostic Text-based Test-time Prompt Tuning for Vision-Language Models. IJCV (2025), 1-23."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755106","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T20:05:06Z","timestamp":1765310706000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755106"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":86,"alternative-id":["10.1145\/3746027.3755106","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755106","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}