{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:46:55Z","timestamp":1765309615573,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755268","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"3972-3980","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Video-to-Image Affordance Grounding via Visual Conceptual Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-4092-0633","authenticated-orcid":false,"given":"Zhiyuan","family":"Fan","sequence":"first","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University, Xi'an, Shaanxi, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7877-2556","authenticated-orcid":false,"given":"Keyi","family":"Liang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Northwestern Polytechnical University, Xi'an, Shaanxi, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Can Visual Foundation Models Achieve Long-term Point Tracking? arXiv preprint arXiv:2408.13575","author":"Aydemir G\u00f6rkay","year":"2024","unstructured":"G\u00f6rkay Aydemir, Weidi Xie, and Fatma G\u00fcney. 2024. Can Visual Foundation Models Achieve Long-term Point Tracking? arXiv preprint arXiv:2408.13575 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"The affordance-matching hypothesis: how objects guide action understanding and prediction. Frontiers in human neuroscience","author":"Bach Patric","year":"2014","unstructured":"Patric Bach, Toby Nicholson, and Matthew Hudson. 2014. The affordance-matching hypothesis: how objects guide action understanding and prediction. Frontiers in human neuroscience, Vol. 8 (2014), 254."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01324"},{"key":"e_1_3_2_1_4_1","volume-title":"What do different evaluation metrics tell us about saliency models? IEEE transactions on pattern analysis and machine intelligence","author":"Bylinskii Zoya","year":"2018","unstructured":"Zoya Bylinskii, Tilke Judd, Aude Oliva, Antonio Torralba, and Fr\u00e9do Durand. 2018. What do different evaluation metrics tell us about saliency models? IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 3 (2018), 740-757."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00657"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2991965"},{"key":"e_1_3_2_1_9_1","volume-title":"Efros","author":"Delaitre Vincent","year":"2012","unstructured":"Vincent Delaitre, David F. Fouhey, Ivan Laptev, Josef Sivic, Abhinav Gupta, and Alexei A. Efros. 2012. Scene Semantics from Long-Term Observation of People. In Computer Vision - ECCV 2012, Andrew Fitzgibbon, Svetlana Lazebnik, Pietro Perona, Yoichi Sato, and Cordelia Schmid (Eds.). Springer Berlin Heidelberg, Berlin, Heidelberg, 284-298."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01377"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01871"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00228"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10161571"},{"volume-title":"The Ecological Approach to Visual Perception. Houghton Mifflin","author":"Gibson James J.","key":"e_1_3_2_1_14_1","unstructured":"James J. Gibson. 1979. The Ecological Approach to Visual Perception. Houghton Mifflin, Boston."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3446370"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01878"},{"key":"e_1_3_2_1_18_1","volume-title":"Anticipating human activities using object affordances for reactive robotic response","author":"Koppula Hema S","year":"2015","unstructured":"Hema S Koppula and Ashutosh Saxena. 2015. Anticipating human activities using object affordances for reactive robotic response. IEEE transactions on pattern analysis and machine intelligence, Vol. 38, 1 (2015), 14-29."},{"key":"e_1_3_2_1_19_1","volume-title":"Jianghao Li, and Marcelo H Ang Jr.","author":"Lee Min Young","year":"2024","unstructured":"Min Young Lee, Christina Dao Wen Lee, Jianghao Li, and Marcelo H Ang Jr. 2024. DINO-MOT: 3D Multi-Object Tracking with Visual Foundation Model for Pedestrian Re-Identification using Visual Memory Mechanism. IEEE Robotics and Automation Letters (2024)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01051"},{"key":"e_1_3_2_1_21_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00229"},{"key":"e_1_3_2_1_23_1","volume-title":"Learning visual affordance grounding from demonstration videos","author":"Luo Hongchen","year":"2023","unstructured":"Hongchen Luo, Wei Zhai, Jing Zhang, Yang Cao, and Dacheng Tao. 2023. Learning visual affordance grounding from demonstration videos. IEEE Transactions on Neural Networks and Learning Systems (2023)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01962-z"},{"key":"e_1_3_2_1_25_1","first-page":"4373","article-title":"Learning relational affordance models for robots in multi-object manipulation tasks. In 2012 ieee international conference on robotics and automation","author":"Moldovan Bogdan","year":"2012","unstructured":"Bogdan Moldovan, Plinio Moreno, Martijn Van Otterlo, Jos\u00e9 Santos-Victor, and Luc De Raedt. 2012. Learning relational affordance models for robots in multi-object manipulation tasks. In 2012 ieee international conference on robotics and automation. IEEE, 4373-4378.","journal-title":"IEEE"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00878"},{"volume-title":"The Psychology of Everyday Things","author":"Norman Donald A.","key":"e_1_3_2_1_27_1","unstructured":"Donald A. Norman. 1988. The Psychology of Everyday Things. Basic Books, New York. Page 9."},{"key":"e_1_3_2_1_28_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Tool use and affordance: Manipulation-based versus reasoning-based approaches. Psychological review","author":"Osiurak Fran\u00e7ois","year":"2016","unstructured":"Fran\u00e7ois Osiurak and Arnaud Badets. 2016. Tool use and affordance: Manipulation-based versus reasoning-based approaches. Psychological review, Vol. 123, 5 (2016), 534."},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PmLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748-8763."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10514-018-9787-5"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.69"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_12"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01228-7"},{"key":"e_1_3_2_1_35_1","volume-title":"European Conference on Computer Vision. Springer, 367-385","author":"Tumanyan Narek","year":"2024","unstructured":"Narek Tumanyan, Assaf Singer, Shai Bagon, and Tali Dekel. 2024. Dino-tracker: Taming dino for self-supervised point tracking in a single video. In European Conference on Computer Vision. Springer, 367-385."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2021.3062560"},{"key":"e_1_3_2_1_37_1","volume-title":"TB-HSU: Hierarchical 3D Scene Understanding with Contextual Affordances. arXiv preprint arXiv:2412.05596","author":"Xu Wenting","year":"2024","unstructured":"Wenting Xu, Viorela Ila, Luping Zhou, and Craig T Jin. 2024. TB-HSU: Hierarchical 3D Scene Understanding with Contextual Affordances. arXiv preprint arXiv:2412.05596 (2024)."},{"key":"e_1_3_2_1_38_1","unstructured":"Weirui Ye Fangchen Liu Zheng Ding Yang Gao Oleh Rybkin and Pieter Abbeel. 2025. Video2Policy: Scaling up Manipulation Tasks in Simulation through Internet Videos. arXiv:2502.09886 [cs.RO] https:\/\/arxiv.org\/abs\/2502.09886"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-019-04336-0"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.210"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755268","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:44:04Z","timestamp":1765309444000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755268"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":40,"alternative-id":["10.1145\/3746027.3755268","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755268","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}