{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:06:07Z","timestamp":1765343167083,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100013076","name":"National Major Science and Technology Projects of China","doi-asserted-by":"publisher","award":["2022ZD0119501"],"award-info":[{"award-number":["2022ZD0119501"]}],"id":[{"id":"10.13039\/501100013076","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["52374221"],"award-info":[{"award-number":["52374221"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Science and Technology Development Fund of Shandong Province of China","award":["ZR2023MF097"],"award-info":[{"award-number":["ZR2023MF097"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758221","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:37:21Z","timestamp":1761377841000},"page":"12799-12806","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["PhysLab: A Benchmark Dataset for Multi-Granularity Visual Parsing of Physics Experiments"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0239-3990","authenticated-orcid":false,"given":"Minghao","family":"Zou","sequence":"first","affiliation":[{"name":"Shandong University of Science and Technology, Qingdao, China and Cardiff University, Cardiff, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6421-8223","authenticated-orcid":false,"given":"Qingtian","family":"Zeng","sequence":"additional","affiliation":[{"name":"Shandong University of Science and Technology, Qingdao, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7210-1938","authenticated-orcid":false,"given":"Yongping","family":"Miao","sequence":"additional","affiliation":[{"name":"Shandong University of Science and Technology, Qingdao, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5728-5092","authenticated-orcid":false,"given":"Shangkun","family":"Liu","sequence":"additional","affiliation":[{"name":"Shandong University of Science and Technology, Qingdao, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1876-0851","authenticated-orcid":false,"given":"Zilong","family":"Wang","sequence":"additional","affiliation":[{"name":"Shandong University of Science and Technology, Qingdao, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4544-3481","authenticated-orcid":false,"given":"Hantao","family":"Liu","sequence":"additional","affiliation":[{"name":"Cardiff University, Cardiff, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3641-1429","authenticated-orcid":false,"given":"Wei","family":"Zhou","sequence":"additional","affiliation":[{"name":"Cardiff University, Cardiff, United Kingdom"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"11367","volume-title":"ATTACH Dataset: Annotated Two-Handed Assembly Actions for Human Action Understanding. In IEEE International Conference on Robotics and Automation. IEEE","author":"Aganian Dustin","year":"2023","unstructured":"Dustin Aganian, Benedict Stephan, Markus Eisenbach, Corinna Stretz, and Horst-Michael Gross. 2023. ATTACH Dataset: Annotated Two-Handed Assembly Actions for Human Action Understanding. In IEEE International Conference on Robotics and Automation. IEEE, London, 11367-11373."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00982"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01532"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3674980"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00048"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00912"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41597-022-01843-z"},{"key":"e_1_3_2_1_10_1","first-page":"720","volume-title":"Proceedings of the European Conference on Computer Vision. Springer Nature Switzerland AG","author":"Damen Dima","year":"2018","unstructured":"Dima Damen, Hazel Doughty, Giovanni Maria Farinella, Sanja Fidler, Antonino Furnari, Evangelos Kazakos, Davide Moltisanti, Jonathan Munro, Toby Perrett, Will Price, et al., 2018. Scaling Egocentric Vision: The EPIC-KITCHENS Dataset. In Proceedings of the European Conference on Computer Vision. Springer Nature Switzerland AG, Munich, 720-736."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02065"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3327284"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00929"},{"key":"e_1_3_2_1_14_1","volume-title":"Visual Semantic Role Labeling. arXiv preprint arXiv:1505.04474","author":"Gupta Saurabh","year":"2015","unstructured":"Saurabh Gupta and Jitendra Malik. 2015. Visual Semantic Role Labeling. arXiv preprint arXiv:1505.04474 (2015), 1-11."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TETCI.2024.3518613"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2023.106223"},{"key":"e_1_3_2_1_17_1","first-page":"50750","article-title":"Noisy Ostracods: A Fine-Grained, Imbalanced Real-World Dataset for Benchmarking Robust Machine Learning and Label Correction Methods","volume":"37","author":"Hu Jiamian","year":"2024","unstructured":"Jiamian Hu, Hong Yuanyuan, Yihua Chen, He Wang, and Moriaki Yasuhara. 2024. Noisy Ostracods: A Fine-Grained, Imbalanced Real-World Dataset for Benchmarking Robust Machine Learning and Label Correction Methods. Advances in Neural Information Processing Systems, Vol. 37 (2024), 50750-50771.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02084"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2016.10.018"},{"key":"e_1_3_2_1_20_1","first-page":"1","article-title":"Visual Question Answering: A Survey of Methods","volume":"57","author":"Kim Byeong Su","year":"2025","unstructured":"Byeong Su Kim, Jieun Kim, Deokwoo Lee, and Beakcheol Jang. 2025. Visual Question Answering: A Survey of Methods, Datasets, Evaluation, and Challenges. Comput. Surveys, Vol. 57, 10 (2025), 1-35.","journal-title":"Datasets, Evaluation, and Challenges. Comput. Surveys"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.105"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01316-z"},{"key":"e_1_3_2_1_25_1","volume-title":"PhysicsAssistant: An LLM-Powered Interactive Learning Robot for Physics Lab Investigations. In IEEE International Conference on Robot and Human Interactive Communication. IEEE, Pasadena, 864-871","author":"Latif Ehsan","year":"2024","unstructured":"Ehsan Latif, Ramviyas Parasuraman, and Xiaoming Zhai. 2024. PhysicsAssistant: An LLM-Powered Interactive Learning Robot for Physics Lab Investigations. In IEEE International Conference on Robot and Human Interactive Communication. IEEE, Pasadena, 864-871."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01765"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00634"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00324"},{"key":"e_1_3_2_1_29_1","first-page":"21158","article-title":"Neural-Logic Human-Object Interaction Detection","volume":"36","author":"Li Liulei","year":"2023","unstructured":"Liulei Li, Jianan Wei, Wenguan Wang, and Yi Yang. 2023. Neural-Logic Human-Object Interaction Detection. Advances in Neural Information Processing Systems, Vol. 36 (2023), 21158-21171.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01949"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681153"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3217368"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00264"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00798"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01928"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3331738"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3034487"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-022-10358-3"},{"key":"e_1_3_2_1_40_1","first-page":"135626","article-title":"CaptainCook4D: A Dataset for Understanding Errors in Procedural Activities","volume":"37","author":"Peddi Rohith","year":"2024","unstructured":"Rohith Peddi, Shivvrat Arya, Bharath Challa, Likhitha Pallapothula, Akshay Vyas, Bhavya Gouripeddi, Qifan Zhang, Jikai Wang, Vasundhara Komaragiri, Eric Ragan, et al., 2024. CaptainCook4D: A Dataset for Understanding Errors in Procedural Activities. Advances in Neural Information Processing Systems, Vol. 37 (2024), 135626-135679.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/356698.356702"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41562-022-01394-8"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevPhysEducRes.20.010117"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00431"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02042"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00852"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01722"},{"key":"e_1_3_2_1_48_1","volume-title":"Amir Roshan Zamir, and Mubarak Shah","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah. 2012. UCF101: A Dataset of 101 Human Actions Classes from Videos in the Wild. arXiv preprint arXiv:1212.0402 (2012), 1-7."},{"key":"e_1_3_2_1_49_1","first-page":"3200","article-title":"Human Action Recognition from Various Data Modalities: A review","volume":"45","author":"Sun Zehua","year":"2022","unstructured":"Zehua Sun, Qiuhong Ke, Hossein Rahmani, Mohammed Bennamoun, Gang Wang, and Jun Liu. 2022. Human Action Recognition from Various Data Modalities: A review. IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 45, 3 (2022), 3200-3225.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2020.103107"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00130"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3330794"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02159"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3358952"},{"key":"e_1_3_2_1_55_1","volume-title":"ELAN: A Professional Framework for Multimodality Research. In International Conference on Language Resources and Evaluation. ELRA, Genoa, 1556-1559","author":"Wittenburg Peter","year":"2006","unstructured":"Peter Wittenburg, Hennie Brugman, Albert Russel, Alex Klassmann, and Han Sloetjes. 2006. ELAN: A Professional Framework for Multimodality Research. In International Conference on Language Resources and Evaluation. ELRA, Genoa, 1556-1559."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01754"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29907"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547869"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612311"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20229"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01947"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00955"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01894"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"},{"key":"e_1_3_2_1_65_1","unstructured":"Peilin Zhou Bruce Leon Xiang Ying Can Zhang Yifan Shao Qichen Ye Dading Chong Zhiling Jin Chenxuan Xie Meng Cao et al. 2025b. BrowseComp-ZH: Benchmarking Web Browsing Ability of Large Language Models in Chinese. arXiv preprint arXiv:2504.19314 (2025) 1-14."},{"key":"e_1_3_2_1_66_1","volume-title":"Patrick Le Callet, and Alan C Bovik","author":"Zhou Wei","year":"2025","unstructured":"Wei Zhou, Hadi Amirpour, Christian Timmerer, Guangtao Zhai, Patrick Le Callet, and Alan C Bovik. 2025a. Perceptual Visual Quality Assessment: Principles, Methods, and Future Directions. arXiv preprint arXiv:2503.00625 (2025), 1-6."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3680818"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00365"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01165"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3358547"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758221","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:02:55Z","timestamp":1765342975000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758221"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":70,"alternative-id":["10.1145\/3746027.3758221","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758221","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}