{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T16:36:52Z","timestamp":1757608612183,"version":"3.44.0"},"reference-count":59,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11127898","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"5593-5600","source":"Crossref","is-referenced-by-count":0,"title":["Ego-$A^{\\mathbf{3}}$: Adaptive Fusion-Based Disentangled Transformer for Egocentric Action Anticipation"],"prefix":"10.1109","author":[{"given":"Minhyuk","family":"Kim","sequence":"first","affiliation":[{"name":"Chonnam National University,Department of Artificial Intelligence Convergence,Gwangju,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jong Won","family":"Jung","sequence":"additional","affiliation":[{"name":"Chonnam National University,Department of Artificial Intelligence Convergence,Gwangju,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Eungi","family":"Lee","sequence":"additional","affiliation":[{"name":"Chonnam National University,Department of Artificial Intelligence Convergence,Gwangju,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Seok Bong","family":"Yoo","sequence":"additional","affiliation":[{"name":"Chonnam National University,Department of Artificial Intelligence Convergence,Gwangju,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"2442","article-title":"First-person vision","volume-title":"Proceedings of the IEEE 100","volume":"8","author":"Takeo","year":"2012"},{"key":"ref2","first-page":"229248","article-title":"Robust multidimensional motion features for first-person vision activity recognition","volume":"149","author":"Girmaw","year":"2016","journal-title":"Computer Vision and Image Understanding"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.pmcj.2014.11.004"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2010.999"},{"key":"ref5","doi-asserted-by":"crossref","first-page":"743","DOI":"10.1007\/s11042-012-1117-x","article-title":"Hierarchical Hidden Markov Model in detecting activities of daily living in wearable videos for studies of dementia.","volume":"69","author":"Svebor","year":"2014","journal-title":"Multimedia Tools and Applications"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096198"},{"key":"ref7","article-title":"What would you expect? anticipating egocentric actions with rolling-unrolling lstms and modality attention","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Antonino","year":"2019"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01325"},{"key":"ref9","article-title":"Temporal aggregate representations for long-range video understanding","volume-title":"Computer Vision-ECCV 2020: 16th European Conference","author":"Fadime","year":"2020"},{"key":"ref10","article-title":"Interaction region visual transformer for egocentric action anticipation","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"Debaditya","year":"2024"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/wacv56688.2023.00601"},{"journal-title":"Disentangled action recognition with knowledge bases.","year":"2022","author":"Zhekun","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-71278-5_12"},{"key":"ref14","article-title":"Memory-and-anticipation transformer for online action understanding","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Jiahao","year":"2023"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01799"},{"key":"ref16","article-title":"Forecasting human-object interaction: joint prediction of motor attention and actions in first person video","volume-title":"Computer Vi-sion-ECCV 2020: 16th European Conference","author":"Miao","year":"2020"},{"key":"ref17","article-title":"Quo vadis, action recognition? a new model and the kinetics dataset","volume-title":"proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Joao","year":"2017"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01240"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3251843"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3188101"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3150855"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA46639.2022.9812079"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2023.3301307"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/icra57147.2024.10610283"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28442"},{"key":"ref26","article-title":"Real-time online video detection with temporal smoothing transformers","volume-title":"European Conference on Computer Vision","author":"Yue","year":"2022"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/wacv57701.2024.00846"},{"journal-title":"Antgpt: Can large language models help long-term action anticipation from videos?.","year":"2023","author":"Qi","key":"ref28"},{"key":"ref29","article-title":"Vlmah: Visual-linguistic modeling of action history for effective action anticipation.","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Victoria","year":"2023"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2022.3178804"},{"key":"ref31","article-title":"A hybrid egocentric activity anticipation framework via memory-augmented recurrent and one-shot representation forecasting.","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Tianshan","year":"2022"},{"key":"ref32","article-title":"Audio-visual grouping network for sound localization from mixtures","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Shentong","year":"2023"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9197566"},{"key":"ref34","article-title":"Localizing visual sounds the easy way","volume-title":"European Conference on Computer Vision","author":"Shentong","year":"2022"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196829"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02194"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747867"},{"key":"ref38","article-title":"Mix and localize: Localizing sound sources in mixtures","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Xixi","year":"2022"},{"key":"ref39","article-title":"Event-specific audio-visual fusion layers: A simple and new perspective on video understanding.","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"Arda","year":"2023"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5361"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16354"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16403"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20073"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00559"},{"key":"ref45","article-title":"Domain generalization through audio-visual relative norm alignment in first person action recognition.","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"Mirco","year":"2022"},{"key":"ref46","first-page":"26492","article-title":"Temporally disentangled representation learning.","volume":"35","author":"Weiran","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"journal-title":"Disentangled representation learning.","year":"2022","author":"Xin","key":"ref47"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00022"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1016\/j.adhoc.2020.102380"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2023.3341297"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00337"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00138"},{"journal-title":"Learning phrase representations using RNN encoder-decoder for statistical machine translation","year":"2014","author":"Kyunghyun","key":"ref53"},{"key":"ref54","article-title":"Scaling egocentric vision: The epic-kitchens dataset.","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV)","author":"Dima","year":"2018"},{"key":"ref55","article-title":"Integrating human gaze into attention for egocentric activity recognition.","volume-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","author":"Kyle","year":"2021"},{"journal-title":"Adam: A method for stochastic optimization","year":"2014","author":"Diederik P.","key":"ref56"},{"key":"ref57","first-page":"1086","article-title":"Long short-term transformer for online action detection.","volume":"34","author":"Mingze","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref58","article-title":"Temporal segment networks: Towards good practices for deep action recognition.","volume-title":"European Conference on Computer Vision","author":"Limin","year":"2016"},{"key":"ref59","article-title":"Grad-cam: Visual explanations from deep networks via gradient-based localization.","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"Ramprasaath R.","year":"2017"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","start":{"date-parts":[[2025,5,19]]},"location":"Atlanta, GA, USA","end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11127898.pdf?arnumber=11127898","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:03:29Z","timestamp":1756879409000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11127898\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":59,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11127898","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}