{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,18]],"date-time":"2026-02-18T08:21:05Z","timestamp":1771402865705,"version":"3.50.1"},"reference-count":20,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,9,14]],"date-time":"2025-09-14T00:00:00Z","timestamp":1757808000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,14]],"date-time":"2025-09-14T00:00:00Z","timestamp":1757808000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,9,14]]},"DOI":"10.1109\/icipw68931.2025.11385906","type":"proceedings-article","created":{"date-parts":[[2026,2,17]],"date-time":"2026-02-17T21:05:43Z","timestamp":1771362343000},"page":"333-338","source":"Crossref","is-referenced-by-count":0,"title":["Semanticbox: Bounding Box-Guided Caption Enhanced Action Recognition for Instructional Videos"],"prefix":"10.1109","author":[{"given":"Jonathan","family":"McGee","sequence":"first","affiliation":[{"name":"University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA"}]},{"given":"Chongyu","family":"He","sequence":"additional","affiliation":[{"name":"University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA"}]},{"given":"Peter","family":"Youngs","sequence":"additional","affiliation":[{"name":"University of Virginia,Department of Curriculum, Instruction, and Special Education,VA,USA"}]},{"given":"Scott T.","family":"Acton","sequence":"additional","affiliation":[{"name":"University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA"}]},{"given":"Matthew","family":"Korban","sequence":"additional","affiliation":[{"name":"University of Virginia,C.L. Brown,Dept. of Electrical and Computer Engineering,VA,USA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3377192"},{"key":"ref2","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref3","article-title":"Actionclip: A new paradigm for video action recognition","author":"Wang","year":"2021","journal-title":"arXiv preprint arXiv:2109.08472"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2025.111409"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01328"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00471"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_20"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01323"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00461"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28361"},{"key":"ref11","article-title":"Bounding box embedding for single shot person instance segmentation","author":"Richeimer","year":"2018","journal-title":"arXiv preprint arXiv:1807.07674"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.472"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00300"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02206"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i6.28361"},{"key":"ref18","article-title":"Actionclip: A new paradigm for video action recognition","volume":"abs\/2109.08472","author":"Wang","year":"2021","journal-title":"CoRR"},{"key":"ref19","first-page":"10078","article-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training","volume":"35","author":"Tong","year":"2022","journal-title":"Advances in neural information processing systems"},{"key":"ref20","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017","journal-title":"arXiv preprint arXiv:1711.05101"}],"event":{"name":"2025 IEEE International Conference on Image Processing Workshops (ICIPW)","location":"Anchorage, AK, USA","start":{"date-parts":[[2025,9,14]]},"end":{"date-parts":[[2025,9,17]]}},"container-title":["2025 IEEE International Conference on Image Processing Workshops (ICIPW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11385856\/11385840\/11385906.pdf?arnumber=11385906","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,18]],"date-time":"2026-02-18T07:28:38Z","timestamp":1771399718000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11385906\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,14]]},"references-count":20,"URL":"https:\/\/doi.org\/10.1109\/icipw68931.2025.11385906","relation":{},"subject":[],"published":{"date-parts":[[2025,9,14]]}}}