{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T17:25:22Z","timestamp":1763227522750},"reference-count":21,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T00:00:00Z","timestamp":1658102400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,7,18]],"date-time":"2022-07-18T00:00:00Z","timestamp":1658102400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,7,18]]},"DOI":"10.1109\/icme52920.2022.9859690","type":"proceedings-article","created":{"date-parts":[[2022,8,26]],"date-time":"2022-08-26T19:45:18Z","timestamp":1661543118000},"page":"01-06","source":"Crossref","is-referenced-by-count":0,"title":["Semantic-Driven Saliency-Context Separation for Video Captioning"],"prefix":"10.1109","author":[{"given":"Heming","family":"Jing","sequence":"first","affiliation":[{"name":"School of Computer Science, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuejie","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Computer Science, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui","family":"Feng","sequence":"additional","affiliation":[{"name":"School of Computer Science, Shanghai Key Laboratory of Intelligent Information Processing, Fudan University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui-Wei","family":"Zhao","sequence":"additional","affiliation":[{"name":"Fudan University,Academy for Engineering and Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tao","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Information Management and Engineering, Shanghai Key Laboratory of Financial Information Technology, Shanghai University of Finance and Economics"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuequan","family":"Lu","sequence":"additional","affiliation":[{"name":"School of Information Technology, Deakin University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shang","family":"Gao","sequence":"additional","affiliation":[{"name":"School of Information Technology, Deakin University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref10","article-title":"Categorical reparameterization with gumbel-softmax","author":"jang","year":"2017","journal-title":"ICLRE"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref12","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","author":"chen","year":"0","journal-title":"Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics Hu-man Language Technologies"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref15","first-page":"2641","article-title":"Controllable video captioning with POS sequence guidance based on gated fusion network","author":"wang","year":"2019","journal-title":"ICCV"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00854"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2969330"},{"key":"ref18","first-page":"2514","article-title":"Semantic grouping network for video captioning","author":"ryu","year":"2021","journal-title":"AAAI"},{"key":"ref19","first-page":"1","article-title":"Hierarchical rep-resentation network with auxiliary tasks for video captioning","author":"lei","year":"2021","journal-title":"ICME"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/IWCMC48107.2020.9148294"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICME46284.2020.9102967"},{"key":"ref6","first-page":"3119","article-title":"Non-autoregressive coarse-to-fine video captioning","author":"yang","year":"2021","journal-title":"AAAI"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351060"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00157"},{"key":"ref2","first-page":"5523","article-title":"Middle-out decoding","author":"mehri","year":"2018","journal-title":"NIPS"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/N15-1173"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref20","first-page":"2641","article-title":"Controllable video captioning with POS sequence guidance based on gated fusion network","author":"wang","year":"2019","journal-title":"ICCV"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00854"}],"event":{"name":"2022 IEEE International Conference on Multimedia and Expo (ICME)","start":{"date-parts":[[2022,7,18]]},"location":"Taipei, Taiwan","end":{"date-parts":[[2022,7,22]]}},"container-title":["2022 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9859562\/9858923\/09859690.pdf?arnumber=9859690","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,19]],"date-time":"2022-09-19T20:23:55Z","timestamp":1663619035000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9859690\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,7,18]]},"references-count":21,"URL":"https:\/\/doi.org\/10.1109\/icme52920.2022.9859690","relation":{},"subject":[],"published":{"date-parts":[[2022,7,18]]}}}