{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,18]],"date-time":"2025-05-18T18:21:15Z","timestamp":1747592475582},"reference-count":30,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,9,19]],"date-time":"2021-09-19T00:00:00Z","timestamp":1632009600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,9,19]],"date-time":"2021-09-19T00:00:00Z","timestamp":1632009600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,9,19]]},"DOI":"10.1109\/icip42928.2021.9506267","type":"proceedings-article","created":{"date-parts":[[2021,8,23]],"date-time":"2021-08-23T21:08:41Z","timestamp":1629752921000},"page":"1334-1338","source":"Crossref","is-referenced-by-count":6,"title":["Semantic Role Aware Correlation Transformer For Text To Video Retrieval"],"prefix":"10.1109","author":[{"given":"Burak","family":"Satar","sequence":"first","affiliation":[{"name":"Institute for Infocomm Research,A&#x002A;STAR,Singapore"}]},{"given":"Zhu","family":"Hongyuan","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research,A&#x002A;STAR,Singapore"}]},{"given":"Xavier","family":"Bresson","sequence":"additional","affiliation":[{"name":"National University,Department of Computer Science,Singapore"}]},{"given":"Joo Hwee","family":"Lim","sequence":"additional","affiliation":[{"name":"Institute for Infocomm Research,A&#x002A;STAR,Singapore"}]}],"member":"263","reference":[{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00208"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"ref12","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NIPS"},{"key":"ref13","article-title":"Multimodal Transformer for Video Retrieval","author":"gabeur","year":"2020","journal-title":"ECCV"},{"key":"ref14","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2019","journal-title":"HLT-NAACL"},{"article-title":"wman: Weakly-supervised moment alignment network for text based video segment retrieval","year":"2020","author":"tan","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"},{"article-title":"Self-supervised multimodal versatile networks","year":"2020","author":"alayrac","key":"ref18"},{"key":"ref19","first-page":"776","article-title":"Audio set: An ontology and human-labeled dataset for audio events","author":"gemmeke","year":"2017","journal-title":"ICASSP"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206064"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"ref27","article-title":"Multi-modal dense video captioning","author":"iashin","year":"2020","journal-title":"CVPR Workshops"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"ref5","article-title":"Learning a text-video embedding from incomplete and heterogeneous data","author":"miech","year":"2018","journal-title":"arXiv 1804 02516"},{"key":"ref8","article-title":"Use what you have: Video retrieval using representations from collaborative experts","author":"liu","year":"2019","journal-title":"ArXiv"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2578726.2578746"},{"key":"ref1","article-title":"Semantic concept discovery for large-scale zero-shot event detection","author":"chang","year":"2015","journal-title":"IJCAI"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00957"},{"article-title":"Simple bert models for relation extraction and semantic role labeling","year":"2019","author":"shi","key":"ref20"},{"key":"ref22","first-page":"248","article-title":"Imagenet: A large-scale hierarchical image database","author":"deng","year":"2009","journal-title":"CVPR"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref23","first-page":"6546","article-title":"Can spatiotemporal 3d cnns retrace the history of 2d cnns and imagenet?","author":"hara","year":"2018","journal-title":"CVPR"},{"article-title":"Ms coco: Common objects in context","year":"2015","author":"lin","key":"ref26"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"}],"event":{"name":"2021 IEEE International Conference on Image Processing (ICIP)","start":{"date-parts":[[2021,9,19]]},"location":"Anchorage, AK, USA","end":{"date-parts":[[2021,9,22]]}},"container-title":["2021 IEEE International Conference on Image Processing (ICIP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9506008\/9506009\/09506267.pdf?arnumber=9506267","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,6]],"date-time":"2022-12-06T23:55:50Z","timestamp":1670370950000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9506267\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,9,19]]},"references-count":30,"URL":"https:\/\/doi.org\/10.1109\/icip42928.2021.9506267","relation":{},"subject":[],"published":{"date-parts":[[2021,9,19]]}}}