{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T08:55:47Z","timestamp":1765356947263,"version":"3.37.3"},"reference-count":19,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,7,5]],"date-time":"2021-07-05T00:00:00Z","timestamp":1625443200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,7,5]],"date-time":"2021-07-05T00:00:00Z","timestamp":1625443200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006190","name":"Research and Development","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006190","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,7,5]]},"DOI":"10.1109\/icme51207.2021.9428325","type":"proceedings-article","created":{"date-parts":[[2021,6,9]],"date-time":"2021-06-09T21:14:21Z","timestamp":1623273261000},"page":"1-6","source":"Crossref","is-referenced-by-count":6,"title":["What Matters: Attentive and Relational Feature Aggregation Network for Video-Text Retrieval"],"prefix":"10.1109","author":[{"given":"Xiaoshuai","family":"Hao","sequence":"first","affiliation":[{"name":"University of Chinese Academy of Sciences,School of Cyber Security,Beijing,China,100049"}]},{"given":"Yucan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences,Institute of Information Engineering,Beijing,China,100093"}]},{"given":"Dayan","family":"Wu","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences,Institute of Information Engineering,Beijing,China,100093"}]},{"given":"Wanqian","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences,School of Cyber Security,Beijing,China,100049"}]},{"given":"Bo","family":"Li","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences,Institute of Information Engineering,Beijing,China,100093"}]},{"given":"Weiping","family":"Wang","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences,Institute of Information Engineering,Beijing,China,100093"}]},{"given":"Dan","family":"Meng","sequence":"additional","affiliation":[{"name":"Chinese Academy of Sciences,Institute of Information Engineering,Beijing,China,100093"}]}],"member":"263","reference":[{"key":"ref10","first-page":"19","article-title":"Learning joint embedding with multimodal cues for cross-modal video-text retrieval","author":"chowdhury mithun","year":"2018","journal-title":"ICMR"},{"article-title":"Learning a text-video embedding from incomplete and heterogeneous data","year":"2018","author":"miech","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.572"},{"key":"ref13","article-title":"Semi-supervised classification with graph convolutional networks","author":"welling","year":"2017","journal-title":"ICLRE"},{"key":"ref14","article-title":"Efficient estimation of word representations in vector space","author":"mikolov","year":"2013","journal-title":"ICLRE"},{"key":"ref15","article-title":"Vse++: Improving visual-semantic embeddings with hard negatives","author":"faghri","year":"2018","journal-title":"BMVC"},{"key":"ref16","article-title":"Unifying visual-semantic embeddings with multimodal neural language models","author":"kiros","year":"2014","journal-title":"NeurIPS"},{"key":"ref17","article-title":"Learning joint representations of videos and sentences with web image search","author":"yokoya","year":"2016","journal-title":"ECCV"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref19","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","author":"chen","year":"2011","journal-title":"ACL"},{"article-title":"Word2visualvec: Image and video to sentence matching by visual feature prediction","year":"2016","author":"dong","key":"ref4"},{"key":"ref3","article-title":"Use what you have: Video retrieval using representations from collaborative experts","author":"liu","year":"2019","journal-title":"BMVC"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICME46284.2020.9102760"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3078971.3079041"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.341"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00957"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2832602"},{"key":"ref9","article-title":"Improving semantic video indexing: Efforts in waseda trecvid 2015 sin system","author":"kobayashi","year":"2016","journal-title":"ICASSP"}],"event":{"name":"2021 IEEE International Conference on Multimedia and Expo (ICME)","start":{"date-parts":[[2021,7,5]]},"location":"Shenzhen, China","end":{"date-parts":[[2021,7,9]]}},"container-title":["2021 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9428049\/9428068\/09428325.pdf?arnumber=9428325","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,27]],"date-time":"2022-06-27T21:28:37Z","timestamp":1656365317000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9428325\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,5]]},"references-count":19,"URL":"https:\/\/doi.org\/10.1109\/icme51207.2021.9428325","relation":{},"subject":[],"published":{"date-parts":[[2021,7,5]]}}}