{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,4]],"date-time":"2025-10-04T08:03:52Z","timestamp":1759565032235},"reference-count":15,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"10","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2022,10,1]]},"DOI":"10.1587\/transinf.2022edl8018","type":"journal-article","created":{"date-parts":[[2022,9,30]],"date-time":"2022-09-30T22:33:44Z","timestamp":1664577224000},"page":"1825-1828","source":"Crossref","is-referenced-by-count":1,"title":["An Efficient Multimodal Aggregation Network for Video-Text Retrieval"],"prefix":"10.1587","volume":"E105.D","author":[{"given":"Zhi","family":"LIU","sequence":"first","affiliation":[{"name":"North China University of Technology"}]},{"given":"Fangyuan","family":"ZHAO","sequence":"additional","affiliation":[{"name":"North China University of Technology"}]},{"given":"Mengmeng","family":"ZHANG","sequence":"additional","affiliation":[{"name":"North China University of Technology"},{"name":"Beijing Polytechnic College"}]}],"member":"532","reference":[{"key":"1","unstructured":"[1] A. Radford, J.W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, G. Krueger, and I. Sutskever, \u201cLearning transferable visual models from natural language supervision,\u201d arXiv preprint arXiv:2103.00020. 10.48550\/arXiv.2103.00020"},{"key":"2","unstructured":"[2] A. Miech, I. Laptev, and J. Sivic, \u201cLearning a text-video embedding from incomplete and heterogeneous data,\u201d arXiv preprint arXiv:1804.02516. 10.48550\/arXiv.1804.02516"},{"key":"3","unstructured":"[3] Y. Liu, S. Albanie, A. Nagrani, and A. Zisserman, \u201cUse what you have: Video retrieval using representations from collaborative experts,\u201d arXiv preprint arXiv.1907.13487."},{"key":"4","doi-asserted-by":"crossref","unstructured":"[4] V. Gabeur, C. Sun, K. Alahari, and C. Schmid, \u201cMulti-modal transformer for video retrieval,\u201d Proc. 16th European Conference on Computer Vision, ECCV 2020, Glasgow, UK, Aug. 23-28, pp.214-229, 2020. 10.1007\/978-3-030-58548-8_13","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"5","unstructured":"[5] S. Liu, H. Fan, S. Qian, Y. Chen, W. Ding, and Z. Wang, \u201cHit: Hierarchical transformer with momentum contrast for video-text retrieval,\u201d arXiv preprint arXiv.2103.15049."},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] R. Arandjelovic, P. Gronat, A. Torii, T. Pajdla, and J. Sivic, \u201cNetVLAD: CNN architecture for weakly supervised place recognition,\u201d Proc. IEEE Conference on Computer Vision and Pattern Recognition, pp.5297-5307, 2016. 10.1109\/CVPR.2016.572","DOI":"10.1109\/CVPR.2016.572"},{"key":"7","unstructured":"[7] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A.N. Gomez, \u0141. Kaiser, and I. Polosukhin, \u201cAttention is all you need,\u201d Advances in Neural Information Processing Systems, Vol.30, pp.5998-6008, 2017."},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] J. Xu, T. Mei, T. Yao, and Y. Rui, \u201cMSR-VTT: A large video description dataset for bridging video and language,\u201d Proc. IEEE Conference on Computer Vision And Pattern Recognition, pp.5288-5296, 2016. 10.1109\/CVPR.2016.571","DOI":"10.1109\/CVPR.2016.571"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] R. Krishna, K. Hata, F. Ren, L. Fei-Fei, and J.C. Niebles, \u201cDense-captioning events in videos,\u201d Proc. IEEE International Conference on Computer Vision, pp.706-715, 2017. 10.1109\/iccv.2017.83","DOI":"10.1109\/ICCV.2017.83"},{"key":"10","unstructured":"[10] T. Wang and P. Isola, \u201cUnderstanding contrastive representation learning through alignment and uniformity on the hypersphere,\u201d International Conference on Machine Learning, PMLR 119, pp.9929-9939, 2020."},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] F. Wang and H. Liu, \u201cUnderstanding the behaviour of contrastive loss,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021. 10.1109\/cvpr46437.2021.00252","DOI":"10.1109\/CVPR46437.2021.00252"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] Y. Yu, J. Kim, and G. Kim, \u201cA joint sequence fusion model for video question answering and retrieval,\u201d Proc. European Conference on Computer Vision (ECCV), pp.487-503, 2018. 10.1007\/978-3-030-01234-2_29","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"13","doi-asserted-by":"crossref","unstructured":"[13] B. Zhang, H. Hu, and F. Sha, \u201cCross-modal and hierarchical modeling of video and text,\u201d Proc. European Conference on Computer Vision (ECCV), pp.385-401, 2018. 10.1007\/978-3-030-01261-8_23","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"14","unstructured":"[14] J. Devlin, M.W. Chang, K. Lee, and K. Toutanova, \u201cBERT: Pre-training of deep bidirectional transformers for language understanding,\u201d arXiv preprint arXiv:1810.04805. 10.48550\/arXiv.1810.04805"},{"key":"15","doi-asserted-by":"crossref","unstructured":"[15] M. Dzabraev, M. Kalashnikov, S. Komkov, and A. Petiushko, \u201cMDMMT: Multidomain multimodal transformer for video retrieval,\u201d Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.3354-3363, 2021. 10.1109\/cvprw53098.2021.00374","DOI":"10.1109\/CVPRW53098.2021.00374"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E105.D\/10\/E105.D_2022EDL8018\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,5,9]],"date-time":"2024-05-09T04:56:00Z","timestamp":1715230560000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E105.D\/10\/E105.D_2022EDL8018\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,1]]},"references-count":15,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2022]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2022edl8018","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,10,1]]},"article-number":"2022EDL8018"}}