{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,15]],"date-time":"2025-10-15T10:32:46Z","timestamp":1760524366517,"version":"3.37.3"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"24","license":[{"start":{"date-parts":[[2022,5,30]],"date-time":"2022-05-30T00:00:00Z","timestamp":1653868800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,5,30]],"date-time":"2022-05-30T00:00:00Z","timestamp":1653868800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2022,10]]},"DOI":"10.1007\/s11042-022-13048-y","type":"journal-article","created":{"date-parts":[[2022,5,30]],"date-time":"2022-05-30T03:23:26Z","timestamp":1653881006000},"page":"34367-34386","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Multi-grained encoding and joint embedding space fusion for video and text cross-modal retrieval"],"prefix":"10.1007","volume":"81","author":[{"given":"Xiaotao","family":"Cui","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5242-7909","authenticated-orcid":false,"given":"Jing","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Jia","family":"Zhu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,5,30]]},"reference":[{"key":"13048_CR1","unstructured":"Andrew G, Arora R, Bilmes J, Livescu K (2013) Deep canonical correlation analysis. In: the 30th International conference on machine learning, pp 1247\u20131255"},{"key":"13048_CR2","doi-asserted-by":"crossref","unstructured":"Carreira J , Zisserman A (2017) Quo vadis, action recognition: a new model and the kinetics dataset. In: IEEE conference on computer vision and pattern recognition, pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"13048_CR3","unstructured":"Chen D L, Dolan W B (2011) Collecting highly parallel data for paraphrase evaluation. In: the 49th annual meeting of the association for computational linguistics: human language technologies, proceedings of the conference, pp 190\u2013200"},{"key":"13048_CR4","doi-asserted-by":"crossref","unstructured":"Chi J, Peng Y (2018) Dual adversarial networks for zero-shot cross-media retrieval. In: the 27th international joint conference on artificial intelligence, pp 663\u2013669","DOI":"10.24963\/ijcai.2018\/92"},{"key":"13048_CR5","unstructured":"Dong J, Li X, Snoek Cees GM (2016) Word2visualvec: Image and video to sentence matching by visual feature prediction. arXiv:1604.06838"},{"issue":"12","key":"13048_CR6","doi-asserted-by":"publisher","first-page":"3377","DOI":"10.1109\/TMM.2018.2832602","volume":"20","author":"J Dong","year":"2018","unstructured":"Dong J, Li X, Snoek CGM (2018) Predicting visual features from text for image and video caption retrieval. IEEE Trans Multimed 20(12):3377\u20133388","journal-title":"IEEE Trans Multimed"},{"key":"13048_CR7","doi-asserted-by":"crossref","unstructured":"Dong J, Li X, Xu C, Ji S, He Y, Yang G, Wang X (2019) Dual encoding for zero-example video retrieval. In: the IEEE conference on computer vision and pattern recognition, pp 9346\u20139355","DOI":"10.1109\/CVPR.2019.00957"},{"key":"13048_CR8","unstructured":"Faghri F, Fleet D J, Kiros J R, Fidler S (2017) Vse++: Improved visual-semantic embeddings. arXiv:1707.05612"},{"key":"13048_CR9","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"13048_CR10","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J (2013) Framing image description as a ranking task: Data, models and evaluation metrics. J Artif Intell Res 47:853\u2013899","journal-title":"J Artif Intell Res"},{"key":"13048_CR11","doi-asserted-by":"crossref","unstructured":"Kim Y (2014) Convolutional neural networks for sentence classification. In: the 2014 conference on empirical methods in natural language processing, pp 1746\u20131751","DOI":"10.3115\/v1\/D14-1181"},{"key":"13048_CR12","unstructured":"Kiros R, Salakhutdinov R, Zemel R S (2014) Unifying visual-semantic embeddings with multimodal neural language models. arXiv:1411.2539"},{"key":"13048_CR13","unstructured":"Kiros R, Zhu Y, Salakhutdinov R, Zemel R S, Urtasun R, Torralba A, Fidler S (2015) Skip-thought vectors. In: advances in neural information processing systems 28: annual conference on neural information processing systems 2015, pp 3294\u20133302"},{"issue":"1","key":"13048_CR14","doi-asserted-by":"publisher","first-page":"46","DOI":"10.1002\/asi.24373","volume":"72","author":"W Li","year":"2020","unstructured":"Li W, Zheng Y, Zhang Y, Feng R, Zhang T, Fan W (2020) Cross-modal retrieval with dual multi-angle self-attention. J Assoc Inf Sci Technol 72 (1):46\u201365","journal-title":"J Assoc Inf Sci Technol"},{"key":"13048_CR15","unstructured":"Liu Y, Albanie S, Nagrani A, Zisserman A (2019) Use what you have: Video retrieval using representations from collaborative experts. arXiv:1907.13487"},{"key":"13048_CR16","doi-asserted-by":"crossref","unstructured":"Markatopoulou F, Galanopoulos D, Mezaris V, Patras I (2017) Query and keyframe representations for ad-hoc video search. In: the 2017 ACM on international conference on multimedia retrieval, pp 407\u2013411","DOI":"10.1145\/3078971.3079041"},{"key":"13048_CR17","doi-asserted-by":"crossref","unstructured":"Mc Donald K, Smeaton A F (2005) A comparison of score, rank and probability-based fusion methods for video shot retrieval. In: the 4th international conference on image and video retrieval, pp 61\u201370","DOI":"10.1007\/11526346_10"},{"key":"13048_CR18","unstructured":"Miech A, Laptev I, Sivic J (2018) Learning a text-video embedding from incomplete and heterogeneous data. arXiv:1804.02516"},{"key":"13048_CR19","doi-asserted-by":"crossref","unstructured":"Miech A, Zhukov D, Alayrac J-B, Tapaswi M, Laptev I, Sivic J (2019) Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. arXiv:1906.03327","DOI":"10.1109\/ICCV.2019.00272"},{"key":"13048_CR20","doi-asserted-by":"crossref","unstructured":"Mithun N C, Li J, Metze F, Roy-Chowdhury A K (2018) Learning joint embedding with multimodal cues for cross-modal video-text retrieval. In: the 2008 ACM on international conference on multimedia retrieval, pp 19\u201327","DOI":"10.1145\/3206025.3206064"},{"key":"13048_CR21","unstructured":"Molchanov P, Tyree S, Karras T, Aila T, Kautz J (2016) Pruning convolutional neural networks for resource efficient inference. arXiv:1611.06440"},{"key":"13048_CR22","doi-asserted-by":"crossref","unstructured":"Otani M, Nakashima Y, Rahtu E, Heikkil J, Yokoya N (2016) Learning joint representations of videos and sentences with web image search. In: the european conference on computer vision, pp 651\u2013667","DOI":"10.1007\/978-3-319-46604-0_46"},{"key":"13048_CR23","doi-asserted-by":"crossref","unstructured":"Pan Y, Mei T, Yao T, Li H, Rui Y (2016) Jointly modeling embedding and translation to bridge video and language. In: the IEEE conference on computer vision and pattern recognition, pp 4594\u20134602","DOI":"10.1109\/CVPR.2016.497"},{"key":"13048_CR24","unstructured":"Peng Y, Huang X, Qi J (2016) Cross-media shared representation by hierarchical learning with multiple deep networks. In: the 25th international joint conference on artificial intelligence, pp 3846\u20133853"},{"issue":"11","key":"13048_CR25","doi-asserted-by":"publisher","first-page":"5585","DOI":"10.1109\/TIP.2018.2852503","volume":"27","author":"Y Peng","year":"2018","unstructured":"Peng Y, Qi J, Yuan Y (2018) Modality-specific cross-modal similarity measurement with recurrent attention network. IEEE Trans Image Process 27(11):5585\u20135599","journal-title":"IEEE Trans Image Process"},{"issue":"12","key":"13048_CR26","doi-asserted-by":"publisher","first-page":"4275","DOI":"10.1109\/TCYB.2016.2606441","volume":"47","author":"X Shen","year":"2016","unstructured":"Shen X, Shen F, Sun Q-S, Yang Y, Yuan Y-H, Shen H T (2016) Semi-paired discrete hashing: Learning latent hash codes for semi-paired cross-view retrieval. IEEE Trans Cybern 47(12):4275\u20134288","journal-title":"IEEE Trans Cybern"},{"key":"13048_CR27","doi-asserted-by":"crossref","unstructured":"Socher R, Li F (2010) Connecting modalities: Semi-supervised segmentation and annotation of images using unaligned text corpora. In: the IEEE conference on computer vision and pattern recognition, pp 966\u2013973","DOI":"10.1109\/CVPR.2010.5540112"},{"key":"13048_CR28","unstructured":"Ueki K, Hirakawa K, Kikuchi K, Ogawa T, Kobayashi T (2017) Waseda meisei at trecvid 2017: Ad-hoc video search. In: the 2017 TREC video retrieval evaluation"},{"key":"13048_CR29","doi-asserted-by":"crossref","unstructured":"Xu J, Mei T, Yao T, Rui Y (2016) Msr-vtt: A large video description dataset for bridging video and language. In: the IEEE conference on computer vision and pattern recognition, pp 5288\u20135296","DOI":"10.1109\/CVPR.2016.571"},{"key":"13048_CR30","doi-asserted-by":"crossref","unstructured":"Xu R, Xiong C, Chen W, Corso J J (2015) Jointly modeling deep video and compositional text to bridge vision and language in a unified framework. In: the 29th AAAI conference on artificial intelligence, pp 2346\u20132352","DOI":"10.1609\/aaai.v29i1.9512"},{"key":"13048_CR31","doi-asserted-by":"crossref","unstructured":"Xu R, Li C, Yan J, Deng C, Liu X (2019) Graph convolutional network hashing for cross-modal retrieval. In: the 28th international joint conference on artificial intelligence, pp 10\u201316","DOI":"10.24963\/ijcai.2019\/138"},{"issue":"6","key":"13048_CR32","doi-asserted-by":"publisher","first-page":"2400","DOI":"10.1109\/TCYB.2019.2928180","volume":"50","author":"X Xu","year":"2019","unstructured":"Xu X, Lu H, Song J, Yang Y, Shen H T, Li X (2019) Ternary adversarial networks with self-supervision for zero-shot cross-modal retrieval. IEEE Trans Cybern 50(6):2400\u20132413","journal-title":"IEEE Trans Cybern"},{"key":"13048_CR33","doi-asserted-by":"crossref","unstructured":"Xu X, Song J, Lu H, Yang Y, Shen F, Huang Z (2018) Modal-adversarial semantic learning network for extendable cross-modal retrieval. In: the 2018 ACM on international conference on multimedia retrieval, pp 46\u201354","DOI":"10.1145\/3206025.3206033"},{"issue":"11","key":"13048_CR34","doi-asserted-by":"publisher","first-page":"5563","DOI":"10.1109\/TIP.2018.2859820","volume":"27","author":"H Xue","year":"2018","unstructured":"Xue H, Chu W, Zhao Z, Cai D (2018) A better way to attend: Attention with trees for video question answering. IEEE Trans Image Process 27 (11):5563\u20135574","journal-title":"IEEE Trans Image Process"},{"issue":"11","key":"13048_CR35","doi-asserted-by":"publisher","first-page":"5600","DOI":"10.1109\/TIP.2018.2855422","volume":"27","author":"Y Yang","year":"2018","unstructured":"Yang Y, Zhou J, Ai J, Bin Y, Hanjalic A, Shen H T, Ji Y (2018) Video captioning by adversarial lstm. IEEE Trans Image Process 27(11):5600\u20135611","journal-title":"IEEE Trans Image Process"},{"key":"13048_CR36","doi-asserted-by":"crossref","unstructured":"Yu Y, Kim J, Kim G (2018) A joint sequence fusion model for video question answering and retrieval. In: the european conference on computer vision, pp 471\u2013487","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"13048_CR37","doi-asserted-by":"crossref","unstructured":"Yu Y, Ko H, Choi J, Kim G (2017) End-to-end concept word detection for video captioning, retrieval, and question answering. In: the IEEE conference on computer vision and pattern recognition, pp 3165\u20133173","DOI":"10.1109\/CVPR.2017.347"},{"issue":"2","key":"13048_CR38","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1109\/TCYB.2018.2868826","volume":"50","author":"J Zhang","year":"2018","unstructured":"Zhang J, Peng Y, Yuan M (2018) Sch-gan: Semi-supervised cross-modal hashing by generative adversarial network. IEEE Trans Cybern 50(2):489\u2013502","journal-title":"IEEE Trans Cybern"},{"key":"13048_CR39","doi-asserted-by":"crossref","unstructured":"Zhang X, Zhou S, Feng J, Lai H, Li B, Pan Y, Yin J, Yan S (2017) Hashgan: attention-aware deep adversarial hashing for cross modal retrieval. arXiv:1711.09347","DOI":"10.1007\/978-3-030-01267-0_36"},{"key":"13048_CR40","doi-asserted-by":"publisher","first-page":"237","DOI":"10.1109\/TIP.2019.2930152","volume":"29","author":"T Zhuo","year":"2019","unstructured":"Zhuo T, Cheng Z, Zhang P, Wong Y, Kankanhalli M (2019) Unsupervised online video object segmentation with motion property understanding. IEEE Trans Image Process 29:237\u2013249","journal-title":"IEEE Trans Image Process"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-022-13048-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-022-13048-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-022-13048-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,22]],"date-time":"2022-09-22T10:51:48Z","timestamp":1663843908000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-022-13048-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,5,30]]},"references-count":40,"journal-issue":{"issue":"24","published-print":{"date-parts":[[2022,10]]}},"alternative-id":["13048"],"URL":"https:\/\/doi.org\/10.1007\/s11042-022-13048-y","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2022,5,30]]},"assertion":[{"value":"27 August 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 November 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 April 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 May 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that there is no conflict of interest regarding the publication of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"<!--Emphasis Type='Bold' removed-->Conflict of Interests"}}]}}