{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T11:48:46Z","timestamp":1774352926237,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,7,18]],"date-time":"2023-07-18T00:00:00Z","timestamp":1689638400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"AI4Media - A European Excellence Centre for Media, Society, and Democracy","award":["EC, H2020 n. 951911"],"award-info":[{"award-number":["EC, H2020 n. 951911"]}]},{"name":"SUN - Social and hUman ceNtered XR","award":["EC, Horizon Europe n. 101092612"],"award-info":[{"award-number":["EC, Horizon Europe n. 101092612"]}]},{"name":"ERDF CyberSecurity, CyberCrime and Critical Information Infrastructures Center of Excellence","award":["CZ.02.1.01\/0.0\/0.0\/16_019\/0000822"],"award-info":[{"award-number":["CZ.02.1.01\/0.0\/0.0\/16_019\/0000822"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,7,19]]},"DOI":"10.1145\/3539618.3592069","type":"proceedings-article","created":{"date-parts":[[2023,7,19]],"date-time":"2023-07-19T00:22:23Z","timestamp":1689726143000},"page":"2420-2425","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Text-to-Motion Retrieval: Towards Joint Understanding of Human Motion Data and Natural Language"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3011-2487","authenticated-orcid":false,"given":"Nicola","family":"Messina","sequence":"first","affiliation":[{"name":"ISTI-CNR, Pisa, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7668-8521","authenticated-orcid":false,"given":"Jan","family":"Sedmidubsky","sequence":"additional","affiliation":[{"name":"Masaryk University, Brno, Czech Rep"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6258-5313","authenticated-orcid":false,"given":"Fabrizio","family":"Falchi","sequence":"additional","affiliation":[{"name":"ISTI-CNR, Pisa, Italy"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2331-7671","authenticated-orcid":false,"given":"Tom\u00e1s","family":"Rebok","sequence":"additional","affiliation":[{"name":"Masaryk University, Brno, Czech Rep"}]}],"member":"320","published-online":{"date-parts":[[2023,7,18]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2004.08692"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"e_1_3_2_2_3_1","volume-title":"ViViT: A Video Vision Transformer. In IEEE\/CVF International Conference on Computer Vision (ICCV). 6836--6846","author":"Arnab Anurag","year":"2021","unstructured":"Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Luvci\u0107, and Cordelia Schmid. 2021. ViViT: A Video Vision Transformer. In IEEE\/CVF International Conference on Computer Vision (ICCV). 6836--6846."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2013.178"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3460426.3463646"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-019-07827--3"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10791-017-9318-6"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME51207.2021.9428459"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3444685.3446289"},{"key":"e_1_3_2_2_10_1","volume-title":"Symposium on Interactive 3D Graphics (SI3D). ACM, 191--198","author":"Deng Z.","unstructured":"Z. Deng, Q. Gu, and Q. Li. 2009. Perceptually consistent example-based human motion retrieval. In Symposium on Interactive 3D Graphics (SI3D). ACM, 191--198."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2210.05895"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-022-00980-0"},{"key":"e_1_3_2_2_13_1","volume-title":"Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097","author":"Fang Han","year":"2021","unstructured":"Han Fang, Pengfei Xiong, Luhui Xu, and Yu Chen. 2021. Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00143"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_34"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413635"},{"key":"e_1_3_2_2_18_1","volume-title":"Symposium on Interactive 3D Graphics and Games (I3D). ACM, 19--28","author":"Kapadia M.","unstructured":"M. Kapadia, I-K. Chiang, T. Thomas, N.I. Badler, and J. T. Kider Jr. 2013. Efficient motion retrieval in large motion databases. In Symposium on Interactive 3D Graphics and Games (I3D). ACM, 19--28."},{"key":"e_1_3_2_2_19_1","volume-title":"Proceedings of NAACL-HLT. 4171--4186","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of NAACL-HLT. 4171--4186."},{"key":"e_1_3_2_2_20_1","volume-title":"Learning Joint Representation of Human Motion and Language. arXiv preprint arXiv:2210.15187","author":"Kim Jihoon","year":"2022","unstructured":"Jihoon Kim, Youngjae Yu, Seungyoun Shin, Taehyun Byun, and Sungjoon Choi. 2022. Learning Joint Representation of Human Motion and Language. arXiv preprint arXiv:2210.15187 (2022)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413548"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531776"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413505"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00554"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3451390"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CBMI50038.2021.9461890"},{"key":"e_1_3_2_2_28_1","volume-title":"Andrea Esuli, and Fabrizio Falchi.","author":"Messina Nicola","year":"2022","unstructured":"Nicola Messina, Davide Alessandro Coccomini, Andrea Esuli, and Fabrizio Falchi. 2022a. Transformer-Based Multi-modal Proposal and Re-Rank for Wikipedia Image-Caption Matching. arXiv preprint arXiv:2206.10436 (2022)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9413172"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3549555.3549576"},{"key":"e_1_3_2_2_31_1","volume-title":"Eurographics\/ACM SIGGRAPH Symposium on Computer Animation (SCA). Eurographics Assoc., 157--166","author":"Numaguchi N.","unstructured":"N. Numaguchi, A. Nakazawa, T. Shiratori, and J. K. Hodgins. 2011. A Puppet Interface for Retrieval of Motion Capture Data. In Eurographics\/ACM SIGGRAPH Symposium on Computer Animation (SCA). Eurographics Assoc., 157--166."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-29888-3_3"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.107921"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1089\/big.2016.0028"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2103.00020"},{"key":"e_1_3_2_2_37_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-45439-5_35"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3-031--28238--6_8"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3075766"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01939"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2818328"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/HUMANOIDS.2014.7041470"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2209.14916"},{"key":"e_1_3_2_2_45_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3194350"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2301.06052"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2208.15001"},{"key":"e_1_3_2_2_49_1","volume-title":"Contrastive learning of medical visual representations from paired images and text. arXiv preprint arXiv:2010.00747","author":"Zhang Yuhao","year":"2020","unstructured":"Yuhao Zhang, Hang Jiang, Yasuhide Miura, Christopher D Manning, and Curtis P Langlotz. 2020. Contrastive learning of medical visual representations from paired images and text. arXiv preprint arXiv:2010.00747 (2020)."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531950"}],"event":{"name":"SIGIR '23: The 46th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Taipei Taiwan","acronym":"SIGIR '23","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3592069","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3539618.3592069","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:38:03Z","timestamp":1750178283000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3539618.3592069"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,18]]},"references-count":50,"alternative-id":["10.1145\/3539618.3592069","10.1145\/3539618"],"URL":"https:\/\/doi.org\/10.1145\/3539618.3592069","relation":{},"subject":[],"published":{"date-parts":[[2023,7,18]]},"assertion":[{"value":"2023-07-18","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}