{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T10:57:46Z","timestamp":1765018666121,"version":"3.46.0"},"publisher-location":"Cham","reference-count":48,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031736353"},{"type":"electronic","value":"9783031736360"}],"license":[{"start":{"date-parts":[[2024,11,5]],"date-time":"2024-11-05T00:00:00Z","timestamp":1730764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,5]],"date-time":"2024-11-05T00:00:00Z","timestamp":1730764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73636-0_19","type":"book-chapter","created":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T15:04:55Z","timestamp":1730732695000},"page":"323-339","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Chronologically Accurate Retrieval for\u00a0Temporal Grounding of\u00a0Motion-Language Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2205-6115","authenticated-orcid":false,"given":"Kent","family":"Fujiwara","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8259-6658","authenticated-orcid":false,"given":"Mikihiro","family":"Tanaka","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6965-9581","authenticated-orcid":false,"given":"Qing","family":"Yu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,5]]},"reference":[{"key":"19_CR1","doi-asserted-by":"crossref","unstructured":"Athanasiou, N., Petrovich, M., Black, M.J., Varol, G.: Teach: temporal action composition for 3D humans. In: 3DV (2022)","DOI":"10.1109\/3DV57658.2022.00053"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Borsos, Z., et\u00a0al.: AudioLM: a language modeling approach to audio generation. IEEE\/ACM TASLP (2023)","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"19_CR3","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: NeurIPS (2020)"},{"key":"19_CR4","unstructured":"Burgert, R., Ranasinghe, K., Li, X., Ryoo, M.S.: Peekaboo: text to image diffusion models are zero-shot segmentors. arXiv preprint arXiv:2211.13224 (2022)"},{"key":"19_CR5","doi-asserted-by":"crossref","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: attention-based semantic guidance for text-to-image diffusion models. TOG (2023)","DOI":"10.1145\/3592116"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: Executing your commands via motion diffusion in latent space. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"19_CR7","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: NeurIPS (2021)"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"Duan, H., Zhao, Y., Chen, K., Lin, D., Dai, B.: Revisiting skeleton-based action recognition. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Ghosh, A., Cheema, N., Oguz, C., Theobalt, C., Slusallek, P.: Synthesis of compositional animations from textual descriptions. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00143"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Generating diverse and natural 3D human motions from text. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00509"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Action2motion: conditioned generation of 3D human motions. In: ACM MM (2020)","DOI":"10.1145\/3394171.3413635"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Ionescu, C., Papava, D., Olaru, V., Sminchisescu, C.: Human3. 6m: large scale datasets and predictive methods for 3D human sensing in natural environments. IEEE TPAMI (2013)","DOI":"10.1109\/TPAMI.2013.248"},{"key":"19_CR13","unstructured":"Jung, M., Jang, Y., Choi, S., Kim, J., Kim, J.H., Zhang, B.T.: Overcoming weak visual-textual alignment for video moment retrieval. arXiv preprint arXiv:2306.02728 (2023)"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Kalakonda, S.S., Maheshwari, S., Sarvadevabhatla, R.K.: Action-GPT: leveraging large-scale language models for improved and generalized zero shot action generation. arXiv preprint arXiv:2211.15603 (2022)","DOI":"10.1109\/ICME55011.2023.00014"},{"key":"19_CR15","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"19_CR16","doi-asserted-by":"crossref","unstructured":"Ko, D., et al.: Video-text representation learning via differentiable weak temporal alignment. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00496"},{"key":"19_CR17","unstructured":"Kreuk, F., et al.: AudioGen: textually guided audio generation. In: ICLR (2023)"},{"key":"19_CR18","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML (2023)"},{"key":"19_CR19","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML (2022)"},{"key":"19_CR20","unstructured":"Lin, J., et al.: Motion-X: a large-scale 3D expressive whole-body human motion dataset. In: NeurIPS (2023)"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Liu, X., Yin, J., Liu, H., Yin, Y.: PISEP 2: pseudo-image sequence evolution-based 3D pose prediction. Vis. Comput. (2022)","DOI":"10.1007\/s00371-021-02135-0"},{"key":"19_CR22","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"19_CR23","doi-asserted-by":"crossref","unstructured":"Mahmood, N., Ghorbani, N., Troje, N.F., Pons-Moll, G., Black, M.J.: Amass: archive of motion capture as surface shapes. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00554"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Miki, D., Chen, S., Demachi, K.: Weakly supervised graph convolutional neural network for human action localization. In: WACV (2020)","DOI":"10.1109\/WACV45572.2020.9093551"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: Action-conditioned 3D human motion synthesis with transformer VAE. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"19_CR26","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TEMOS: generating diverse human motions from textual descriptions. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20047-2_28"},{"key":"19_CR27","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: TMR: text-to-motion retrieval using contrastive 3D human motion synthesis. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00870"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"Petrovich, M., et al.: Multi-track timeline control for text-driven 3D human motion generation (2024). arXiv preprint arXiv:2401.08559","DOI":"10.1109\/CVPRW63382.2024.00197"},{"key":"19_CR29","doi-asserted-by":"crossref","unstructured":"Plappert, M., Mandery, C., Asfour, T.: The KIT motion-language dataset. Big Data (2016)","DOI":"10.1089\/big.2016.0028"},{"key":"19_CR30","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"issue":"1","key":"19_CR31","first-page":"5485","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(1), 5485\u20135551 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"19_CR32","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"19_CR33","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter. In: NeurIPS-W (2019)"},{"key":"19_CR34","unstructured":"Shafir, Y., Tevet, G., Kapon, R., Bermano, A.H.: PriorMDM: human motion diffusion as a generative prior. In: ICLR (2024)"},{"key":"19_CR35","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.T., Wang, G.: NTU RGB+ D: a large scale dataset for 3D human activity analysis. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.115"},{"key":"19_CR36","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Cheng, J., Lu, H.: Two-stream adaptive graph convolutional networks for skeleton-based action recognition. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01230"},{"key":"19_CR37","doi-asserted-by":"crossref","unstructured":"Taheri, O., Ghorbani, N., Black, M.J., Tzionas, D.: Grab: a dataset of whole-body human grasping of objects. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58548-8_34"},{"key":"19_CR38","doi-asserted-by":"crossref","unstructured":"Tevet, G., Gordon, B., Hertz, A., Bermano, A.H., Cohen-Or, D.: MotionCLIP: exposing human motion generation to clip space. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20047-2_21"},{"key":"19_CR39","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-Or, D., Bermano, A.H.: Human motion diffusion model. In: ICLR (2023)"},{"key":"19_CR40","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"19_CR41","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., Lin, D.: Spatial temporal graph convolutional networks for skeleton-based action recognition. In: AAAI (2018)","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"19_CR42","doi-asserted-by":"crossref","unstructured":"Yu, Q., Fujiwara, K.: Frame-level label refinement for skeleton-based weakly-supervised action recognition. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i3.25439"},{"key":"19_CR43","doi-asserted-by":"crossref","unstructured":"Yu, Q., Tanaka, M., Fujiwara, K.: Exploring vision transformers for 3D human motion-language models with motion patches. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00095"},{"key":"19_CR44","unstructured":"Yuksekgonul, M., Bianchi, F., Kalluri, P., Jurafsky, D., Zou, J.: When and why vision-language models behave like bag-of-words models, and what to do about it? In: ICLR (2023)"},{"key":"19_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, H., Liu, D., Lv, Z., Su, B., Tao, D.: Exploring temporal concurrency for video-language representation learning. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01427"},{"key":"19_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, J., et al.: T2M-GPT: generating human motion from textual descriptions with discrete representations. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01415"},{"key":"19_CR47","unstructured":"Zhang, M., et al.: MotionDiffuse: text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)"},{"key":"19_CR48","doi-asserted-by":"crossref","unstructured":"Zhang, M., et al.: RemoDiffuse: retrieval-augmented motion diffusion model. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00040"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73636-0_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T10:53:07Z","timestamp":1765018387000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73636-0_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,5]]},"ISBN":["9783031736353","9783031736360"],"references-count":48,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73636-0_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,5]]},"assertion":[{"value":"5 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}