{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T20:30:15Z","timestamp":1743107415175,"version":"3.40.3"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031666933"},{"type":"electronic","value":"9783031666940"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-66694-0_16","type":"book-chapter","created":{"date-parts":[[2024,8,22]],"date-time":"2024-08-22T06:09:21Z","timestamp":1724306961000},"page":"259-276","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Action Conditioned Attention Encoder-Decoder and\u00a0Discriminator for\u00a0Human Motion Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7339-8425","authenticated-orcid":false,"given":"Chaitanya","family":"Bandi","sequence":"first","affiliation":[]},{"given":"Ulrike","family":"Thomas","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,8,21]]},"reference":[{"key":"16_CR1","doi-asserted-by":"crossref","unstructured":"Ahn, H., Ha, T., Choi, Y., Yoo, H., Oh, S.: Text2action: generative adversarial synthesis from language to action. In: 2018 IEEE International Conference on Robotics and Automation (ICRA), pp.\u00a01\u20135 (2017). https:\/\/api.semanticscholar.org\/CorpusID:10408622","DOI":"10.1109\/ICRA.2018.8460608"},{"key":"16_CR2","doi-asserted-by":"crossref","unstructured":"Ahuja, C., Morency, L.P.: Language2pose: natural language grounded pose forecasting. In: 2019 International Conference on 3D Vision (3DV), pp. 719\u2013728 (2019). https:\/\/api.semanticscholar.org\/CorpusID:195776094","DOI":"10.1109\/3DV.2019.00084"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Aksan, E., Kaufmann, M., Hilliges, O.: Structured prediction helps 3d human motion modelling. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7143\u20137152 (2019). https:\/\/api.semanticscholar.org\/CorpusID:204800967","DOI":"10.1109\/ICCV.2019.00724"},{"key":"16_CR4","unstructured":"Arjovsky, M., Chintala, S., Bottou, L.: Wasserstein GAN. ArXiv abs\/1701.07875 (2017). https:\/\/api.semanticscholar.org\/CorpusID:13943041"},{"key":"16_CR5","doi-asserted-by":"publisher","unstructured":"Bandi, C., Thomas, U.: Skeleton-based action recognition for human-robot interaction using self-attention mechanism. In: 2021 16th IEEE International Conference on Automatic Face and Gesture Recognition (FG 2021), pp.\u00a01\u20138 (2021). https:\/\/doi.org\/10.1109\/FG52635.2021.9666948","DOI":"10.1109\/FG52635.2021.9666948"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Bao, J., Chen, D., Wen, F., Li, H., Hua, G.: CVAE-GAN: fine-grained image generation through asymmetric training. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 2764\u20132773 (2017). https:\/\/api.semanticscholar.org\/CorpusID:266050344","DOI":"10.1109\/ICCV.2017.299"},{"key":"16_CR7","unstructured":"Chu, X., Tian, Z., Zhang, B., Wang, X., Shen, C.: Conditional positional encodings for vision transformers (2021). https:\/\/api.semanticscholar.org\/CorpusID:256827775"},{"key":"16_CR8","unstructured":"Company: ART\u00a0solutions, G.: Advanced real-time tracking system,art-human motion capture assistance tool. https:\/\/ar-tracking.com\/en\/product-program\/art-human"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Dabral, R., Mughal, M.H., Golyanik, V., Theobalt, C.: Mofusion: a framework for denoising-diffusion-based motion synthesis. In: Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00941"},{"key":"16_CR10","doi-asserted-by":"crossref","unstructured":"Degardin, B., Neves, J.C., Lopes, V., de\u00a0Brito, J.B., Yaghoubi, E., Proencca, H.M.: Generative adversarial graph convolutional networks for human action synthesis. In: 2022 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 2753\u20132762 (2021). https:\/\/api.semanticscholar.org\/CorpusID:239050181","DOI":"10.1109\/WACV51458.2022.00281"},{"key":"16_CR11","doi-asserted-by":"crossref","unstructured":"Fragkiadaki, K., Levine, S., Malik, J.: Recurrent network models for kinematic tracking. ArXiv abs\/1508.00271 (2015). https:\/\/api.semanticscholar.org\/CorpusID:15294095","DOI":"10.1109\/ICCV.2015.494"},{"key":"16_CR12","unstructured":"Goodfellow, I.J., et al.: Generative adversarial nets. In: Neural Information Processing Systems (2014). https:\/\/api.semanticscholar.org\/CorpusID:261560300"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Guo, C., et al.: Action2motion: conditioned generation of 3d human motions. In: Proceedings of the 28th ACM International Conference on Multimedia (2020). https:\/\/api.semanticscholar.org\/CorpusID:220870974","DOI":"10.1145\/3394171.3413635"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"Habibie, I., Holden, D., Schwarz, J., Yearsley, J., Komura, T.: A recurrent variational autoencoder for human motion synthesis. In: British Machine Vision Conference (2017). https:\/\/api.semanticscholar.org\/CorpusID:9123693","DOI":"10.5244\/C.31.119"},{"key":"16_CR15","unstructured":"Higgins, I., et al.: beta-VAE: learning basic visual concepts with a constrained variational framework. In: International Conference on Learning Representations (2017). https:\/\/openreview.net\/forum?id=Sy2fzU9gl"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Jain, A., Zamir, A.R., Savarese, S., Saxena, A.: Structural-RNN: deep learning on spatio-temporal graphs. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5308\u20135317 (2015). https:\/\/api.semanticscholar.org\/CorpusID:563473","DOI":"10.1109\/CVPR.2016.573"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Karunratanakul, K., Preechakul, K., Suwajanakorn, S., Tang, S.: Guided motion diffusion for controllable human motion synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2151\u20132162 (2023)","DOI":"10.1109\/ICCV51070.2023.00205"},{"key":"16_CR18","unstructured":"Lee, H.Y., Yang, X., Liu, M.Y., Wang, T.C., Lu, Y.D., Yang, M.H., Kautz, J.: Dancing to music. In: Neural Information Processing Systems (2019). https:\/\/api.semanticscholar.org\/CorpusID:207780070"},{"key":"16_CR19","doi-asserted-by":"crossref","unstructured":"Li, R., Yang, S., Ross, D.A., Kanazawa, A.: Ai choreographer: music conditioned 3d dance generation with aist++. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 13381\u201313392 (2021). https:\/\/api.semanticscholar.org\/CorpusID:236882798","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Liu, J., Shahroudy, A., Perez, M., Wang, G., yu\u00a0Duan, L., Kot, A.C.: NTU RGB+D 120: a large-scale benchmark for 3d human activity understanding. IEEE Trans. Pattern Anal. Mach. Intell. 42, 2684\u20132701 (2019). https:\/\/api.semanticscholar.org\/CorpusID:152282878","DOI":"10.1109\/TPAMI.2019.2916873"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"Loper, M., Mahmood, N., Romero, J., Pons-Moll, G., Black, M.J.: SMPL: a skinned multi-person linear model. ACM Trans. Graphics (Proc. SIGGRAPH Asia) 34(6), 248:1\u2013248:16 (2015)","DOI":"10.1145\/2816795.2818013"},{"key":"16_CR22","doi-asserted-by":"crossref","unstructured":"Lu, Q., Zhang, Y., Lu, M., Roychowdhury, V.P.: Action-conditioned on-demand motion generation. In: Proceedings of the 30th ACM International Conference on Multimedia (2022). https:\/\/api.semanticscholar.org\/CorpusID:250627031","DOI":"10.1145\/3503161.3548287"},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Mao, W., Liu, M., Salzmann, M., Li, H.: Learning trajectory dependencies for human motion prediction. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 9488\u20139496 (2019)","DOI":"10.1109\/ICCV.2019.00958"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Martinez, J., Black, M.J., Romero, J.: On human motion prediction using recurrent neural networks. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4674\u20134683 (2017). https:\/\/api.semanticscholar.org\/CorpusID:645845","DOI":"10.1109\/CVPR.2017.497"},{"key":"16_CR25","doi-asserted-by":"crossref","unstructured":"Petrovich, M., Black, M.J., Varol, G.: Action-conditioned 3d human motion synthesis with transformer VAE. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 10965\u201310975 (2021). https:\/\/api.semanticscholar.org\/CorpusID:233210075","DOI":"10.1109\/ICCV48922.2021.01080"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Shahroudy, A., Liu, J., Ng, T.T., Wang, G.: NTU RGB+D: a large scale dataset for 3d human activity analysis. 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1010\u20131019 (2016). https:\/\/api.semanticscholar.org\/CorpusID:15928602","DOI":"10.1109\/CVPR.2016.115"},{"key":"16_CR27","unstructured":"Sohn, K., Lee, H., Yan, X.: Learning structured output representation using deep conditional generative models. In: Cortes, C., Lawrence, N., Lee, D., Sugiyama, M., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol.\u00a028. Curran Associates, Inc. (2015)"},{"key":"16_CR28","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., Wojna, Z.: Rethinking the inception architecture for computer vision. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2818\u20132826 (2015). https:\/\/api.semanticscholar.org\/CorpusID:206593880","DOI":"10.1109\/CVPR.2016.308"},{"key":"16_CR29","unstructured":"Tevet, G., Raab, S., Gordon, B., Shafir, Y., Cohen-or, D., Bermano, A.H.: Human motion diffusion model. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=SJ1kSyO2jwu"},{"key":"16_CR30","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Neural Information Processing Systems (2017). https:\/\/api.semanticscholar.org\/CorpusID:13756489"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Xu, L., et al.: ActFormer: a GAN-based transformer towards general action-conditioned 3d human motion generation. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2228\u20132238 (2022). https:\/\/api.semanticscholar.org\/CorpusID:253801491","DOI":"10.1109\/ICCV51070.2023.00212"},{"key":"16_CR32","doi-asserted-by":"publisher","unstructured":"Yan, S., Li, Z., Xiong, Y., Yan, H., Lin, D.: Convolutional sequence generation for skeleton-based action synthesis. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 4393\u20134401 (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00449","DOI":"10.1109\/ICCV.2019.00449"},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., Lin, D.: Spatial temporal graph convolutional networks for skeleton-based action recognition. In: AAAI Conference on Artificial Intelligence (2018). https:\/\/api.semanticscholar.org\/CorpusID:19167105","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"16_CR34","unstructured":"Yan, X., et al.: MT-VAE: learning motion transformations to generate multimodal human dynamics. ArXiv abs\/1808.04545 (2018). https:\/\/api.semanticscholar.org\/CorpusID:52003444"},{"key":"16_CR35","unstructured":"Yu, T., Yin, H., Zhu, Z.: Spatio-temporal graph convolutional neural network: a deep learning framework for traffic forecasting. ArXiv abs\/1709.04875 (2017). https:\/\/api.semanticscholar.org\/CorpusID:4972291"},{"key":"16_CR36","unstructured":"Zhang, M., et al.: Motiondiffuse: text-driven human motion generation with diffusion model. arXiv preprint arXiv:2208.15001 (2022)"},{"key":"16_CR37","unstructured":"Zhang, Y., Black, M.J., Tang, S.: Perpetual motion: generating unbounded human motion. ArXiv abs\/2007.13886 (2020). https:\/\/api.semanticscholar.org\/CorpusID:220831301"},{"key":"16_CR38","doi-asserted-by":"crossref","unstructured":"Zhong, C., Hu, L., Zhang, Z., Xia, S.: Attt2m: text-driven human motion generation with multi-perspective attention mechanism. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 509\u2013519, October 2023","DOI":"10.1109\/ICCV51070.2023.00053"},{"key":"16_CR39","unstructured":"Zou, S., Zuo, X., Qian, Y., Wang, S., Xu, C., Gong, M., Cheng, L.: Polarization human shape and pose dataset. ArXiv abs\/2004.14899 (2020). https:\/\/api.semanticscholar.org\/CorpusID:216867173"},{"key":"16_CR40","doi-asserted-by":"publisher","first-page":"1617","DOI":"10.1109\/TMM.2020.3001506","volume":"23","author":"X Zuo","year":"2021","unstructured":"Zuo, X., et al.: SparseFusion: Dynamic human avatar modeling from sparse RGBD images. IEEE Trans. Multimedia 23, 1617\u20131629 (2021)","journal-title":"IEEE Trans. Multimedia"}],"container-title":["Communications in Computer and Information Science","Deep Learning Theory and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-66694-0_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,22]],"date-time":"2024-08-22T06:12:34Z","timestamp":1724307154000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-66694-0_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031666933","9783031666940"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-66694-0_16","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"21 August 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DeLTA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Deep Learning Theory and Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Dijon","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"France","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 July 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 July 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"delta2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/delta.scitevents.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}