{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T03:35:08Z","timestamp":1743132908726,"version":"3.40.3"},"publisher-location":"Cham","reference-count":31,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783031301070"},{"type":"electronic","value":"9783031301087"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-30108-7_33","type":"book-chapter","created":{"date-parts":[[2023,4,12]],"date-time":"2023-04-12T04:03:04Z","timestamp":1681272184000},"page":"391-402","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Enhance Gesture Recognition via\u00a0Visual-Audio Modal Embedding"],"prefix":"10.1007","author":[{"given":"Yiting","family":"Cao","sequence":"first","affiliation":[]},{"given":"Yuchun","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Shiwei","family":"Xiao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,4,13]]},"reference":[{"key":"33_CR1","doi-asserted-by":"crossref","unstructured":"Abavisani, M., Joze, H.R.V., Patel, V.M.: Improving the performance of unimodal dynamic hand-gesture recognition with multimodal training. In: CVPR, pp. 1165\u20131174 (2019)","DOI":"10.1109\/CVPR.2019.00126"},{"issue":"12","key":"33_CR2","doi-asserted-by":"publisher","first-page":"8717","DOI":"10.1109\/TPAMI.2018.2889052","volume":"44","author":"T Afouras","year":"2018","unstructured":"Afouras, T., Chung, J.S., Senior, A., Vinyals, O., Zisserman, A.: Deep audio-visual speech recognition. IEEE Trans. Pattern Anal. Mach. Intell. 44(12), 8717\u20138727 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"33_CR3","doi-asserted-by":"crossref","unstructured":"Brousmiche, M., Rouat, J., Dupont, S.: Audio-visual fusion and conditioning with neural networks for event recognition. In: MLSP, pp. 1\u20136 (2019)","DOI":"10.1109\/MLSP.2019.8918712"},{"key":"33_CR4","doi-asserted-by":"crossref","unstructured":"Chang, J.Y., Tejero-de Pablos, A., Harada, T.: Improved optical flow for gesture-based human-robot interaction. In: ICRA, pp. 7983\u20137989 (2019)","DOI":"10.1109\/ICRA.2019.8793825"},{"key":"33_CR5","first-page":"1109","volume":"11","author":"G Chechik","year":"2010","unstructured":"Chechik, G.: Sharma, varun, Shalit, Uri, Bengio, Samy: large scale online learning of image similarity through ranking. J. Mach. Learn. Res. 11, 1109\u20131135 (2010)","journal-title":"J. Mach. Learn. Res."},{"key":"33_CR6","doi-asserted-by":"crossref","unstructured":"Chen, X., Guo, H., Wang, G., Zhang, L.: Motion feature augmented recurrent neural network for skeleton-based dynamic hand gesture recognition. In: ICIP, pp. 2881\u20132885 (2017)","DOI":"10.1109\/ICIP.2017.8296809"},{"key":"33_CR7","doi-asserted-by":"crossref","unstructured":"Cho, K., Van Merri\u00ebnboer, B., Bahdanau, D., Bengio, Y.: On the properties of neural machine translation: Encoder-decoder approaches. arXiv preprint arXiv:1409.1259 (2014)","DOI":"10.3115\/v1\/W14-4012"},{"issue":"7","key":"33_CR8","doi-asserted-by":"publisher","first-page":"1880","DOI":"10.1109\/TMM.2018.2889563","volume":"21","author":"R Cui","year":"2019","unstructured":"Cui, R., Liu, H., Zhang, C.: A deep neural framework for continuous sign language recognition by iterative training. IEEE Trans. Multimedia 21(7), 1880\u20131891 (2019)","journal-title":"IEEE Trans. Multimedia"},{"issue":"4","key":"33_CR9","doi-asserted-by":"publisher","first-page":"1002","DOI":"10.1109\/TPAMI.2017.2700390","volume":"40","author":"C Ding","year":"2016","unstructured":"Ding, C., Tao, D.: Trunk-branch ensemble convolutional neural networks for video-based face recognition. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 1002\u20131014 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"33_CR10","doi-asserted-by":"crossref","unstructured":"Eyben, F., W\u00f6llmer, M., Schuller, B.: Opensmile: the munich versatile and fast open-source audio feature extractor. In: Proceedings of the 18th ACM International Conference on Multimedia, pp. 1459\u20131462 (2010)","DOI":"10.1145\/1873951.1874246"},{"issue":"5","key":"33_CR11","doi-asserted-by":"publisher","first-page":"473","DOI":"10.1162\/089892999563544","volume":"11","author":"P Giard","year":"1999","unstructured":"Giard, P.: Auditory-visual integration during multimodal object recognition in humans: a behavioral and electrophysiological study. J. Cogn. Neurosci. 11(5), 473\u2013490 (1999)","journal-title":"J. Cogn. Neurosci."},{"key":"33_CR12","unstructured":"Goldstein, E.B., Brockmole, J.: Sensation and perception. In: Cengage Learning (2016)"},{"issue":"5","key":"33_CR13","doi-asserted-by":"publisher","first-page":"737","DOI":"10.3813\/AAA.919214","volume":"104","author":"J Han","year":"2018","unstructured":"Han, J., Zhang, Z., Keren, G., Schuller, B.: Emotion recognition in speech with latent discriminative representations learning. Acta Acustica united with Acustica 104(5), 737\u2013740 (2018)","journal-title":"Acta Acustica united with Acustica"},{"key":"33_CR14","doi-asserted-by":"crossref","unstructured":"Huang, J., gang Zhou, W., Li, H., Li, W.: Attention-based 3d-cnns for large-vocabulary sign language recognition. IEEE Trans. Circ. Syst. Video Technol. 29, 2822\u20132832 (2019)","DOI":"10.1109\/TCSVT.2018.2870740"},{"issue":"12","key":"33_CR15","doi-asserted-by":"publisher","first-page":"12549","DOI":"10.1016\/j.aej.2022.05.043","volume":"61","author":"A Khan","year":"2022","unstructured":"Khan, A., et al.: Packerrobo: model-based robot vision self supervised learning in cart. Alexandria Eng. J. 61(12), 12549\u201312566 (2022)","journal-title":"Alexandria Eng. J."},{"key":"33_CR16","doi-asserted-by":"publisher","first-page":"4342","DOI":"10.1109\/TMM.2021.3115626","volume":"24","author":"M Kim","year":"2021","unstructured":"Kim, M., Hong, J., Park, S.J., Ro, Y.M.: Cromm-vsr: cross-modal memory augmented visual speech recognition. IEEE Trans. Multimedia 24, 4342\u20134355 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"33_CR17","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. Commun. ACM 60, 84\u201390 (2012)","journal-title":"Commun. ACM"},{"key":"33_CR18","doi-asserted-by":"crossref","unstructured":"Kumar, A., Khadkevich, M., F\u00fcgen, C.: Knowledge transfer from weakly labeled audio using convolutional neural network for sound events and scenes. In: ICASSP, pp. 326\u2013330. IEEE (2018)","DOI":"10.1109\/ICASSP.2018.8462200"},{"key":"33_CR19","doi-asserted-by":"crossref","unstructured":"Liu, J., Furusawa, K., Tateyama, T., Iwamoto, Y., Chen, Y.W.: An improved hand gesture recognition with two-stage convolution neural networks using a hand color image and its pseudo-depth image. In: ICIP, pp. 375\u2013379 (2019)","DOI":"10.1109\/ICIP.2019.8802970"},{"key":"33_CR20","doi-asserted-by":"crossref","unstructured":"Mar\u00e9chal, C., et al.: Survey on AI-based multimodal methods for emotion detection. In: High-Performance Modelling and Simulation for Big Data Applications (2019)","DOI":"10.1007\/978-3-030-16272-6_11"},{"key":"33_CR21","doi-asserted-by":"crossref","unstructured":"McFee, B., et al.: librosa: Audio and music signal analysis in python. In: Proceedings of the 14th Python in Science Conference, vol. 8, pp. 18\u201325 (2015)","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"33_CR22","doi-asserted-by":"crossref","unstructured":"Mullick, K., Namboodiri, A.M.: Learning deep and compact models for gesture recognition. In: ICIP (2017)","DOI":"10.1109\/ICIP.2017.8297033"},{"key":"33_CR23","doi-asserted-by":"crossref","unstructured":"Nguyen, X.S., Brun, L., L\u00e9zoray, O., Bougleux, S.: A neural network based on SPD manifold learning for skeleton-based hand gesture recognition. In: CVPR, pp. 12036\u201312045 (2019)","DOI":"10.1109\/CVPR.2019.01231"},{"key":"33_CR24","doi-asserted-by":"crossref","unstructured":"Praveen, R.G., Granger, E., Cardinal, P.: Cross attentional audio-visual fusion for dimensional emotion recognition. In: FG 2021, pp. 1\u20138 (2021)","DOI":"10.1109\/FG52635.2021.9667055"},{"issue":"1","key":"33_CR25","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s10462-012-9356-9","volume":"43","author":"SS Rautaray","year":"2015","unstructured":"Rautaray, S.S., Agrawal, A.: Vision based hand gesture recognition for human computer interaction: a survey. Artif. Intell. Rev. 43(1), 1\u201354 (2015)","journal-title":"Artif. Intell. Rev."},{"key":"33_CR26","doi-asserted-by":"crossref","unstructured":"Schroff, F., Kalenichenko, D., Philbin, J.: Facenet: a unified embedding for face recognition and clustering. In: CVPR, pp. 815\u2013823 (2015)","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"33_CR27","doi-asserted-by":"crossref","unstructured":"Shi, L., Zhang, Y., Hu, J., Cheng, J., Lu, H.: Gesture recognition using spatiotemporal deformable convolutional representation. In: ICIP, pp. 1900\u20131904 (2019)","DOI":"10.1109\/ICIP.2019.8803152"},{"key":"33_CR28","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: ICLR (2015). http:\/\/arxiv.org\/abs\/1409.1556"},{"key":"33_CR29","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1016\/j.patcog.2018.02.011","volume":"80","author":"J Tang","year":"2018","unstructured":"Tang, J., Cheng, H., Zhao, Y., Guo, H.: Structured dynamic time warping for continuous hand trajectory gesture recognition. Pattern Recogn. 80, 21\u201331 (2018)","journal-title":"Pattern Recogn."},{"key":"33_CR30","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: ICCV, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"issue":"8","key":"33_CR31","doi-asserted-by":"publisher","first-page":"1583","DOI":"10.1109\/TPAMI.2016.2537340","volume":"38","author":"D Wu","year":"2016","unstructured":"Wu, D., et al.: Deep dynamic neural networks for multimodal gesture segmentation and recognition. IEEE Trans. Pattern Anal. Mach. Intell. 38(8), 1583\u20131597 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."}],"container-title":["Lecture Notes in Computer Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-30108-7_33","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,4,12]],"date-time":"2023-04-12T04:09:47Z","timestamp":1681272587000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-30108-7_33"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031301070","9783031301087"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-30108-7_33","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"13 April 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"New Delhi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22 November 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 November 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iconip2022.apnns.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Easy Chair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"810","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"359","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"44% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.65","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"ICONIP 2022 consists of a two-volume set, LNCS & CCIS, which includes 146 and 213 papers","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}