{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,27]],"date-time":"2025-07-27T07:14:53Z","timestamp":1753600493489,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":37,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819980789"},{"type":"electronic","value":"9789819980796"}],"license":[{"start":{"date-parts":[[2023,11,14]],"date-time":"2023-11-14T00:00:00Z","timestamp":1699920000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,11,14]],"date-time":"2023-11-14T00:00:00Z","timestamp":1699920000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-99-8079-6_14","type":"book-chapter","created":{"date-parts":[[2023,11,13]],"date-time":"2023-11-13T16:02:42Z","timestamp":1699891362000},"page":"173-187","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Multimodal Isotropic Neural Architecture with\u00a0Patch Embedding"],"prefix":"10.1007","author":[{"given":"Hubert","family":"Truchan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Evgenii","family":"Naumov","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rezaul","family":"Abedin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gregory","family":"Palmer","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zahra","family":"Ahmadi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,11,14]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"van Amsterdam, B., Kadkhodamohammadi, A., Luengo, I., Stoyanov, D.: Aspnet: action segmentation with shared-private representation of multiple data sources. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2384\u20132393 (2023)","DOI":"10.1109\/CVPR52729.2023.00236"},{"key":"14_CR2","doi-asserted-by":"crossref","unstructured":"Aslam, M.H., Zeeshan, M.O., Pedersoli, M., Koerich, A.L., Bacon, S., Granger, E.: Privileged knowledge distillation for dimensional emotion recognition in the wild. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3337\u20133346 (2023)","DOI":"10.1109\/CVPRW59228.2023.00336"},{"key":"14_CR3","unstructured":"Bai, S., Kolter, J.Z., Koltun, V.: An empirical evaluation of generic convolutional and recurrent networks for sequence modeling. arXiv:1803.01271 (2018)"},{"key":"14_CR4","unstructured":"Bonner, L.E.R., Buhl, D.D., Kristensen, K., Navarro-Guerrero, N.: Au dataset for visuo-haptic object recognition for robots. arXiv preprint arXiv:2112.13761 (2021)"},{"key":"14_CR5","first-page":"5834","volume":"34","author":"S Chen","year":"2021","unstructured":"Chen, S., Guhur, P.L., Schmid, C., Laptev, I.: History aware multimodal transformer for vision-and-language navigation. Adv. Neural Inform. Process. Syst. (NeurIPS) 34, 5834\u20135847 (2021)","journal-title":"Adv. Neural Inform. Process. Syst. (NeurIPS)"},{"key":"14_CR6","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1016\/j.inffus.2019.02.010","volume":"51","author":"JH Choi","year":"2019","unstructured":"Choi, J.H., Lee, J.S.: Embracenet: a robust deep learning architecture for multimodal classification. Inform. Fusion 51, 259\u2013270 (2019)","journal-title":"Inform. Fusion"},{"issue":"1","key":"14_CR7","doi-asserted-by":"publisher","first-page":"745","DOI":"10.1038\/s41597-022-01843-z","volume":"9","author":"G Cicirelli","year":"2022","unstructured":"Cicirelli, G., et al.: The ha4m dataset: multi-modal monitoring of an assembly task for human action recognition in manufacturing. Sci. Data 9(1), 745 (2022)","journal-title":"Sci. Data"},{"key":"14_CR8","doi-asserted-by":"crossref","unstructured":"Cubuk, E.D., Zoph, B., Mane, D., Vasudevan, V., Le, Q.V.: Autoaugment: Learning augmentation policies from data. arXiv preprint arXiv:1805.09501 (2018)","DOI":"10.1109\/CVPR.2019.00020"},{"key":"14_CR9","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representation (ICLR) (2021)"},{"issue":"18","key":"14_CR10","doi-asserted-by":"publisher","first-page":"7429","DOI":"10.1007\/s11042-014-1986-2","volume":"74","author":"C Eroglu Erdem","year":"2015","unstructured":"Eroglu Erdem, C., Turan, C., Aydin, Z.: Baum-2: a multilingual audio-visual affective face database. Multimed. Tools Appl. 74(18), 7429\u20137459 (2015)","journal-title":"Multimed. Tools Appl."},{"issue":"1","key":"14_CR11","doi-asserted-by":"publisher","first-page":"537","DOI":"10.1038\/s41597-022-01643-5","volume":"9","author":"S Gashi","year":"2022","unstructured":"Gashi, S., Min, C., Montanari, A., Santini, S., Kawsar, F.: A multidevice and multimodal dataset for human energy expenditure estimation using wearable devices. Sci. Data 9(1), 537 (2022)","journal-title":"Sci. Data"},{"key":"14_CR12","doi-asserted-by":"crossref","unstructured":"Geng, T., Wang, T., Duan, J., Cong, R., Zheng, F.: Dense-localizing audio-visual events in untrimmed videos: A large-scale benchmark and baseline. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 22942\u201322951 (2023)","DOI":"10.1109\/CVPR52729.2023.02197"},{"key":"14_CR13","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: Imagebind: one embedding space to bind them all. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15180\u201315190 (2023)","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"14_CR14","doi-asserted-by":"crossref","unstructured":"Gong, X., et al.: MMG-ego4D: multimodal generalization in egocentric action recognition. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6481\u20136491 (2023)","DOI":"10.1109\/CVPR52729.2023.00627"},{"key":"14_CR15","unstructured":"Hendrycks, D., Gimpel, K.: Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415 (2016)"},{"key":"14_CR16","unstructured":"Lee, S.H., Lee, S., Song, B.C.: Vision transformer for small-size datasets. arXiv preprint arXiv:2112.13492 (2021)"},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Li, Y., Quan, R., Zhu, L., Yang, Y.: Efficient multimodal fusion via interactive prompting. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2604\u20132613 (2023)","DOI":"10.1109\/CVPR52729.2023.00256"},{"key":"14_CR18","doi-asserted-by":"crossref","unstructured":"Lialin, V., Rawls, S., Chan, D., Ghosh, S., Rumshisky, A., Hamza, W.: Scalable and accurate self-supervised multimodal representation learning without aligned video and text data. In: IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 390\u2013400 (2023)","DOI":"10.1109\/WACVW58289.2023.00043"},{"key":"14_CR19","doi-asserted-by":"crossref","unstructured":"Lin, Y.B., Sung, Y.L., Lei, J., Bansal, M., Bertasius, G.: Vision transformers are parameter-efficient audio-visual learners. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2299\u20132309 (2023)","DOI":"10.1109\/CVPR52729.2023.00228"},{"key":"14_CR20","first-page":"11449","volume":"34","author":"YB Lin","year":"2021","unstructured":"Lin, Y.B., Tseng, H.Y., Lee, H.Y., Lin, Y.Y., Yang, M.H.: Exploring cross-video and cross-modality signals for weakly-supervised audio-visual video parsing. Adv. Neural Inform. Process. Syst. (NeurIPS) 34, 11449\u201311461 (2021)","journal-title":"Adv. Neural Inform. Process. Syst. (NeurIPS)"},{"key":"14_CR21","unstructured":"Liu, K., Li, Y., Xu, N., Natarajan, P.: Learn to combine modalities in multimodal deep learning. arXiv preprint arXiv:1805.11730 (2018)"},{"key":"14_CR22","doi-asserted-by":"crossref","unstructured":"Liu, X., Lu, H., Yuan, J., Li, X.: Cat: causal audio transformer for audio classification. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10096787"},{"key":"14_CR23","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representation (ICLR) (2018)"},{"key":"14_CR24","doi-asserted-by":"crossref","unstructured":"Ramazanova, M., Escorcia, V., Caba, F., Zhao, C., Ghanem, B.: Owl (observe, watch, listen): Audiovisual temporal context for localizing actions in egocentric videos. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4879\u20134889 (2023)","DOI":"10.1109\/CVPRW59228.2023.00516"},{"key":"14_CR25","doi-asserted-by":"crossref","unstructured":"Ranganathan, H., Chakraborty, S., Panchanathan, S.: Multimodal emotion recognition using deep learning architectures. In: IEEE winter conference on Applications of Computer Vision (WACV), pp. 1\u20139 (2016)","DOI":"10.1109\/WACV.2016.7477679"},{"key":"14_CR26","doi-asserted-by":"crossref","unstructured":"Ryan, F., Jiang, H., Shukla, A., Rehg, J.M., Ithapu, V.K.: Egocentric auditory attention localization in conversations. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14663\u201314674 (2023)","DOI":"10.1109\/CVPR52729.2023.01409"},{"key":"14_CR27","doi-asserted-by":"crossref","unstructured":"Senocak, A., Kim, J., Oh, T.H., Li, D., Kweon, I.S.: Event-specific audio-visual fusion layers: a simple and new perspective on video understanding. In: IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 2237\u20132247 (2023)","DOI":"10.1109\/WACV56688.2023.00227"},{"key":"14_CR28","unstructured":"Wightman, R., Touvron, H., J\u00e9gou, H.: Resnet strikes back: an improved training procedure in timm. arXiv preprint arXiv:2110.00476 (2021)"},{"key":"14_CR29","unstructured":"Wijekoon, A., Wiratunga, N., Cooper, K.: Mex: multi-modal exercises dataset for human activity recognition. arXiv preprint arXiv:1908.08992 (2019)"},{"key":"14_CR30","doi-asserted-by":"crossref","unstructured":"Wu, H., et al.: Cvt: introducing convolutions to vision transformers. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 22\u201331 (2021)","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"14_CR31","doi-asserted-by":"crossref","unstructured":"Xiao, Y., Ma, Y., Li, S., Zhou, H., Liao, R., Li, X.: Semanticac: semantics-assisted framework for audio classification. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10096319"},{"key":"14_CR32","doi-asserted-by":"crossref","unstructured":"Xu, R., Feng, R., Zhang, S.X., Hu, D.: Mmcosine: multi-modal cosine loss towards balanced audio-visual fine-grained learning. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10096655"},{"key":"14_CR33","doi-asserted-by":"crossref","unstructured":"Xue, Z., Marculescu, R.: Dynamic multimodal fusion. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2574\u20132583 (2023)","DOI":"10.1109\/CVPRW59228.2023.00256"},{"key":"14_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, X., Tang, X., Zong, L., Liu, X., Mu, J.: Deep multimodal clustering with cross reconstruction. In: Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD), pp. 305\u2013317 (2020)","DOI":"10.1007\/978-3-030-47426-3_24"},{"key":"14_CR35","doi-asserted-by":"crossref","unstructured":"Zhang, Z., et al.: Abaw5 challenge: a facial affect recognition approach utilizing transformer encoder and audiovisual fusion. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5724\u20135733 (2023)","DOI":"10.1109\/CVPRW59228.2023.00607"},{"key":"14_CR36","doi-asserted-by":"crossref","unstructured":"Zhong, Z., Schneider, D., Voit, M., Stiefelhagen, R., Beyerer, J.: Anticipative feature fusion transformer for multi-modal action anticipation. In: IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 6068\u20136077 (2023)","DOI":"10.1109\/WACV56688.2023.00601"},{"key":"14_CR37","doi-asserted-by":"crossref","unstructured":"Zhu, W., Omar, M.: Multiscale audio spectrogram transformer for efficient audio classification. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135 (2023)","DOI":"10.1109\/ICASSP49357.2023.10096513"}],"container-title":["Lecture Notes in Computer Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-99-8079-6_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T16:31:59Z","timestamp":1710261119000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-99-8079-6_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,14]]},"ISBN":["9789819980789","9789819980796"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-981-99-8079-6_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023,11,14]]},"assertion":[{"value":"14 November 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Changsha","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 November 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 November 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iconip2023.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1274","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"650","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"51% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.14","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.46","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}