{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T10:57:35Z","timestamp":1781002655500,"version":"3.54.1"},"publisher-location":"Singapore","reference-count":25,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819698684","type":"print"},{"value":"9789819698691","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-9869-1_15","type":"book-chapter","created":{"date-parts":[[2025,7,24]],"date-time":"2025-07-24T13:43:59Z","timestamp":1753364639000},"page":"169-180","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal-Guided Perceptual Image Compression via Joint Text and Audio"],"prefix":"10.1007","author":[{"given":"Genhong","family":"Wang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wen","family":"Tan","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Youneng","family":"Bao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fanyang","family":"Meng","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yongsheng","family":"Liang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,7,25]]},"reference":[{"key":"15_CR1","unstructured":"Bellard, F.: BPG Image Format. https:\/\/bellard.org\/bpg"},{"key":"15_CR2","unstructured":"Ohm, J.-R., Sullivan, G.J.: Versatile video coding\u2013towards the next generation of video compression. In: Picture Coding Symposium (PCS), pp.124\u2013125 (2018)"},{"key":"15_CR3","unstructured":"Blau, Y., Michaeli, T.: Rethinking lossy compression: the rate-distortion-perception tradeoff. In: International Conference on Machine Learning (ICML), pp. 675\u2013685 (2019)"},{"key":"15_CR4","unstructured":"Mentzer, F., Toderici, G.D., Tschannen, M., Agustsson, E.: High-fidelity generative image compression. In: Advances in Neural Information Processing Systems (NIPS), pp. 11913\u201311924 (2020)"},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Jiang, X., Tan, W., Tan, T., et al.: Multi-modality deep network for extreme learned image compression. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI) 37(1), pp. 1033\u20131041 (2023)","DOI":"10.1609\/aaai.v37i1.25184"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Qin, S., Chen, B., Huang, Y., et al.: Perceptual Image Compression with Cooperative Cross-Modal Side Information. arXiv preprint arXiv: 2311.13847 (2023)","DOI":"10.2139\/ssrn.4919564"},{"key":"15_CR7","unstructured":"Lee, H., Kim, M., Kim, J.H., et al.: Neural image compression with text-guided encoding for both pixel-level and perceptual fidelity. In: International Conference on Machine Learning (ICML), pp. 26715\u201326730 (2024)"},{"key":"15_CR8","doi-asserted-by":"crossref","unstructured":"Agustsson, E., Tschannen, M., Mentzer, F., et al.: Generative adversarial networks for extreme learned image compression. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 221\u2013231 (2019)","DOI":"10.1109\/ICCV.2019.00031"},{"key":"15_CR9","unstructured":"Tschannen, M., Agustsson, E., Lucic, M.: Deep generative models for distribution preserving lossy compression. In: Advances in Neural Information Processing Systems (NIPS), pp.1\u201312(2018)"},{"key":"15_CR10","unstructured":"Muckley, M.J., El-Nouby, A., Ullrich, K., et al.: Improving statistical fidelity for neural image compression with implicit local likelihood models. In: International Conference on Machine Learning (ICML), pp. 25426\u201325443 (2023)"},{"key":"15_CR11","unstructured":"Careil, M., Muckley, M.J., Verbeek, J., et al.: Towards image compression with perfect realism at ultra-low bitrates. In: International Conference on Learning Representations (ICLR), pp.1\u201312(2024)"},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"Bordin, T., Maugey, T.: Semantic based generative compression of images for extremely low bitrates. In: 2023 IEEE 25th International Workshop on Multimedia Signal Processing (MMSP), pp. 1\u20136 (2023)","DOI":"10.1109\/MMSP59012.2023.10337734"},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"Lu, G., Zhong, T., Geng, J., et al.: Learning based multi-modality image and video compression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6083\u20136092 (2022)","DOI":"10.1109\/CVPR52688.2022.00599"},{"key":"15_CR14","doi-asserted-by":"crossref","unstructured":"Peng, B., Jing, Y., Jin, D., et al.: Texture-guided end-to-end depth map compression. In: 2022 IEEE International Conference on Image Processing (ICIP), pp. 2386\u20132390 (2022)","DOI":"10.1109\/ICIP46576.2022.9897569"},{"key":"15_CR15","doi-asserted-by":"crossref","unstructured":"He, D., Yang, Z., Peng, W., et al.: ELIC: Efficient learned image compression with unevenly grouped space-channel contextual adaptive coding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5718\u20135727 (2022)","DOI":"10.1109\/CVPR52688.2022.00563"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Liu, J., Sun, H., Katto, J.: Learned image compression with mixed transformer-CNN architectures. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1\u201310 (2023)","DOI":"10.1109\/CVPR52729.2023.01383"},{"key":"15_CR17","doi-asserted-by":"crossref","unstructured":"Xu, T., et al.: AttnGAN: fine-grained text to image generation with attentional generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1316\u20131324 (2018)","DOI":"10.1109\/CVPR.2018.00143"},{"issue":"3","key":"15_CR18","doi-asserted-by":"publisher","first-page":"517","DOI":"10.1109\/JSTSP.2020.2987417","volume":"14","author":"J Li","year":"2020","unstructured":"Li, J., et al.: Direct speech-to-image translation. IEEE J. Selected Topics in Signal Processing 14(3), 517\u2013529 (2020)","journal-title":"IEEE J. Selected Topics in Signal Processing"},{"key":"15_CR19","unstructured":"Vaswani, A.: Attention is all you need. In: Advances in Neural Information Processing Systems (NIPS), pp. 6000\u20136010 (2017)"},{"issue":"8","key":"15_CR20","doi-asserted-by":"publisher","first-page":"7484","DOI":"10.1109\/TCSVT.2024.3370578","volume":"34","author":"D Zhan","year":"2024","unstructured":"Zhan, D., Wu, J., Luo, X., Jin, Z.: Learning from text: a multimodal face inpainting network for irregular holes. IEEE Trans. Circuits Syst. Video Technol. 34(8), 7484\u20137497 (2024)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"15_CR21","doi-asserted-by":"crossref","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. In: Advances in Neural Information Processing Systems (NIPS), pp. 84\u201390 (2012)","DOI":"10.1145\/3065386"},{"key":"15_CR22","unstructured":"Wah, C., Branson, S., Welinder, P., Perona, P., Belongie, S.: The Caltech-UCSD Birds-200\u20132011 dataset. California Institute of Technology (2011)"},{"key":"15_CR23","doi-asserted-by":"crossref","unstructured":"Nilsback, M.-E., Zisserman, A.: A visual vocabulary for flower classification. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1447\u20131454 (2006)","DOI":"10.1109\/CVPR.2006.42"},{"key":"15_CR24","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., et al.: Microsoft COCO: Common objects in context. In: European Conference on Computer Vision (ECCV), pp. 740\u2013755 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Havard, W., Besacier, L., Rosec, O.: SPEECH-COCO: 600 k Visually grounded spoken captions aligned to MSCOCO data set. In: International Workshop on Grounding Language Understanding (GLU), pp. 42\u201346 (2017)","DOI":"10.21437\/GLU.2017-9"}],"container-title":["Lecture Notes in Computer Science","Advanced Intelligent Computing Technology and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-9869-1_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T10:01:31Z","timestamp":1780999291000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-9869-1_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819698684","9789819698691"],"references-count":25,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-9869-1_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"25 July 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Intelligent Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Ningbo","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 July 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 July 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icic2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.ic-icc.cn\/icg\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}