{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T02:42:22Z","timestamp":1775011342403,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":21,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819688883","type":"print"},{"value":"9789819688890","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-96-8889-0_12","type":"book-chapter","created":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T08:57:24Z","timestamp":1751273844000},"page":"137-148","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["SYNCAD: Synchronised Yields from\u00a0Narrative Cross Modal Audio and\u00a0Data"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-8873-1792","authenticated-orcid":false,"given":"Aditya Vardhan","family":"Madivada","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6488-5630","authenticated-orcid":false,"given":"Madhav Kartheek","family":"Bhumireddi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7463-5936","authenticated-orcid":false,"given":"K. E.","family":"Srinivasa Desikan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,7,1]]},"reference":[{"key":"12_CR1","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. arXiv preprint arXiv:2006.11239 (2020)"},{"key":"12_CR2","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. arXiv preprint arXiv:2112.10752 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"12_CR3","unstructured":"Wang, T., Liu, X., Zhang, Y.: Neural audio generation with diffusion models. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)"},{"key":"12_CR4","unstructured":"Sheffer, A., Adi, M.: Diffusion models for music generation. In: Proceedings of the 2023 International Conference on Learning Representations (ICLR) (2023)"},{"key":"12_CR5","unstructured":"Copet, J., et al.: DiffWave: a versatile diffusion model for audio synthesis. In: Proceedings of the 2023 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) (2023)"},{"key":"12_CR6","unstructured":"Kreuk, F., et al.: A diffusion model for speech synthesis and audio generation. arXiv preprint arXiv:2210.09632 (2022)"},{"key":"12_CR7","unstructured":"Hassid, A., et al.: Multimodal audio-visual generation using diffusion models. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing (EMNLP) (2023)"},{"key":"12_CR8","unstructured":"Touvron, H., et al.: LLaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"12_CR9","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems (NeurIPS) (2020)"},{"key":"12_CR10","unstructured":"Mama, Y., et al.: Audio-driven talking head generation using visual-audio consistency loss. In: Proceedings of the 2021 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) (2021)"},{"key":"12_CR11","unstructured":"Park, J., et al.: Speech-driven video generation of talking heads using temporal smoothing and attention. IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI) (2022)"},{"key":"12_CR12","unstructured":"Kumar, A., et al.: Voice2Face: speech-driven facial reenactment using temporal consistency. In: Proceedings of the European Conference on Computer Vision (ECCV) (2020)"},{"key":"12_CR13","unstructured":"Chatterjee, R., Cherian, A.: Audio-driven video continuation using visual-temporal alignment. In: Proceedings of the 2020 Conference on Computer Vision and Pattern Recognition (CVPR) (2020)"},{"key":"12_CR14","unstructured":"Ge, Y., et al.: Audio-visual scene generation with diffusion models. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022)"},{"key":"12_CR15","unstructured":"Hao, X., Guan, Z., Zhang, Y.: Audio-driven video synthesis using neural generative models. In: Proceedings of the IEEE Conference on Multimedia Information Retrieval (ICMR) (2022)"},{"key":"12_CR16","unstructured":"Ruan, Z., et al.: Generative models for real-time audio-visual synthesis. In: Proceedings of the 2023 International Conference on Multimedia and Expo (ICME) (2023)"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Chen, X., et al.: VGGSound: a large-scale dataset for audio-visual event detection. In: Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP) (2020)","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"12_CR18","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: AudioSet: an ontology and human-labeled dataset for audio events. In: Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"12_CR19","unstructured":"Lee, J., et al.: Landscape dataset for generative audio-visual models. In: Proceedings of the 2022 Conference on Computer Vision and Pattern Recognition (CVPR) (2022)"},{"key":"12_CR20","unstructured":"Bosch, J.J., Janer, J., Fuhrmann, F., Herrera, P.: A comparison of sound segregation techniques for predominant instrument recognition in musical audio signals. In: International Symposium\/Conference on Music Information Retrieval, pp. 559\u2013564 (2012)"},{"key":"12_CR21","doi-asserted-by":"publisher","unstructured":"Dynamic time warping. In Springer eBooks, pp. 69\u201384 (2007). https:\/\/doi.org\/10.1007\/978-3-540-74048-3_4","DOI":"10.1007\/978-3-540-74048-3_4"}],"container-title":["Lecture Notes in Computer Science","Advances and Trends in Artificial Intelligence. Theory and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-8889-0_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T01:33:54Z","timestamp":1775007234000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-8889-0_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,1]]},"ISBN":["9789819688883","9789819688890"],"references-count":21,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-8889-0_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7,1]]},"assertion":[{"value":"1 July 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"IEA\/AIE","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Industrial, Engineering and Other Applications of Applied Intelligent Systems","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kytakyushu","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 July 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 July 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"38","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ieaaie2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.i-somet.org\/iea-aie2025\/committees.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}