{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,29]],"date-time":"2025-05-29T04:01:45Z","timestamp":1748491305404,"version":"3.41.0"},"reference-count":34,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icasspw65056.2025.11011192","type":"proceedings-article","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T17:05:14Z","timestamp":1748365514000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["IndicST: Indian Multilingual Translation Corpus For Evaluating Speech Large Language Models"],"prefix":"10.1109","author":[{"given":"Sanket","family":"Shah","sequence":"first","affiliation":[{"name":"Krutrim AI,Bangalore"}]},{"given":"Kavya Ranjan","family":"Saxena","sequence":"additional","affiliation":[{"name":"IIT Kanpur"}]},{"given":"Kancharana Manideep","family":"Bharadwaj","sequence":"additional","affiliation":[{"name":"Krutrim AI,Bangalore"}]},{"given":"Sharath","family":"Adavanne","sequence":"additional","affiliation":[{"name":"Krutrim AI,Bangalore"}]},{"given":"Nagaraj","family":"Adiga","sequence":"additional","affiliation":[{"name":"Krutrim AI,Bangalore"}]}],"member":"263","reference":[{"article-title":"Listen, think, and understand","year":"2023","author":"Gong","key":"ref1"},{"article-title":"Macaw-llm: Multi-modal language modeling with image, audio, video, and text integration","year":"2023","author":"Lyu","key":"ref2"},{"article-title":"X-llm: Bootstrapping advanced large language models by treating multi-modalities as foreign languages","year":"2023","author":"Chen","key":"ref3"},{"article-title":"Pandagpt: One model to instruction-follow them all","year":"2023","author":"Su","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"article-title":"Audiopalm: A large language model that can speak and listen","year":"2023","author":"Rubenstein","key":"ref6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.1055"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.263"},{"article-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models","year":"2023","author":"Chu","key":"ref9"},{"article-title":"Salmonn: Towards generic hearing abilities for large language models","year":"2023","author":"Tang","key":"ref10"},{"key":"ref11","first-page":"28 492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref12","first-page":"5178","article-title":"Beats: Audio pre-training with acoustic tokenizers","volume-title":"International Conference on Machine Learning","author":"Chen"},{"issue":"3","key":"ref13","first-page":"6","article-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","volume":"2","author":"Chiang","year":"2023"},{"key":"ref14","first-page":"19 730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International conference on machine learning","author":"Li"},{"article-title":"Lora: Low-rank adaptation of large language models","volume-title":"International Conference on Learning Representations","author":"Hu","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26521"},{"article-title":"Spring-inx: A multilingual indian language speech corpus by spring lab, iit madras","year":"2023","author":"Gangwar","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.639"},{"article-title":"Audiobench: A universal benchmark for audio large language models","year":"2024","author":"Wang","key":"ref19"},{"article-title":"Indictrans2: Towards high-quality and accessible machine translation models for all 22 scheduled indian languages","year":"2023","author":"Gala","key":"ref20"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1339"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2012-659"},{"key":"ref23","first-page":"4211","article-title":"Common voice: A massively-multilingual speech corpus","volume-title":"Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020)","author":"Ardila"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2376"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096933"},{"article-title":"Google crowdsourced speech corpora and related open-source resources for low-resource languages and dialects: an overview","year":"2020","author":"Butryna","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023141"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2588"},{"key":"ref29","article-title":"Styletts 2: Towards human-level text-to-speech through style diffusion and adversarial training with large speech language models","volume":"36","author":"Li","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Nemo: A toolkit for building ai applications using neural modules.(2019)","year":"1909","author":"Kuchaiev","key":"ref30"},{"key":"ref31","first-page":"1298","article-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"International Conference on Machine Learning","author":"Baevski"},{"article-title":"Llama: Open and efficient foundation language models","year":"2023","author":"Touvron","key":"ref32"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2024-938"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.3115\/1225403.1225421"}],"event":{"name":"2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","start":{"date-parts":[[2025,4,6]]},"location":"Hyderabad, India","end":{"date-parts":[[2025,4,11]]}},"container-title":["2025 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11010992\/11010997\/11011192.pdf?arnumber=11011192","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,28]],"date-time":"2025-05-28T04:53:41Z","timestamp":1748408021000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11011192\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":34,"URL":"https:\/\/doi.org\/10.1109\/icasspw65056.2025.11011192","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}