{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:44:58Z","timestamp":1776887098397,"version":"3.51.2"},"reference-count":63,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["92048205"],"award-info":[{"award-number":["92048205"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shanghai Municipal Science and Technology Major Project","award":["2021SHZDZX0102"],"award-info":[{"award-number":["2021SHZDZX0102"]}]},{"name":"Jiangsu Technology Project","award":["BE2022059-2"],"award-info":[{"award-number":["BE2022059-2"]}]},{"name":"Guangxi major science and technology project","award":["AA23062062"],"award-info":[{"award-number":["AA23062062"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/tmm.2024.3443614","type":"journal-article","created":{"date-parts":[[2024,8,23]],"date-time":"2024-08-23T17:36:56Z","timestamp":1724434616000},"page":"11126-11138","source":"Crossref","is-referenced-by-count":9,"title":["Towards Weakly Supervised Text-to-Audio Grounding"],"prefix":"10.1109","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8718-1278","authenticated-orcid":false,"given":"Xuenan","family":"Xu","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"given":"Ziyang","family":"Ma","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5599-8707","authenticated-orcid":false,"given":"Mengyue","family":"Wu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7102-9826","authenticated-orcid":false,"given":"Kai","family":"Yu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3353578"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095687"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3369529"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414834"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096526"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00674"},{"key":"ref9","first-page":"1","article-title":"Weakly-supervised video object grounding from text by loss weighting and object interaction","volume-title":"Proc. Brit. Mach. Vis. Conf.","author":"Zhou","year":"2018"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3096087"},{"key":"ref11","first-page":"15039","article-title":"Advancing visual grounding with scene knowledge: Benchmark and method","volume-title":"Proc. IEEE\/CVF Comput. Soc. Conf. Comput. Vis. Pattern Recognit.","author":"Song","year":"2023"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01813"},{"key":"ref13","first-page":"119","article-title":"AudioCaps: Generating captions for audios in the wild","volume-title":"Proc. Conf. North Amer. Chapter Assoc. Comput. Linguist.","author":"Kim","year":"2019"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747336"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSPW59220.2023.10192960"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00434"},{"key":"ref18","first-page":"49542","article-title":"Exploiting contextual objects and relations for 3D visual grounding","volume-title":"Proc. Int. Conf. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Yang","year":"2024"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1168"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3311917"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00269"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3481539"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01186"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_49"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00270"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_44"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2991592"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1049\/cit2.12216"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3044997"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3054313"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-330"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01805"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2023.104347"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref37","first-page":"1298","article-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Baevski","year":"2022"},{"key":"ref38","first-page":"1","article-title":"vq-wav2vec: Self-supervised learning of discrete speech representations","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Baevski","year":"2019"},{"key":"ref39","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. Int. Adv. Conf. Neural Inf. Process. Syst.","author":"Baevski","year":"2020"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-822"},{"key":"ref42","first-page":"1","article-title":"BEiT: BERT pre-training of image transformers","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Bao","year":"2021"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1096"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448504"},{"key":"ref48","first-page":"1889","article-title":"Deep fragment embeddings for bidirectional image sentence mapping","volume":"27","author":"Karpathy","year":"2014","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682847"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01070"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1470"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"ref55","article-title":"The SJTU system for DCASE2022 challenge task 6: Audio captioning with audio-text retrieval pre-training","author":"Xu","year":"2022"},{"key":"ref56","article-title":"Query-graph with cross-gating attention model for text-to-audio grounding","author":"Tang","year":"2021"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052995"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747556"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053396"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054478"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414579"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3120633"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6046\/10384483\/10645318.pdf?arnumber=10645318","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:29:56Z","timestamp":1732667396000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10645318\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":63,"URL":"https:\/\/doi.org\/10.1109\/tmm.2024.3443614","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}