{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T22:28:59Z","timestamp":1762036139282,"version":"build-2065373602"},"reference-count":29,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,1,24]]},"DOI":"10.1109\/iscslp49672.2021.9362117","type":"proceedings-article","created":{"date-parts":[[2021,3,1]],"date-time":"2021-03-01T23:37:05Z","timestamp":1614641825000},"page":"1-5","source":"Crossref","is-referenced-by-count":8,"title":["Audio Caption in a Car Setting with a Sentence-Level Loss"],"prefix":"10.1109","author":[{"given":"Xuenan","family":"Xu","sequence":"first","affiliation":[]},{"given":"Heinrich","family":"Dinkel","sequence":"additional","affiliation":[]},{"given":"Mengyue","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Kai","family":"Yu","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1117"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.450"},{"key":"ref12","first-page":"2528","article-title":"Multimodal attention for fusion of audio and spatiotemporal features for video description","author":"hori","year":"2018","journal-title":"Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268961"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.548"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref16","first-page":"1188","article-title":"Distributed representations of sentences and documents","author":"le","year":"2014","journal-title":"Proc International Conference on Machine Learning (ICML)"},{"key":"ref17","first-page":"6294","article-title":"Learned in translation: Contextualized word vectors","author":"mccann","year":"2017","journal-title":"Proc Conference on Neural Information Processing Systems (NIPS)"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"},{"article-title":"Improving language understanding by generative pre-training","year":"2018","author":"radford","key":"ref19"},{"key":"ref28","first-page":"8026","article-title":"PyTorch: An Imperative Style, High-Performance Deep Learning Library","author":"paszke","year":"0"},{"key":"ref4","article-title":"The SJTU submission for DCASE2020 task 6: A CRNN-GRU based reinforcement learning approach to audiocaption","author":"xu","year":"2020","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref27","first-page":"1","article-title":"Adam: A Method for Stochastic Optimization","author":"kingma","year":"0"},{"key":"ref3","article-title":"Automated audio captioning with temporal attention","author":"wang","year":"2020","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461809"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682377"},{"key":"ref8","first-page":"119","article-title":"AudioCaps: Generating captions for audios in the wild","author":"kim","year":"0"},{"article-title":"Sequence level training with recurrent neural networks","year":"2015","author":"ranzato","key":"ref7"},{"key":"ref2","article-title":"Audio captioning based on transformer and pre-training for 2020 DCASE audio captioning challenge","author":"wu","year":"2020","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref1","article-title":"The NTT DCASE2020 challenge task 6 system: Automated audio captioning with keywords and sentence length estimation","author":"koizumi","year":"2020","journal-title":"Tech Rep DCASE2016 Challenge"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1049"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref21","first-page":"4171","article-title":"Bert: Pre-training of deep bidirectional transformers for language under-standing","author":"devlin","year":"2019","journal-title":"Proc Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref23","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"Proc International Conference on Machine Learning (ICML)"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/K18-2016"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3994"}],"event":{"name":"2021 12th International Symposium on Chinese Spoken Language Processing (ISCSLP)","start":{"date-parts":[[2021,1,24]]},"location":"Hong Kong","end":{"date-parts":[[2021,1,27]]}},"container-title":["2021 12th International Symposium on Chinese Spoken Language Processing (ISCSLP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9362048\/9362049\/09362117.pdf?arnumber=9362117","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,3,29]],"date-time":"2021-03-29T22:56:29Z","timestamp":1617058589000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9362117\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,1,24]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/iscslp49672.2021.9362117","relation":{},"subject":[],"published":{"date-parts":[[2021,1,24]]}}}