{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T04:08:38Z","timestamp":1742962118672,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":41,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819609130"},{"type":"electronic","value":"9789819609147"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0914-7_9","type":"book-chapter","created":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T15:07:32Z","timestamp":1737558452000},"page":"129-146","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Audio-Guided Visual Knowledge Representation"],"prefix":"10.1007","author":[{"given":"Fei","family":"Yu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiguo","family":"Wan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuehua","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,23]]},"reference":[{"issue":"12","key":"9_CR1","doi-asserted-by":"publisher","first-page":"1551","DOI":"10.1631\/FITEE.2100463","volume":"22","author":"Y Yang","year":"2021","unstructured":"Yang, Y., Zhuang, Y., Pan, Y.: Multiple knowledge representation for big data artificial intelligence: framework, applications, and case studies. Front. Inf. Technol. Electron. Eng. 22(12), 1551\u20131558 (2021)","journal-title":"Front. Inf. Technol. Electron. Eng."},{"issue":"8","key":"9_CR2","doi-asserted-by":"publisher","first-page":"1021","DOI":"10.1631\/FITEE.1910001","volume":"20","author":"Y Pan","year":"2019","unstructured":"Pan, Y.: On visual knowledge. Front. Inf. Technol. Electron. Eng. 20(8), 1021\u20131025 (2019)","journal-title":"Front. Inf. Technol. Electron. Eng."},{"issue":"2","key":"9_CR3","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","volume":"41","author":"T Baltru\u0161aitis","year":"2018","unstructured":"Baltru\u0161aitis, T., Ahuja, C., Morency, L.-P.: Multimodal machine learning: a survey and taxonomy. IEEE Trans. Pattern Anal. Mach. Intell. 41(2), 423\u2013443 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"9_CR4","doi-asserted-by":"crossref","unstructured":"Ye, H., Li, G., Qi, Y., Wang, S., Huang, Q., Yang, M.H.: Hierarchical modular network for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17939\u201317948 (2022)","DOI":"10.1109\/CVPR52688.2022.01741"},{"issue":"10","key":"9_CR5","doi-asserted-by":"publisher","first-page":"6642","DOI":"10.1109\/TCSVT.2022.3177320","volume":"32","author":"L Yan","year":"2022","unstructured":"Yan, L., et al.: Video captioning using global-local representation. IEEE Trans. Circuits Syst. Video Technol. 32(10), 6642\u20136656 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"9_CR6","doi-asserted-by":"crossref","unstructured":"Ryu, H., Kang, S., Kang, H., Yoo, C.D.: Semantic grouping network for video captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 2514\u20132522 (2021)","DOI":"10.1609\/aaai.v35i3.16353"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Iashin, V., Rahtu, E.: A better use of audio-visual cues: dense video captioning with bi-modal transformer. arXiv preprint arXiv:2005.08271 (2020)","DOI":"10.1109\/CVPRW50498.2020.00487"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Shen, X., et al.: Fine-grained audible video description. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10585\u201310596 (2023)","DOI":"10.1109\/CVPR52729.2023.01020"},{"key":"9_CR9","unstructured":"Kim, C.D., Kim, B., Lee, H., Kim, G.: Audiocaps: generating captions for audios in the wild. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 119\u2013132 (2019)"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Drossos, K., Lipping, S., Virtanen, T.: Clotho: an audio captioning dataset. In: ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 736\u2013740. IEEE (2020)","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Wu, Y.,\u00a0Chen, K., Zhang, T., Hui, Y., Berg-Kirkpatrick, T., Dubnov, S.: Large-scale contrastive language-audio pretraining with feature fusion and keyword-to-caption augmentation. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 1\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"9_CR12","doi-asserted-by":"crossref","unstructured":"Mei, X., et al.: Wavcaps: a ChatGPT-assisted weakly-labelled audio captioning dataset for audio-language multimodal research. arXiv preprint arXiv:2303.17395 (2023)","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"9_CR13","unstructured":"Chen, D., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pp. 190\u2013200 (2011)"},{"key":"9_CR14","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"9_CR15","unstructured":"Torabi, A., Pal, C., Larochelle, H., Courville, A.: Using descriptive video services to create a large data source for video annotation research. arXiv preprint arXiv:1503.01070 (2015)"},{"key":"9_CR16","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133212 (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Das, P., Xu, C., Doell, R.F., Corso, J.J.: A thousand frames in just a few words: lingual description of videos through latent topics and sparse object stitching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2634\u20132641 (2013)","DOI":"10.1109\/CVPR.2013.340"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Corso, J.: Towards automatic learning of procedures from web instructional videos. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32 (2018)","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"9_CR19","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part I 14, pp. 510\u2013526. Springer (2016)","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"9_CR20","doi-asserted-by":"crossref","unstructured":"Ji, J., Krishna, R., Fei-Fei, L., Niebles, J.C.: Action genome: actions as compositions of spatio-temporal scene graphs. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10236\u201310247 (2020)","DOI":"10.1109\/CVPR42600.2020.01025"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Gella, S., Lewis, M., Rohrbach, M.: A dataset for telling the stories of social media videos. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 968\u2013974 (2018)","DOI":"10.18653\/v1\/D18-1117"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Lei, J., Yu, L., Berg, T.L., Bansal, M.: TVR: a large-scale dataset for video-subtitle moment retrieval. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXI 16, pp. 447\u2013463. Springer (2020)","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"9_CR23","doi-asserted-by":"crossref","unstructured":"Monfort, M., et al.: Spoken moments: Learning joint audio-visual representations from video descriptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14871\u201314881 (2021)","DOI":"10.1109\/CVPR46437.2021.01463"},{"key":"9_CR24","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Amin, S., Andriluka, M., Schiele, B.: A database for fine grained activity detection of cooking activities. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1194\u20131201. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6247801"},{"key":"9_CR25","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Niebles, R.C.: Dense-captioning events in videos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"9_CR26","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.F., Wang, W.Y.: Vatex: a large-scale, high-quality multilingual dataset for video-and-language research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4581\u20134591 (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"9_CR27","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: Howto100m: learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"9_CR28","doi-asserted-by":"crossref","unstructured":"Iashin, V., Rahtu, E.: Multi-modal dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 958\u2013959 (2020)","DOI":"10.1109\/CVPRW50498.2020.00487"},{"key":"9_CR29","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"9_CR31","doi-asserted-by":"crossref","unstructured":"Tian, Y., Shi, J., Li, B., Duan, Z., Xu, C.: Audio-visual event localization in unconstrained videos. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 247\u2013263 (2018)","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Cong, Y., Liao, W., Ackermann, H., Rosenhahn, B., Yang, M.Y.: Spatial-temporal transformer for dynamic scene graph generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16372\u201316382 (2021)","DOI":"10.1109\/ICCV48922.2021.01606"},{"key":"9_CR33","doi-asserted-by":"crossref","unstructured":"Gong, Y., Chung, Y.A., Glass, J.: AST: audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021)","DOI":"10.21437\/Interspeech.2021-698"},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Lin, K., et al.: Swinbert: end-to-end transformers with sparse attention for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17949\u201317958 (2022)","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"9_CR35","doi-asserted-by":"crossref","unstructured":"Li, J., Su, X., Gao, G.: Teast: temporal knowledge graph embedding via archimedean spiral timeline. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 15460\u201315474 (2023)","DOI":"10.18653\/v1\/2023.acl-long.862"},{"key":"9_CR36","doi-asserted-by":"crossref","unstructured":"Yi, X., Junjie, O., Hui, X., Luoyi, F.: Temporal knowledge graph reasoning with historical contrastive learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, pp. 4765\u20134773 (2023)","DOI":"10.1609\/aaai.v37i4.25601"},{"key":"9_CR37","doi-asserted-by":"crossref","unstructured":"Li, Y., Sun, S., Zhao, J.: Tirgn: time-guided recurrent graph network with local-global historical patterns for temporal knowledge graph reasoning. In: Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, IJCAI 2022, Vienna, Austria, 23\u201329 July 2022, pp. 2152\u20132158. ijcai. org (2022)","DOI":"10.24963\/ijcai.2022\/299"},{"key":"9_CR38","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"9_CR39","unstructured":"Banerjee, S., Lavie, A.: Meteor: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"9_CR40","doi-asserted-by":"crossref","unstructured":"Lin, C.Y., Och, F.J.: Automatic evaluation of machine translation quality using longest common subsequence and skip-bigram statistics. In: Proceedings of the 42nd Annual Meeting of the Association for Computational Linguistics (ACL-04), pp. 605\u2013612 (2004)","DOI":"10.3115\/1218955.1219032"},{"key":"9_CR41","doi-asserted-by":"crossref","unstructured":"Hershey, S., et al.: CNN architectures for large-scale audio classification. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 131\u2013135. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952132"}],"container-title":["Lecture Notes in Computer Science","Database Systems for Advanced Applications. DASFAA 2024 International Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0914-7_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,22]],"date-time":"2025-01-22T15:08:01Z","timestamp":1737558481000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0914-7_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819609130","9789819609147"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0914-7_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"23 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DASFAA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Database Systems for Advanced Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Gifu","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 July 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 July 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dasfaa2024a","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.dasfaa2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}