{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T05:59:41Z","timestamp":1742968781853,"version":"3.40.3"},"publisher-location":"Cham","reference-count":23,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031843556"},{"type":"electronic","value":"9783031843563"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-84356-3_21","type":"book-chapter","created":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T13:47:21Z","timestamp":1740491241000},"page":"258-269","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Combining Audio and\u00a0Image Sequence for\u00a0Video Moment Retrieval by\u00a0Natural Language"],"prefix":"10.1007","author":[{"given":"Lu\u00eds G.","family":"de Souza","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3635-7477","authenticated-orcid":false,"given":"S\u00edlvio R. R.","family":"Sanches","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9421-9254","authenticated-orcid":false,"given":"Pedro H.","family":"Bugatti","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4870-4766","authenticated-orcid":false,"given":"Priscila T. M.","family":"Saito","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,2,17]]},"reference":[{"key":"21_CR1","doi-asserted-by":"crossref","unstructured":"Alayrac, J.B., Bojanowski, P., Agrawal, N., Sivic, J., Laptev, I., Lacoste-Julien, S.: Unsupervised learning from narrated instruction videos. In: IEEE Conference on Computer Vision and Pattern Recognition, pp. 4575\u20134583 (2016)","DOI":"10.1109\/CVPR.2016.495"},{"key":"21_CR2","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Bojanowski, P., et al.: Weakly-supervised alignment of video with text. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4462\u20134470 (2015)","DOI":"10.1109\/ICCV.2015.507"},{"key":"21_CR4","doi-asserted-by":"crossref","unstructured":"Chen, J., Chen, X., Ma, L., Jie, Z., Chua, T.S.: Temporally grounding natural sentence in video. In: Conference on Empirical Methods in Natural Language Processing, pp. 162\u2013171. Association for Computational Linguistics (2018)","DOI":"10.18653\/v1\/D18-1015"},{"key":"21_CR5","doi-asserted-by":"crossref","unstructured":"Hauptmann, A.G., Jin, R., Ng, T.D.: Multi-modal information retrieval from broadcast video using OCR and speech recognition. In: 2nd ACMIEEE-CS Joint Conference on Digital Libraries, pp. 160\u2013161 (2002)","DOI":"10.1145\/544220.544252"},{"issue":"5","key":"21_CR6","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1109\/MSP.2008.926652","volume":"25","author":"X He","year":"2008","unstructured":"He, X., Deng, L., Chou, W.: Discriminative learning in sequential pattern recognition. IEEE Signal Process. Mag. 25(5), 14\u201336 (2008)","journal-title":"IEEE Signal Process. Mag."},{"key":"21_CR7","doi-asserted-by":"crossref","unstructured":"Lin, D., Fidler, S., Kong, C., Urtasun, R.: Visual semantic search: Retrieving videos via complex textual queries. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2657\u20132664 (2014)","DOI":"10.1109\/CVPR.2014.340"},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Lin, Z., Zhao, Z., Zhang, Z., Wang, Q., Liu, H.: Weakly-supervised video moment retrieval via semantic completion network. arXiv preprint arXiv:1911.08199 (2020)","DOI":"10.1609\/aaai.v34i07.6820"},{"key":"21_CR9","unstructured":"Liu, Y., Albanie, S., Nagrani, A., Zisserman, A.: Use what you have: video retrieval using representations from collaborative experts. In: British Machine Vision Conference (BMVC) (2019)"},{"key":"21_CR10","unstructured":"Miech, A., Laptev, I., Sivic, J.: Learning a text-video embedding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516 (2018)"},{"key":"21_CR11","doi-asserted-by":"crossref","unstructured":"Mithun, N.C., Paul, S., Roy-Chowdhury, A.K.: Weakly supervised video moment retrieval from text queries. In: IEEE Conference on Computing Vision and Pattern Recognition, pp. 11592\u201311601 (2019)","DOI":"10.1109\/CVPR.2019.01186"},{"key":"21_CR12","doi-asserted-by":"crossref","unstructured":"Otani, M., Nakashima, Y., Rahtu, E., Heikkil\u00e4, J., Yokoya, N.: Learning joint representations of videos and sentences with web image search. In: European Conference on Computer Vision (ECCV), pp. 651\u2013667 (2016)","DOI":"10.1007\/978-3-319-46604-0_46"},{"key":"21_CR13","unstructured":"Parker, R., Graff, D., Kong, J., Chen, K., Maeda, K.: English gigaword fifth edition, linguistic data consortium (2011), lDC Catalog No. LDC2011T07"},{"key":"21_CR14","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.: GloVe: Global vectors for word representation. In: Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543. Association for Computational Linguistics (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"21_CR15","doi-asserted-by":"crossref","unstructured":"Radha, N.: Video retrieval using speech and text in video. In: 2016 International Conference on Inventive Computation Technologies (ICICT). vol.\u00a02, pp.\u00a01\u20136 (2016)","DOI":"10.1109\/INVENTIVE.2016.7824801"},{"key":"21_CR16","doi-asserted-by":"crossref","unstructured":"Sener, O., Zamir, A.R., Savarese, S., Saxena, A.: Unsupervised semantic parsing of video collections. In: IEEE International Conference on Computing Vision, pp. 4480\u20134488 (2015)","DOI":"10.1109\/ICCV.2015.509"},{"key":"21_CR17","doi-asserted-by":"crossref","unstructured":"Takahashi, N., Gygli, M., Van\u00a0Gool, L.: Aenet: learning deep audio features for video analysis. arXiv:1701.00599v2 (2017)","DOI":"10.1109\/TMM.2017.2751969"},{"key":"21_CR18","doi-asserted-by":"crossref","unstructured":"Tan, R., Xu, H., Saenko, K., Plummer, B.A.: Logan: latent graph co-attention network for weakly-supervised video moment retrieval. arXiv preprint arXiv:1909.13784 (2020)","DOI":"10.1109\/WACV48630.2021.00213"},{"key":"21_CR19","doi-asserted-by":"crossref","unstructured":"Tellex, S., Roy, D.: Towards surveillance video search by natural language query. In: ACM International Conference on Image and Video Retrieval, pp.\u00a01\u20138 (2009)","DOI":"10.1145\/1646396.1646442"},{"key":"21_CR20","unstructured":"Torabi, A., Tandon, N., Sigal, L.: Learning language-visual embedding for movie understanding with natural-language. arXiv preprint arXiv:1609.08124 (2016)"},{"key":"21_CR21","doi-asserted-by":"crossref","unstructured":"Xu, H., Li, B., Ramanishka, V., Sigal, L., Saenko, K.: Joint event detection and description in continuous video streams. In: 2019 IEEE Winter Applications of Computer Vision Workshops (WACVW), pp. 25\u201326 (2019)","DOI":"10.1109\/WACVW.2019.00011"},{"key":"21_CR22","doi-asserted-by":"crossref","unstructured":"Xu, R., Xiong, C., Chen, W., Corso, J.J.: Jointly modeling deep video and compositional text to bridge vision and language in a unified framework. In: Twenty-Ninth AAAI Conference on Artificial Intelligence (2015)","DOI":"10.1609\/aaai.v29i1.9512"},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Yamamoto, N., Ogata, J., Ariki, Y.: Topic segmentation and retrieval system for lecture videos based on spontaneous speech recognition. In: Eighth European Conference on Speech Communication and Technology (2003)","DOI":"10.21437\/Eurospeech.2003-333"}],"container-title":["Lecture Notes in Computer Science","Artificial Intelligence and Soft Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-84356-3_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T13:47:29Z","timestamp":1740491249000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-84356-3_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031843556","9783031843563"],"references-count":23,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-84356-3_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"17 February 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICAISC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Intelligence and Soft Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Zakopane","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Poland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 June 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 June 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icaisc2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icaisc.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}