{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T03:16:16Z","timestamp":1769224576471,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":34,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819555666","type":"print"},{"value":"9789819555673","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5567-3_3","type":"book-chapter","created":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T21:13:02Z","timestamp":1769116382000},"page":"33-46","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["SegCL: Segmented Reasoning with\u00a0Global Visual-Audio Knowledge for\u00a0Complex Long Video Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3707-7575","authenticated-orcid":false,"given":"Siqi","family":"Yang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7436-2691","authenticated-orcid":false,"given":"Na","family":"Lu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4237-9771","authenticated-orcid":false,"given":"Benqi","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5360-0900","authenticated-orcid":false,"given":"Yikun","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0059-6270","authenticated-orcid":false,"given":"Runxi","family":"Cui","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,23]]},"reference":[{"key":"3_CR1","unstructured":"Achiam, J., et\u00a0al.: Gpt-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"3_CR2","unstructured":"Bai, S., et\u00a0al.: Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923 (2025)"},{"key":"3_CR3","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21-26, 2017, pp. 4724\u20134733. IEEE Computer Society (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"3_CR4","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-024-4231-5","volume-title":"How far are we to gpt-4v? closing the gap to commercial multimodal models with open-source suites","author":"Z Chen","year":"2024","unstructured":"Chen, Z., et al.: How far are we to gpt-4v? closing the gap to commercial multimodal models with open-source suites. Sci. China Inf, Sci (2024)"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: Arcface: additive angular margin loss for deep face recognition. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019, Long Beach, CA, USA, June 16-20, 2019. Computer Vision Foundation\/IEEE (2019)","DOI":"10.1109\/CVPR.2019.00482"},{"key":"3_CR6","unstructured":"Ester, M., Kriegel, H., Sander, J., Xu, X.: A density-based algorithm for discovering clusters in large spatial databases with noise. AAAI Press (1996)"},{"key":"3_CR7","unstructured":"Grattafiori, A., et\u00a0al.: The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)"},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"Han, T., Bain, M., Nagrani, A., Varol, G., Xie, W., Zisserman, A.: Autoad: movie description in context. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, Vancouver, BC, Canada, June 17-24, 2023. IEEE (2023)","DOI":"10.1109\/CVPR52729.2023.01815"},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Escorcia, V., Ghanem, B., Niebles, J.C.: Activitynet: a large-scale video benchmark for human activity understanding. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2015, Boston, MA, USA, June 7-12, 2015, pp. 961\u2013970. IEEE Computer Society (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"Huang, Q., Xiong, Y., Rao, A., Wang, J., Lin, D.: Movienet: A holistic dataset for movie understanding. Springer (2020)","DOI":"10.1007\/978-3-030-58548-8_41"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Islam, M.M., Ho, N., Yang, X., Nagarajan, T., Torresani, L., Bertasius, G.: Video recap: recursive captioning of hour-long videos. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024, Seattle, WA, USA, June 16-22, 2024, pp. 18198\u201318208. IEEE (2024)","DOI":"10.1109\/CVPR52733.2024.01723"},{"key":"3_CR12","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.C.H.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Krause, A., Brunskill, E., Cho, K., Engelhardt, B., Sabato, S., Scarlett, J. (eds.) International Conference on Machine Learning, ICML 2023, 23-29 July 2023, Honolulu, Hawaii, USA. PMLR (2023)"},{"key":"3_CR13","unstructured":"Li, K., et al.: Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"issue":"2","key":"3_CR14","doi-asserted-by":"publisher","first-page":"708","DOI":"10.1109\/TPAMI.2024.3479776","volume":"47","author":"J Liu","year":"2025","unstructured":"Liu, J., et al.: VALOR: vision-audio-language omni-perception pretraining model and dataset. IEEE Trans. Pattern Anal. Mach. Intell. 47(2), 708\u2013724 (2025)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3_CR15","unstructured":"Mahon, L., Lapata, M.: Screenwriter: Automatic screenplay generation and movie summarisation. arXiv preprint arXiv:2410.19809 (2024)"},{"issue":"2","key":"3_CR16","doi-asserted-by":"publisher","first-page":"502","DOI":"10.1109\/TPAMI.2019.2901464","volume":"42","author":"M Monfort","year":"2020","unstructured":"Monfort, M., et al.: Moments in time dataset: one million videos for event understanding. IEEE Trans. Pattern Anal. Mach. Intell. 42(2), 502\u2013508 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Mun, J., Shin, M., Han, G., Lee, S., Ha, S., Lee, J., Kim, E.: Bassl: Boundary-aware self-supervised learning for video scene segmentation. Springer (2022)","DOI":"10.1007\/978-3-031-26316-3_29"},{"key":"3_CR18","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 - December 9, 2022 (2022)"},{"key":"3_CR19","unstructured":"Shen, X., et\u00a0al.: Longvu: Spatiotemporal adaptive compression for long video-language understanding. arXiv preprint arXiv:2410.17434 (2024)"},{"key":"3_CR20","doi-asserted-by":"crossref","unstructured":"Song, E., et al.: Moviechat: from dense token to sparse memory for long video understanding. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024, Seattle, WA, USA, June 16-22, 2024, pp. 18221\u201318232. IEEE (2024)","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"3_CR21","unstructured":"Team, G., et\u00a0al.: Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530 (2024)"},{"key":"3_CR22","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA (2017)"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"Wang, H., Zheng, S., Chen, Y., Cheng, L., Chen, Q.: Cam++: A fast and efficient network for speaker verification using context-aware masking. arXiv preprint arXiv:2303.00332 (2023)","DOI":"10.21437\/Interspeech.2023-1513"},{"key":"3_CR24","unstructured":"Wang, P., et al.: Qwen2-vl: enhancing vision-language model\u2019s perception of the world at any resolution (2024). https:\/\/arxiv.org\/abs\/2409.12191"},{"key":"3_CR25","unstructured":"Wang, Y., et\u00a0al.: Internvideo: General video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)"},{"key":"3_CR26","unstructured":"Wu, H., Li, D., Chen, B., Li, J.: Longvideobench: a benchmark for long-context interleaved video-language understanding. In: Advances in Neural Information Processing Systems 38: Annual Conference on Neural Information Processing Systems 2024, NeurIPS 2024, Vancouver, BC, Canada, December 10\u201315, 2024 (2024)"},{"key":"3_CR27","unstructured":"Xu, J., et\u00a0al.: Qwen2. 5-omni technical report. arXiv preprint arXiv:2503.20215 (2025)"},{"key":"3_CR28","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Zero-shot video question answering via frozen bidirectional language models. In: Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 - December 9, 2022 (2022)"},{"key":"3_CR29","unstructured":"Ye, Q., et\u00a0al.: mplug-owl: Modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)"},{"key":"3_CR30","doi-asserted-by":"crossref","unstructured":"You, Z., et al.: Towards long video understanding via fine-detailed video story generation. IEEE Transactions on Circuits and Systems for Video Technology (2024)","DOI":"10.1109\/TCSVT.2024.3514820"},{"key":"3_CR31","doi-asserted-by":"crossref","unstructured":"Zhai, X., Mustafa, B., Kolesnikov, A., Beyer, L.: Sigmoid loss for language image pre-training. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, October 1-6, 2023, pp. 11941\u201311952. IEEE (2023)","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: A simple LLM framework for long-range video question-answering. Association for Computational Linguistics (2024)","DOI":"10.18653\/v1\/2024.emnlp-main.1209"},{"key":"3_CR33","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: Mm-narrator: narrating long-form videos with multimodal in-context learning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2024, Seattle, WA, USA, June 16\u201322, 2024. IEEE (2024)","DOI":"10.1109\/CVPR52733.2024.01295"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Corso, J.J.: Towards automatic learning of procedures from web instructional videos. AAAI Press (2018)","DOI":"10.1609\/aaai.v32i1.12342"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5567-3_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T21:13:10Z","timestamp":1769116390000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5567-3_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819555666","9789819555673"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5567-3_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"23 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}