{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T14:08:21Z","timestamp":1780495701004,"version":"3.54.1"},"reference-count":54,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.patcog.2026.113526","type":"journal-article","created":{"date-parts":[[2026,3,22]],"date-time":"2026-03-22T23:01:17Z","timestamp":1774220477000},"page":"113526","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["Synergistic audio-textual cues: A cross-modal framework for weakly-supervised temporal action localization"],"prefix":"10.1016","volume":"179","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3748-5980","authenticated-orcid":false,"given":"Linkai","family":"Liu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7041-559X","authenticated-orcid":false,"given":"Yuchen","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-6917-0478","authenticated-orcid":false,"given":"Zipeng","family":"Guo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2611-5686","authenticated-orcid":false,"given":"Lei","family":"Yu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4128-886X","authenticated-orcid":false,"given":"Chao","family":"Gou","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.113526_bib0001","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"7252","article-title":"DeepLocalization: using change point detection for temporal action localization","author":"Rahman","year":"2024"},{"key":"10.1016\/j.patcog.2026.113526_bib0002","doi-asserted-by":"crossref","first-page":"7728","DOI":"10.1109\/TPAMI.2024.3395778","article-title":"DeTAL: open-vocabulary temporal action localization with decoupled networks","volume":"46","author":"Li","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113526_bib0003","doi-asserted-by":"crossref","first-page":"9425","DOI":"10.1109\/TMM.2023.3252176","article-title":"Exploring action centers for temporal action localization","volume":"25","author":"Xia","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.113526_bib0004","doi-asserted-by":"crossref","first-page":"8476","DOI":"10.1109\/TMM.2024.3379887","article-title":"Integration of global and local knowledge for foreground enhancing in weakly supervised temporal action localization","volume":"Vol. 26","author":"Zhang","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.113526_bib0005","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"6908","article-title":"Weakly-supervised temporal action localization by inferring salient snippet-feature","volume":"Vol. 38","author":"Yun","year":"2024"},{"key":"10.1016\/j.patcog.2026.113526_bib0006","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"2704","article-title":"Weakly-supervised temporal action localization with multi-modal plateau transformers","author":"Hu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113526_bib0007","series-title":"2017\u202fIEEE International Conference on Computer Vision (ICCV)","article-title":"Temporal context network for activity localization in videos","author":"Dai","year":"2017"},{"key":"10.1016\/j.patcog.2026.113526_bib0008","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"1513","article-title":"Actionness inconsistency-guided contrastive learning for weakly-supervised temporal action localization","volume":"Vol. 37","author":"Li","year":"2023"},{"key":"10.1016\/j.patcog.2026.113526_bib0009","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"14751","article-title":"Distilling vision-language pre-training to collaborate with weakly-supervised temporal action localization","author":"Ju","year":"2023"},{"key":"10.1016\/j.patcog.2026.113526_bib0010","series-title":"UntrimmedNets for Weakly Supervised Action Recognition and Detection","author":"Wang","year":"2017"},{"key":"10.1016\/j.patcog.2026.113526_bib0011","series-title":"Step-by-Step Erasion, One-by-One Collection: A Weakly Supervised Temporal Action Detector","author":"Zhong","year":"2018"},{"key":"10.1016\/j.patcog.2026.113526_bib0012","series-title":"Proceedings of the 29th ACM International Conference on Multimedia","article-title":"Cross-modal consensus network for weakly supervised temporal action localization","author":"Hong","year":"2021"},{"key":"10.1016\/j.patcog.2026.113526_bib0013","series-title":"International Conference on Learning Representations","article-title":"Cross-attentional audio-visual fusion for weakly-supervised action localization","author":"Lee","year":"2020"},{"key":"10.1016\/j.patcog.2026.113526_bib0014","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10648","article-title":"Boosting weakly-supervised temporal action localization with text information","author":"Li","year":"2023"},{"key":"10.1016\/j.patcog.2026.113526_bib0015","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"247","article-title":"Audio-visual event localization in unconstrained videos","author":"Tian","year":"2018"},{"key":"10.1016\/j.patcog.2026.113526_bib0016","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"8436","article-title":"Positive sample propagation along the audio-visual event line","author":"Zhou","year":"2021"},{"issue":"6","key":"10.1016\/j.patcog.2026.113526_bib0017","doi-asserted-by":"crossref","first-page":"7239","DOI":"10.1109\/TPAMI.2022.3223688","article-title":"Contrastive positive sample propagation along the audio-visual event line","volume":"45","author":"Zhou","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113526_bib0018","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"10905","article-title":"Dense audio-visual event localization under cross-modal consistency and multi-temporal granularity collaboration","volume":"39","author":"Zhou","year":"2025"},{"key":"10.1016\/j.patcog.2026.113526_bib0019","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"8362","article-title":"Towards open-vocabulary audio-visual event localization","author":"Zhou","year":"2025"},{"issue":"11","key":"10.1016\/j.patcog.2026.113526_bib0020","doi-asserted-by":"crossref","first-page":"5308","DOI":"10.1007\/s11263-024-02142-3","article-title":"Advancing weakly-supervised audio-visual video parsing via segment-wise pseudo labeling","volume":"132","author":"Zhou","year":"2024","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.patcog.2026.113526_bib0021","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"10448","article-title":"Multimodal class-aware semantic enhancement network for audio-visual video parsing","volume":"39","author":"Zhao","year":"2025"},{"key":"10.1016\/j.patcog.2026.113526_bib0022","series-title":"European Conference on Computer Vision","first-page":"192","article-title":"Dual-evidential learning for weakly-supervised temporal action localization","author":"Chen","year":"2022"},{"key":"10.1016\/j.patcog.2026.113526_bib0023","first-page":"6000","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113526_bib0024","series-title":"Computer Vision and Pattern Recognition","article-title":"W-TALC: weakly-supervised temporal activity localization and classification","author":"Shrivastava","year":"2018"},{"key":"10.1016\/j.patcog.2026.113526_bib0025","first-page":"1854","article-title":"Weakly-supervised temporal action localization by uncertainty modeling","author":"Lee","year":"2022","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"10.1016\/j.patcog.2026.113526_bib0026","series-title":"AutoLoc: weakly-supervised temporal action localization in untrimmed videos","first-page":"162","author":"Shou","year":"2018"},{"key":"10.1016\/j.patcog.2026.113526_bib0027","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.cviu.2016.10.018","article-title":"The THUMOS challenge on action recognition for videos \u201cin the wild\u201d","author":"Idrees","year":"2017","journal-title":"Comput. Vision Image Understanding"},{"key":"10.1016\/j.patcog.2026.113526_bib0028","series-title":"2015\u202fIEEE Conference on Computer Vision and Pattern Recognition (CVPR)","article-title":"ActivityNet: a large-scale video benchmark for human activity understanding","author":"Heilbron","year":"2015"},{"key":"10.1016\/j.patcog.2026.113526_bib0029","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"23003","article-title":"Improving weakly supervised temporal action localization by bridging train-test gap in pseudo labels","author":"Zhou","year":"2023"},{"key":"10.1016\/j.patcog.2026.113526_bib0030","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"2394","article-title":"Proposal-based multiple instance learning for weakly-supervised temporal action localization","author":"Ren","year":"2023"},{"key":"10.1016\/j.patcog.2026.113526_bib0031","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"3889","article-title":"BMN: boundary-matching network for temporal action proposal generation","author":"Lin","year":"2019"},{"key":"10.1016\/j.patcog.2026.113526_bib0032","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10156","article-title":"G-tad: sub-graph localization for temporal action detection","author":"Xu","year":"2020"},{"key":"10.1016\/j.patcog.2026.113526_bib0033","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2021.107831","article-title":"Weakly-supervised action localization via embedding-modeling iterative optimization","volume":"113","author":"Zhang","year":"2021","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113526_bib0034","series-title":"Proceedings of the 30th ACM International Conference on Multimedia","first-page":"3820","article-title":"Dynamic graph modeling for weakly-supervised temporal action localization","author":"Shi","year":"2022"},{"key":"10.1016\/j.patcog.2026.113526_bib0035","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"19999","article-title":"Fine-grained temporal contrastive learning for weakly-supervised temporal action localization","author":"Gao","year":"2022"},{"key":"10.1016\/j.patcog.2026.113526_bib0036","doi-asserted-by":"crossref","first-page":"270","DOI":"10.1109\/TMM.2023.3263965","article-title":"Feature weakening, contextualization, and discrimination for weakly supervised temporal action localization","volume":"26","author":"Moniruzzaman","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.113526_bib0037","doi-asserted-by":"crossref","first-page":"6717","DOI":"10.1109\/TMM.2024.3355628","article-title":"Snippet-to-prototype contrastive consensus network for weakly supervised temporal action localization","volume":"26","author":"Shao","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.113526_bib0038","series-title":"2017\u202fIEEE Conference on Computer Vision and Pattern Recognition (CVPR)","article-title":"Quo vadis, action recognition? A new model and the kinetics dataset","author":"Carreira","year":"2017"},{"key":"10.1016\/j.patcog.2026.113526_bib0039","series-title":"The Kinetics Human Action Video Dataset","author":"Zisserman","year":"2017"},{"key":"10.1016\/j.patcog.2026.113526_bib0040","series-title":"Learning Transferable Visual Models From Natural Language Supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2026.113526_bib0041","doi-asserted-by":"crossref","unstructured":"Y. Gong, Y.-A. Chung, J. Glass, Ast: Audio spectrogram transformer, (2021). arXiv preprint arXiv: 2104.01778.","DOI":"10.21437\/Interspeech.2021-698"},{"key":"10.1016\/j.patcog.2026.113526_bib0042","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107686","article-title":"Deep snippet selective network for weakly supervised temporal action localization","volume":"110","author":"Ge","year":"2021","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113526_bib0043","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109426","article-title":"Complementary adversarial mechanisms for weakly-supervised temporal action localization","volume":"139","author":"Wang","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113526_bib0044","series-title":"Proceedings of the 15th Asian Conference on Machine Learning","first-page":"470","article-title":"Temporal RPN learning for weakly-supervised temporal action localization","volume":"222","author":"Huang","year":"2024"},{"key":"10.1016\/j.patcog.2026.113526_bib0045","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops","first-page":"2704","article-title":"Weakly-supervised temporal action localization with multi-modal plateau transformers","author":"Hu","year":"2024"},{"key":"10.1016\/j.patcog.2026.113526_bib0046","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"19914","article-title":"Exploring denoised cross-video contrast for weakly-supervised temporal action localization","author":"Li","year":"2022"},{"key":"10.1016\/j.patcog.2026.113526_bib0047","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"11320","article-title":"Background suppression network for weakly-supervised temporal action localization","volume":"Vol. 34","author":"Lee","year":"2020"},{"key":"10.1016\/j.patcog.2026.113526_bib0048","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"2233","article-title":"ACSNet: action-context separation network for weakly supervised temporal action localization","volume":"Vol. 35","author":"Liu","year":"2021"},{"key":"10.1016\/j.patcog.2026.113526_bib0049","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"53","article-title":"Uncertainty guided collaborative training for weakly supervised temporal action detection","author":"Yang","year":"2021"},{"key":"10.1016\/j.patcog.2026.113526_bib0050","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9969","article-title":"Action unit memory network for weakly supervised temporal action localization","author":"Luo","year":"2021"},{"key":"10.1016\/j.patcog.2026.113526_bib0051","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"8002","article-title":"Foreground-action consistency network for weakly supervised temporal action localization","author":"Huang","year":"2021"},{"key":"10.1016\/j.patcog.2026.113526_bib0052","doi-asserted-by":"crossref","first-page":"53","DOI":"10.1016\/0377-0427(87)90125-7","article-title":"Silhouettes: a graphical aid to the interpretation and validation of cluster analysis","volume":"20","author":"Rousseeuw","year":"1987","journal-title":"J. Comput. Appl. Math."},{"issue":"1","key":"10.1016\/j.patcog.2026.113526_bib0053","first-page":"1","article-title":"A dendrite method for cluster analysis","volume":"3","author":"Cali\u0144ski","year":"1974","journal-title":"Commun. Statistics-theory Methods"},{"issue":"2","key":"10.1016\/j.patcog.2026.113526_bib0054","doi-asserted-by":"crossref","first-page":"224","DOI":"10.1109\/TPAMI.1979.4766909","article-title":"A cluster separation measure","author":"Davies","year":"1979","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326004929?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326004929?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T13:07:48Z","timestamp":1780492068000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326004929"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":54,"alternative-id":["S0031320326004929"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113526","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Synergistic audio-textual cues: A cross-modal framework for weakly-supervised temporal action localization","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113526","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113526"}}