{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T01:11:03Z","timestamp":1763341863773,"version":"3.45.0"},"reference-count":55,"publisher":"Tech Science Press","issue":"2","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["CMC"],"published-print":{"date-parts":[[2025]]},"DOI":"10.32604\/cmc.2025.061037","type":"journal-article","created":{"date-parts":[[2025,3,18]],"date-time":"2025-03-18T04:12:18Z","timestamp":1742271138000},"page":"3277-3301","source":"Crossref","is-referenced-by-count":0,"title":["Event-Driven Attention Network: A Cross-Modal Framework for Efficient Image-Text Retrieval in Mass Gathering Events"],"prefix":"10.32604","volume":"83","author":[{"given":"Kamil","family":"Yasen","sequence":"first","affiliation":[]},{"given":"Heyan","family":"Jin","sequence":"additional","affiliation":[]},{"given":"Sijie","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Li","family":"Zhan","sequence":"additional","affiliation":[]},{"given":"Xuyang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Ke","family":"Qin","sequence":"additional","affiliation":[]},{"given":"Ye","family":"Li","sequence":"additional","affiliation":[]}],"member":"17807","published-online":{"date-parts":[[2025]]},"reference":[{"key":"ref1","doi-asserted-by":"crossref","first-page":"57","DOI":"10.1201\/9781003053262-3","author":"Varghese","year":"2023","journal-title":"Intelligent image and video analytics"},{"key":"ref2","doi-asserted-by":"crossref","first-page":"1992","DOI":"10.1109\/TIP.2017.2670780","article-title":"Deep-cascade: cascading 3D deep neurnetworks for fast anomaly detection and localization in crowded scenes","volume":"26","author":"Sabokrou","year":"2017","journal-title":"IEEE Trans Image Process"},{"key":"ref3","first-page":"358","article-title":"Spatial-temporal convolutional neural networks for anomaly detection and localization in crowded scenes","volume":"47","author":"Zhou","year":"2016","journal-title":"Signal Process: Image Commun"},{"key":"ref4","series-title":"Intelligent Information and Database Systems: 11th Asian Conference, ACIIDS 2019","first-page":"613","article-title":"Violent crowd flow detection using deep learning","author":"Sumon","year":"2019 Apr 8\u201311"},{"key":"ref5","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"589","article-title":"Single-image crowd counting via multi-column convolutional neural network","author":"Zhang","year":"2016"},{"key":"ref6","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"5094","article-title":"Context-aware crowd counting","author":"Liu","year":"2019"},{"key":"ref7","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"1091","article-title":"CSRNet: dilated convolutional neural networks for understanding the highly congested scenes","author":"Li","year":"2018"},{"key":"ref8","first-page":"2576","article-title":"To choose or to fuse? Scale selection for crowd counting","volume":"35","author":"Song","year":"2021","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"ref9","doi-asserted-by":"crossref","first-page":"1070","DOI":"10.1109\/TPAMI.2019.2944377","article-title":"Video anomaly detection with sparse coding inspired deep neural networks","volume":"43","author":"Luo","year":"2021","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref10","doi-asserted-by":"crossref","first-page":"107969","DOI":"10.1016\/j.patcog.2021.107969","article-title":"NM-GAN: noisemodulated generative adversarial network for video anomaly detection","volume":"116","author":"Chen","year":"2021","journal-title":"Pattern Recognit"},{"key":"ref11","series-title":"Proceedings of the IEEE Winter Conference on Applications of Computer Vision (WACV)","first-page":"1896","article-title":"Training adversarial discriminators for cross-channel abnormal event detection in crowds","author":"Ravanbakhsh","year":"2019"},{"key":"ref12","series-title":"Proceedings of the IEEE International Conference on Image Processing","first-page":"3464","article-title":"Simple online and realtime tracking","author":"Bewley","year":"2016"},{"key":"ref13","series-title":"Proceedings of the IEEE International Conference on Image Processing","first-page":"3645","article-title":"Simple online and realtime tracking with a deep association metric","author":"Wojke","year":"2017"},{"key":"ref14","series-title":"Proceedings of the European Conference on Computer Vision","first-page":"107","article-title":"Towards real-time multiobject tracking","author":"Wang","year":"2020"},{"key":"ref15","doi-asserted-by":"crossref","first-page":"12588","DOI":"10.1109\/JIOT.2021.3077449","article-title":"Deep-learning-enhanced multitarget detection for end-edge\u2013cloud surveillance in smart IoT","volume":"8","author":"Zhou","year":"2021","journal-title":"IEEE Internet Things J"},{"key":"ref16","first-page":"1","article-title":"The BEHAVE video dataset: ground truthed video for multi-person behavior classification","volume":"4","author":"Blunsden","year":"2010","journal-title":"Ann BMVA"},{"key":"ref17","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"6479","article-title":"Real-world anomaly detection in surveillance videos","author":"Sultani","year":"2018"},{"key":"ref18","doi-asserted-by":"crossref","first-page":"8651","DOI":"10.1109\/TMM.2024.3381040","article-title":"Crowd descriptors and interpretable gathering understanding","volume":"26","author":"Zhou","year":"2024","journal-title":"IEEE Trans Multimed"},{"key":"ref19","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"11583","article-title":"Learning cross-modal contrastive features for zero-shot action recognition and localization","author":"Meng","year":"2022"},{"key":"ref20","series-title":"British Machine Vision Conference (BMVC)","article-title":"VSE++: improving visual-semantic embeddings with hard negatives","author":"Faghri","year":"2017"},{"key":"ref21","series-title":"2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3128","article-title":"Deep visual-semantic alignments for generating image descriptions","volume":"2015","author":"Karpathy","year":"2015 Jun 7\u201312"},{"key":"ref22","doi-asserted-by":"crossref","first-page":"212","DOI":"10.1007\/978-3-030-01225-0_13","author":"Lee","year":"2018","journal-title":"Computer vision-ECCV 2018"},{"key":"ref23","doi-asserted-by":"crossref","first-page":"3622","DOI":"10.1109\/TIP.2023.3286710","article-title":"Efficient token-guided image-text retrieval with consistent multimodal contrastive training","volume":"32","author":"Liu","year":"2023","journal-title":"IEEE Trans Image Process"},{"key":"ref24","doi-asserted-by":"crossref","first-page":"948","DOI":"10.1109\/TCYB.2022.3179020","article-title":"Learning relationship-enhanced semantic graph for fine-grained image-text matching","volume":"54","author":"Liu","year":"2024","journal-title":"IEEE Trans Cybern"},{"key":"ref25","doi-asserted-by":"crossref","first-page":"548","DOI":"10.1016\/j.neucom.2016.09.063","article-title":"Learning deep event models for crowd anomaly detection","volume":"219","author":"Feng","year":"2017","journal-title":"Neurocomputing"},{"key":"ref26","doi-asserted-by":"crossref","first-page":"5017","DOI":"10.1109\/TIP.2015.2475625","article-title":"PCANet: a simple deep learning baseline for image classification?","volume":"24","author":"Chan","year":"2015","journal-title":"IEEE Trans Image Process"},{"key":"ref27","doi-asserted-by":"crossref","first-page":"15695","DOI":"10.1007\/s10489-022-04233-5","article-title":"An efficient deep neural model for detecting crowd anomalies in videos","volume":"53","author":"Yang","year":"2023","journal-title":"Appl Intell"},{"key":"ref28","first-page":"1","article-title":"Where are they going? Predicting human behaviors in crowded scenes","volume":"17","author":"Zhang","year":"2021","journal-title":"ACM Trans Multimedia Comput Commun Appl"},{"key":"ref29","doi-asserted-by":"crossref","first-page":"e11038","DOI":"10.1016\/j.heliyon.2022.e11038","article-title":"A new approach for social group detection based on spatio-temporal interpersonal distance measurement","volume":"8","author":"Su","year":"2022","journal-title":"Heliyon"},{"key":"ref30","doi-asserted-by":"crossref","first-page":"4077","DOI":"10.1007\/s12652-021-03323-5","article-title":"Generative adversarial network based abnormal behavior detection in massive crowd videos: a Hajj case study","volume":"13","author":"Alafif","year":"2022","journal-title":"J Ambient Intell Humaniz Comput"},{"journal-title":"Advances in neural information processing systems","article-title":"DeViSE: a deep visual-semantic embedding model","author":"Frome","key":"ref31"},{"key":"ref32","series-title":"2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6077","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson","year":"2018 Jun 18\u201323"},{"key":"ref33","doi-asserted-by":"crossref","first-page":"12363","DOI":"10.1007\/s11042-023-15798-9","article-title":"SAM: cross-modal semantic alignments module for image-text retrieval","volume":"83","author":"Park","year":"2023","journal-title":"Multimed Tools Appl"},{"key":"ref34","doi-asserted-by":"crossref","first-page":"74","DOI":"10.1007\/s11263-016-0965-7","article-title":"Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models","volume":"123","author":"Plummer","year":"2017","journal-title":"Int J Comput Vis"},{"key":"ref35","doi-asserted-by":"crossref","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","author":"Lin","year":"2014","journal-title":"Computer vision-ECCV 2014"},{"key":"ref36","series-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","first-page":"2556","article-title":"Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning","author":"Sharma","year":"2018"},{"key":"ref37","doi-asserted-by":"crossref","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","article-title":"Faster R-CNN: towards real-time object detection with region proposal networks","volume":"39","author":"Ren","year":"2017","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref38","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","article-title":"Visual genome: connecting language and vision using crowdsourced dense image annotations","volume":"123","author":"Krishna","year":"2017","journal-title":"Int J Comput Vis"},{"key":"ref39","series-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","first-page":"4171","article-title":"Bert: pre-training of deep bidirectional transformers for language understanding","author":"Devlin"},{"key":"ref40","unstructured":"Mikolov T, Chen K, Corrado G, Dean J. Efficient estimation of word representations in vector space. arXiv:1301.3781. 2013."},{"key":"ref41","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, et al. Attention is all you need. arXiv:1706.03762. 2017."},{"key":"ref42","doi-asserted-by":"crossref","first-page":"10","DOI":"10.1016\/j.neucom.2020.12.018","article-title":"Triplet online instance matching loss for person re-identification","volume":"433","author":"Li","year":"2021","journal-title":"Neurocomputing"},{"key":"ref43","series-title":"International Conference on Machine Learning","article-title":"BLIP: bootstrap** language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"ref44","unstructured":"Brown T, Mann B, Ryde NR, Subbiah M, Kaplan J, Dhariwal P, et al. Language models are few-shot learners. arxiv:2005.14165. 2020."},{"key":"ref45","series-title":"2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"10941","article-title":"Multi-modality cross attention network for image and sentence matching","volume":"2020","author":"Wei","year":"2020 Jun 13\u201319"},{"key":"ref46","series-title":"2017 IEEE International Conference on Computer Vision (ICCV)","first-page":"1869","article-title":"Class rectification hard mining for imbalanced deep learning","volume":"2017","author":"Dong","year":"2017 Oct 22\u201329"},{"key":"ref47","doi-asserted-by":"crossref","unstructured":"Luo G, Darrell T, Rohrbach A. NewsCLIPpings: Automatic generation of out-of-context multimodal media. arXiv:2104.05893. 2021.","DOI":"10.18653\/v1\/2021.emnlp-main.545"},{"key":"ref48","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"Maaten","year":"2008","journal-title":"J Mach Learn Res"},{"key":"ref49","series-title":"Proceedings of the 27th ACM International Conference on Multimedia","first-page":"12","article-title":"Matching images and text with multi-modal tensor fusion and re-ranking","author":"Wang","year":"2019"},{"key":"ref50","series-title":"2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","article-title":"Learning the best pooling strategy for visual semantic embedding","author":"Chen","year":"2021 Jun 20\u201325"},{"key":"ref51","series-title":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"15640","article-title":"Negative-aware attention framework for image-text matching","volume":"2022","author":"Zhang","year":"2022 Jun 18\u201324"},{"key":"ref52","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3451390","article-title":"Fine-grained visual textual alignment for cross-modal retrieval using transformer encoders","volume":"17","author":"Messina","year":"2021","journal-title":"ACM Trans Multimed Comput Commun Appl"},{"key":"ref53","first-page":"1218","article-title":"Similarity reasoning and filtration for image-text matching","volume":"35","author":"Diao","year":"2021","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"ref54","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3499027","article-title":"Cross-modal graph matching network for image-text retrieval","volume":"18","author":"Cheng","year":"2022","journal-title":"ACM Trans Multimed Comput Commun Appl"},{"key":"ref55","series-title":"2023 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","first-page":"1022","article-title":"Cross-modal semantic enhanced interaction for image-sentence retrieval","volume":"2023","author":"Ge","year":"2023 Jan 2\u20137"}],"container-title":["Computers, Materials &amp; Continua"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/cdn.techscience.cn\/files\/cmc\/2025\/TSP_CMC-83-2\/TSP_CMC_61037\/TSP_CMC_61037.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T01:05:49Z","timestamp":1763341549000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.techscience.com\/cmc\/v83n2\/60544"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":55,"journal-issue":{"issue":"2","published-online":{"date-parts":[[2025]]},"published-print":{"date-parts":[[2025]]}},"URL":"https:\/\/doi.org\/10.32604\/cmc.2025.061037","relation":{},"ISSN":["1546-2226"],"issn-type":[{"type":"electronic","value":"1546-2226"}],"subject":[],"published":{"date-parts":[[2025]]}}}