{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T20:37:59Z","timestamp":1779309479132,"version":"3.51.4"},"publisher-location":"Cham","reference-count":47,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031731150","type":"print"},{"value":"9783031731167","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73116-7_1","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:15:38Z","timestamp":1730301338000},"page":"1-18","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":17,"title":["WTS: A Pedestrian-Centric Traffic Video Dataset for\u00a0Fine-Grained Spatial-Temporal Understanding"],"prefix":"10.1007","author":[{"given":"Quan","family":"Kong","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuki","family":"Kawana","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rajat","family":"Saini","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ashutosh","family":"Kumar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingjing","family":"Pan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ta","family":"Gu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yohei","family":"Ozao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Balazs","family":"Opra","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yoichi","family":"Sato","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Norimasa","family":"Kobori","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation (2016)","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"1_CR2","unstructured":"Awad, G., et\u00a0al.: TRECVID 2020: a comprehensive campaign for evaluating video retrieval tasks across multiple application domains. arXiv preprint arXiv:2104.13473 (2021)"},{"key":"1_CR3","unstructured":"Bai, S., et al.: TouchStone: evaluating vision-language models by language models (2023)"},{"key":"1_CR4","unstructured":"Baid, A., et al.: GTSFM: Georgia tech structure from motion. https:\/\/github.com\/borglab\/gtsfm (2021)"},{"key":"1_CR5","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Goldstein, J., Lavie, A., Lin, C.Y., Voss, C. (eds.) Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, Ann Arbor, Michigan, pp. 65\u201372. Association for Computational Linguistics (2005). https:\/\/aclanthology.org\/W05-0909"},{"key":"1_CR6","unstructured":"Chen, D., Dolan, W.: Collecting highly parallel data for paraphrase evaluation. In: Lin, D., Matsumoto, Y., Mihalcea, R. (eds.) Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, Portland, Oregon, USA, pp. 190\u2013200. Association for Computational Linguistics (2011). https:\/\/aclanthology.org\/P11-1020"},{"key":"1_CR7","unstructured":"Chen, S., et al.: VALOR: vision-audio-language omni-perception pretraining model and dataset. arXiv preprint arXiv:2304.08345 (2023)"},{"key":"1_CR8","doi-asserted-by":"crossref","unstructured":"Cheng, H.K., Schwing, A.G.: XMem: long-term video object segmentation with an Atkinson-Shiffrin memory model. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. Springer, Cham (2022)","DOI":"10.1007\/978-3-031-19815-1_37"},{"key":"1_CR9","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Victor\u00a0Escorcia, B.G., Niebles, J.C.: ActivityNet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"1_CR10","doi-asserted-by":"crossref","unstructured":"Fei, H., Ren, Y., Ji, D.: Improving text understanding via deep syntax-semantics communication. In: Findings (2020). https:\/\/api.semanticscholar.org\/CorpusID:226283615","DOI":"10.18653\/v1\/2020.findings-emnlp.8"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"Fu, J., Ng, S.K., Jiang, Z., Liu, P.: GPTScore: evaluate as you desire (2023)","DOI":"10.18653\/v1\/2024.naacl-long.365"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Guzov, V., Mir, A., Sattler, T., Pons-Moll, G.: Human poseitioning system (HPS): 3D human pose estimation and self-localization in large scenes from body-mounted sensors. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE (2021)","DOI":"10.1109\/CVPR46437.2021.00430"},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Hu, Z., Yang, Y., Zhai, X., Yang, D., Zhou, B., Liu, J.: GFIE: a dataset and baseline for gaze-following from 2D to 3D in indoor environments. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8907\u20138916 (2023)","DOI":"10.1109\/CVPR52729.2023.00860"},{"key":"1_CR14","doi-asserted-by":"crossref","unstructured":"Huang, J.Y., Huang, K.H., Chang, K.W.: Disentangling semantics and syntax in sentence embeddings with pre-trained language models (2021)","DOI":"10.18653\/v1\/2021.naacl-main.108"},{"key":"1_CR15","doi-asserted-by":"crossref","unstructured":"Kellnhofer, P., Recasens, A., Stent, S., Matusik, W., Torralba, A.: Gaze360: physically unconstrained gaze estimation in the wild. In: IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00701"},{"key":"1_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"577","DOI":"10.1007\/978-3-030-01216-8_35","volume-title":"Computer Vision \u2013 ECCV 2018","author":"J Kim","year":"2018","unstructured":"Kim, J., Rohrbach, A., Darrell, T., Canny, J., Akata, Z.: Textual explanations for self-driving vehicles. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11206, pp. 577\u2013593. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01216-8_35"},{"key":"1_CR17","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything. arXiv:2304.02643 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"1_CR18","doi-asserted-by":"publisher","unstructured":"Krishna, K., Chang, Y., Wieting, J., Iyyer, M.: RankGen: improving text generation with large ranking models. In: Goldberg, Y., Kozareva, Z., Zhang, Y. (eds.) Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, Abu Dhabi, United Arab Emirates, pp. 199\u2013232. Association for Computational Linguistics (2022). https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.15, https:\/\/aclanthology.org\/2022.emnlp-main.15","DOI":"10.18653\/v1\/2022.emnlp-main.15"},{"key":"1_CR19","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: MVBench: a comprehensive multi-modal video understanding benchmark (2024)","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"1_CR20","doi-asserted-by":"crossref","unstructured":"Li, Y., Song, Y., Cao, L., Tetreault, J., Goldberg, L., Jaimes, A., Luo, J.: TGIF: a new dataset and benchmark on animated gif description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4641\u20134650 (2016)","DOI":"10.1109\/CVPR.2016.502"},{"key":"1_CR21","unstructured":"Lin, C.Y.: ROUGE: A package for automatic evaluation of summaries. In: Text Summarization Branches Out, Barcelona, Spain, pp. 74\u201381. Association for Computational Linguistics (2004). https:\/\/aclanthology.org\/W04-1013"},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., Khan, F.S.: Video-ChatGPT: towards detailed video understanding via large vision and language models. arXiv:2306.05424 (2023)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"1_CR23","doi-asserted-by":"publisher","unstructured":"Malla, S., Choi, C., Dwivedi, I., Choi, J.H., Li, J.: DRAMA: joint risk localization and captioning in driving. In: IEEE\/CVF Winter Conference on Applications of Computer Vision, WACV 2023, Waikoloa, HI, USA, 2\u20137 January 2023, pp. 1043\u20131052. IEEE (2023). https:\/\/doi.org\/10.1109\/WACV56688.2023.00110","DOI":"10.1109\/WACV56688.2023.00110"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: learning a text-video embedding by watching hundred million narrated video clips. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Nonaka, S., Nobuhara, S., Nishino, K.: Dynamic 3D gaze from afar: deep gaze estimation from temporal eye-head-body coordination. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2192\u20132201 (2022)","DOI":"10.1109\/CVPR52688.2022.00223"},{"key":"1_CR26","doi-asserted-by":"crossref","unstructured":"Onishi, Hirofumi, H.T.K.R.I.H., Murase, T.: Analysis of pedestrian-fatality statistics in Japan and the US and vehicle-pedestrian communication for vehicle-pedestrian crash-warnings. Int. J. Autom. Eng. 9(4), 231\u2013236 (2018)","DOI":"10.20485\/jsaeijae.9.4_231"},{"key":"1_CR27","doi-asserted-by":"publisher","DOI":"10.3758\/s13428-023-02173-7","author":"V Onkhar","year":"2023","unstructured":"Onkhar, V., Dodou, D., de Winter, J.: Evaluating the tobii pro glasses 2 and 3 in static and dynamic conditions. Behav. Res. Methods (2023). https:\/\/doi.org\/10.3758\/s13428-023-02173-7","journal-title":"Behav. Res. Methods"},{"key":"1_CR28","unstructured":"OpenAI: GPT-3.5 (2023). https:\/\/platform.openai.com\/docs\/models\/gpt-3-5-turbo"},{"key":"1_CR29","doi-asserted-by":"publisher","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Isabelle, P., Charniak, E., Lin, D. (eds.) Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, Philadelphia, Pennsylvania, USA, pp. 311\u2013318. Association for Computational Linguistics (2002). https:\/\/doi.org\/10.3115\/1073083.1073135, https:\/\/aclanthology.org\/P02-1040","DOI":"10.3115\/1073083.1073135"},{"issue":"10","key":"1_CR30","doi-asserted-by":"publisher","first-page":"14007","DOI":"10.1007\/s11042-018-7040-z","volume":"78","author":"S Pini","year":"2019","unstructured":"Pini, S., Cornia, M., Bolelli, F., Baraldi, L., Cucchiara, R.: M-VAD names: a dataset for video captioning with naming. Multimedia Tools Appl. 78(10), 14007\u201314027 (2019)","journal-title":"Multimedia Tools Appl."},{"key":"1_CR31","doi-asserted-by":"crossref","unstructured":"Regneri, M., Rohrbach, M., Wetzel, D., Thater, S., Schiele, B., Pinkal, M.: Grounding action descriptions in videos. Trans. Assoc. Comput. Linguist. 1, 25\u201336 (2013)","DOI":"10.1162\/tacl_a_00207"},{"key":"1_CR32","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"1_CR33","doi-asserted-by":"publisher","unstructured":"Oliveira\u00a0dos Santos, G., Colombini, E.L., Avila, S.: CIDEr-R: robust consensus-based image description evaluation. In: Xu, W., Ritter, A., Baldwin, T., Rahimi, A. (eds.) Proceedings of the Seventh Workshop on Noisy User-generated Text (W-NUT 2021), pp. 351\u2013360. Association for Computational Linguistics, Online (2021). https:\/\/doi.org\/10.18653\/v1\/2021.wnut-1.39, https:\/\/aclanthology.org\/2021.wnut-1.39","DOI":"10.18653\/v1\/2021.wnut-1.39"},{"key":"1_CR34","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Gupta, A., Schmid, C., Farhadi, A., Alahari, K.: Actor and observer: joint modeling of first and third-person videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7396\u20137404 (2018)","DOI":"10.1109\/CVPR.2018.00772"},{"key":"1_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"1_CR36","doi-asserted-by":"crossref","unstructured":"Sima, C., et al.: DriveLM: driving with graph visual question answering (2023)","DOI":"10.1007\/978-3-031-72943-0_15"},{"key":"1_CR37","unstructured":"Wang, T., et al.: Caption anything: Interactive image description with diverse multimodal controls. arXiv preprint arXiv:2305.02677 (2023)"},{"key":"1_CR38","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhang, R., Lu, Z., Zheng, F., Cheng, R., Luo, P.: End-to-end dense video captioning with parallel decoding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6847\u20136857 (2021)","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"1_CR39","doi-asserted-by":"crossref","unstructured":"Wang, X., Wu, J., Chen, J., Li, L., Wang, Y.F., Wang, W.Y.: VATEX: a large-scale, high-quality multilingual dataset for video-and-language research. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4581\u20134591 (2019)","DOI":"10.1109\/ICCV.2019.00468"},{"key":"1_CR40","unstructured":"Xu, H., et al.: mPLUG-2: a modularized multi-modal foundation model across text, image and video. ArXiv abs\/2302.00402 (2023)"},{"key":"1_CR41","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"1_CR42","doi-asserted-by":"crossref","unstructured":"Xu, Z., et al.: DriveGPT4: interpretable end-to-end autonomous driving via large language model. arXiv preprint arXiv:2310.01412 (2023)","DOI":"10.1109\/LRA.2024.3440097"},{"key":"1_CR43","doi-asserted-by":"crossref","unstructured":"Yang, A., et al.: Vid2Seq: large-scale pretraining of a visual language model for dense video captioning. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"1_CR44","unstructured":"Yang, J., Gao, M., Li, Z., Gao, S., Wang, F., Zheng, F.: Track anything: segment anything meets videos (2023)"},{"key":"1_CR45","doi-asserted-by":"crossref","unstructured":"Yu, F., et al.: Bdd100k: a diverse driving dataset for heterogeneous multitask learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00271"},{"key":"1_CR46","doi-asserted-by":"crossref","unstructured":"Yuan, Y., et al.: Osprey: pixel understanding with visual instruction tuning (2023)","DOI":"10.1109\/CVPR52733.2024.02664"},{"key":"1_CR47","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-LLaMA: an instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73116-7_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T14:12:08Z","timestamp":1732975928000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73116-7_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031731150","9783031731167"],"references-count":47,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73116-7_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}