{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,14]],"date-time":"2026-04-14T16:38:45Z","timestamp":1776184725706,"version":"3.50.1"},"reference-count":122,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2023,8,1]],"date-time":"2023-08-01T00:00:00Z","timestamp":1690848000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,8,1]],"date-time":"2023-08-01T00:00:00Z","timestamp":1690848000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,8,1]],"date-time":"2023-08-01T00:00:00Z","timestamp":1690848000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R&#x0026;D Program of China","award":["2020AAA0108800"],"award-info":[{"award-number":["2020AAA0108800"]}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["226-2022-00051"],"award-info":[{"award-number":["226-2022-00051"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2023,8]]},"DOI":"10.1109\/tpami.2023.3262578","type":"journal-article","created":{"date-parts":[[2023,3,28]],"date-time":"2023-03-28T18:48:31Z","timestamp":1680029311000},"page":"10055-10069","source":"Crossref","is-referenced-by-count":85,"title":["Local-Global Context Aware Transformer for Language-Guided Video Segmentation"],"prefix":"10.1109","volume":"45","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8026-0843","authenticated-orcid":false,"given":"Chen","family":"Liang","sequence":"first","affiliation":[{"name":"ReLER, CCAI, Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0802-9567","authenticated-orcid":false,"given":"Wenguan","family":"Wang","sequence":"additional","affiliation":[{"name":"ReLER, CCAI, Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5475-1473","authenticated-orcid":false,"given":"Tianfei","family":"Zhou","sequence":"additional","affiliation":[{"name":"ETH Zurich, Z&#x00FC;rich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4238-8475","authenticated-orcid":false,"given":"Jiaxu","family":"Miao","sequence":"additional","affiliation":[{"name":"ReLER, CCAI, Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7037-1806","authenticated-orcid":false,"given":"Yawei","family":"Luo","sequence":"additional","affiliation":[{"name":"ReLER, CCAI, Zhejiang University, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0512-880X","authenticated-orcid":false,"given":"Yi","family":"Yang","sequence":"additional","affiliation":[{"name":"ReLER, CCAI, Zhejiang University, Hangzhou, Zhejiang, China"}]}],"member":"263","reference":[{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2819173"},{"key":"ref56","first-page":"2491","article-title":"Associating objects with transformers for video object segmentation","author":"yang","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2966453"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.238"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.372"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.5244\/C.31.116"},{"key":"ref55","article-title":"Decoupling features in hierarchical propagation for video object segmentation","author":"yang","year":"2022","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.336"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.565"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00478"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.777"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00205"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.375"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00206"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00438"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00579"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00996"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6895"},{"key":"ref9","first-page":"2228","article-title":"Zero-shot video object segmentation with co-attention siamese networks","volume":"44","author":"lu","year":"2022","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58555-6_13"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00404"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/132"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref100","article-title":"Linformer: Self-attention with linear complexity","author":"wang","year":"2020"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58607-2_4"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.470"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01111"},{"key":"ref31","article-title":"Rethinking cross-modal interaction from a top-down perspective for referring video object segmentation","author":"liang","year":"2021"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00602"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01525"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01075"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.333"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.493"},{"key":"ref24","first-page":"213","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00493"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1631\/FITEE.2100463"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00492"},{"key":"ref21","article-title":"Towards data-and knowledge-driven artificial intelligence: A survey on neuro-symbolic computing","author":"wang","year":"2022"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_3"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.143"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_39"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01043"},{"key":"ref12","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref15","first-page":"123","article-title":"Video object segmentation with language referring expressions","author":"khoreva","year":"2018","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref14","article-title":"The 3rd large-scale video object segmentation challenge","year":"2021"},{"key":"ref97","article-title":"Generating long sequences with sparse transformers","author":"child","year":"2019"},{"key":"ref96","article-title":"End-to-end memory networks","volume":"28","author":"sukhbaatar","year":"2015","journal-title":"Advances in neural information processing systems"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3015894"},{"key":"ref99","first-page":"5156","article-title":"Transformers are RNNs: Fast autoregressive transformers with linear attention","author":"katharopoulos","year":"2020","journal-title":"Proc ACM Int Conf Mach Learn"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00412"},{"key":"ref98","article-title":"Longformer: The long-document transformer","author":"beltagy","year":"2020"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2712608"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00417"},{"key":"ref19","article-title":"Hierarchical interaction network for video object segmentation from referring expressions","author":"yang","year":"2021","journal-title":"Proc Brit Mach Vis Conf"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00556"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01721-6"},{"key":"ref92","article-title":"Neural map: Structured memory for deep reinforcement learning","author":"parisotto","year":"2019","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref95","article-title":"Memory networks","author":"weston","year":"2015","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref94","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref91","first-page":"2790","article-title":"Control of memory, active perception, and action in Minecraft","author":"oh","year":"2016","journal-title":"Proc ACM Int Conf Mach Learn"},{"key":"ref90","article-title":"Neural turing machines","author":"graves","year":"2014"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00852"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1006\/jcss.1995.1013"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref88","article-title":"Recurrent space-time graph neural networks","author":"nicolicioiu","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref87","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/JAS.2023.123456"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref83","doi-asserted-by":"crossref","first-page":"533","DOI":"10.1038\/323533a0","article-title":"Learning representations by back-propagating errors","volume":"323","author":"rumelhart","year":"1986","journal-title":"Nature"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"ref79","first-page":"19652","article-title":"Referring transformer: A one-step approach to multi-task visual grounding","author":"li","year":"2021","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref108","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"ref109","article-title":"GMMSeg: Gaussian mixture based generative semantic segmentation models","author":"liang","year":"2022","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref106","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref107","article-title":"Bidirectional LSTM-CRF models for sequence tagging","author":"huang","year":"2015"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01315"},{"key":"ref104","first-page":"562","article-title":"Deeply-supervised nets","volume":"38","author":"lee","year":"2015","journal-title":"Proc Artif Intell Statist"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58523-5_20"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.164"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01512"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref103","article-title":"LANA: A language-capable navigator for instruction following and generation","author":"wang","year":"2023","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00624"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3225573"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref111","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics Hum Lang Technol"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"ref112","article-title":"Simple BERT models for relation extraction and semantic role labeling","author":"shi","year":"2019"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"ref72","article-title":"VL-BERT: Pre-training of generic visual-linguistic representations","author":"su","year":"2020","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1548"},{"key":"ref68","first-page":"13","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"lu","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01248"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-demos.14"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.233"},{"key":"ref116","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"Proc Int Conf Learn Represent"},{"key":"ref66","first-page":"4648","article-title":"MMFT-BERT: Multimodal fusion transformer with bert encodings for visual question answering","author":"khan","year":"2020","journal-title":"Proc Conf Empirical Methods Natural Process"},{"key":"ref113","first-page":"143","article-title":"Towards robust linguistic analysis using ontonotes","author":"pradhan","year":"2013","journal-title":"Proc 22nd Conf Comput Natural Lang Learn"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00970"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1162\/0891201053630264"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3115815"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58580-8_39"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20017"},{"key":"ref61","first-page":"4701","article-title":"Collaborative video object segmentation by multi-scale foreground-background integration","volume":"44","author":"yang","year":"2022","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"ref121","first-page":"7","article-title":"Progressive multimodal interaction network for referring video object segmentation","author":"ding","year":"2021","journal-title":"The 2nd large-scale video object segmentation challenge"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10169863\/10083244.pdf?arnumber=10083244","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T19:23:49Z","timestamp":1743794629000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10083244\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8]]},"references-count":122,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2023.3262578","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,8]]}}}