{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,24]],"date-time":"2026-01-24T17:01:58Z","timestamp":1769274118124,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Science Foundation and the Institute of Education Sciences U.S. Department of Educa","award":["2229873"],"award-info":[{"award-number":["2229873"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611898","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"5007-5016","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Chain-of-Look Prompting for Verb-centric Surgical Triplet Recognition in Endoscopic Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7334-7772","authenticated-orcid":false,"given":"Nan","family":"Xi","sequence":"first","affiliation":[{"name":"State University of New York at Buffalo, Buffalo, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9776-4805","authenticated-orcid":false,"given":"Jingjing","family":"Meng","sequence":"additional","affiliation":[{"name":"Amazon, Seattle, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7324-7034","authenticated-orcid":false,"given":"Junsong","family":"Yuan","sequence":"additional","affiliation":[{"name":"State University of New York at Buffalo, Biffalo, NY, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/S2214-109X(15)70115-4"},{"key":"e_1_3_2_1_2_1","volume-title":"Constitutional AI: Harmlessness from AI Feedback. arXiv preprint arXiv:2212.08073","author":"Bai Yuntao","year":"2022","unstructured":"Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, et al. 2022. Constitutional AI: Harmlessness from AI Feedback. arXiv preprint arXiv:2212.08073 (2022)."},{"key":"e_1_3_2_1_3_1","volume-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555","author":"Chung Junyoung","year":"2014","unstructured":"Junyoung Chung, Caglar Gulcehre, KyungHyun Cho, and Yoshua Bengio. 2014. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)."},{"key":"e_1_3_2_1_4_1","volume-title":"BioMed Language Model. https:\/\/huggingface.co\/stanford-crfm\/BioMedLM","author":"Stanford CRFM.","year":"2022","unstructured":"Stanford CRFM. 2022. BioMed Language Model. https:\/\/huggingface.co\/stanford-crfm\/BioMedLM (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544","author":"Gao Peng","year":"2021","unstructured":"Peng Gao, Shijie Geng, Renrui Zhang, Teli Ma, Rongyao Fang, Yongfeng Zhang, Hongsheng Li, and Yu Qiao. 2021. Clip-adapter: Better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544 (2021)."},{"key":"e_1_3_2_1_6_1","volume-title":"Making pre-trained language models better few-shot learners. arXiv preprint arXiv:2012.15723","author":"Gao Tianyu","year":"2020","unstructured":"Tianyu Gao, Adam Fisch, and Danqi Chen. 2020. Making pre-trained language models better few-shot learners. arXiv preprint arXiv:2012.15723 (2020)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_8_1","volume-title":"Unsupervised prompt learning for vision-language models. arXiv preprint arXiv:2204.03649","author":"Huang Tony","year":"2022","unstructured":"Tony Huang, Jack Chu, and Fangyun Wei. 2022. Unsupervised prompt learning for vision-language models. arXiv preprint arXiv:2204.03649 (2022)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00081"},{"key":"e_1_3_2_1_10_1","volume-title":"Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907","author":"Kipf Thomas N","year":"2016","unstructured":"Thomas N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 (2016)."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Machine Learning. PMLR, 12888--12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888--12900."},{"key":"e_1_3_2_1_12_1","volume-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision-ECCV 2020: 16th European Conference","author":"Li Xiujun","year":"2020","unstructured":"Xiujun Li, Xi Yin, Chunyuan Li, Pengchuan Zhang, Xiaowei Hu, Lei Zhang, Lijuan Wang, Houdong Hu, Li Dong, Furu Wei, et al. 2020. Oscar: Object-semantics aligned pre-training for vision-language tasks. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XXX 16. Springer, 121--137."},{"key":"e_1_3_2_1_13_1","volume-title":"Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190","author":"Li Xiang Lisa","year":"2021","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-tuning: Optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-39899-8_29"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.3389\/fncom.2013.00065"},{"key":"e_1_3_2_1_16_1","volume-title":"UK","author":"Murahari Vishvak","year":"2020","unstructured":"Vishvak Murahari, Dhruv Batra, Devi Parikh, and Abhishek Das. 2020. Large-scale pretraining for visual dialog: A simple state-of-the-art baseline. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XVIII. Springer, 336--352."},{"key":"e_1_3_2_1_17_1","volume-title":"Adversarial NLI: A new benchmark for natural language understanding. arXiv preprint arXiv:1910.14599","author":"Nie Yixin","year":"2019","unstructured":"Yixin Nie, Adina Williams, Emily Dinan, Mohit Bansal, Jason Weston, and Douwe Kiela. 2019. Adversarial NLI: A new benchmark for natural language understanding. arXiv preprint arXiv:1910.14599 (2019)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-59716-0_35"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2022.102433"},{"key":"e_1_3_2_1_20_1","volume-title":"Abe Fingerhut, Robert D Fanelli, Horacio Asbun, Rajesh Aggarwal, et al.","author":"Pucher Philip H","year":"2018","unstructured":"Philip H Pucher, L Michael Brunt, Neil Davies, Ali Linsk, Amani Munshi, H Alejan- dro Rodriguez, Abe Fingerhut, Robert D Fanelli, Horacio Asbun, Rajesh Aggarwal, et al. 2018. Outcome trends and safety measures after 30 years of laparoscopic cholecystectomy: a systematic review and pooled data analysis. Surgical endoscopy 32 (2018), 2175--2183."},{"key":"e_1_3_2_1_21_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-21504-9_6"},{"key":"e_1_3_2_1_23_1","volume-title":"Chiara Amat di San Filippo, Federico Tombari, Mohamed Alsheakhali, Vasileios Belagiannis, Abouzar Eslami, and Nassir Navab.","author":"Rieke Nicola","year":"2016","unstructured":"Nicola Rieke, David Joseph Tan, Chiara Amat di San Filippo, Federico Tombari, Mohamed Alsheakhali, Vasileios Belagiannis, Abouzar Eslami, and Nassir Navab. 2016. Real-time localization of articulated surgical instruments in retinal micro-surgery. Medical image analysis 34 (2016), 82--100."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-23623-5_1"},{"key":"e_1_3_2_1_25_1","volume-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 692--699","author":"Sznitman Raphael","year":"2014","unstructured":"Raphael Sznitman, Carlos Becker, and Pascal Fua. 2014. Fast part-based classifi-cation for instrument detection in minimally invasive surgery. In International Conference on Medical Image Computing and Computer-Assisted Intervention. Springer, 692--699."},{"key":"e_1_3_2_1_26_1","volume-title":"Intravascular imaging and computer assisted stenting and large-scale annotation of biomedical data and expert label synthesis","author":"Vardazaryan Armine","unstructured":"Armine Vardazaryan, Didier Mutter, Jacques Marescaux, and Nicolas Padoy. 2018. Weakly-supervised learning for tool localization in laparoscopic videos. In Intravascular imaging and computer assisted stenting and large-scale annotation of biomedical data and expert label synthesis. Springer, 169--179."},{"key":"e_1_3_2_1_27_1","volume-title":"Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le.","author":"Wei Jason","year":"2021","unstructured":"Jason Wei, Maarten Bosma, Vincent Y Zhao, Kelvin Guu, Adams Wei Yu, Brian Lester, Nan Du, Andrew M Dai, and Quoc V Le. 2021. Finetuned language models are zero-shot learners. arXiv preprint arXiv:2109.01652 (2021)."},{"key":"e_1_3_2_1_28_1","volume-title":"Chi, Quoc Le, and Denny Zhou","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Ed Chi, Quoc Le, and Denny Zhou. 2022. Chain of thought prompting elicits reasoning in large language models. arXiv preprint arXiv:2201.11903 (2022)."},{"key":"e_1_3_2_1_29_1","volume-title":"An estimation of the global","author":"Weiser Thomas G","year":"2008","unstructured":"Thomas G Weiser, Scott E Regenbogen, Katherine D Thompson, Alex B Haynes, Stuart R Lipsitz, William R Berry, and Atul A Gawande. 2008. An estimation of the global volume of surgery: a modelling strategy based on available data. The Lancet 372, 9633 (2008), 139--144."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3191838"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539300"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.555"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611898","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611898","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:02:18Z","timestamp":1755820938000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611898"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":34,"alternative-id":["10.1145\/3581783.3611898","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611898","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}