{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,26]],"date-time":"2026-04-26T05:23:06Z","timestamp":1777180986410,"version":"3.51.4"},"reference-count":76,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2022,11,1]],"date-time":"2022-11-01T00:00:00Z","timestamp":1667260800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2022,11,1]],"date-time":"2022-11-01T00:00:00Z","timestamp":1667260800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2022,9,22]],"date-time":"2022-09-22T00:00:00Z","timestamp":1663804800000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Medical Image Analysis"],"published-print":{"date-parts":[[2022,11]]},"DOI":"10.1016\/j.media.2022.102630","type":"journal-article","created":{"date-parts":[[2022,9,17]],"date-time":"2022-09-17T18:57:14Z","timestamp":1663441034000},"page":"102630","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":22,"special_numbering":"C","title":["Gaze-assisted automatic captioning of fetal ultrasound videos using three-way multi-modal deep neural networks"],"prefix":"10.1016","volume":"82","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2271-0578","authenticated-orcid":false,"given":"Mohammad","family":"Alsharid","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifan","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Harshita","family":"Sharma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5588-1410","authenticated-orcid":false,"given":"Lior","family":"Drukker","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Aris T.","family":"Papageorghiou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3060-3772","authenticated-orcid":false,"given":"J. Alison","family":"Noble","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.media.2022.102630_b1","doi-asserted-by":"crossref","unstructured":"Allaouzi,\u00a0I., Ben\u00a0Ahmed,\u00a0M., Benamrou,\u00a0B., Ouardouz,\u00a0M., 2018. Automatic Caption Generation for Medical Images. In: Proceedings of the 3rd International Conference on Smart City Applications. pp. 1\u20136.","DOI":"10.1145\/3286606.3286863"},{"key":"10.1016\/j.media.2022.102630_b2","series-title":"Medical Ultrasound, and Preterm, Perinatal and Paediatric Image Analysis","first-page":"75","article-title":"A curriculum learning based approach to captioning ultrasound images","author":"Alsharid","year":"2020"},{"key":"10.1016\/j.media.2022.102630_b3","series-title":"2021 IEEE 18th International Symposium on Biomedical Imaging","first-page":"716","article-title":"A course-focused dual curriculum for image captioning","author":"Alsharid","year":"2021"},{"key":"10.1016\/j.media.2022.102630_b4","series-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention","first-page":"338","article-title":"Captioning ultrasound images automatically","author":"Alsharid","year":"2019"},{"key":"10.1016\/j.media.2022.102630_b5","doi-asserted-by":"crossref","first-page":"409","DOI":"10.1613\/jair.4900","article-title":"Automatic description generation from images: A survey of models, datasets, and evaluation measures","volume":"55","author":"Bernardi","year":"2016","journal-title":"J. Artificial Intelligence Res."},{"key":"10.1016\/j.media.2022.102630_b6","series-title":"Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit","author":"Bird","year":"2009"},{"key":"10.1016\/j.media.2022.102630_b7","series-title":"Enriching word vectors with subword information","author":"Bojanowski","year":"2016"},{"key":"10.1016\/j.media.2022.102630_b8","series-title":"Deep Learning Sonographer Visual Attention","author":"Cai","year":"2019"},{"key":"10.1016\/j.media.2022.102630_b9","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2020.101762","article-title":"Spatio-temporal visual attention modelling of standard biometry plane-finding navigation","volume":"65","author":"Cai","year":"2020","journal-title":"Med. Image Anal."},{"key":"10.1016\/j.media.2022.102630_b10","series-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention","first-page":"871","article-title":"Multi-task SonoEyeNet: detection of fetal standardized planes assisted by generated sonographer attention maps","author":"Cai","year":"2018"},{"key":"10.1016\/j.media.2022.102630_b11","series-title":"2018 IEEE 15th International Symposium on Biomedical Imaging (ISBI 2018)","first-page":"1475","article-title":"Sonoeyenet: Standardized fetal ultrasound plane detection informed by eye tracking","author":"Cai","year":"2018"},{"key":"10.1016\/j.media.2022.102630_b12","series-title":"Proceedings of the 34th International Conference on Machine Learning-Volume 70","first-page":"894","article-title":"Soft-DTW: a differentiable loss function for time-series","author":"Cuturi","year":"2017"},{"key":"10.1016\/j.media.2022.102630_b13","doi-asserted-by":"crossref","unstructured":"Damen,\u00a0D., Doughty,\u00a0H., Farinella,\u00a0G.M., Fidler,\u00a0S., Furnari,\u00a0A., Kazakos,\u00a0E., Moltisanti,\u00a0D., Munro,\u00a0J., Perrett,\u00a0T., Price,\u00a0W., et al., 2018. Scaling egocentric vision: The epic-kitchens dataset. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 720\u2013736.","DOI":"10.1007\/978-3-030-01225-0_44"},{"key":"10.1016\/j.media.2022.102630_b14","doi-asserted-by":"crossref","first-page":"90","DOI":"10.1016\/j.cviu.2017.10.001","article-title":"Human attention in visual question answering: Do humans and deep networks look at the same regions?","volume":"163","author":"Das","year":"2017","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.media.2022.102630_b15","series-title":"2009 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"248","article-title":"Imagenet: A large-scale hierarchical image database","author":"Deng","year":"2009"},{"key":"10.1016\/j.media.2022.102630_b16","series-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"10.1016\/j.media.2022.102630_b17","series-title":"Annual Conference on Medical Image Understanding and Analysis","first-page":"174","article-title":"Towards capturing sonographic experience: cognition-inspired ultrasound video saliency prediction","author":"Droste","year":"2019"},{"issue":"3","key":"10.1016\/j.media.2022.102630_b18","doi-asserted-by":"crossref","first-page":"375","DOI":"10.1002\/uog.21929","article-title":"Expected-value bias in routine third-trimester growth scans","volume":"55","author":"Drukker","year":"2020","journal-title":"Ultrasound Obstet. Gynecol."},{"issue":"02","key":"10.1016\/j.media.2022.102630_b19","doi-asserted-by":"crossref","first-page":"138","DOI":"10.1055\/a-1074-0722","article-title":"Safety indices of ultrasound: adherence to recommendations and awareness during routine obstetric ultrasound scanning","volume":"41","author":"Drukker","year":"2020","journal-title":"Ultraschall Der Medizin-European J. Ultrasound"},{"key":"10.1016\/j.media.2022.102630_b20","doi-asserted-by":"crossref","first-page":"235","DOI":"10.1002\/uog.22958","article-title":"VP40. 20: Standard biometric planes: what are the salient anatomical landmarks?","volume":"56","author":"Drukker","year":"2020","journal-title":"Ultrasound Obstet. Gynecol."},{"issue":"1","key":"10.1016\/j.media.2022.102630_b21","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1038\/s41598-021-92829-1","article-title":"Transforming obstetric ultrasound into data science using eye tracking, voice recording, transducer motion and ultrasound video","volume":"11","author":"Drukker","year":"2021","journal-title":"Sci. Rep."},{"key":"10.1016\/j.media.2022.102630_b22","doi-asserted-by":"crossref","unstructured":"Elliott,\u00a0D., Keller,\u00a0F., 2013. Image description using visual dependency representations. In: Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing. pp. 1292\u20131302.","DOI":"10.18653\/v1\/D13-1128"},{"key":"10.1016\/j.media.2022.102630_b23","series-title":"European Conference on Computer Vision","first-page":"15","article-title":"Every picture tells a story: Generating sentences from images","author":"Farhadi","year":"2010"},{"key":"10.1016\/j.media.2022.102630_b24","doi-asserted-by":"crossref","first-page":"345","DOI":"10.1613\/jair.4992","article-title":"A primer on neural network models for natural language processing","volume":"57","author":"Goldberg","year":"2016","journal-title":"J. Artificial Intelligence Res."},{"key":"10.1016\/j.media.2022.102630_b25","series-title":"Deep Learning","author":"Goodfellow","year":"2016"},{"key":"10.1016\/j.media.2022.102630_b26","series-title":"Google cloud speech-to-text - speech recognition","author":"Google Cloud","year":"2019"},{"key":"10.1016\/j.media.2022.102630_b27","doi-asserted-by":"crossref","unstructured":"Guadarrama,\u00a0S., Krishnamoorthy,\u00a0N., Malkarnenkar,\u00a0G., Venugopalan,\u00a0S., Mooney,\u00a0R., Darrell,\u00a0T., Saenko,\u00a0K., 2013. Youtube2text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: Proceedings of the IEEE International Conference on Computer Vision. pp. 2712\u20132719.","DOI":"10.1109\/ICCV.2013.337"},{"key":"10.1016\/j.media.2022.102630_b28","doi-asserted-by":"crossref","unstructured":"He,\u00a0K., Zhang,\u00a0X., Ren,\u00a0S., Sun,\u00a0J., 2016. Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.media.2022.102630_b29","unstructured":"Kingma,\u00a0D., 2015. Ba J. Adam: a method for stochastic optimization. In: The International Conference on Learning Representations."},{"key":"10.1016\/j.media.2022.102630_b30","article-title":"NHS fetal anomaly screening programme","volume":"18","author":"Kirwan","year":"2010","journal-title":"National Stand. Guid. Engl."},{"issue":"2","key":"10.1016\/j.media.2022.102630_b31","doi-asserted-by":"crossref","first-page":"171","DOI":"10.1023\/A:1020346032608","article-title":"Natural language description of human activities from video images based on concept hierarchy of actions","volume":"50","author":"Kojima","year":"2002","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.media.2022.102630_b32","series-title":"A survey on biomedical image captioning","author":"Kougia","year":"2019"},{"issue":"4","key":"10.1016\/j.media.2022.102630_b33","doi-asserted-by":"crossref","first-page":"1234","DOI":"10.1093\/bioinformatics\/btz682","article-title":"Biobert: a pre-trained biomedical language representation model for biomedical text mining","volume":"36","author":"Lee","year":"2020","journal-title":"Bioinformatics"},{"key":"10.1016\/j.media.2022.102630_b34","series-title":"Text Summarization Branches Out","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.media.2022.102630_b35","doi-asserted-by":"crossref","unstructured":"Lin,\u00a0T.-Y., Goyal,\u00a0P., Girshick,\u00a0R., He,\u00a0K., Doll\u00e1r,\u00a0P., 2017. Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision. pp. 2980\u20132988.","DOI":"10.1109\/ICCV.2017.324"},{"key":"10.1016\/j.media.2022.102630_b36","series-title":"European Conference on Computer Vision","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.media.2022.102630_b37","series-title":"Univl: A unified video and language pre-training model for multimodal understanding and generation","author":"Luo","year":"2020"},{"key":"10.1016\/j.media.2022.102630_b38","doi-asserted-by":"crossref","unstructured":"Miech,\u00a0A., Zhukov,\u00a0D., Alayrac,\u00a0J.-B., Tapaswi,\u00a0M., Laptev,\u00a0I., Sivic,\u00a0J., 2019. Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 2630\u20132640.","DOI":"10.1109\/ICCV.2019.00272"},{"key":"10.1016\/j.media.2022.102630_b39","series-title":"Efficient estimation of word representations in vector space","author":"Mikolov","year":"2013"},{"key":"10.1016\/j.media.2022.102630_b40","series-title":"Advances in Neural Information Processing Systems","first-page":"3111","article-title":"Distributed representations of words and phrases and their compositionality","author":"Mikolov","year":"2013"},{"key":"10.1016\/j.media.2022.102630_b41","series-title":"2021 International Joint Conference on Neural Networks","first-page":"1","article-title":"Bioalbert: A simple and effective pre-trained language model for biomedical named entity recognition","author":"Naseem","year":"2021"},{"key":"10.1016\/j.media.2022.102630_b42","series-title":"NHS fetal anomaly screening programme handbook 2018","author":"NHS","year":"2018"},{"key":"10.1016\/j.media.2022.102630_b43","series-title":"Advances in Neural Information Processing Systems","first-page":"1143","article-title":"Im2text: Describing images using 1 million captioned photographs","author":"Ordonez","year":"2011"},{"key":"10.1016\/j.media.2022.102630_b44","doi-asserted-by":"crossref","unstructured":"Pan,\u00a0Y., Mei,\u00a0T., Yao,\u00a0T., Li,\u00a0H., Rui,\u00a0Y., 2016a. Jointly modeling embedding and translation to bridge video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 4594\u20134602.","DOI":"10.1109\/CVPR.2016.497"},{"key":"10.1016\/j.media.2022.102630_b45","doi-asserted-by":"crossref","unstructured":"Pan,\u00a0P., Xu,\u00a0Z., Yang,\u00a0Y., Wu,\u00a0F., Zhuang,\u00a0Y., 2016b. Hierarchical recurrent neural encoder for video representation with application to captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 1029\u20131038.","DOI":"10.1109\/CVPR.2016.117"},{"key":"10.1016\/j.media.2022.102630_b46","series-title":"Proceedings of the 40th Annual Meeting on Association for Computational Linguistics","first-page":"311","article-title":"BLEU: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.media.2022.102630_b47","first-page":"2825","article-title":"Scikit-learn: Machine learning in python","volume":"12","author":"Pedregosa","year":"2011","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.media.2022.102630_b48","series-title":"Proceedings of the NAACL HLT 2010 Workshop on Creating Speech and Language Data with Amazon\u2019s Mechanical Turk","first-page":"139","article-title":"Collecting image annotations using amazon\u2019s mechanical turk","author":"Rashtchian","year":"2010"},{"key":"10.1016\/j.media.2022.102630_b49","doi-asserted-by":"crossref","unstructured":"Rohrbach,\u00a0M., Qiu,\u00a0W., Titov,\u00a0I., Thater,\u00a0S., Pinkal,\u00a0M., Schiele,\u00a0B., 2013. Translating video content to natural language descriptions. In: Proceedings of the IEEE International Conference on Computer Vision. pp. 433\u2013440.","DOI":"10.1109\/ICCV.2013.61"},{"key":"10.1016\/j.media.2022.102630_b50","series-title":"German Conference on Pattern Recognition","first-page":"184","article-title":"Coherent multi-sentence video description with variable level of detail","author":"Rohrbach","year":"2014"},{"key":"10.1016\/j.media.2022.102630_b51","series-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention","first-page":"603","article-title":"Observational supervision for medical image classification using gaze data","author":"Saab","year":"2021"},{"key":"10.1016\/j.media.2022.102630_b52","series-title":"2019 IEEE 16th International Symposium on Biomedical Imaging (ISBI 2019)","first-page":"987","article-title":"Spatio-temporal partitioning and description of full-length routine fetal anomaly ultrasound scans","author":"Sharma","year":"2019"},{"key":"10.1016\/j.media.2022.102630_b53","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2021.101973","article-title":"Knowledge representation and learning of operator clinical workflow from full-length routine fetal ultrasound scan videos","volume":"69","author":"Sharma","year":"2021","journal-title":"Med. Image Anal."},{"key":"10.1016\/j.media.2022.102630_b54","doi-asserted-by":"crossref","unstructured":"Siersdorfer,\u00a0S., San\u00a0Pedro,\u00a0J., Sanderson,\u00a0M., 2009. Automatic video tagging using content redundancy. In: Proceedings of the 32nd International ACM SIGIR Conference on Research and Development in Information Retrieval. pp. 395\u2013402.","DOI":"10.1145\/1571941.1572010"},{"key":"10.1016\/j.media.2022.102630_b55","series-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014"},{"key":"10.1016\/j.media.2022.102630_b56","series-title":"Seeing with humans: Gaze-assisted neural image captioning","author":"Sugano","year":"2016"},{"key":"10.1016\/j.media.2022.102630_b57","series-title":"What is the role of recurrent neural networks (RNNs) in an image caption generator?","author":"Tanti","year":"2017"},{"issue":"3","key":"10.1016\/j.media.2022.102630_b58","doi-asserted-by":"crossref","first-page":"467","DOI":"10.1017\/S1351324918000098","article-title":"Where to put the image in an image caption generator","volume":"24","author":"Tanti","year":"2018","journal-title":"Nat. Lang. Eng."},{"key":"10.1016\/j.media.2022.102630_b59","doi-asserted-by":"crossref","unstructured":"Vinyals,\u00a0O., Toshev,\u00a0A., Bengio,\u00a0S., Erhan,\u00a0D., 2015. Show and tell: A neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 3156\u20133164.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"10.1016\/j.media.2022.102630_b60","series-title":"Frontiers of Multimedia Research","first-page":"3","article-title":"Deep learning for video classification and captioning","author":"Wu","year":"2017"},{"key":"10.1016\/j.media.2022.102630_b61","series-title":"Advances in Neural Information Processing Systems","first-page":"802","article-title":"Convolutional LSTM network: A machine learning approach for precipitation nowcasting","author":"Xingjian","year":"2015"},{"key":"10.1016\/j.media.2022.102630_b62","series-title":"International Workshop on Machine Learning in Medical Imaging","first-page":"673","article-title":"Reinforced transformer for medical image captioning","author":"Xiong","year":"2019"},{"key":"10.1016\/j.media.2022.102630_b63","unstructured":"Xu,\u00a0K., Ba,\u00a0J., Kiros,\u00a0R., Cho,\u00a0K., Courville,\u00a0A., Salakhudinov,\u00a0R., Zemel,\u00a0R., Bengio,\u00a0Y., 2015a. Show, attend and tell: Neural image caption generation with visual attention. In: International Conference on Machine Learning. pp. 2048\u20132057."},{"key":"10.1016\/j.media.2022.102630_b64","doi-asserted-by":"crossref","unstructured":"Xu,\u00a0J., Mei,\u00a0T., Yao,\u00a0T., Rui,\u00a0Y., 2016. MSR-VTT: A large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 5288\u20135296.","DOI":"10.1109\/CVPR.2016.571"},{"key":"10.1016\/j.media.2022.102630_b65","series-title":"AAAI","first-page":"6","article-title":"Jointly modeling deep video and compositional text to bridge vision and language in a unified framework","volume":"Vol. 5","author":"Xu","year":"2015"},{"key":"10.1016\/j.media.2022.102630_b66","doi-asserted-by":"crossref","unstructured":"Yao,\u00a0T., Mei,\u00a0T., Ngo,\u00a0C.-W., Li,\u00a0S., 2013. Annotation for free: Video tagging by mining user search behavior. In: Proceedings of the 21st ACM International Conference on Multimedia. pp. 977\u2013986.","DOI":"10.1145\/2502081.2502085"},{"key":"10.1016\/j.media.2022.102630_b67","doi-asserted-by":"crossref","unstructured":"You,\u00a0Q., Jin,\u00a0H., Wang,\u00a0Z., Fang,\u00a0C., Luo,\u00a0J., 2016. Image captioning with semantic attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 4651\u20134659.","DOI":"10.1109\/CVPR.2016.503"},{"key":"10.1016\/j.media.2022.102630_b68","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1162\/tacl_a_00166","article-title":"From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions","volume":"2","author":"Young","year":"2014","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"10.1016\/j.media.2022.102630_b69","doi-asserted-by":"crossref","unstructured":"Yu,\u00a0Y., Choi,\u00a0J., Kim,\u00a0Y., Yoo,\u00a0K., Lee,\u00a0S.-H., Kim,\u00a0G., 2017. Supervising neural attention models for video captioning by human gaze data. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 490\u2013498.","DOI":"10.1109\/CVPR.2017.648"},{"key":"10.1016\/j.media.2022.102630_b70","doi-asserted-by":"crossref","unstructured":"Yu,\u00a0H., Wang,\u00a0J., Huang,\u00a0Z., Yang,\u00a0Y., Xu,\u00a0W., 2016. Video paragraph captioning using hierarchical recurrent neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp. 4584\u20134593.","DOI":"10.1109\/CVPR.2016.496"},{"issue":"5","key":"10.1016\/j.media.2022.102630_b71","doi-asserted-by":"crossref","first-page":"1086","DOI":"10.1007\/s11390-018-1874-8","article-title":"Understanding and generating ultrasound image description","volume":"33","author":"Zeng","year":"2018","journal-title":"J. Comput. Sci. Tech."},{"key":"10.1016\/j.media.2022.102630_b72","article-title":"Deep learning for ultrasound image caption generation based on object detection","author":"Zeng","year":"2019","journal-title":"Neurocomputing"},{"issue":"1","key":"10.1016\/j.media.2022.102630_b73","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1038\/s41597-019-0055-0","article-title":"BioWordVec, improving biomedical word embeddings with subword information and MeSH","volume":"6","author":"Zhang","year":"2019","journal-title":"Sci. Data"},{"issue":"11","key":"10.1016\/j.media.2022.102630_b74","doi-asserted-by":"crossref","first-page":"5552","DOI":"10.1109\/TIP.2019.2916757","article-title":"CAM-RNN: Co-attention model based RNN for video captioning","volume":"28","author":"Zhao","year":"2019","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.media.2022.102630_b75","doi-asserted-by":"crossref","unstructured":"Zhou,\u00a0L., Palangi,\u00a0H., Zhang,\u00a0L., Hu,\u00a0H., Corso,\u00a0J., Gao,\u00a0J., 2020. Unified vision-language pre-training for image captioning and vqa. In: Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 34, (07), pp. 13041\u201313049.","DOI":"10.1609\/aaai.v34i07.7005"},{"issue":"5","key":"10.1016\/j.media.2022.102630_b76","doi-asserted-by":"crossref","first-page":"739","DOI":"10.3390\/app8050739","article-title":"Captioning transformer with stacked attention modules","volume":"8","author":"Zhu","year":"2018","journal-title":"Appl. Sci."}],"container-title":["Medical Image Analysis"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1361841522002584?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1361841522002584?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,10,23]],"date-time":"2025-10-23T18:51:51Z","timestamp":1761245511000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1361841522002584"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,11]]},"references-count":76,"alternative-id":["S1361841522002584"],"URL":"https:\/\/doi.org\/10.1016\/j.media.2022.102630","relation":{},"ISSN":["1361-8415"],"issn-type":[{"value":"1361-8415","type":"print"}],"subject":[],"published":{"date-parts":[[2022,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Gaze-assisted automatic captioning of fetal ultrasound videos using three-way multi-modal deep neural networks","name":"articletitle","label":"Article Title"},{"value":"Medical Image Analysis","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.media.2022.102630","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2022 The Authors. Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"102630"}}