Here're are some resources about Text-to-Image and Image-to-Text modeling, understanding, generation in Multi-Modal LLMs, a.k.a. Visual Language Models, VLMs
tag: Janus
| DeepSeek
| Peking University
paper link: here
github link: here
citation:
@misc{wu2024janusdecouplingvisualencoding,
title={Janus: Decoupling Visual Encoding for Unified Multimodal Understanding and Generation},
author={Chengyue Wu and Xiaokang Chen and Zhiyu Wu and Yiyang Ma and Xingchao Liu and Zizheng Pan and Wen Liu and Zhenda Xie and Xingkai Yu and Chong Ruan and Ping Luo},
year={2024},
eprint={2410.13848},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2410.13848},
}
tag: Pixtral
| Mistral AI
paper link: here
blog link: here
github link: here
model link: here
citation:
@misc{agrawal2024pixtral12b,
title={Pixtral 12B},
author={Pravesh Agrawal and Szymon Antoniak and Emma Bou Hanna and Baptiste Bout and Devendra Chaplot and Jessica Chudnovsky and Diogo Costa and Baudouin De Monicault and Saurabh Garg and Theophile Gervet and Soham Ghosh and Amélie Héliou and Paul Jacob and Albert Q. Jiang and Kartik Khandelwal and Timothée Lacroix and Guillaume Lample and Diego Las Casas and Thibaut Lavril and Teven Le Scao and Andy Lo and William Marshall and Louis Martin and Arthur Mensch and Pavankumar Muddireddy and Valera Nemychnikova and Marie Pellat and Patrick Von Platen and Nikhil Raghuraman and Baptiste Rozière and Alexandre Sablayrolles and Lucile Saulnier and Romain Sauvestre and Wendy Shang and Roman Soletskyi and Lawrence Stewart and Pierre Stock and Joachim Studnia and Sandeep Subramanian and Sagar Vaze and Thomas Wang and Sophia Yang},
year={2024},
eprint={2410.07073},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2410.07073},
}
tag: ARIA
| MoE
| Rhymes AI
paper link: here
blog link: here
github link: here
model link: here
homepage link: here
citation:
@misc{li2024ariaopenmultimodalnative,
title={Aria: An Open Multimodal Native Mixture-of-Experts Model},
author={Dongxu Li and Yudong Liu and Haoning Wu and Yue Wang and Zhiqi Shen and Bowen Qu and Xinyao Niu and Guoyin Wang and Bei Chen and Junnan Li},
year={2024},
eprint={2410.05993},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2410.05993},
}
tag: Molmo
| PixMo
| Allen AI
paper link: here
modelhub link: here
homepage link: here
citation:
@misc{deitke2024molmopixmoopenweights,
title={Molmo and PixMo: Open Weights and Open Data for State-of-the-Art Multimodal Models},
author={Matt Deitke and Christopher Clark and Sangho Lee and Rohun Tripathi and Yue Yang and Jae Sung Park and Mohammadreza Salehi and Niklas Muennighoff and Kyle Lo and Luca Soldaini and Jiasen Lu and Taira Anderson and Erin Bransom and Kiana Ehsani and Huong Ngo and YenSung Chen and Ajay Patel and Mark Yatskar and Chris Callison-Burch and Andrew Head and Rose Hendrix and Favyen Bastani and Eli VanderBilt and Nathan Lambert and Yvonne Chou and Arnavi Chheda and Jenna Sparks and Sam Skjonsberg and Michael Schmitz and Aaron Sarnat and Byron Bischoff and Pete Walsh and Chris Newell and Piper Wolters and Tanmay Gupta and Kuo-Hao Zeng and Jon Borchardt and Dirk Groeneveld and Jen Dumas and Crystal Nam and Sophie Lebrecht and Caitlin Wittlif and Carissa Schoenick and Oscar Michel and Ranjay Krishna and Luca Weihs and Noah A. Smith and Hannaneh Hajishirzi and Ross Girshick and Ali Farhadi and Aniruddha Kembhavi},
year={2024},
eprint={2409.17146},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2409.17146},
}
tag: NVLM
| Nvidia
paper link: here
homepage link: here
model link: here
citation:
@misc{dai2024nvlmopenfrontierclassmultimodal,
title={NVLM: Open Frontier-Class Multimodal LLMs},
author={Wenliang Dai and Nayeon Lee and Boxin Wang and Zhuolin Yang and Zihan Liu and Jon Barker and Tuomas Rintamaki and Mohammad Shoeybi and Bryan Catanzaro and Wei Ping},
year={2024},
eprint={2409.11402},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.11402},
}
tag: Eagle
| Nvidia
paper link: here
github link: here
model-hub link: here
citation:
@misc{shi2024eagleexploringdesignspace,
title={Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of Encoders},
author={Min Shi and Fuxiao Liu and Shihao Wang and Shijia Liao and Subhashree Radhakrishnan and De-An Huang and Hongxu Yin and Karan Sapra and Yaser Yacoob and Humphrey Shi and Bryan Catanzaro and Andrew Tao and Jan Kautz and Zhiding Yu and Guilin Liu},
year={2024},
eprint={2408.15998},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2408.15998},
}
tag: Show-o
| ByteDance
| NUS
paper link: here
github link: here
homepage link: here
model link: here
citation:
@misc{xie2024showosingletransformerunify,
title={Show-o: One Single Transformer to Unify Multimodal Understanding and Generation},
author={Jinheng Xie and Weijia Mao and Zechen Bai and David Junhao Zhang and Weihao Wang and Kevin Qinghong Lin and Yuchao Gu and Zhijie Chen and Zhenheng Yang and Mike Zheng Shou},
year={2024},
eprint={2408.12528},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2408.12528},
}
tag: FViT
| CAI24
paper link: here
citation:
@INPROCEEDINGS{cai2024fastvisiontransformervia,
author={Wen, Yang and Chen, Samuel and Shrestha, Abhishek Krishna},
booktitle={2024 IEEE Conference on Artificial Intelligence (CAI)},
title={Fast Vision Transformer via Additive Attention},
year={2024},
volume={},
number={},
pages={573-574},
keywords={Computer vision;Additives;Computational modeling;Memory management;Linearity;Convolutional neural networks;Task analysis;Fast Vision Transformer;Additive Attention},
doi={10.1109/CAI59869.2024.00113}
}
tag: RLAIF-V
| Tsinghua University
| NUS
paper link: here
github link: here
citation:
@misc{yu2024rlaifvaligningmllmsopensource,
title={RLAIF-V: Aligning MLLMs through Open-Source AI Feedback for Super GPT-4V Trustworthiness},
author={Tianyu Yu and Haoye Zhang and Yuan Yao and Yunkai Dang and Da Chen and Xiaoman Lu and Ganqu Cui and Taiwen He and Zhiyuan Liu and Tat-Seng Chua and Maosong Sun},
year={2024},
eprint={2405.17220},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2405.17220},
}
tag: mPLUG-Owl 2
| Alibaba Group
paper link: here
github link: here
follow-up work: here
citation:
@misc{ye2023mplugowl2revolutionizingmultimodallarge,
title={mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration},
author={Qinghao Ye and Haiyang Xu and Jiabo Ye and Ming Yan and Anwen Hu and Haowei Liu and Qi Qian and Ji Zhang and Fei Huang and Jingren Zhou},
year={2023},
eprint={2311.04257},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2311.04257},
}
tag: AdaMV-MoE
| ICCV23
| Apple
| Google
paper link: here
citation:
@inproceedings{chen2023adamv,
title={AdaMV-MoE: Adaptive Multi-Task Vision Mixture-of-Experts},
author={Chen, Tianlong and Chen, Xuxi and Du, Xianzhi and Rashwan, Abdullah and Yang, Fan and Chen, Huizhong and Wang, Zhangyang and Li, Yeqing},
booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
pages={17346--17357},
year={2023}
}
tag: mPLUG-Owl
| DAMO Academy
| Alibaba Group
paper link: here
github link: here
follow-up work: here
citation:
@misc{ye2024mplugowlmodularizationempowerslarge,
title={mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality},
author={Qinghao Ye and Haiyang Xu and Guohai Xu and Jiabo Ye and Ming Yan and Yiyang Zhou and Junyang Wang and Anwen Hu and Pengcheng Shi and Yaya Shi and Chenliang Li and Yuanhong Xu and Hehong Chen and Junfeng Tian and Qi Qian and Ji Zhang and Fei Huang and Jingren Zhou},
year={2024},
eprint={2304.14178},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2304.14178},
}
tag: LLaVA
| NIPS23
| Microsoft
paper link: here
github link: here
homepage link: here
citation:
@misc{liu2023visualinstructiontuning,
title={Visual Instruction Tuning},
author={Haotian Liu and Chunyuan Li and Qingyang Wu and Yong Jae Lee},
year={2023},
eprint={2304.08485},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2304.08485},
}
tag: DiT
| UCB
paper link: here
github link: here
homepage link: here
citation:
@misc{peebles2023scalablediffusionmodelstransformers,
title={Scalable Diffusion Models with Transformers},
author={William Peebles and Saining Xie},
year={2023},
eprint={2212.09748},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2212.09748},
}
tag: OpenCLIP
| LAION
| UCB
paper link: here
github link: here
citation:
@inproceedings{Cherti_2023,
title={Reproducible Scaling Laws for Contrastive Language-Image Learning},
url={http://dx.doi.org/10.1109/CVPR52729.2023.00276},
DOI={10.1109/cvpr52729.2023.00276},
booktitle={2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
publisher={IEEE},
author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
year={2023},
month=jun }
tag: EVA
| CVPR23
| BAAI
| HUST
paper link: here
github link: here
citation:
@misc{fang2022evaexploringlimitsmasked,
title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
author={Yuxin Fang and Wen Wang and Binhui Xie and Quan Sun and Ledell Wu and Xinggang Wang and Tiejun Huang and Xinlong Wang and Yue Cao},
year={2022},
eprint={2211.07636},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2211.07636},
}
tag: Cold Diffusion
paper link: here
github link: here
citation:
@misc{bansal2022colddiffusioninvertingarbitrary,
title={Cold Diffusion: Inverting Arbitrary Image Transforms Without Noise},
author={Arpit Bansal and Eitan Borgnia and Hong-Min Chu and Jie S. Li and Hamid Kazemi and Furong Huang and Micah Goldblum and Jonas Geiping and Tom Goldstein},
year={2022},
eprint={2208.09392},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2208.09392},
}
tag: DaViT
| ECCV22
| Microsoft
| HKU
paper link: here
github link: here
citation:
@misc{ding2022davitdualattentionvision,
title={DaViT: Dual Attention Vision Transformers},
author={Mingyu Ding and Bin Xiao and Noel Codella and Ping Luo and Jingdong Wang and Lu Yuan},
year={2022},
eprint={2204.03645},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2204.03645},
}
tag: DeiT III
| ECCV22
| Meta
paper link: here
github link: here
citation:
@misc{touvron2022deitiiirevengevit,
title={DeiT III: Revenge of the ViT},
author={Hugo Touvron and Matthieu Cord and Hervé Jégou},
year={2022},
eprint={2204.07118},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2204.07118},
}
tag: Three Things
| ViT
| ECCV22
| Meta
paper link: here
github link: here
citation:
@misc{touvron2022thingsknowvisiontransformers,
title={Three things everyone should know about Vision Transformers},
author={Hugo Touvron and Matthieu Cord and Alaaeldin El-Nouby and Jakob Verbeek and Hervé Jégou},
year={2022},
eprint={2203.09795},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2203.09795},
}
tag: RQ-Transformer
| RQ-VAE
paper link: here
github link: here
citation:
@misc{lee2022autoregressiveimagegenerationusing,
title={Autoregressive Image Generation using Residual Quantization},
author={Doyup Lee and Chiheon Kim and Saehoon Kim and Minsu Cho and Wook-Shin Han},
year={2022},
eprint={2203.01941},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2203.01941},
}
tag: Latent Diffusion
| LDM
| Runway ML
paper link: here
github link: here
citation:
@misc{rombach2022highresolutionimagesynthesislatent,
title={High-Resolution Image Synthesis with Latent Diffusion Models},
author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer},
year={2022},
eprint={2112.10752},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2112.10752},
}
tag: ViT
| ViT-BN
| ViT-FFNBN
| ICCV21
| MSRA
| Tsinghua University
paper link: here
citation:
@inproceedings{yao2021leveraging,
author={Yao, Zhuliang and Cao, Yue and Lin, Yutong and Liu, Ze and Zhang, Zheng and Hu, Han},
booktitle={2021 IEEE/CVF International Conference on Computer Vision Workshops (ICCVW)},
title={Leveraging Batch Normalization for Vision Transformers},
year={2021},
volume={},
number={},
pages={413-422},
keywords={Training;Computer vision;Conferences;Computer architecture;Transformers;Computer crashes;Feeds},
doi={10.1109/ICCVW54120.2021.00050}
}
tag: SDEdit
| Stanford University
| CMU
paper link: here
github link: here
homepage link: here
citation:
@misc{meng2022sdeditguidedimagesynthesis,
title={SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations},
author={Chenlin Meng and Yutong He and Yang Song and Jiaming Song and Jiajun Wu and Jun-Yan Zhu and Stefano Ermon},
year={2022},
eprint={2108.01073},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2108.01073},
}
tag: Guided Diffusion
| OpenAI
paper link: here
github link: here
citation:
@misc{dhariwal2021diffusionmodelsbeatgans,
title={Diffusion Models Beat GANs on Image Synthesis},
author={Prafulla Dhariwal and Alex Nichol},
year={2021},
eprint={2105.05233},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2105.05233},
}
tag: CaiT
| ICCV21
| Meta
paper link: here
github link: here
citation:
@inproceedings{touvron2021goingdeeperimagetransformers,
author = {Touvron, Hugo and Cord, Matthieu and Sablayrolles, Alexandre and Synnaeve, Gabriel and J\'egou, Herv\'e},
title = {Going Deeper With Image Transformers},
booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
month = {October},
year = {2021},
pages = {32-42}
}
tag: Swin Transformer
| MSRA
paper link: here
github link: here
citation:
@misc{liu2021swintransformerhierarchicalvision,
title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},
author={Ze Liu and Yutong Lin and Yue Cao and Han Hu and Yixuan Wei and Zheng Zhang and Stephen Lin and Baining Guo},
year={2021},
eprint={2103.14030},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2103.14030},
}
tag: CLIP
| OpenAI
paper link: here
github link: here
citation:
@misc{radford2021learningtransferablevisualmodels,
title={Learning Transferable Visual Models From Natural Language Supervision},
author={Alec Radford and Jong Wook Kim and Chris Hallacy and Aditya Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
year={2021},
eprint={2103.00020},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2103.00020},
}
tag: DALL-E
| OpenAI
paper link: here
github link: here
citation:
@misc{ramesh2021zeroshottexttoimagegeneration,
title={Zero-Shot Text-to-Image Generation},
author={Aditya Ramesh and Mikhail Pavlov and Gabriel Goh and Scott Gray and Chelsea Voss and Alec Radford and Mark Chen and Ilya Sutskever},
year={2021},
eprint={2102.12092},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2102.12092},
}
tag: DeiT
| ICML21
| Meta
paper link: here
github link: here
citation:
@inproceedings{touvron2021trainingdataefficientimagetransformers,
title = {Training data-efficient image transformers & distillation through attention},
author = {Touvron, Hugo and Cord, Matthieu and Douze, Matthijs and Massa, Francisco and Sablayrolles, Alexandre and Jegou, Herve},
booktitle = {International Conference on Machine Learning},
pages = {10347--10357},
year = {2021},
volume = {139},
month = {July}
}
tag: Taming Transformer
| VQGAN
paper link: here
github link: here
citation:
@misc{esser2021tamingtransformershighresolutionimage,
title={Taming Transformers for High-Resolution Image Synthesis},
author={Patrick Esser and Robin Rombach and Björn Ommer},
year={2021},
eprint={2012.09841},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2012.09841},
}
tag: SDE
| Stanford University
| Google
paper link: here
github link: here
citation:
@misc{song2021scorebasedgenerativemodelingstochastic,
title={Score-Based Generative Modeling through Stochastic Differential Equations},
author={Yang Song and Jascha Sohl-Dickstein and Diederik P. Kingma and Abhishek Kumar and Stefano Ermon and Ben Poole},
year={2021},
eprint={2011.13456},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2011.13456},
}
tag: ViT
| Vision Transformer
| Google Brain
paper link: here
github link: here
citation:
@misc{dosovitskiy2021imageworth16x16words,
title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
author={Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
year={2021},
eprint={2010.11929},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2010.11929},
}
tag: DDIM
| Stanford University
paper link: here
citation:
@misc{song2022denoisingdiffusionimplicitmodels,
title={Denoising Diffusion Implicit Models},
author={Jiaming Song and Chenlin Meng and Stefano Ermon},
year={2022},
eprint={2010.02502},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2010.02502},
}
tag: DDPM
| UCB
paper link: here
github link: here
citation:
@misc{ho2020denoisingdiffusionprobabilisticmodels,
title={Denoising Diffusion Probabilistic Models},
author={Jonathan Ho and Ajay Jain and Pieter Abbeel},
year={2020},
eprint={2006.11239},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2006.11239},
}
tag: NCSN
| SMLD
| Score matching
| Score Function
| Stanford University
paper link: here
citation:
@misc{song2020generativemodelingestimatinggradients,
title={Generative Modeling by Estimating Gradients of the Data Distribution},
author={Yang Song and Stefano Ermon},
year={2020},
eprint={1907.05600},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/1907.05600},
}
tag: MMStar
| Shanghai AILab
paper link: here
github link: here
homepage link: here
dataset link: here
citation:
@misc{chen2024rightwayevaluatinglarge,
title={Are We on the Right Way for Evaluating Large Vision-Language Models?},
author={Lin Chen and Jinsong Li and Xiaoyi Dong and Pan Zhang and Yuhang Zang and Zehui Chen and Haodong Duan and Jiaqi Wang and Yu Qiao and Dahua Lin and Feng Zhao},
year={2024},
eprint={2403.20330},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2403.20330},
}
tag: MMMU
| CMU
paper link: here
github link: here
homepage link: here
dataset link: here
citation:
@misc{yue2024mmmumassivemultidisciplinemultimodal,
title={MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI},
author={Xiang Yue and Yuansheng Ni and Kai Zhang and Tianyu Zheng and Ruoqi Liu and Ge Zhang and Samuel Stevens and Dongfu Jiang and Weiming Ren and Yuxuan Sun and Cong Wei and Botao Yu and Ruibin Yuan and Renliang Sun and Ming Yin and Boyuan Zheng and Zhenzhu Yang and Yibo Liu and Wenhao Huang and Huan Sun and Yu Su and Wenhu Chen},
year={2024},
eprint={2311.16502},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2311.16502},
}
tag: MMBench
| Shanghai AILab
paper link: here
github link: here
dataset link: here
citation:
@misc{liu2024mmbenchmultimodalmodelallaround,
title={MMBench: Is Your Multi-modal Model an All-around Player?},
author={Yuan Liu and Haodong Duan and Yuanhan Zhang and Bo Li and Songyang Zhang and Wangbo Zhao and Yike Yuan and Jiaqi Wang and Conghui He and Ziwei Liu and Kai Chen and Dahua Lin},
year={2024},
eprint={2307.06281},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2307.06281},
}
tag: GPT-4V
| Gemini
| Awesome Multi-Modal LLMs
| Tencent Youtu Lab
| Shanghai AILab
paper link: here
github link: here
citation:
@misc{fu2023challengergpt4vearlyexplorations,
title={A Challenger to GPT-4V? Early Explorations of Gemini in Visual Expertise},
author={Chaoyou Fu and Renrui Zhang and Zihan Wang and Yubo Huang and Zhengye Zhang and Longtian Qiu and Gaoxiang Ye and Yunhang Shen and Mengdan Zhang and Peixian Chen and Sirui Zhao and Shaohui Lin and Deqiang Jiang and Di Yin and Peng Gao and Ke Li and Hongsheng Li and Xing Sun},
year={2023},
eprint={2312.12436},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2312.12436},
}
tag: Diffusion Survey
| Peking University
paper link: here
github link: here
citation:
@misc{yang2024diffusionmodelscomprehensivesurvey,
title={Diffusion Models: A Comprehensive Survey of Methods and Applications},
author={Ling Yang and Zhilong Zhang and Yang Song and Shenda Hong and Runsheng Xu and Yue Zhao and Wentao Zhang and Bin Cui and Ming-Hsuan Yang},
year={2024},
eprint={2209.00796},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2209.00796},
}