reference.bib

% ----------------TEXT-TO-VIDEO GENERATION---------------------
@misc{yan-etal-2021-videogpt,
      title={{VideoGPT: Video Generation using VQ-VAE and Transformers}}, 
      author={{Wilson Yan, Yunzhi Zhang, Pieter Abbeel, and Aravind Srinivas}},
      year={2021},
      eprint={2104.10157},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{han-etal-2022-show,
  author       = {{Ligong Han,
                  Jian Ren,
                  Hsin{-}Ying Lee,
                  Francesco Barbieri,
                  Kyle Olszewski,
                  Shervin Minaee,
                  Dimitris N. Metaxas, and
                  Sergey Tulyakov}},
  title        = {{Show Me What and Tell Me How: Video Synthesis via Multimodal Conditioning}},
  booktitle    = {CVPR},
  pages        = {3605--3615},
  year         = {2022},
}

@article{an-etal-2023-latentshift,
  author       = {Jie An,
                  Songyang Zhang,
                  Harry Yang,
                  Sonal Gupta,
                  Jia{-}Bin Huang,
                  Jiebo Luo, and
                  Xi Yin},
  title        = {Latent-Shift: Latent Diffusion with Temporal Shift for Efficient Text-to-Video
                  Generation},
  journal      = {CoRR},
  year         = {2023},
}

# arXiv 2023
@article{blattmann-etal-2023-svd,
  author       = {Andreas Blattmann,
                  Tim Dockhorn,
                  Sumith Kulal,
                  Daniel Mendelevitch,
                  Maciej Kilian,
                  Dominik Lorenz,
                  Yam Levi,
                  Zion English,
                  Vikram Voleti,
                  Adam Letts,
                  Varun Jampani, and
                  Robin Rombach},
  title        = {Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large
                  Datasets},
  journal      = {CoRR},
  year         = {2023},
}

# arXiv 2023
@article{chen-etal-2023-control,
  author       = {Weifeng Chen,
                  Jie Wu,
                  Pan Xie,
                  Hefeng Wu,
                  Jiashi Li,
                  Xin Xia,
                  Xuefeng Xiao, and
                  Liang Lin},
  title        = {Control-A-Video: Controllable Text-to-Video Generation with Diffusion
                  Models},
  journal      = {CoRR},
  year         = {2023},
}

# arXiv 2023
@article{esser-etal-2023-structure,
  author       = {Patrick Esser,
                  Johnathan Chiu,
                  Parmida Atighehchian,
                  Jonathan Granskog, and
                  Anastasis Germanidis},
  title        = {Structure and Content-Guided Video Synthesis with Diffusion Models},
  journal      = {CoRR},
  year         = {2023},
}

# arXiv 2023
@article{ho-etal-2023-imagen,
  author       = {{Jonathan Ho,
                  William Chan,
                  Chitwan Saharia,
                  Jay Whang,
                  Ruiqi Gao,
                  Alexey A. Gritsenko,
                  Diederik P. Kingma,
                  Ben Poole,
                  Mohammad Norouzi,
                  David J. Fleet, and
                  Tim Salimans}},
  title        = {{Imagen Video: High Definition Video Generation with Diffusion Models}},
  journal      = {CoRR},
  year         = {2022},
}

# arXiv 2023
@article{khachatryan-etal-2023-text2video,
  author       = {{Levon Khachatryan,
                  Andranik Movsisyan,
                  Vahram Tadevosyan,
                  Roberto Henschel,
                  Zhangyang Wang,
                  Shant Navasardyan, and
                  Humphrey Shi}},
  title        = {{Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video
                  Generators}},
  journal      = {CoRR},
  year         = {2023},
}

# arXiv 2023
@article{li-etal-2023-videogen,
  author       = {{Xin Li,
                  Wenqing Chu,
                  Ye Wu,
                  Weihang Yuan,
                  Fanglong Liu,
                  Qi Zhang,
                  Fu Li,
                  Haocheng Feng,
                  Errui Ding, and
                  Jingdong Wang}},
  title        = {{VideoGen: {A} Reference-Guided Latent Diffusion Approach for High
                  Definition Text-to-Video Generation}},
  journal      = {CoRR},
  year         = {2023},
}

# arXiv 2023
@article{ge-etal-2023-noise,
  author       = {{Songwei Ge,
                  Seungjun Nah,
                  Guilin Liu,
                  Tyler Poon,
                  Andrew Tao,
                  Bryan Catanzaro,
                  David Jacobs,
                  Jia{-}Bin Huang,
                  Ming{-}Yu Liu, and
                  Yogesh Balaji}},
  title        = {{Preserve Your Own Correlation: {A} Noise Prior for Video Diffusion
                  Models}},
  journal      = {CoRR},
  year         = {2023},
}

@inproceedings{wang-etal-2023-videocomposer,
  author       = {Xiang Wang and
                  Hangjie Yuan and
                  Shiwei Zhang and
                  Dayou Chen and
                  Jiuniu Wang and
                  Yingya Zhang and
                  Yujun Shen and
                  Deli Zhao and
                  Jingren Zhou},
  title        = {{VideoComposer: Compositional Video Synthesis with Motion Controllability}},
  booktitle    = {NeurIPS},
  year         = {2023}
}

# arXiv 2023
@article{wang-etal-2023-lavie,
  author       = {{Yaohui Wang,
                  Xinyuan Chen,
                  Xin Ma,
                  Shangchen Zhou,
                  Ziqi Huang,
                  Yi Wang,
                  Ceyuan Yang,
                  Yinan He,
                  Jiashuo Yu,
                  Peiqing Yang,
                  Yuwei Guo,
                  Tianxing Wu,
                  Chenyang Si,
                  Yuming Jiang,
                  Cunjian Chen,
                  Chen Change Loy,
                  Bo Dai,
                  Dahua Lin,
                  Yu Qiao, and
                  Ziwei Liu}},
  title        = {{{LAVIE:} High-Quality Video Generation with Cascaded Latent Diffusion
                  Models}},
  journal      = {CoRR},
  year         = {2023},
}

# arXiv 2023
@article{wang-etal-2023-videofactory,
  author       = {Wenjing Wang,
                  Huan Yang,
                  Zixi Tuo,
                  Huiguo He,
                  Junchen Zhu,
                  Jianlong Fu, and
                  Jiaying Liu},
  title        = {{VideoFactory: Swap Attention in Spatiotemporal Diffusions for Text-to-Video
                  Generation}},
  journal      = {CoRR},
  year         = {2023},
}

# arXiv 2023
@article{zhang-etal-2023-controlvideo,
  author       = {{Yabo Zhang,
                  Yuxiang Wei,
                  Dongsheng Jiang,
                  Xiaopeng Zhang,
                  Wangmeng Zuo, and
                  Qi Tian}},
  title        = {{ControlVideo: Training-free Controllable Text-to-Video Generation}},
  journal      = {CoRR},
  year         = {2023},
}

# arXiv 2023
@article{zhang-etal-2023-show,
  author       = {{David Junhao Zhang,
                  Jay Zhangjie Wu,
                  Jia{-}Wei Liu,
                  Rui Zhao,
                  Lingmin Ran,
                  Yuchao Gu,
                  Difei Gao, and
                  Mike Zheng Shou}},
  title        = {{Show-1: Marrying Pixel and Latent Diffusion Models for Text-to-Video
                  Generation}},
  journal      = {CoRR},
  year         = {2023},
}

# CVPR 2023
@inproceedings{blattmann-etal-2023-align,
  author       = {{Andreas Blattmann,
                  Robin Rombach,
                  Huan Ling,
                  Tim Dockhorn,
                  Seung Wook Kim,
                  Sanja Fidler, and
                  Karsten Kreis}},
  title        = {{Align Your Latents: High-Resolution Video Synthesis with Latent Diffusion
                  Models}},
  booktitle    = {CVPR},
  pages        = {22563--22575},
  year         = {2023},
}

# CVPR 2023
@inproceedings{yu-etal-2023-video,
  author       = {{Sihyun Yu,
                  Kihyuk Sohn,
                  Subin Kim, and
                  Jinwoo Shin}},
  title        = {{Video Probabilistic Diffusion Models in Projected Latent Space}},
  booktitle    = {CVPR},
  pages        = {18456--18466},
  year         = {2023},
}

# NeurIPS 2023
@inproceedings{ho-etal-2023-video,
  author       = {Jonathan Ho,
                  Tim Salimans,
                  Alexey A. Gritsenko,
                  William Chan,
                  Mohammad Norouzi, and
                  David J. Fleet},
  title        = {{Video Diffusion Models}},
  booktitle    = {NeurIPS},
  year         = {2022},
}

# ICLR 2023
@inproceedings{hong-etal-2023-cogvideo,
  title={{CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers}},
  author={Wenyi Hong, Ming Ding, Wendi Zheng, Xinghan Liu, and Jie Tang},
  booktitle={ICLR},
  year={2023},
  pages={1--24}
}

# ICLR 2023
@inproceedings{singer-etal-2023-make,
  author       = {Uriel Singer,
                  Adam Polyak,
                  Thomas Hayes,
                  Xi Yin,
                  Jie An,
                  Songyang Zhang,
                  Qiyuan Hu,
                  Harry Yang,
                  Oron Ashual,
                  Oran Gafni,
                  Devi Parikh,
                  Sonal Gupta, and
                  Yaniv Taigman},
  title        = {Make-A-Video: Text-to-Video Generation without Text-Video Data},
  booktitle    = {ICLR},
  year         = {2023},
  pages        = {1--13},
}

# ICLR 2023
@inproceedings{villegas-etal-2023-phenaki,
  author       = {Ruben Villegas,
                  Mohammad Babaeizadeh,
                  Pieter{-}Jan Kindermans,
                  Hernan Moraldo,
                  Han Zhang,
                  Mohammad Taghi Saffar,
                  Santiago Castro,
                  Julius Kunze, and
                  Dumitru Erhan},
  title        = {Phenaki: Variable Length Video Generation from Open Domain Textual
                  Descriptions},
  booktitle    = {ICLR},
  year         = {2023},
  pages        = {1--14},
}

@misc{girdhar-etal-2023-emu,
      title={{Emu Video: Factorizing Text-to-Video Generation by Explicit Image Conditioning}}, 
      author={{Rohit Girdhar, Mannat Singh, Andrew Brown, Quentin Duval, Samaneh Azadi, Sai Saketh Rambhatla, Akbar Shah, Xi Yin, Devi Parikh, and Ishan Misra}},
      year={2023},
      eprint={2311.10709},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{yuan-etal-2023-instructvideo,
      title={{InstructVideo: Instructing Video Diffusion Models with Human Feedback}}, 
      author={{Hangjie Yuan, Shiwei Zhang, Xiang Wang, Yujie Wei, Tao Feng, Yining Pan, Yingya Zhang, Ziwei Liu, Samuel Albanie, and Dong Ni}},
      year={2023},
      eprint={2312.12490},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{chen-etal-2023-seine,
      title={{SEINE: Short-to-Long Video Diffusion Model for Generative Transition and Prediction}}, 
      author={{Xinyuan Chen, Yaohui Wang, Lingjun Zhang, Shaobin Zhuang, Xin Ma, Jiashuo Yu, Yali Wang, Dahua Lin, Yu Qiao, and Ziwei Liu}},
      year={2023},
      eprint={2310.20700},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{wang-etal-2023-videolcm,
      title={{VideoLCM: Video Latent Consistency Model}}, 
      author={{Xiang Wang, Shiwei Zhang, Han Zhang, Yu Liu, Yingya Zhang, Changxin Gao, and Nong Sang}},
      year={2023},
      eprint={2312.09109},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{wang-etal-2023-modelscopet2v,
      title={{ModelScope Text-to-Video Technical Report}}, 
      author={{Jiuniu Wang, Hangjie Yuan, Dayou Chen, Yingya Zhang, Xiang Wang, and Shiwei Zhang}},
      year={2023},
      eprint={2308.06571},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{xing-etal-2023-vidiff,
      title={{VIDiff: Translating Videos via Multi-Modal Instructions with Diffusion Models}}, 
      author={{Zhen Xing, Qi Dai, Zihao Zhang, Hui Zhang, Han Hu, Zuxuan Wu, and Yu-Gang Jiang}},
      year={2023},
      eprint={2311.18837},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{wu-etal-2023-lamp,
  author       = {{Ruiqi Wu,
                  Liangyu Chen,
                  Tong Yang,
                  Chunle Guo,
                  Chongyi Li, and
                  Xiangyu Zhang}},
  title        = {{{LAMP:} Learn {A} Motion Pattern for Few-Shot-Based Video Generation}},
  journal      = {CoRR},
  year         = {2023},
}

@inproceedings{du-etal-2023-learning,
  author       = {Yilun Du and
                  Sherry Yang and
                  Bo Dai and
                  Hanjun Dai and
                  Ofir Nachum and
                  Josh Tenenbaum and
                  Dale Schuurmans and
                  Pieter Abbeel},
  title        = {{Learning Universal Policies via Text-Guided Video Generation}},
  booktitle    = {NeurIPS},
  year         = {2023}
}

@misc{bartal-etal-2024-lumiere,
      title={{Lumiere: A Space-Time Diffusion Model for Video Generation}}, 
      author={{Omer Bar-Tal, Hila Chefer, Omer Tov, Charles Herrmann, Roni Paiss, Shiran Zada, Ariel Ephrat, Junhwa Hur, Guanghui Liu, Amit Raj, Yuanzhen Li, Michael Rubinstein, Tomer Michaeli, Oliver Wang, Deqing Sun, Tali Dekel, and Inbar Mosseri}},
      year={2024},
      eprint={2401.12945},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{wang-etal-2024-boximator,
      title={{Boximator: Generating Rich and Controllable Motions for Video Synthesis}}, 
      author={{Jiawei Wang, Yuchen Zhang, Jiaxin Zou, Yan Zeng, Guoqiang Wei, Liping Yuan, and Hang Li}},
      year={2024},
      eprint={2402.01566},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{liu-etal-2024-world,
      title={{World Model on Million-Length Video And Language With RingAttention}}, 
      author={{Hao Liu, Wilson Yan, Matei Zaharia, and Pieter Abbeel}},
      year={2024},
      eprint={2402.08268},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{yang-etal-2024-directavideo,
      title={{Direct-a-Video: Customized Video Generation with User-Directed Camera Movement and Object Motion}}, 
      author={{Shiyuan Yang, Liang Hou, Haibin Huang, Chongyang Ma, Pengfei Wan, Di Zhang, Xiaodong Chen, and Jing Liao}},
      year={2024},
      eprint={2402.03162},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zhuang-etal-2024-vlogger,
      title={{Vlogger: Make Your Dream A Vlog}}, 
      author={{Shaobin Zhuang, Kunchang Li, Xinyuan Chen, Yaohui Wang, Ziwei Liu, Yu Qiao, and Yali Wang}},
      year={2024},
      eprint={2401.09414},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{zeng-etal-2023-make,
      title={{Make Pixels Dance: High-Dynamic Video Generation}}, 
      author={{Yan Zeng, Guoqiang Wei, Jiani Zheng, Jiaxin Zou, Yang Wei, Yuchen Zhang, and Hang Li}},
      year={2023},
      eprint={2311.10982},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{qing-etal-2023-hierarchical,
      title={{Hierarchical Spatio-temporal Decoupling for Text-to-Video Generation}}, 
      author={{Zhiwu Qing, Shiwei Zhang, Jiayu Wang, Xiang Wang, Yujie Wei, Yingya Zhang, Changxin Gao, and Nong Sang}},
      year={2023},
      eprint={2312.04483},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{chen2023gentron,
      title={{GenTron: Delving Deep into Diffusion Transformers for Image and Video Generation}}, 
      author={Shoufa Chen, Mengmeng Xu, Jiawei Ren, Yuren Cong, Sen He, Yanping Xie, Animesh Sinha, Ping Luo, Tao Xiang, and Juan-Manuel Perez-Rua},
      year={2023},
      eprint={2312.04557},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@article{xing-etal-2023-simda,
  author       = {{Zhen Xing,
                  Qi Dai,
                  Han Hu,
                  Zuxuan Wu, and
                  Yu{-}Gang Jiang}},
  title        = {{SimDA: Simple Diffusion Adapter for Efficient Video Generation}},
  journal      = {CoRR},
  year         = {2023},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wang-etal-2023-microcinema,
      title={{MicroCinema: A Divide-and-Conquer Approach for Text-to-Video Generation}}, 
      author={{Yanhui Wang, Jianmin Bao, Wenming Weng, Ruoyu Feng, Dacheng Yin, Tao Yang, Jingxu Zhang, Qi Dai Zhiyuan Zhao, Chunyu Wang, Kai Qiu, Yuhui Yuan, Chuanxin Tang, Xiaoyan Sun, Chong Luo, and Baining Guo}},
      year={2023},
      eprint={2311.18829},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{cai-etal-2023-generativerendering,
      title={{Generative Rendering: Controllable 4D-Guided Video Generation with 2D Diffusion Models}}, 
      author={{Shengqu Cai, Duygu Ceylan, Matheus Gadelha, Chun-Hao Paul Huang, Tuanfeng Yang Wang, and Gordon Wetzstein}},
      year={2023},
      eprint={2312.01409},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated  
@misc{jain-etal-2023-peekaboo,
      title={{PEEKABOO: Interactive Video Generation via Masked-Diffusion}}, 
      author={{Yash Jain, Anshul Nasery, Vibhav Vineet, and Harkirat Behl}},
      year={2023},
      eprint={2312.07509},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{liu-etal-2023-evalcrafter,
      title={{EvalCrafter: Benchmarking and Evaluating Large Video Generation Models}}, 
      author={{Yaofang Liu, Xiaodong Cun, Xuebo Liu, Xintao Wang, Yong Zhang, Haoxin Chen, Yang Liu, Tieyong Zeng, Raymond Chan, and Ying Shan}},
      year={2023},
      eprint={2310.11440},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wang-etal-2023-recipe,
      title={{A Recipe for Scaling up Text-to-Video Generation with Text-free Videos}}, 
      author={{Xiang Wang, Shiwei Zhang, Hangjie Yuan, Zhiwu Qing, Biao Gong, Yingya Zhang, Yujun Shen, Changxin Gao, and Nong Sang}},
      year={2023},
      eprint={2312.15770},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{shi-etal-2023-bivdiff,
      title={{BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis via Bridging Image and Video Diffusion Models}}, 
      author={{Fengyuan Shi, Jiaxi Gu, Hang Xu, Songcen Xu, Wei Zhang, and Limin Wang}},
      year={2023},
      eprint={2312.02813},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{menapace-etal-2024-mindthetime,
      title={{Snap Video: Scaled Spatiotemporal Transformers for Text-to-Video Synthesis}}, 
      author={{Willi Menapace, Aliaksandr Siarohin, Ivan Skorokhodov, Ekaterina Deyneka, Tsai-Shien Chen, Anil Kag, Yuwei Fang, Aleksei Stoliar, Elisa Ricci, Jian Ren, and Sergey Tulyakov}},
      year={2024},
      eprint={2402.14797},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{hu-etal-2023-animateanyone,
      title={{Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation}}, 
      author={{Li Hu, Xin Gao, Peng Zhang, Ke Sun, Bang Zhang, and Liefeng Bo}},
      year={2023},
      eprint={2311.17117},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{xiang-etal-2024-versvideo,
title={{VersVideo: Leveraging Enhanced Temporal Diffusion Models for Versatile Video Generation}},
author={{Jinxi Xiang, Ricong Huang, Jun Zhang, Guanbin Li, Xiao Han, and Yang Wei}},
booktitle={ICLR},
year={2024},
pages={1--19}
}

@inproceedings{ma-etal-2024-follow-your-pose,
  author       = {{Yue Ma,
                  Yingqing He,
                  Xiaodong Cun,
                  Xintao Wang,
                  Siran Chen,
                  Xiu Li, and
                  Qifeng Chen}},
  title        = {{Follow Your Pose: Pose-Guided Text-to-Video Generation Using Pose-Free
                  Videos}},
  booktitle    = {AAAI},
  pages        = {4117--4125},
  year         = {2024},
}

@inproceedings{qu-etal-2024-e2hqv,
  author       = {{Qiang Qu,
                  Yiran Shen,
                  Xiaoming Chen,
                  Yuk Ying Chung, and
                  Tongliang Liu}},
  title        = {{{E2HQV:} High-Quality Video Generation from Event Camera via Theory-Inspired
                  Model-Aided Deep Learning}},
  booktitle    = {AAAI},
  pages        = {4632--4640},
  year         = {2024},
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{peng-etal-2023-conditionvideo,
      title={{ConditionVideo: Training-Free Condition-Guided Text-to-Video Generation}}, 
      author={{Bo Peng, Xinyuan Chen, Yaohui Wang, Chaochao Lu, and Yu Qiao}},
      year={2023},
      eprint={2310.07697},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{su-etal-2023-f3pruning,
      title={{F3-Pruning: A Training-Free and Generalized Pruning Strategy towards Faster and Finer Text-to-Video Synthesis}}, 
      author={{Sitong Su, Jianzhi Liu, Lianli Gao, and Jingkuan Song}},
      year={2023},
      eprint={2312.03459},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{wang-etal-2024-worlddreamer,
      title={{WorldDreamer: Towards General World Models for Video Generation via Predicting Masked Tokens}}, 
      author={{Xiaofeng Wang, Zheng Zhu, Guan Huang, Boyuan Wang, Xinze Chen, and Jiwen Lu}},
      year={2024},
      eprint={2401.09985},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{wang-etal-2024-magicvideov2,
      title={{MagicVideo-V2: Multi-Stage High-Aesthetic Video Generation}}, 
      author={{Weimin Wang, Jiawei Liu, Zhijie Lin, Jiangqiao Yan, Shuo Chen, Chetwin Low, Tuyen Hoang, Jie Wu, Jun Hao Liew, Hanshu Yan, Daquan Zhou, and Jiashi Feng}},
      year={2024},
      eprint={2401.04468},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{ma-etal-2024-latte,
      title={{Latte: Latent Diffusion Transformer for Video Generation}}, 
      author={{Xin Ma, Yaohui Wang, Gengyun Jia, Xinyuan Chen, Ziwei Liu, Yuan-Fang Li, Cunjian Chen, and Yu Qiao}},
      year={2024},
      eprint={2401.03048},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{yuan-etal-2024-mora,
      title={{Mora: Enabling Generalist Video Generation via A Multi-Agent Framework}}, 
      author={{Zhengqing Yuan, Ruoxi Chen, Zhaoxu Li, Haolong Jia, Lifang He, Chi Wang, and Lichao Sun}},
      year={2024},
      eprint={2403.13248},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{henschel-etal-2024-streamingt2v,
      title={{StreamingT2V: Consistent, Dynamic, and Extendable Long Video Generation from Text}}, 
      author={{Roberto Henschel, Levon Khachatryan, Daniil Hayrapetyan, Hayk Poghosyan, Vahram Tadevosyan, Zhangyang Wang, Shant Navasardyan, and Humphrey Shi}},
      year={2024},
      eprint={2403.14773},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{zhao-etal-2023-motiondirector,
      title={{MotionDirector: Motion Customization of Text-to-Video Diffusion Models}}, 
      author={{Rui Zhao, Yuchao Gu, Jay Zhangjie Wu, David Junhao Zhang, Jiawei Liu, Weijia Wu, Jussi Keppo, and Mike Zheng Shou}},
      year={2023},
      eprint={2310.08465},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{zhou-etal-2024-storydiffusion,
      title={{StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation}}, 
      author={{Yupeng Zhou and Daquan Zhou and Ming-Ming Cheng and Jiashi Feng and Qibin Hou}},
      year={2024},
      eprint={2405.01434},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{xu-etal-2024-easyanimate,
      title={{EasyAnimate: A High-Performance Long Video Generation Method based on Transformer Architecture}}, 
      author={Jiaqi Xu and Xinyi Zou and Kunzhe Huang and Yunkuo Chen and Bo Liu and MengLi Cheng and Xing Shi and Jun Huang},
      year={2024},
      eprint={2405.18991},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{lin-etal-2024-ctrladapter,
      title={{Ctrl-Adapter: An Efficient and Versatile Framework for Adapting Diverse Controls to Any Diffusion Model}}, 
      author={Han Lin and Jaemin Cho and Abhay Zala and Mohit Bansal},
      year={2024},
      eprint={2404.09967},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@article{lee-etal-2024-grid,
      title={{Grid Diffusion Models for Text-to-Video Generation}},
      author={Lee, Taegyeong and Kwon, Soyeong and Kim, Taehwan},
      journal={arXiv preprint arXiv:2404.00234},
      year={2024}
    }

% TODO: Missing reference for the paper `Hierarchical Patch-wise Diffusion Models for High-Resolution Video Generation'

% TODO: Missing reference for the paper `DiffPerformer: Iterative Learning of Consistent Latent Guidance for Diffusion-based Human Video Generation'

@misc{peng-etal-2024-controlnext,
      title={{ControlNeXt: Powerful and Efficient Control for Image and Video Generation}}, 
      author={Bohao Peng and Jian Wang and Yuechen Zhang and Wenbo Li and Ming-Chang Yang and Jiaya Jia},
      year={2024},
      eprint={2408.06070},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{feng-etal-2024-fancyvideo,
      title={{FancyVideo: Towards Dynamic and Consistent Video Generation via Cross-frame Textual Guidance}}, 
      author={Jiasong Feng and Ao Ma and Jing Wang and Bo Cheng and Xiaodan Liang and Dawei Leng and Yuhui Yin},
      year={2024},
      eprint={2408.08189},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{yang-etal-2024-factorized-dreamer,
      title={{Factorized-Dreamer: Training A High-Quality Video Generator with Limited and Low-Quality Data}}, 
      author={Tao Yang and Yangming Shi and Yunwen Huang and Feng Chen and Yin Zheng and Lei Zhang},
      year={2024},
      eprint={2408.10119},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{chen-etal-2024-fine-grained,
      title={Fine-gained Zero-shot Video Sampling}, 
      author={Dengsheng Chen and Jie Hu and Xiaoming Wei and Enhua Wu},
      year={2024},
      eprint={2407.21475},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{li-etal-2024-training-free,
      title={{Training-free Long Video Generation with Chain of Diffusion Model Experts}}, 
      author={Wenhao Li and Yichao Cao and Xiu Su and Xi Lin and Shan You and Mingkai Zheng and Yi Chen and Chang Xu},
      year={2024},
      eprint={2408.13423},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{liu-etal-2024-reconx,
      title={{ReconX: Reconstruct Any Scene from Sparse Views with Video Diffusion Model}}, 
      author={Fangfu Liu and Wenqiang Sun and Hanyang Wang and Yikai Wang and Haowen Sun and Junliang Ye and Jun Zhang and Yueqi Duan},
      year={2024},
      eprint={2408.16767},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{li-etal-2024-confiner,
      title={{Training-free Long Video Generation with Chain of Diffusion Model Experts}}, 
      author={Wenhao Li and Yichao Cao and Xiu Su and Xi Lin and Shan You and Mingkai Zheng and Yi Chen and Chang Xu},
      year={2024},
      eprint={2408.13423},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{gupta-etal-2023-walt,
  author       = {Agrim Gupta and
                  Lijun Yu and
                  Kihyuk Sohn and
                  Xiuye Gu and
                  Meera Hahn and
                  Li Fei{-}Fei and
                  Irfan Essa and
                  Lu Jiang and
                  Jos{\'{e}} Lezama},
  title        = {{Photorealistic Video Generation with Diffusion Models}},
  journal      = {CoRR},
  volume       = {abs/2312.06662},
  year         = {2023},
}

% TODO: Missing reference for the paper `MoVideo: Motion-Aware Video Generation with Diffusion Models'

@inproceedings{li-etal-2024-drivingdiffusion,
  author       = {Xiaofan Li and
                  Yifu Zhang and
                  Xiaoqing Ye},
  title        = {{DrivingDiffusion: Layout-Guided Multi-view Driving Scenarios Video
                  Generation with Latent Diffusion Model}},
  booktitle    = {ECCV},
  volume       = {15136},
  pages        = {469--485},
  year         = {2024},
}

@inproceedings{zhao-etal-2024-magdiff,
  author       = {Haoyu Zhao and
                  Tianyi Lu and
                  Jiaxi Gu and
                  Xing Zhang and
                  Qingping Zheng and
                  Zuxuan Wu and
                  Hang Xu and
                  Yu{-}Gang Jiang},
  title        = {{MagDiff: Multi-alignment Diffusion for High-Fidelity Video Generation
                  and Editing}},
  booktitle    = {ECCV},
  volume       = {15076},
  pages        = {205--221},
  year         = {2024},
}

% TODO: Missing reference for the paper `HARIVO: Harnessing Text-to-Image Models for Video Generation'

@inproceedings{oh-etal-2024-mevg,
  author       = {Gyeongrok Oh and
                  Jaehwan Jeong and
                  Sieun Kim and
                  Wonmin Byeon and
                  Jinkyu Kim and
                  Sungwoong Kim and
                  Sangpil Kim},
  title        = {{{MEVG:} Multi-event Video Generation with Text-to-Video Models}},
  booktitle    = {ECCV},
  volume       = {15101},
  pages        = {401--418},
  year         = {2024},
}

@misc{hyung-etal-2024-stg,
      title={{Spatiotemporal Skip Guidance for Enhanced Video Diffusion Sampling}}, 
      author={Junha Hyung and Kinam Kim and Susung Hong and Min-Jung Kim and Jaegul Choo},
      year={2024},
      eprint={2411.18664},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% ----------------IMAGE-TO-VIDEO GENERATION---------------------

@inproceedings{ardino-etal-2021-click-to-move,
  author       = {{Pierfrancesco Ardino and
                  Marco De Nadai and
                  Bruno Lepri and
                  Elisa Ricci and
                  St{\'{e}}phane Lathuili{\`{e}}re}},
  title        = {{Click to Move: Controlling Video Generation with Sparse Motion}},
  booktitle    = {ICCV},
  pages        = {14729--14738},
  year         = {2021},
}

@inproceedings{hu-etal-2022-make,
  author       = {{Yaosi Hu,
                  Chong Luo, and
                  Zhenzhong Chen}},
  title        = {{Make It Move: Controllable Image-to-Video Generation with Text Descriptions}},
  booktitle    = {CVPR},
  pages        = {18198--18207},
  year         = {2022},
}

# arXiv 2023
@article{zhang-etal-2023-i2vgenxl,
  author       = {Shiwei Zhang,
                  Jiayu Wang,
                  Yingya Zhang,
                  Kang Zhao,
                  Hangjie Yuan,
                  Zhiwu Qin,
                  Xiang Wang,
                  Deli Zhao, and
                  Jingren Zhou},
  title        = {I2VGen-XL: High-Quality Image-to-Video Synthesis via Cascaded Diffusion
                  Models},
  journal      = {CoRR},
  volume       = {abs/2311.04145},
  year         = {2023},
}

@article{guo-etal-2023-animatediff,
  author       = {Yuwei Guo,
                  Ceyuan Yang,
                  Anyi Rao,
                  Yaohui Wang,
                  Yu Qiao,
                  Dahua Lin, and
                  Bo Dai},
  title        = {AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models
                  without Specific Tuning},
  journal      = {CoRR},
  volume       = {abs/2307.04725},
  year         = {2023},
}

@misc{guo-etal-2024-i2vadapter,
      title={{I2V-Adapter: A General Image-to-Video Adapter for Diffusion Models}}, 
      author={{Xun Guo, Mingwu Zheng, Liang Hou, Yuan Gao, Yufan Deng, Pengfei Wan, Di Zhang, Yufan Liu, Weiming Hu, Zhengjun Zha, Haibin Huang, and Chongyang Ma}},
      year={2024},
      eprint={2312.16693},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with ICLR version as soon as the metadata is updated
@misc{lu-etal-2023-vdt,
      title={{VDT: General-purpose Video Diffusion Transformers via Mask Modeling}}, 
      author={{Haoyu Lu, Guoxing Yang, Nanyi Fei, Yuqi Huo, Zhiwu Lu, Ping Luo, and Mingyu Ding}},
      year={2023},
      eprint={2305.13311},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{jiang-etal-2023-videobooth,
      title={{VideoBooth: Diffusion-based Video Generation with Image Prompts}}, 
      author={{Yuming Jiang, Tianxing Wu, Shuai Yang, Chenyang Si, Dahua Lin, Yu Qiao, Chen Change Loy, and Ziwei Liu}},
      year={2023},
      eprint={2312.00777},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{shen-etal-2024-decouple,
  author       = {{Cuifeng Shen,
                  Yulu Gan,
                  Chen Chen,
                  Xiongwei Zhu,
                  Lele Cheng,
                  Tingting Gao, and
                  Jinzhi Wang}},
  title        = {{Decouple Content and Motion for Conditional Image-to-Video Generation}},
  booktitle    = {AAAI},
  pages        = {4757--4765},
  year         = {2024},
}

@misc{ma-etal-2024-follow-your-click,
      title={{Follow-Your-Click: Open-domain Regional Image Animation via Short Prompts}}, 
      author={{Yue Ma, Yingqing He, Hongfa Wang,,ong Wang, Chenyang Qi, Chengfei Cai, Xiu Li, Zhifeng Li, Heung-Yeung Shum, Wei Liu, and Qifeng Chen}},
      year={2024},
      eprint={2403.08268},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{gong-etal-2024-atomovideo,
      title={{AtomoVideo: High Fidelity Image-to-Video Generation}}, 
      author={{Litong Gong, Yiran Zhu, Weijie Li, Xiaoyang Kang, Biao Wang, Tiezheng Ge, and Bo Zheng}},
      year={2024},
      eprint={2403.01800},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{qian-etal-2024-rethinking,
      title={{Rethinking Image-to-Video Adaptation: An Object-centric Perspective}}, 
      author={Rui Qian and Shuangrui Ding and Dahua Lin},
      year={2024},
      eprint={2407.06871},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@misc{yang-etal-2024-megactor,
      title={{MegActor-$\Sigma$: Unlocking Flexible Mixed-Modal Control in Portrait Animation with Diffusion Transformer}}, 
      author={Shurong Yang and Huadong Li and Juhao Wu and Minhao Jing and Linze Li and Renhe Ji and Jiajun Liang and Haoqiang Fan and Jin Wang},
      year={2024},
      eprint={2408.14975},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@inproceedings{liu-etal-2024-physgen,
  author       = {Shaowei Liu and
                  Zhongzheng Ren and
                  Saurabh Gupta and
                  Shenlong Wang},
  title        = {{PhysGen: Rigid-Body Physics-Grounded Image-to-Video Generation}},
  booktitle    = {ECCV},
  pages        = {360--378},
  year         = {2024},
}

% ----------------AUDIO-TO-VIDEO GENERATION---------------------

@inproceedings{ruan-etal-2023-mmdiffusion,
  author       = {{Ludan Ruan,
                  Yiyang Ma,
                  Huan Yang,
                  Huiguo He,
                  Bei Liu,
                  Jianlong Fu,
                  Nicholas Jing Yuan,
                  Qin Jin, and
                  Baining Guo}},
  title        = {{MM-Diffusion: Learning Multi-Modal Diffusion Models for Joint Audio
                  and Video Generation}},
  booktitle    = {CVPR},
  pages        = {10219--10228},
  year         = {2023},
}

@inproceedings{ni-etal-2023-conditional,
  author       = {Haomiao Ni,
                  Changhao Shi,
                  Kai Li,
                  Sharon X. Huang, and
                  Martin Renqiang Min},
  title        = {Conditional Image-to-Video Generation with Latent Flow Diffusion Models},
  booktitle    = {CVPR},
  pages        = {18444--18455},
  year         = {2023},
}

@inproceedings{yariv-etal-2024-diverse,
  author       = {{Guy Yariv,
                  Itai Gat,
                  Sagie Benaim,
                  Lior Wolf,
                  Idan Schwartz, and
                  Yossi Adi}},
  title        = {{Diverse and Aligned Audio-to-Video Generation via Text-to-Video Model
                  Adaptation}},
  booktitle    = {AAAI},
  pages        = {6639--6647},
  year         = {2024},
}

@misc{he-etal-2024-idanimator,
      title={{ID-Animator: Zero-Shot Identity-Preserving Human Video Generation}}, 
      author={{Xuanhua He, Quande Liu, Shengju Qian, Xin Wang, Tao Hu, Ke Cao, Keyu Yan, Man Zhou, and Jie Zhang}},
      year={2024},
      eprint={2404.15275},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{li-etal-2024-tuningfree,
      title={{Tuning-Free Noise Rectification for High Fidelity Image-to-Video Generation}}, 
      author={{Weijie Li, Litong Gong, Yiran Zhu, Fanda Fan, Biao Wang, Tiezheng Ge, and Bo Zheng}},
      year={2024},
      eprint={2403.02827},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% ----------------PERSONALIZED VIDEO GENERATION---------------------
@misc{xiao-etal-2023-fastcomposer,
      title={{FastComposer: Tuning-Free Multi-Subject Image Generation with Localized Attention}}, 
      author={{Guangxuan Xiao, Tianwei Yin, William T. Freeman, Frédo Durand, and Song Han}},
      year={2023},
      eprint={2305.10431},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@article{xing-etal-2023-makeyourvideo,
  author       = {{Jinbo Xing,
                  Menghan Xia,
                  Yuxin Liu,
                  Yuechen Zhang,
                  Yong Zhang,
                  Yingqing He,
                  Hanyuan Liu,
                  Haoxin Chen,
                  Xiaodong Cun,
                  Xintao Wang,
                  Ying Shan, and
                  Tien{-}Tsin Wong}},
  title        = {{Make-Your-Video: Customized Video Generation Using Textual and Structural
                  Guidance}},
  journal      = {CoRR},
  year         = {2023},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wang-etal-2024-highfidelity,
      title={High-fidelity Person-centric Subject-to-Image Synthesis}, 
      author={Yibin Wang, Weizhong Zhang, Jianwei Zheng, and Cheng Jin},
      year={2024},
      eprint={2311.10329},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{ma-etal-2024-magic-me,
      title={{Magic-Me: Identity-Specific Video Customized Diffusion}}, 
      author={{Ze Ma, Daquan Zhou, Chun-Hsiao Yeh, Xue-She Wang, Xiuyu Li, Huanrui Yang, Zhen Dong, Kurt Keutzer, and Jiashi Feng}},
      year={2024},
      eprint={2402.09368},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{mou-etal-2024-revideo,
      title={{ReVideo: Remake a Video with Motion and Content Control}}, 
      author={Chong Mou and Mingdeng Cao and Xintao Wang and Zhaoyang Zhang and Ying Shan and Jian Zhang},
      year={2024},
      eprint={2405.13865},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{zhong-etal-2024-posecrafter,
  author       = {Yong Zhong and
                  Min Zhao and
                  Zebin You and
                  Xiaofeng Yu and
                  Changwang Zhang and
                  Chongxuan Li},
  title        = {{PoseCrafter: One-Shot Personalized Video Synthesis Following Flexible
                  Pose Control}},
  booktitle    = {ECCV},
  volume       = {15102},
  pages        = {243--260},
  year         = {2024},
}

% ----------------VIDEO EDITING---------------------
@inproceedings{bar-etal-2022-text2live,
  title={{Text2LIVE: Text-Driven Layered Image and Video Editing}},
  author={Bar-Tal, Omer and Ofri-Amar, Dolev and Fridman, Rafail and Kasten, Yoni and Dekel, Tali},
  booktitle={ECCV},
  pages={707--723},
  year={2022},
}

@article{huang-etal-2023-styleavideo,
  author       = {Nisha Huang,
                  Yuxin Zhang, and
                  Weiming Dong},
  title        = {Style-A-Video: Agile Diffusion for Arbitrary Text-based Video Style
                  Transfer},
  journal      = {CoRR},
  volume       = {abs/2305.05464},
  year         = {2023},
}

@inproceedings{zhang-etal-2023-towards,
  author       = {{Zicheng Zhang,
                  Bonan Li,
                  Xuecheng Nie,
                  Congying Han,
                  Tiande Guo, and
                  Luoqi Liu}},
  title        = {{Towards Consistent Video Editing with Text-to-Image Diffusion Models}},
  booktitle    = {NeurIPS},
  year         = {2023},
}

@article{karim-etal-2023-save,
  author       = {{Nazmul Karim,
                  Umar Khalid,
                  Mohsen Joneidi,
                  Chen Chen, and
                  Nazanin Rahnavard}},
  title        = {{{SAVE:} Spectral-Shift-Aware Adaptation of Image Diffusion Models
                  for Text-guided Video Editing}},
  journal      = {CoRR},
  year         = {2023},
}

@inproceedings{ceylan-etal-2023-pix2video,
  author       = {{Duygu Ceylan,
                  Chun{-}Hao Paul Huang, and
                  Niloy J. Mitra}},
  title        = {{Pix2Video: Video Editing using Image Diffusion}},
  booktitle    = {{ICCV},
  pages        = {23149--23160},
  year         = {2023},
}}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wu-etal-2023-fairy,
      title={{Fairy: Fast Parallelized Instruction-Guided Video-to-Video Synthesis}}, 
      author={{Bichen Wu, Ching-Yao Chuang, Xiaoyan Wang, Yichen Jia, Kapil Krishnakumar, Tong Xiao, Feng Liang, Licheng Yu, and Peter Vajda}},
      year={2023},
      eprint={2312.13834},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{feng-etal-2023-ccedit,
      title={{CCEdit: Creative and Controllable Video Editing via Diffusion Models}}, 
      author={{Ruoyu Feng, Wenming Weng, Yanhui Wang, Yuhui Yuan, Jianmin Bao, Chong Luo, Zhibo Chen, and Baining Guo}},
      year={2023},
      eprint={2309.16496},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{liu-etal-2023-dynvideoe,
      title={{DynVideo-E: Harnessing Dynamic NeRF for Large-Scale Motion-, View-Change Human-Centric Video Editing}}, 
      author={{Jia-Wei Liu, Yan-Pei Cao, Jay Zhangjie Wu, Weijia Mao, Yuchao Gu, Rui Zhao, Jussi Keppo, Ying Shan, and Mike Zheng Shou}},
      year={2023},
      eprint={2310.10624},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{liu-etal-2023-videop2p,
      title={{Video-P2P: Video Editing with Cross-attention Control}}, 
      author={{Shaoteng Liu, Yuechen Zhang, Wenbo Li, Zhe Lin, and Jiaya Jia}},
      year={2023},
      eprint={2303.04761},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{li-etal-2023-video,
      title={{A Video is Worth 256 Bases: Spatial-Temporal Expectation-Maximization Inversion for Zero-Shot Video Editing}}, 
      author={{Maomao Li, Yu Li, Tianyu Yang, Yunfei Liu, Dongxu Yue, Zhihui Lin, and Dong Xu}},
      year={2023},
      eprint={2312.05856},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{ma-etal-2023-maskint,
      title={{MaskINT: Video Editing via Interpolative Non-autoregressive Masked Transformers}}, 
      author={{Haoyu Ma, Shahin Mahdizadehaghdam, Bichen Wu, Zhipeng Fan, Yuchao Gu, Wenliang Zhao, Lior Shapira, and Xiaohui Xie}},
      year={2023},
      eprint={2312.12468},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{li-etal-2023-vidtome,
      title={{VidToMe: Video Token Merging for Zero-Shot Video Editing}}, 
      author={{Xirui Li, Chao Ma, Xiaokang Yang, and Ming-Hsuan Yang}},
      year={2023},
      eprint={2312.10656},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{lee-etal-2023-shape,
  author       = {Yao{-}Chih Lee and
                  Ji{-}Ze Genevieve Jang and
                  Yi{-}Ting Chen and
                  Elizabeth Qiu and
                  Jia{-}Bin Huang},
  title        = {{Shape-Aware Text-Driven Layered Video Editing}},
  booktitle    = {CVPR},
  pages        = {14317--14326},
  year         = {2023},
}

@inproceedings{jay-etal-2023-tuneavideo,
  author       = {Jay Zhangjie Wu and
                  Yixiao Ge and
                  Xintao Wang and
                  Stan Weixian Lei and
                  Yuchao Gu and
                  Yufei Shi and
                  Wynne Hsu and
                  Ying Shan and
                  Xiaohu Qie and
                  Mike Zheng Shou},
  title        = {{Tune-A-Video: One-Shot Tuning of Image Diffusion Models for Text-to-Video
                  Generation}},
  booktitle    = {ICCV},
  pages        = {7589--7599},
  year         = {2023},
}

@inproceedings{yang-etal-2023-rerender-a-video,
author = {Yang, Shuai and Zhou, Yifan and Liu, Ziwei and Loy, Chen Change},
title = {{Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation}},
year = {2023},
isbn = {9798400703157},
booktitle = {SIGGRAPH Asia},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wu-etal-2024-languagedriven,
      title={{Towards Language-Driven Video Inpainting via Multimodal Large Language Models}}, 
      author={{Jianzong Wu, Xiangtai Li, Chenyang Si, Shangchen Zhou, Jingkang Yang, Jiangning Zhang, Yining Li, Kai Chen, Yunhai Tong, Ziwei Liu, and Chen Change Loy}},
      year={2024},
      eprint={2401.10226},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{jeong-etal-2024-groundavideo,
title={{Ground-A-Video: Zero-shot Grounded Video Editing using Text-to-image Diffusion Models}},
author={{Hyeonho Jeong and Jong Chul Ye}},
booktitle={ICLR},
year={2024},
pages={1--28}
}

@inproceedings{geyer-etal-2024-tokenflow,
title={{TokenFlow: Consistent Diffusion Features for Consistent Video Editing}},
author={{Michal Geyer, Omer Bar-Tal, Shai Bagon, and Tali Dekel}},
booktitle={ICLR},
year={2024},
pages={1--13}
}

@misc{wu-etal-2024-draganything,
      title={{DragAnything: Motion Control for Anything using Entity Representation}}, 
      author={{Weijia Wu, Zhuang Li, Yuchao Gu, Rui Zhao, Yefei He, David Junhao Zhang, Mike Zheng Shou, Yan Li, Tingting Gao, and Di Zhang}},
      year={2024},
      eprint={2403.07420},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{zhang-etal-2024-avid,
      title={{AVID: Any-Length Video Inpainting with Diffusion Model}}, 
      author={{Zhixing Zhang, Bichen Wu, Xiaoyan Wang, Yaqiao Luo, Luxin Zhang, Yinan Zhao, Peter Vajda, Dimitris Metaxas, and Licheng Yu}},
      year={2024},
      eprint={2312.03816},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{ku-etal-2024-anyv2v,
      title={{AnyV2V: A Plug-and-Play Framework For Any Video-to-Video Editing Tasks}}, 
      author={Max Ku and Cong Wei and Weiming Ren and Harry Yang and Wenhu Chen},
      year={2024},
      eprint={2403.14468},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{cheng-etal-2024-consistent,
title={{Consistent Video-to-Video Transfer Using Synthetic Dataset}},
author={Jiaxin Cheng and Tianjun Xiao and Tong He},
booktitle={ICLR},
year={2024},
pages={1--13},
}

% TODO: Missing reference for the paper `CAMEL: CAusal Motion Enhancement tailored for Lifting Text-driven Video Editing'

@misc{deng-etal-2024-dragvideo,
      title={{DragVideo: Interactive Drag-style Video Editing}}, 
      author={Yufan Deng and Ruida Wang and Yuhao Zhang and Yu-Wing Tai and Chi-Keung Tang},
      year={2024},
      eprint={2312.02216},
      archivePrefix={arXiv},
      primaryClass={cs.GR},
}

@article{zi-etal-2024-cococo,
  title={{CoCoCo: Improving Text-Guided Video Inpainting for Better blue Consistency, Controllability and Compatibility}},
  author={Zi, Bojia and Zhao, Shihao and Qi, Xianbiao and Wang, Jianan and Shi, Yukai and Chen, Qianyu and Liang, Bin and Wong, Kam-Fai and Zhang, Lei},
  journal={arXiv preprint arXiv:2403.12305},
  year={2024}
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{singer-etal-2024-video,
      title={{Video Editing via Factorized Diffusion Distillation}}, 
      author={Uriel Singer and Amit Zohar and Yuval Kirstain and Shelly Sheynin and Adam Polyak and Devi Parikh and Yaniv Taigman},
      year={2024},
      eprint={2403.09334},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@article{danah-etal-2024-dmt,
  author       = {Danah Yatim and
                  Rafail Fridman and
                  Omer Bar{-}Tal and
                  Yoni Kasten and
                  Tali Dekel},
  title        = {{Space-Time Diffusion Features for Zero-Shot Text-Driven Motion Transfer}},
  journal      = {CoRR},
  volume       = {abs/2311.17009},
  year         = {2023},
}

@misc{peruzzo-etal-2024-vase,
      title={{VASE: Object-Centric Appearance and Shape Manipulation of Real Videos}}, 
      author={Elia Peruzzo and Vidit Goel and Dejia Xu and Xingqian Xu and Yifan Jiang and Zhangyang Wang and Humphrey Shi and Nicu Sebe},
      year={2024},
      eprint={2401.02473},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@inproceedings{cong-etal-2024-flatten,
  author       = {Yuren Cong and
                  Mengmeng Xu and
                  Christian Simon and
                  Shoufa Chen and
                  Jiawei Ren and
                  Yanping Xie and
                  Juan{-}Manuel P{\'{e}}rez{-}R{\'{u}}a and
                  Bodo Rosenhahn and
                  Tao Xiang and
                  Sen He},
  title        = {{{FLATTEN:} Optical FLow-guided ATTENtion for Consistent Text-to-Video
                  Editing}},
  booktitle    = {ICLR},
  year         = {2024},
}

@misc{wei-etal-2024-dreamvideo2,
      title={{DreamVideo-2: Zero-Shot Subject-Driven Video Customization with Precise Motion Control}}, 
      author={Yujie Wei and Shiwei Zhang and Hangjie Yuan and Xiang Wang and Haonan Qiu and Rui Zhao and Yutong Feng and Feng Liu and Zhizhong Huang and Jiaxin Ye and Yingya Zhang and Hongming Shan},
      year={2024},
      eprint={2410.13830},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@inproceedings{kahatapitiya-etal-2024-ocd,
  author       = {Kumara Kahatapitiya and
                  Adil Karjauv and
                  Davide Abati and
                  Fatih Porikli and
                  Yuki M. Asano and
                  Amirhossein Habibian},
  title        = {{Object-Centric Diffusion for Efficient Video Editing}},
  booktitle    = {ECCV},
  volume       = {15115},
  pages        = {91--108},
  year         = {2024},
}

@inproceedings{zhong-etal-2024-deco,
  author       = {Xiaojing Zhong and
                  Xinyi Huang and
                  Xiaofeng Yang and
                  Guosheng Lin and
                  Qingyao Wu},
  title        = {{DeCo: Decoupled Human-Centered Diffusion Video Editing with Motion
                  Consistency}},
  booktitle    = {ECCV},
  volume       = {15102},
  pages        = {352--370},
  year         = {2024},
}

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{song-etal-2023-save,
  author       = {Yeji Song and
                  Wonsik Shin and
                  Junsoo Lee and
                  Jeesoo Kim and
                  Nojun Kwak},
  title        = {{{SAVE:} Protagonist Diversification with Structure Agnostic Video
                  Editing}},
  journal      = {CoRR},
  volume       = {abs/2312.02503},
  year         = {2023},
}

% TODO: Missing reference for the paper `WAVE: Warping DDIM Inversion Features for Zero-shot Text-to-Video Editing'

% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{fan-etal-2024-videoshop,
  author       = {Xiang Fan and
                  Anand Bhattad and
                  Ranjay Krishna},
  title        = {{Videoshop: Localized Semantic Video Editing with Noise-Extrapolated
                  Diffusion Inversion}},
  journal      = {CoRR},
  volume       = {abs/2403.14617},
  year         = {2024},
}

@misc{liu-etal-2024-stablev2v,
      title={{StableV2V: Stablizing Shape Consistency in Video-to-Video Editing}}, 
      author={Chang Liu and Rui Li and Kaidong Zhang and Yunwei Lan and Dong Liu},
      year={2024},
      eprint={2411.11045},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}

@inproceedings{yang-etal-2024-fresco,
  author       = {Shuai Yang and
                  Yifan Zhou and
                  Ziwei Liu and
                  Chen Change Loy},
  title        = {{Fresco: Spatial-Temporal Correspondence for Zero-Shot Video Translation}},
  booktitle    = {CVPR},
  pages        = {8703--8712},
  year         = {2024},
}

% ----------------DATASETS---------------------
@misc{ucf101,
      title={{UCF101: A Dataset of 101 Human Actions Classes From Videos in The Wild}}, 
      author={{Khurram Soomro, Amir Roshan Zamir, and Mubarak Shah}},
      year={2012},
      eprint={1212.0402},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{faceforensics,
  author       = {{Andreas R{\"{o}}ssler,
                  Davide Cozzolino,
                  Luisa Verdoliva,
                  Christian Riess,
                  Justus Thies, and
                  Matthias Nie{\ss}ner}},
  title        = {{FaceForensics++: Learning to Detect Manipulated Facial Images}},
  booktitle    = {ICCV},
  pages        = {1--11},
  year         = {2019},
}

@inproceedings{taichihd,
  author       = {{Aliaksandr Siarohin,
                  St{\'{e}}phane Lathuili{\`{e}}re,
                  Sergey Tulyakov,
                  Elisa Ricci, and
                  Nicu Sebe}},
  title        = {{First Order Motion Model for Image Animation}},
  booktitle    = {NeurIPS},
  pages        = {7135--7145},
  year         = {2019},
}

@inproceedings{skytimelapse,
  author       = {{Jiangning Zhang,
                  Chao Xu,
                  Liang Liu,
                  Mengmeng Wang,
                  Xia Wu,
                  Yong Liu, and
                  Yunliang Jiang}},
  title        = {{DTVNet: Dynamic Time-Lapse Video Generation via Single Still Image}},
  booktitle    = {ECCV},
  volume       = {12350},
  pages        = {300--315},
  year         = {2020},
}

@inproceedings{webvid,
  author       = {{Max Bain,
                  Arsha Nagrani,
                  G{\"{u}}l Varol, and
                  Andrew Zisserman}},
  title        = {{Frozen in Time: {A} Joint Video and Image Encoder for End-to-End Retrieval}},
  booktitle    = {ICCV},
  pages        = {1708--1718},
  year         = {2021},
}


@inproceedings{ros,
  author       = {{Qihang Zhang,
                  Zhenghao Peng, and
                  Bolei Zhou}},
  title        = {{Learning to Drive by Watching YouTube Videos: Action-Conditioned Contrastive
                  Policy Pretraining}},
  booktitle    = {ECCV},
  volume       = {13686},
  pages        = {111--128},
  year         = {2022},
}

% NOTE: For BibTeX reference of HD-VG-130M dataset, please cite `wang-etal-2023-videofactory'.

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{internvid,
      title={{InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation}}, 
      author={{Yi Wang, Yinan He, Yizhuo Li, Kunchang Li, Jiashuo Yu, Xin Ma, Xinhao Li, Guo Chen, Xinyuan Chen, Yaohui Wang, Conghui He, Ping Luo, Ziwei Liu, Yali Wang, Limin Wang, and Yu Qiao}},
      year={2024},
      eprint={2307.06942},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@misc{vidprom,
      title={{VidProM: A Million-scale Real Prompt-Gallery Dataset for Text-to-Video Diffusion Models}}, 
      author={{Wenhao Wang and Yi Yang}},
      year={2024},
      eprint={2403.06098},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{panda70m,
      title={{Panda-70M: Captioning 70M Videos with Multiple Cross-Modality Teachers}}, 
      author={{Tsai-Shien Chen, Aliaksandr Siarohin, Willi Menapace, Ekaterina Deyneka, Hsiang-wei Chao, Byung Eun Jeon, Yuwei Fang, Hsin-Ying Lee, Jian Ren, Ming-Hsuan Yang, and Sergey Tulyakov}},
      year={2024},
      eprint={2402.19479},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

@inproceedings{fetv,
  author       = {{Yuanxin Liu and
                  Lei Li and
                  Shuhuai Ren and
                  Rundong Gao and
                  Shicheng Li and
                  Sishuo Chen and
                  Xu Sun and
                  Lu Hou}},
  title        = {{{FETV:} {A} Benchmark for Fine-Grained Evaluation of Open-Domain Text-to-Video
                  Generation}},
  booktitle    = {NeurIPS},
  year         = {2023}
}

@misc{DAVIS,
      title={{The 2017 DAVIS Challenge on Video Object Segmentation}}, 
      author={Jordi Pont-Tuset and Federico Perazzi and Sergi Caelles and Pablo Arbeláez and Alex Sorkine-Hornung and Luc Van Gool},
      year={2018},
      eprint={1704.00675},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
}


% ----------------EVALUATION METRICS---------------------
@inproceedings{dover,
  author       = {Haoning Wu and
                  Erli Zhang and
                  Liang Liao and
                  Chaofeng Chen and
                  Jingwen Hou and
                  Annan Wang and
                  Wenxiu Sun and
                  Qiong Yan and
                  Weisi Lin},
  title        = {{Exploring Video Quality Assessment on User Generated Contents from
                  Aesthetic and Technical Perspectives}},
  booktitle    = {ICCV},
  pages        = {20087--20097},
  year         = {2023},
}

@inproceedings{fvd,
  author       = {Thomas Unterthiner and
                  Sjoerd van Steenkiste and
                  Karol Kurach and
                  Rapha{\"{e}}l Marinier and
                  Marcin Michalski and
                  Sylvain Gelly},
  title        = {{FVD:} {A} new Metric for Video Generation},
  booktitle    = {ICLR},
  year         = {2019},
}