@inproceedings{bao2024e3m, title={E3M: Zero-Shot Spatio-Temporal Video Grounding with Expectation-Maximization Multimodal Modulation}, author={Bao, Peijun and Shao, Zihao and Yang, Wenhan and Ng, Boon Poh and Kot, Alex C}, booktitle={European Conference on Computer Vision (ECCV)}, year={2024} }