@inproceedings{shao2025holitom,title={HoliTom: Holistic Token Merging for Fast Video Large Language Models},author={Shao, Kele and Tao, Keda and Qin, Can and You, Haoxuan and Sui, Yang and Wang, Huan},booktitle={NeurIPS},year={2025},}
NeurIPS’25
Poison as Cure: Visual Noise for Mitigating Object Hallucinations in LVMs
@inproceedings{zhang2025poison,title={Poison as Cure: Visual Noise for Mitigating Object Hallucinations in LVMs},author={Zhang, Kejia and Tao, Keda and Tang, Jiasheng and Wang, Huan},booktitle={NeurIPS},year={2025},}
arXiv’25/07
When Tokens Talk Too Much: A Survey of Multimodal Long-Context Token Compression across Images, Videos, and Audios
@article{shao2025tokens,title={When Tokens Talk Too Much: A Survey of Multimodal Long-Context Token Compression across Images, Videos, and Audios},author={Shao, Kele and Tao, Keda and Zhang, Kejia and Feng, Sicheng and Cai, Mu and Shang, Yuzhang and You, Haoxuan and Qin, Can and Sui, Yang and Wang, Huan},journal={arXiv preprint arXiv:2507.20198},year={2025},database={https://oasis-paddleboat-fc1.notion.site/when-tokens-talk-too-much-database},paper_repo={https://github.com/cokeshao/Awesome-Multimodal-Token-Compression},}
arXiv’25/05
Can MLLMs Guide Me Home? A Benchmark Study on Fine-Grained Visual Reasoning from Transit Maps
Sicheng Feng* , Song Wang*, Shuyi Ouyang, Lingdong Kong, Zikai Song, Jianke Zhu, Huan Wang† , and Xinchao Wang
Multimodal large language models (MLLMs) have recently achieved significant progress in visual tasks, including semantic scene understanding and text-image alignment, with reasoning variants enhancing performance on complex tasks involving mathematics and logic. However, their capacity for reasoning tasks involving fine-grained visual understanding remains insufficiently evaluated. To address this gap, we introduce ReasonMap, a benchmark designed to assess the fine-grained visual understanding and spatial reasoning abilities of MLLMs. ReasonMap encompasses high-resolution transit maps from 30 cities across 13 countries and includes 1,008 question-answer pairs spanning two question types and three templates. Furthermore, we design a two-level evaluation pipeline that properly assesses answer correctness and quality. Comprehensive evaluations of 15 popular MLLMs, including both base and reasoning variants, reveal a counterintuitive pattern: among open-source models, base models outperform reasoning ones, while the opposite trend is observed in closed-source models. Additionally, performance generally degrades when visual inputs are masked, indicating that while MLLMs can leverage prior knowledge to answer some questions, fine-grained visual reasoning tasks still require genuine visual perception for strong performance. Our benchmark study offers new insights into visual reasoning and contributes to investigating the gap between open-source and closed-source models.
@article{feng2025canmllms,title={Can MLLMs Guide Me Home? A Benchmark Study on Fine-Grained Visual Reasoning from Transit Maps},author={Feng, Sicheng and Wang, Song and Ouyang, Shuyi and Kong, Lingdong and Song, Zikai and Zhu, Jianke and Wang, Huan and Wang, Xinchao},journal={arXiv preprint arXiv:2505.18675},year={2025},dataset={https://huggingface.co/datasets/FSCCS/ReasonMap},qbitai={https://mp.weixin.qq.com/s/sPJLQtHgl5DZghWLWa_H3Q},}
CVPR’25
DyCoke: Dynamic Compression of Tokens for Fast Video Large Language Models
@inproceedings{tao2025dycoke,title={DyCoke: Dynamic Compression of Tokens for Fast Video Large Language Models},author={Tao, Keda and Qin, Can and You, Haoxuan and Sui, Yang and Wang, Huan},booktitle={CVPR},year={2025},}
ICLR’25
Accessing Vision Foundation Models at ImageNet-level Costs
@inproceedings{zhang2024accessing,title={Accessing Vision Foundation Models at ImageNet-level Costs},author={Zhang, Yitian and Ma, Xu and Bai, Yue and Wang, Huan and Fu, Yun},booktitle={ICLR},year={2025},}
@article{feng2024oracle,title={Is Oracle Pruning the True Oracle?},author={Feng, Sicheng and Tao, Keda and Wang, Huan},journal={arXiv preprint arXiv:2412.00143},year={2024},}
ACM MM’24
Towards Real-time Video Compressive Sensing on Mobile Devices
@inproceedings{cao2024towards,title={Towards Real-time Video Compressive Sensing on Mobile Devices},author={Cao, Miao and Wang, Lishun and Wang, Huan and Wang, Guoqing and Yuan, Xin},booktitle={ACM MM},year={2024},}
ECCV’24 Oral
A Simple Low-bit Quantization Framework for Video Snapshot Compressive Imaging
@inproceedings{cao2024simple,title={A Simple Low-bit Quantization Framework for Video Snapshot Compressive Imaging},author={Cao, Miao and Wang, Lishun and Wang, Huan and Yuan, Xin},booktitle={ECCV},year={2024},}