@proceedings {7044, title = {Behavior-Driven Synthesis of Human Dynamics}, year = {2021}, url = {https://arxiv.org/abs/2103.04677}, author = {Andreas Blattmann and Timo Milbich and Michael Dorkenwald and Bj{\"o}rn Ommer} } @article {7071, title = {Characterizing Generalization under Out-Of-Distribution Shifts in Deep Metric Learning}, year = {2021}, url = {https://arxiv.org/abs/2107.09562}, author = {Timo Milbich and Karsten Roth and Samarth Sinha and Ludwig Schmidt and Marzyeh Ghassemi and Bj{\"o}rn Ommer} } @conference {7068, title = {iPOKE: Poking a Still Image for Controlled Stochastic Video Synthesis}, booktitle = {Proceedings of the International Conference on Computer Vision (ICCV)}, year = {2021}, url = {https://arxiv.org/abs/2107.02790}, author = {Andreas Blattmann and Timo Milbich and Michael Dorkenwald and Bj{\"o}rn Ommer} } @proceedings {7051, title = {S2SD: Simultaneous Similarity-based Self-Distillation for Deep Metric Learning}, year = {2021}, url = {https://arxiv.org/abs/2009.08348}, author = {Karsten Roth and Timo Milbich and Bj{\"o}rn Ommer and Joseph Paul Cohen and Marzyeh Ghassemi} } @proceedings {7053, title = {Stochastic Image-to-Video Synthesis usin cINNs}, year = {2021}, author = {Michael Dorkenwald and Timo Milbich and Andreas Blattmann and Robin Rombach and Konstantinos G. Derpanis and Bj{\"o}rn Ommer} } @proceedings {7063, title = {Understanding Object Dynamics for Interactive Image-to-Video Synthesis}, year = {2021}, abstract = {What would be the effect of locally poking a static scene? We present an approach that learns naturally-looking global articulations caused by a local manipulation at a pixel level. Training requires only videos of moving objects but no information of the underlying manipulation of the physical scene. Our generative model learns to infer natural object dynamics as a response to user interaction and learns about the interrelations between different object body regions. Given a static image of an object and a local poking of a pixel, the approach then predicts how the object would deform over time. In contrast to existing work on video prediction, we do not synthesize arbitrary realistic videos but enable local interactive control of the deformation. Our model is not restricted to particular object categories and can transfer dynamics onto novel unseen object instances. Extensive experiments on diverse objects demonstrate the effectiveness of our approach compared to common video prediction frameworks.}, url = {https://arxiv.org/abs/2106.11303v1}, author = {Andreas Blattmann and Timo Milbich and Michael Dorkenwald and Bj{\"o}rn Ommer} } @proceedings {6934, title = {DiVA: Diverse Visual Feature Aggregation for Deep Metric Learning}, year = {2020}, url = {https://arxiv.org/abs/2004.13458}, author = {Timo Milbich and Karsten Roth and Homanga Bharadhwaj and Samarth Sinha and Yoshua Bengio and Bj{\"o}rn Ommer and Joseph Paul Cohen} } @conference {6386, title = {PADS: Policy-Adapted Sampling for Visual Similarity Learning}, booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, volume = {1}, year = {2020}, chapter = {1}, url = {https://arxiv.org/abs/2003.11113}, author = {Timo Milbich and Karsten Roth and Bj{\"o}rn Ommer} } @proceedings {6390, title = {Revisiting Training Strategies and Generalization Performance in Deep Metric Learning}, year = {2020}, url = {https://arxiv.org/pdf/2002.08473.pdf}, author = {Karsten Roth and Timo Milbich and Samarth Sinha and Prateek Gupta and Bj{\"o}rn Ommer and Joseph Paul Cohen} } @article {6389, title = {Sharing Matters for Generalization in Deep Metric Learning}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)}, year = {2020}, doi = {10.1109/TPAMI.2020.3009620}, url = {https://arxiv.org/abs/2004.05582}, author = {Timo Milbich and Karsten Roth and Biagio Brattoli and Bj{\"o}rn Ommer} } @article {6339, title = {Unsupervised Representation Learning by Discovering Reliable Image Relations}, journal = {Pattern Recognition}, volume = {102}, year = {2020}, month = {June 2020}, url = {http://arxiv.org/abs/1911.07808}, author = {Timo Milbich and Omair Ghori and Bj{\"o}rn Ommer} } @conference {6301, title = {Unsupervised Part-Based Disentangling of Object Shape and Appearance}, booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (Oral + Best paper finalist: top 45 / 5160 submissions)}, year = {2019}, author = {Dominik Lorenz and Leonard Bereska and Timo Milbich and Bj{\"o}rn Ommer} } @conference {6282, title = {Towards Learning a Realistic Rendering of Human Behavior}, booktitle = {European Conference on Computer Vision (ECCV - HBUGEN)}, year = {2018}, abstract = {Realistic rendering of human behavior is of great interest for applications such as video animations, virtual reality and more generally, gaming engines. Commonly animations of persons performing actions are rendered by articulating explicit 3D models based on sequences of coarse body shape representations simulating a certain behavior. While the simulation of natural behavior can be efficiently learned from common video data, the corresponding 3D models are typically designed in manual, laborious processes or reconstructed from costly (multi-)sensor data. In this work, we present an approach towards a holistic learning framework for rendering human behavior in which all components are learned from easily available data. We utilize motion capture data to generate realistic generations which can be controlled by a user and learn to render characters using only RGB camera data. Our experiments show that we can further improve data efficiency by training on multiple characters at the same time. Overall our approach shows a completely new path towards easily available, personalized avatar creation.}, author = {Patrick Esser and Johannes Haux and Timo Milbich and Bj{\"o}rn Ommer} } @conference {6187, title = {Unsupervised Video Understanding by Reconciliation of Posture Similarities}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)}, year = {2017}, url = {https://hciweb.iwr.uni-heidelberg.de/compvis/research/tmilbich_iccv17}, author = {Timo Milbich and Miguel Bautista and Ekaterina Sutter and Bj{\"o}rn Ommer} } @conference {antic:HACI:2013, title = {Less is More: Video Trimming for Action Recognition}, booktitle = {Proceedings of the IEEE International Conference on Computer Vision, Workshop on Understanding Human Activities: Context and Interaction}, year = {2013}, pages = {515--521}, publisher = {IEEE}, organization = {IEEE}, author = {Antic, B. and Timo Milbich and Bj{\"o}rn Ommer} }