From c3953a8ae7473624db9e2724575d7b5df0c7eaf6 Mon Sep 17 00:00:00 2001 From: ProGamerGov Date: Sat, 4 Jun 2022 17:30:04 -0600 Subject: [PATCH 1/2] Add the CLIP Text Feature Visualization Tutorial --- ...LIP_TextFeatureVisAndSearch_OptimViz.ipynb | 1960 +++++++++++++++++ 1 file changed, 1960 insertions(+) create mode 100644 tutorials/optimviz/CLIP_TextFeatureVisAndSearch_OptimViz.ipynb diff --git a/tutorials/optimviz/CLIP_TextFeatureVisAndSearch_OptimViz.ipynb b/tutorials/optimviz/CLIP_TextFeatureVisAndSearch_OptimViz.ipynb new file mode 100644 index 0000000000..c0c6c86737 --- /dev/null +++ b/tutorials/optimviz/CLIP_TextFeatureVisAndSearch_OptimViz.ipynb @@ -0,0 +1,1960 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "CLIP_TextFeatureVisAndSearch_OptimViz.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "4d98f277c7b44d53b463d172ecec7d23": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_b0779c6d342b47caa6e22c036e2e13e2", + "IPY_MODEL_cf6d4296a2084c598a9646b22fe08ac3", + "IPY_MODEL_4cb7cf8553de4dcbabf33c9c0798a27c" + ], + "layout": "IPY_MODEL_c2f97e1a90b644118290a860d3fc3fb2" + } + }, + "b0779c6d342b47caa6e22c036e2e13e2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dd15379bd9534719ab4488c2270b077f", + "placeholder": "​", + "style": "IPY_MODEL_7e1bbde93d924f2b93c74c694678a0fd", + "value": "100%" + } + }, + "cf6d4296a2084c598a9646b22fe08ac3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_da6745763b764c9c9d026e316c6e76dd", + "max": 1544, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_fd60edb03c1242569f3b5a17686ed2b0", + "value": 1544 + } + }, + "4cb7cf8553de4dcbabf33c9c0798a27c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_693faf3b70ca489aafcb3095e04b97a3", + "placeholder": "​", + "style": "IPY_MODEL_74b0b256df6c46658082981f3d82f17a", + "value": " 1544/1544 [01:30<00:00, 17.33it/s]" + } + }, + "c2f97e1a90b644118290a860d3fc3fb2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dd15379bd9534719ab4488c2270b077f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7e1bbde93d924f2b93c74c694678a0fd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "da6745763b764c9c9d026e316c6e76dd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fd60edb03c1242569f3b5a17686ed2b0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "693faf3b70ca489aafcb3095e04b97a3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "74b0b256df6c46658082981f3d82f17a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2477cf00bf934608ba560d35b5086b04": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_60ffccf308304f0b9be9a7637dc3b673", + "IPY_MODEL_57aa9158b0a54edda567e76ec7ab26cc", + "IPY_MODEL_5a56829ac4724d3b848a77c43282d090" + ], + "layout": "IPY_MODEL_229423305a32404cb15dd1052b0f6f8d" + } + }, + "60ffccf308304f0b9be9a7637dc3b673": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bacb750121b349d9b6276b79351c3179", + "placeholder": "​", + "style": "IPY_MODEL_8839b04f3c474d51ae3b90673a3f70bf", + "value": "100%" + } + }, + "57aa9158b0a54edda567e76ec7ab26cc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1b388192a3774856b551b942a6d98bd3", + "max": 1544, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c06cd0c84cd541e0a0ee19584bcaf21d", + "value": 1544 + } + }, + "5a56829ac4724d3b848a77c43282d090": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0f9363100834b32a895ea4ece0a26ec", + "placeholder": "​", + "style": "IPY_MODEL_ad418495991f4f769a643947f857aa45", + "value": " 1544/1544 [22:37<00:00, 1.11it/s]" + } + }, + "229423305a32404cb15dd1052b0f6f8d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bacb750121b349d9b6276b79351c3179": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8839b04f3c474d51ae3b90673a3f70bf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1b388192a3774856b551b942a6d98bd3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c06cd0c84cd541e0a0ee19584bcaf21d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b0f9363100834b32a895ea4ece0a26ec": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ad418495991f4f769a643947f857aa45": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "30f443aef4ff4653b1994ca2fdf6265f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_fd4f3d842ae54b78b1a36c1f9506ad3d", + "IPY_MODEL_c631661c871e49e38ad2fc4323ab83f6", + "IPY_MODEL_3f54e63232e244a584ba99060bb39bd2" + ], + "layout": "IPY_MODEL_58b037efabc542a588408a66af1ee332" + } + }, + "fd4f3d842ae54b78b1a36c1f9506ad3d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7e160f3270a845b3913690fa6374fbf8", + "placeholder": "​", + "style": "IPY_MODEL_c25761926edd47a28d3dbe25089c30ac", + "value": "100%" + } + }, + "c631661c871e49e38ad2fc4323ab83f6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9313b7418c60479a81683f4106e07900", + "max": 1544, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_72f9820725eb4f3fa4bf23cda55377c2", + "value": 1544 + } + }, + "3f54e63232e244a584ba99060bb39bd2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d63c8f2408874a29b0db10c18d4bbe0d", + "placeholder": "​", + "style": "IPY_MODEL_177efc0cc3194fbcbb7602a1d37a6d0c", + "value": " 1544/1544 [01:32<00:00, 16.72it/s]" + } + }, + "58b037efabc542a588408a66af1ee332": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7e160f3270a845b3913690fa6374fbf8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c25761926edd47a28d3dbe25089c30ac": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9313b7418c60479a81683f4106e07900": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "72f9820725eb4f3fa4bf23cda55377c2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d63c8f2408874a29b0db10c18d4bbe0d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "177efc0cc3194fbcbb7602a1d37a6d0c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Text Feature Visualization & Text Search\n", + "This tutorial demonstrates how to search layer channels with text & how to perform text feature visualization on the CLIP ResNet 50x4 model as described in the [Multimodal Neurons in Artificial Neural Networks](https://distill.pub/2021/multimodal-neurons/) research paper." + ], + "metadata": { + "id": "6PyoP2q9bNGJ" + } + }, + { + "cell_type": "code", + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "\n", + "import html\n", + "from typing import Callable, List, Optional, Tuple, Union\n", + "from warnings import warn\n", + "\n", + "import captum.optim as opt\n", + "import regex as re\n", + "import torch\n", + "from captum.optim.models import clip_resnet50x4_text, clip_resnet50x4_image\n", + "from tqdm.auto import tqdm\n", + "\n", + "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")" + ], + "metadata": { + "id": "AFKTgxkmOG_U" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Setup\n", + "\n", + "To start off, we'll define multiple helper functions and classes." + ], + "metadata": { + "id": "LWH8zkmZ7Gpn" + } + }, + { + "cell_type": "code", + "source": [ + "class PreprocessTextCLIP(torch.nn.Module):\n", + " \"\"\"\n", + " Preprocess text strings as per OpenAI's standard CLIP preprocessing / cleaning.\n", + "\n", + " See here for more information:\n", + " https://ftfy.readthedocs.io/en/latest/\n", + " https://docs.python.org/3/library/html.html#html.unescape\n", + " https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py\n", + " \"\"\"\n", + "\n", + " __constants__ = [\"use_ftfy\"]\n", + "\n", + " def __init__(self) -> None:\n", + " super().__init__()\n", + " try:\n", + " import ftfy\n", + "\n", + " self.use_ftfy = True\n", + " except (ImportError, AssertionError):\n", + " warn(\n", + " \"Warning the ftfy library was not found, and thus heuristic unicode\"\n", + " + \" correction will not be used in the CLIPTokenizer preprocessing\"\n", + " + \" module. The library can be installed via 'pip install ftfy'\"\n", + " )\n", + " self.use_ftfy = False\n", + "\n", + " @torch.jit.ignore\n", + " def forward(self, x: List[str]) -> List[str]:\n", + " \"\"\"\n", + " Args:\n", + "\n", + " x (str or list of str): One or more strings to be cleaned.\n", + "\n", + " Returns:\n", + " x (str or list of str): A list of preprocessed / cleaned strings.\n", + " \"\"\"\n", + " assert all([isinstance(s, str) for s in x])\n", + " for i in range(len(x)):\n", + " # Heuristic unicode fixing (ex: mojibake)\n", + " if self.use_ftfy:\n", + " x[i] = ftfy.fix_text(x[i])\n", + "\n", + " # Convert named & numeric character references in HTML to unicode\n", + " x[i] = html.unescape(html.unescape(x[i]))\n", + "\n", + " # Remove duplicate whitespaces\n", + " x[i] = re.sub(r\"\\s+\", \" \", x[i].strip()).strip()\n", + "\n", + " # Only use lowercase characters\n", + " x[i] = x[i].lower()\n", + " return x\n", + "\n", + "\n", + "class CLIP_ResNet50x4(torch.nn.Module):\n", + " \"\"\"\n", + " Wrapper for combining the text and image portions of a CLIP model into the full\n", + " model.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self, image_model: torch.nn.Module, text_model: torch.nn.Module\n", + " ) -> None:\n", + " \"\"\"\n", + " Args:\n", + "\n", + " image_model (nn.Module): A PyTorch model instance that takes image inputs.\n", + " text_model (nn.Module): A PyTorch model instance that takes text inputs.\n", + " \"\"\"\n", + " super().__init__()\n", + " self.image_model = image_model\n", + " self.text_model = text_model\n", + "\n", + " def forward(\n", + " self, x: Union[Tuple[torch.Tensor, torch.Tensor], List[torch.Tensor]]\n", + " ) -> Tuple[torch.Tensor, torch.Tensor]:\n", + " \"\"\"\n", + " Args:\n", + "\n", + " x (tuple or list of torch.Tensor): A tuple or list of tensors, with the\n", + " format: [image_tensor, text_tensor].\n", + "\n", + " Returns:\n", + " logits_per_text (torch.Tensor): The model output.\n", + " \"\"\"\n", + " assert len(x) == 2\n", + " image, text = x\n", + " image_features = self.image_model(image)\n", + " text_features = self.text_model(text)\n", + "\n", + " image_features = image_features / image_features.norm(dim=-1, keepdim=True)\n", + " text_features = text_features / text_features.norm(dim=-1, keepdim=True)\n", + "\n", + " logit_scale = self.text_model.logit_scale.exp()\n", + "\n", + " logits_per_image = logit_scale * image_features @ text_features.t()\n", + " logits_per_text = logit_scale * text_features @ image_features.t()\n", + "\n", + " return logits_per_image, logits_per_text\n", + "\n", + "\n", + "def get_text_layer_attr(\n", + " model: torch.nn.Module, layer_target: torch.nn.Module, text_inputs: torch.Tensor\n", + ") -> torch.Tensor:\n", + " \"\"\"\n", + " Args:\n", + "\n", + " model (nn.Module): A PyTorch model instance.\n", + " layer_target (nn.Module): A target layer instance.\n", + " text_inputs (torch.Tensor): A text input to pass through the text portion of\n", + " the model.\n", + "\n", + " Returns\n", + " grad (torch.Tensor): Attributions for the target layer.\n", + " \"\"\"\n", + " grad = []\n", + " for i in range(text_inputs.shape[0]):\n", + " model_inputs = (\n", + " torch.nn.Parameter(torch.zeros(1, 3, 288, 288).to(text_inputs.device)),\n", + " text_inputs[i : i + 1].clone(),\n", + " )\n", + " attr_activations = opt.models.collect_activations(\n", + " model, [layer_target, model], model_inputs\n", + " )\n", + " target_activ = attr_activations[layer_target]\n", + " logit_activ = attr_activations[model][1]\n", + " grad_b = torch.autograd.grad(\n", + " outputs=logit_activ,\n", + " inputs=[target_activ],\n", + " grad_outputs=torch.ones_like(logit_activ),\n", + " )[0].detach()\n", + " grad.append(grad_b)\n", + " return torch.cat(grad, 0)\n", + "\n", + "\n", + "def int_token_tokenizer(\n", + " x: List[int],\n", + " context_length: int = 77,\n", + " start_token: int = 49406,\n", + " end_token: int = 49407,\n", + " padding_value: int = 0,\n", + " start_from_tokens: List[int] = [],\n", + " end_with_tokens: List[int] = [],\n", + ") -> torch.Tensor:\n", + " \"\"\"\n", + " Apply special tokens and padding to sets of tokens in integer list format.\n", + "\n", + " Args:\n", + "\n", + " context_length (int, optional): The required context length for the model.\n", + " Inputs with lengths less than context_length will be padded with\n", + " zeros.\n", + " Default: 77\n", + " start_token (str, optional): The starting token to place in front of each\n", + " text input. Set to None for no start token.\n", + " Default: \"<|startoftext|>\"\n", + " end_token (str, optional): The ending token to place at the end of each\n", + " text input. Set to None for no end token.\n", + " Default: \"<|endoftext|>\"\n", + " padding_value (int, optional): An integer value to use for padding token\n", + " sets to the desired context_length.\n", + " Default: 0\n", + " start_from_tokens (list of int, optional): Optionally add one or more\n", + " starting tokens to each input.\n", + " Default: []\n", + " end_with_tokens (list of int, optional): Optionally add one or more\n", + " ending tokens to each input.\n", + " Default: []\n", + "\n", + " Returns:\n", + " tokens (torch.Tensor): A tensors containing the token sets stacked across the\n", + " batch dimension.\n", + " \"\"\"\n", + " tokens = [\n", + " [start_token] + start_from_tokens + [t] + end_with_tokens + [end_token]\n", + " for t in x\n", + " ]\n", + " tokens = [\n", + " token_set + ([padding_value] * (context_length - len(token_set)))\n", + " for token_set in tokens\n", + " ]\n", + " return torch.as_tensor(tokens).int()" + ], + "metadata": { + "id": "uZSJVZRZOJAi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We load both the image and text models, and then place them inside our `CLIP_ResNet50x4` wrapper class to create the full CLIP model. We also load the CLIP tokenizer, and some additional variables." + ], + "metadata": { + "id": "mAXbDI6i7cKw" + } + }, + { + "cell_type": "code", + "source": [ + "# Load the CLIP ResNet 50x4 model\n", + "clip_model_text = clip_resnet50x4_text(pretrained=True).eval().to(device)\n", + "clip_model_image = (\n", + " clip_resnet50x4_image(\n", + " pretrained=True, replace_relus_with_redirectedrelu=False, use_attnpool=True\n", + " )\n", + " .eval()\n", + " .to(device)\n", + ")\n", + "clip_model_full = CLIP_ResNet50x4(clip_model_image, clip_model_text)\n", + "\n", + "# Setup tokenizer\n", + "clip_tokenizer = opt.transforms.CLIPTokenizer(\n", + " pretrained_merges=True, preprocessing_module=PreprocessTextCLIP()\n", + ")\n", + "\n", + "# Setup tokenizer vocab range & logit scale\n", + "token_vocab_range = list(range(0, 49405)) # Standard CLIP tokens are [0-49405]\n", + "logit_scale = clip_model_text.logit_scale.exp()" + ], + "metadata": { + "id": "4bKGCAkAnS5c" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Searching CLIP Image Layer Channels With Text\n", + "\n", + "This portion of the tutorial demonstrates how to use the text portion of the CLIP ResNet 50x4 model to search layer channels in the image portion of the model." + ], + "metadata": { + "id": "3-KNjxksTSQJ" + } + }, + { + "cell_type": "markdown", + "source": [ + "Below we show how to search target image layers for channels that relate to our text inputs!" + ], + "metadata": { + "id": "Z0sFRWGS7l7m" + } + }, + { + "cell_type": "code", + "source": [ + "text = \"kitten\" # Change to any text input or list of text inputs\n", + "text_inputs = clip_tokenizer(text).to(device)\n", + "\n", + "# Set target layer as penultimate image model layer\n", + "target = clip_model_full.image_model.layer4[5]\n", + "\n", + "# Get attributions for target layer in relation to given text inputs\n", + "layer_attr = get_text_layer_attr(clip_model_full, target, text_inputs)\n", + "\n", + "# Set the number of results to show\n", + "num_results = 5\n", + "\n", + "\n", + "for b in range(layer_attr.shape[0]):\n", + " # Sort results\n", + " channel_strengths = torch.stack(\n", + " [-torch.linalg.norm(layer_attr[b, i, :, :]) for i in range(layer_attr.shape[1])]\n", + " )\n", + " top_channels = torch.argsort(channel_strengths)[:num_results]\n", + "\n", + " # Show results\n", + " b_text = text if isinstance(text, str) else text[b]\n", + " print(\n", + " \"Top {} channels of the target layer for the text '{}' with the largest L2-norm: \\n {} \".format(\n", + " list(top_channels.size())[0], b_text, top_channels.tolist()\n", + " )\n", + " )\n", + " print(\" {}\".format(channel_strengths[top_channels].tolist()))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Bl1Tsk7izk7H", + "outputId": "f2805136-1733-487a-9f09-dee21d9d73b0" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Top 5 channels of the target layer for the text 'kitten' with the largest L2-norm: \n", + " [289, 1179, 607, 1543, 1124] \n", + " [-1.4196891784667969, -0.7648456692695618, -0.6109495759010315, -0.5101999044418335, -0.5019273161888123]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "We can see that the text input `\"kitten\"` corresponds most strongly to channel number `289` in the target layer. As the second strongest channel is significantly lower than the first, we can reasonably conclude that channel `289` is the image model's \"kitten\" channel." + ], + "metadata": { + "id": "V5B1jEBBGt4j" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Similarity Search\n", + "\n", + "\n", + "CLIP models produce text & image embeddings that can be used to calculate the similarity between different images and text strings.\n", + "\n", + "Below we define a helper function for comparing embedding similarity, by searching through the model's entire vocab token range." + ], + "metadata": { + "id": "w9Cc8MolbtHB" + } + }, + { + "cell_type": "code", + "source": [ + "def embedding_token_search(\n", + " text_model: torch.nn.Module,\n", + " target_embeddings: torch.Tensor,\n", + " token_list: List[int],\n", + " batch_size: int = 32,\n", + " logit_scale: float = 100,\n", + " device: torch.device = torch.device(\"cpu\"),\n", + " start_from_tokens: List[int] = [],\n", + " end_with_tokens: List[int] = [],\n", + " tokenizer_fn: Callable[[List[int]], List[int]] = int_token_tokenizer,\n", + ") -> List[float]:\n", + " \"\"\"\n", + " Args:\n", + "\n", + " text_model (nn.Module): A PyTorch model instance.\n", + " target_embeddings (torch.Tensor): A set of normalized image or text embeddings\n", + " to find the maximal token for, with a shape of: [1, n_vals].\n", + " token_list (list of int): A list of tokens to search through.\n", + " batch_size (int, optional): The desired batch size to use.\n", + " Default: 32\n", + " device (torch.device, optional): The desired device to use.\n", + " Default: torch.device(\"cpu\")\n", + " start_from_tokens (list of int, optional): A list of one or more tokens to use\n", + " a prefix for the token search.\n", + " Default: []\n", + " end_with_tokens (list of int, optional): A list of one or more tokens to use\n", + " a suffix for the token search.\n", + " Default: []\n", + " tokenizer_fn (callable, optional): A function that takes a list of integer\n", + " token sets and applies padding & special tokens.\n", + " Default: int_token_tokenizer\n", + "\n", + " Returns:\n", + " logits_text_list (list of float): A list of values corresponding to the order\n", + " in token_list.\n", + " \"\"\"\n", + " assert target_embeddings.dim() == 2 and target_embeddings.shape[0] == 1\n", + " logits_text_list = []\n", + "\n", + " for i in tqdm(range(0, len(token_list), batch_size)):\n", + " # Prepare input tokens\n", + " token_batch = token_list[i : i + batch_size]\n", + " token_set = tokenizer_fn(\n", + " token_batch,\n", + " start_from_tokens=start_from_tokens,\n", + " end_with_tokens=end_with_tokens,\n", + " ).to(device)\n", + "\n", + " text_embeddings = text_model(token_set).detach()\n", + " text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)\n", + "\n", + " logits_per_text = logit_scale * text_embeddings @ target_embeddings.t()\n", + " logits_text_list += logits_per_text[:, 0].tolist()\n", + "\n", + " return logits_text_list" + ], + "metadata": { + "id": "yNW2B9GNKwq_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Text Similarity\n", + "\n", + "The similarity of two different text embeddings produced by the text portion of the model can easily be determined in the same way similarity between image and text embeddings is calculated." + ], + "metadata": { + "id": "MCBxsWuaK1Wm" + } + }, + { + "cell_type": "code", + "source": [ + "# Setup target embedding\n", + "text_input = \"machine learning\"\n", + "text_tokens = clip_tokenizer(text_input).to(device)\n", + "text_embeddings = clip_model_text(text_tokens).detach()\n", + "text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)\n", + "\n", + "# Compare target embedding with full token list\n", + "logits_text_list = embedding_token_search(\n", + " text_model=clip_model_text,\n", + " target_embeddings=text_embeddings,\n", + " token_list=token_vocab_range,\n", + " batch_size=32,\n", + " logit_scale=logit_scale,\n", + " device=device,\n", + ")\n", + "\n", + "# Sort results\n", + "num_tokens = 10\n", + "top_tokens_text = torch.argsort(torch.as_tensor(logits_text_list), descending=True)[\n", + " 0:num_tokens\n", + "]\n", + "\n", + "# Decode results\n", + "top_tokens_str = [clip_tokenizer.decode(t)[0] for t in top_tokens_text.unsqueeze(1)]\n", + "\n", + "# Display results\n", + "print(\n", + " \"Top {} most similar tokens for the input text is: \\n {} \".format(\n", + " num_tokens, top_tokens_text.tolist()\n", + " )\n", + ")\n", + "print(\"The top tokens decoded are: \\n {} \".format(top_tokens_str))" + ], + "metadata": { + "id": "8rCV0-_byeXf", + "outputId": "d3840f4e-ff78-4081-8a33-81e4ec671b16", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 120, + "referenced_widgets": [ + "30f443aef4ff4653b1994ca2fdf6265f", + "fd4f3d842ae54b78b1a36c1f9506ad3d", + "c631661c871e49e38ad2fc4323ab83f6", + "3f54e63232e244a584ba99060bb39bd2", + "58b037efabc542a588408a66af1ee332", + "7e160f3270a845b3913690fa6374fbf8", + "c25761926edd47a28d3dbe25089c30ac", + "9313b7418c60479a81683f4106e07900", + "72f9820725eb4f3fa4bf23cda55377c2", + "d63c8f2408874a29b0db10c18d4bbe0d", + "177efc0cc3194fbcbb7602a1d37a6d0c" + ] + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/1544 [00:00" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Next we search the full vocab token range with the image embeddings that we collected above." + ], + "metadata": { + "id": "vn5rHuwqsgfR" + } + }, + { + "cell_type": "code", + "source": [ + "# Collect text embedding similarities\n", + "logits_text_list = embedding_token_search(\n", + " text_model=clip_model_text,\n", + " target_embeddings=image_embedding,\n", + " token_list=token_vocab_range,\n", + " batch_size=32,\n", + " logit_scale=logit_scale,\n", + " device=device,\n", + ")\n", + "\n", + "# Sort results\n", + "num_tokens = 10\n", + "top_tokens_text = torch.argsort(torch.as_tensor(logits_text_list), descending=True)[\n", + " 0:num_tokens\n", + "]\n", + "\n", + "# Decode results\n", + "top_tokens_str = [clip_tokenizer.decode(t)[0] for t in top_tokens_text.unsqueeze(1)]\n", + "\n", + "# Display results\n", + "print(\n", + " \"Top {} most similar tokens for the input image is: \\n {} \".format(\n", + " num_tokens, top_tokens_text.tolist()\n", + " )\n", + ")\n", + "print(\"The top tokens decoded are: \\n {} \".format(top_tokens_str))" + ], + "metadata": { + "id": "Ey4YhZDxLCX-", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 120, + "referenced_widgets": [ + "4d98f277c7b44d53b463d172ecec7d23", + "b0779c6d342b47caa6e22c036e2e13e2", + "cf6d4296a2084c598a9646b22fe08ac3", + "4cb7cf8553de4dcbabf33c9c0798a27c", + "c2f97e1a90b644118290a860d3fc3fb2", + "dd15379bd9534719ab4488c2270b077f", + "7e1bbde93d924f2b93c74c694678a0fd", + "da6745763b764c9c9d026e316c6e76dd", + "fd60edb03c1242569f3b5a17686ed2b0", + "693faf3b70ca489aafcb3095e04b97a3", + "74b0b256df6c46658082981f3d82f17a" + ] + }, + "outputId": "fd87c707-f8a5-46db-8cb9-9f7314414195" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/1544 [00:00 Union[List[float], List[List[float]]]:\n", + " \"\"\"\n", + " Args:\n", + "\n", + " full_model (nn.Module): A PyTorch model instance.\n", + " target (nn.Module): The target layer to collect attributions from.\n", + " channel_index (int, optional): The desired channel index to collect\n", + " attributions for, in the target layer. Set to None for all channels.\n", + " token_list (list of int): A list of tokens to search through.\n", + " batch_size (int, optional): The desired batch size to use.\n", + " Default: 32\n", + " device (torch.device, optional): The desired device to use.\n", + " Default: torch.device(\"cpu\")\n", + " start_from_tokens (list of int, optional): A list of one or more tokens to use\n", + " a prefix for the token search.\n", + " Default: []\n", + " end_with_tokens (list of int, optional): A list of one or more tokens to use\n", + " a suffix for the token search.\n", + " Default: []\n", + " tokenizer_fn (callable, optional): A function that takes a list of integer\n", + " token sets and applies padding & special tokens.\n", + " Default: int_token_tokenizer\n", + "\n", + " Returns:\n", + " logits_text_list (list of float or list of list of float): A list of values\n", + " corresponding to the order in token_list.\n", + " \"\"\"\n", + " logits_text_list = []\n", + "\n", + " for i in tqdm(range(0, len(token_list), batch_size)):\n", + " # Prepare input tokens\n", + " token_batch = token_list[i : i + batch_size]\n", + " token_set = tokenizer_fn(\n", + " token_batch,\n", + " start_from_tokens=start_from_tokens,\n", + " end_with_tokens=end_with_tokens,\n", + " ).to(device)\n", + "\n", + " layer_attr = get_text_layer_attr(full_model, target, token_set)\n", + " for b in range(layer_attr.shape[0]):\n", + "\n", + " if channel_index:\n", + " channel_strengths = -torch.linalg.norm(\n", + " layer_attr[b, channel_index, ...]\n", + " )\n", + " else:\n", + " channel_strengths = torch.stack(\n", + " [\n", + " -torch.linalg.norm(layer_attr[b, c, ...])\n", + " for c in range(layer_attr.shape[1])\n", + " ]\n", + " )\n", + " logits_text_list += [channel_strengths.tolist()]\n", + "\n", + " return logits_text_list" + ], + "metadata": { + "id": "pkiKrT8B9gB2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "We can now collect attributions for the penultimate layer with a channel index of `289` for the image portion of the CLIP ResNet 50x4 model." + ], + "metadata": { + "id": "yU0Qp4sRKAPt" + } + }, + { + "cell_type": "code", + "source": [ + "# Desired target layer & channel index\n", + "target_layer = clip_model_full.image_model.layer4[5]\n", + "channel_index = 289\n", + "\n", + "\n", + "# Collect target attributions\n", + "logits_text_list = channel_token_search(\n", + " full_model=clip_model_full,\n", + " target=target_layer,\n", + " channel_index=channel_index,\n", + " token_list=token_vocab_range,\n", + " batch_size=32,\n", + " logit_scale=logit_scale,\n", + " device=device,\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49, + "referenced_widgets": [ + "2477cf00bf934608ba560d35b5086b04", + "60ffccf308304f0b9be9a7637dc3b673", + "57aa9158b0a54edda567e76ec7ab26cc", + "5a56829ac4724d3b848a77c43282d090", + "229423305a32404cb15dd1052b0f6f8d", + "bacb750121b349d9b6276b79351c3179", + "8839b04f3c474d51ae3b90673a3f70bf", + "1b388192a3774856b551b942a6d98bd3", + "c06cd0c84cd541e0a0ee19584bcaf21d", + "b0f9363100834b32a895ea4ece0a26ec", + "ad418495991f4f769a643947f857aa45" + ] + }, + "id": "Dizt021X7yBm", + "outputId": "96a585f5-6467-42d7-ed16-f4b1c7162db2" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " 0%| | 0/1544 [00:00 Date: Sun, 10 Jul 2022 13:50:08 -0600 Subject: [PATCH 2/2] Move tutorial to clip directory --- .../{ => clip}/CLIP_TextFeatureVisAndSearch_OptimViz.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tutorials/optimviz/{ => clip}/CLIP_TextFeatureVisAndSearch_OptimViz.ipynb (100%) diff --git a/tutorials/optimviz/CLIP_TextFeatureVisAndSearch_OptimViz.ipynb b/tutorials/optimviz/clip/CLIP_TextFeatureVisAndSearch_OptimViz.ipynb similarity index 100% rename from tutorials/optimviz/CLIP_TextFeatureVisAndSearch_OptimViz.ipynb rename to tutorials/optimviz/clip/CLIP_TextFeatureVisAndSearch_OptimViz.ipynb