Copy a token from your Hugging Face\ntokens page and paste it below. Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
Copy a token from your Hugging Face\ntokens page and paste it below. Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
"
- }
- },
- "8dba487876124827919079519406ecb8": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_67fcc38a1e5d4eb39381685447e397de",
- "placeholder": "",
- "style": "IPY_MODEL_0b4bf8076fdf4d19843a3246c8bd61ac",
- "value": " 1.92k/1.92k [00:00<00:00, 63.2kB/s]"
- }
- },
- "94756148d2e94a93ae233baba20af683": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "PasswordModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "PasswordModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "PasswordView",
- "continuous_update": true,
- "description": "Token:",
- "description_tooltip": null,
- "disabled": false,
- "layout": "IPY_MODEL_b2be65e192384c948fb8987d4cfca505",
- "placeholder": "",
- "style": "IPY_MODEL_333b42ca7aa44788b1c22724eb11bcc3",
- "value": ""
- }
- },
- "99898e6ee64a46bd832af112e79b58b7": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_765485a1d3f941d28b79782dcffbf401",
- "placeholder": "",
- "style": "IPY_MODEL_3499ef4dd9f243d9bef00b396e78ed69",
- "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. "
- }
- },
- "a02030ba8f324d93a7ed6cc793d70a3b": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "a899f4bc6ed842d397723cca582669e6": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_341615c971b04033b7293d82fc40f35c",
- "placeholder": "",
- "style": "IPY_MODEL_17856a72e4e948039a66c51e8244cb50",
- "value": " 5.53M/5.53M [00:00<00:00, 21.7MB/s]"
- }
- },
- "ab32c7daa1d9404fb921f39fbc4fc05c": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "aba21021d3bb4565a58ffa40049810db": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "ac0bcfa1ef6e4e78a7769c4cb2e8762f": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_549a30c85c47466eadedbd24da42e304",
- "placeholder": "",
- "style": "IPY_MODEL_bedc7d916b9745f097094c5c51a81f06",
- "value": " 500/500 [00:00<00:00, 5.05kB/s]"
- }
- },
- "ac2950d08fc145ba9eb9cf5824b1ee18": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "b26354d0278f447d92c7e1ad4c211d64": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "b2be65e192384c948fb8987d4cfca505": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "ba18cded436e486da34882d821d8f1eb": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "ButtonModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "ButtonModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "ButtonView",
- "button_style": "",
- "description": "Login",
- "disabled": false,
- "icon": "",
- "layout": "IPY_MODEL_0e382d66f09f4958a40baa7ab83c4ccb",
- "style": "IPY_MODEL_6a45ce374e2e47ba9457d02e02522748",
- "tooltip": ""
- }
- },
- "bacfb50c001047c4824a05c9f2ee2e40": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "bcf766d2a2c641f0aa2af596c7da1b18": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "ProgressView",
- "bar_style": "success",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_ee537ee5470f4d7b816a8c8f96948b4d",
- "max": 17719103,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_652e97509a914f3b914665c4889c6d11",
- "value": 17719103
- }
- },
- "bedc7d916b9745f097094c5c51a81f06": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "bf299285318b4a04a88569cc581ecd75": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_bacfb50c001047c4824a05c9f2ee2e40",
- "placeholder": "",
- "style": "IPY_MODEL_c53a1cf68fcd4388abf1f0379891089a",
- "value": " 129k/129k [00:00<00:00, 155kB/s]"
- }
- },
- "c3358d32ac814ea6bc5714402c5bc62d": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_ecd8e5e364d34ea8bfbba4fbd467384d",
- "IPY_MODEL_0125df9fa8e14b3db0e2bce299529812",
- "IPY_MODEL_e3169ca885e04536a709d5751173ce9a"
- ],
- "layout": "IPY_MODEL_70abdfd99be84f7b9b8d24fee9eec022"
- }
- },
- "c53a1cf68fcd4388abf1f0379891089a": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "c8731777ce834e58a76a295076200cfc": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "VBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "VBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "VBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_859b12a6d95b4c6f987791ca848122b9",
- "IPY_MODEL_94756148d2e94a93ae233baba20af683",
- "IPY_MODEL_ba18cded436e486da34882d821d8f1eb",
- "IPY_MODEL_99898e6ee64a46bd832af112e79b58b7"
- ],
- "layout": "IPY_MODEL_79184c8c2a6f4b7493bb7f6983f18a09"
- }
- },
- "c8e0c9a60ef34d2caee9d55a3c21c3d4": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "c9974003727a401797953ef2885db5a2": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "d12f07e25bf5422facc38c3463700994": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_eae11f84c2644ada8295b445c924baec",
- "IPY_MODEL_bcf766d2a2c641f0aa2af596c7da1b18",
- "IPY_MODEL_74bf69aa6eaa4a8594b2ea9a0fb20957"
- ],
- "layout": "IPY_MODEL_2d7a0b901d7044d5b1f273a3e9bea560"
- }
- },
- "d13ba6030aff42bca48c72ff071c44c0": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "ProgressView",
- "bar_style": "success",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_c8e0c9a60ef34d2caee9d55a3c21c3d4",
- "max": 5534328,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_764aa53d75324d73ab06936c52fd8fc8",
- "value": 5534328
- }
- },
- "d182e37b4a404158bee8446fc2728bd9": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HBoxModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_603e99f45afb4910a99f7684ffd21b6a",
- "IPY_MODEL_d13ba6030aff42bca48c72ff071c44c0",
- "IPY_MODEL_a899f4bc6ed842d397723cca582669e6"
- ],
- "layout": "IPY_MODEL_a02030ba8f324d93a7ed6cc793d70a3b"
- }
- },
- "d33fba0d78fb41f983c55f5cd2a0a740": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "d7071582bfbe4ec4b2c3c9843e5481ae": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "e1c9df12fa034c93a9b3530ea4a7c5aa": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "e3169ca885e04536a709d5751173ce9a": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_358c3a67f8b54c4c899e095611fa116b",
- "placeholder": "",
- "style": "IPY_MODEL_e1c9df12fa034c93a9b3530ea4a7c5aa",
- "value": " 318/318 [00:00<00:00, 11.0kB/s]"
- }
- },
- "e4c1e9affaba4045a3ec903091b6f454": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "e7728d9c55e44274966f8f6dbc445c54": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "ProgressStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "bar_color": null,
- "description_width": ""
- }
- },
- "e928540e99564d808cb2d12c92daa498": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_c9974003727a401797953ef2885db5a2",
- "placeholder": "",
- "style": "IPY_MODEL_77a361d1ff214e8799891bbeb28a0789",
- "value": "Downloading: 100%"
- }
- },
- "e98cf7a63c814ffd94f69928f0700ebf": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_aba21021d3bb4565a58ffa40049810db",
- "placeholder": "",
- "style": "IPY_MODEL_f7812fa7fbf744c1b261b985d085e28e",
- "value": "Downloading: 100%"
- }
- },
- "ea95ffd922c0455d957120f034e541f8": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "eae11f84c2644ada8295b445c924baec": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_2cbf0faadd4842c8b22e10541ff9de4e",
- "placeholder": "",
- "style": "IPY_MODEL_ab32c7daa1d9404fb921f39fbc4fc05c",
- "value": "Downloading: 100%"
- }
- },
- "ebc9801e164a44b3b6f8dc7f590e1c79": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "ecd8e5e364d34ea8bfbba4fbd467384d": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_554e567a83b348f88092c6ba01830930",
- "placeholder": "",
- "style": "IPY_MODEL_6e334cad2e94462cae6e722bd6f11a9e",
- "value": "Downloading: 100%"
- }
- },
- "ed169fd606274f2ebbb3e8f32ab42431": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "ee537ee5470f4d7b816a8c8f96948b4d": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "f21c0c6379d74898ac6aadcb6fc14a8a": {
- "model_module": "@jupyter-widgets/base",
- "model_module_version": "1.2.0",
- "model_name": "LayoutModel",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "f439c1de68ac4c799d81fdb29d053d10": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "f7812fa7fbf744c1b261b985d085e28e": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "f91dcd9f30c743d69f9d4b7e8d1beba5": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_2b2d7912186a49dd9891ae12c77482c7",
- "placeholder": "",
- "style": "IPY_MODEL_1600b9cd09c446e581b7912e35c9f56e",
- "value": " 83.3M/83.3M [00:01<00:00, 60.9MB/s]"
- }
- },
- "fc9a3c4ae0a947ec91a227360a80f602": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "FloatProgressModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "ProgressView",
- "bar_style": "success",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_27f6f437c5264368bc2c679942ad1e53",
- "max": 83316686,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_e7728d9c55e44274966f8f6dbc445c54",
- "value": 83316686
- }
- },
- "fd47487fc8734594823f8afa00c4239d": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "DescriptionStyleModel",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "fee75343289f42fb8d6dfb4bf26fe368": {
- "model_module": "@jupyter-widgets/controls",
- "model_module_version": "1.5.0",
- "model_name": "HTMLModel",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_16c0017f65b649f5ac5bebf1c955a1fd",
- "placeholder": "",
- "style": "IPY_MODEL_5e2c207db5424f91829bf5c52040a9f2",
- "value": " 1.92k/1.92k [00:00<00:00, 48.3kB/s]"
- }
- }
- }
+ "d13ba6030aff42bca48c72ff071c44c0": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_c8e0c9a60ef34d2caee9d55a3c21c3d4",
+ "max": 5534328,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_764aa53d75324d73ab06936c52fd8fc8",
+ "value": 5534328
+ }
+ },
+ "d182e37b4a404158bee8446fc2728bd9": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HBoxModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_603e99f45afb4910a99f7684ffd21b6a",
+ "IPY_MODEL_d13ba6030aff42bca48c72ff071c44c0",
+ "IPY_MODEL_a899f4bc6ed842d397723cca582669e6"
+ ],
+ "layout": "IPY_MODEL_a02030ba8f324d93a7ed6cc793d70a3b"
+ }
+ },
+ "d33fba0d78fb41f983c55f5cd2a0a740": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "d7071582bfbe4ec4b2c3c9843e5481ae": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "e1c9df12fa034c93a9b3530ea4a7c5aa": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "e3169ca885e04536a709d5751173ce9a": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_358c3a67f8b54c4c899e095611fa116b",
+ "placeholder": "",
+ "style": "IPY_MODEL_e1c9df12fa034c93a9b3530ea4a7c5aa",
+ "value": " 318/318 [00:00<00:00, 11.0kB/s]"
+ }
+ },
+ "e4c1e9affaba4045a3ec903091b6f454": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "e7728d9c55e44274966f8f6dbc445c54": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "e928540e99564d808cb2d12c92daa498": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_c9974003727a401797953ef2885db5a2",
+ "placeholder": "",
+ "style": "IPY_MODEL_77a361d1ff214e8799891bbeb28a0789",
+ "value": "Downloading: 100%"
+ }
+ },
+ "e98cf7a63c814ffd94f69928f0700ebf": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_aba21021d3bb4565a58ffa40049810db",
+ "placeholder": "",
+ "style": "IPY_MODEL_f7812fa7fbf744c1b261b985d085e28e",
+ "value": "Downloading: 100%"
+ }
+ },
+ "ea95ffd922c0455d957120f034e541f8": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "eae11f84c2644ada8295b445c924baec": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_2cbf0faadd4842c8b22e10541ff9de4e",
+ "placeholder": "",
+ "style": "IPY_MODEL_ab32c7daa1d9404fb921f39fbc4fc05c",
+ "value": "Downloading: 100%"
+ }
+ },
+ "ebc9801e164a44b3b6f8dc7f590e1c79": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "ecd8e5e364d34ea8bfbba4fbd467384d": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_554e567a83b348f88092c6ba01830930",
+ "placeholder": "",
+ "style": "IPY_MODEL_6e334cad2e94462cae6e722bd6f11a9e",
+ "value": "Downloading: 100%"
+ }
+ },
+ "ed169fd606274f2ebbb3e8f32ab42431": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "ee537ee5470f4d7b816a8c8f96948b4d": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "f21c0c6379d74898ac6aadcb6fc14a8a": {
+ "model_module": "@jupyter-widgets/base",
+ "model_module_version": "1.2.0",
+ "model_name": "LayoutModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "f439c1de68ac4c799d81fdb29d053d10": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "f7812fa7fbf744c1b261b985d085e28e": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "f91dcd9f30c743d69f9d4b7e8d1beba5": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_2b2d7912186a49dd9891ae12c77482c7",
+ "placeholder": "",
+ "style": "IPY_MODEL_1600b9cd09c446e581b7912e35c9f56e",
+ "value": " 83.3M/83.3M [00:01<00:00, 60.9MB/s]"
+ }
+ },
+ "fc9a3c4ae0a947ec91a227360a80f602": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_27f6f437c5264368bc2c679942ad1e53",
+ "max": 83316686,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_e7728d9c55e44274966f8f6dbc445c54",
+ "value": 83316686
+ }
+ },
+ "fd47487fc8734594823f8afa00c4239d": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "fee75343289f42fb8d6dfb4bf26fe368": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_module_version": "1.5.0",
+ "model_name": "HTMLModel",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_16c0017f65b649f5ac5bebf1c955a1fd",
+ "placeholder": "",
+ "style": "IPY_MODEL_5e2c207db5424f91829bf5c52040a9f2",
+ "value": " 1.92k/1.92k [00:00<00:00, 48.3kB/s]"
+ }
}
- },
- "nbformat": 4,
- "nbformat_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
}
From 26ca0514394ee8faef044b1da0fcf99e53cc22c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Mon, 16 Oct 2023 15:32:27 +0200
Subject: [PATCH 02/19] feat: add `TimingHook` and list of `Hooks` (#1503)
---
CHANGELOG.md | 6 ++
.../audio/pipelines/speaker_diarization.py | 3 +
pyannote/audio/pipelines/utils/hook.py | 89 ++++++++++++++++++-
3 files changed, 95 insertions(+), 3 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d63c193d0..b9805e758 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
# Changelog
+## `develop` branch
+
+ - feat(pipeline): add `TimingHook` for profiling processing time
+ - feat(pipeline): add support for list of hooks with `Hooks`
+ - fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
+
## Version 3.0.1 (2023-09-28)
- fix(pipeline): fix WeSpeaker GPU support
diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py
index 18b6565d3..d5cf04e05 100644
--- a/pyannote/audio/pipelines/speaker_diarization.py
+++ b/pyannote/audio/pipelines/speaker_diarization.py
@@ -332,6 +332,9 @@ def iter_waveform_and_mask():
embedding_batches = []
+ if hook is not None:
+ hook("embeddings", None, total=batch_count, completed=0)
+
for i, batch in enumerate(batches, 1):
waveforms, masks = zip(*filter(lambda b: b[0] is not None, batch))
diff --git a/pyannote/audio/pipelines/utils/hook.py b/pyannote/audio/pipelines/utils/hook.py
index cb150ea4a..86ecf1ec1 100644
--- a/pyannote/audio/pipelines/utils/hook.py
+++ b/pyannote/audio/pipelines/utils/hook.py
@@ -20,6 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
+import time
from copy import deepcopy
from typing import Any, Mapping, Optional, Text
@@ -64,11 +65,9 @@ class ProgressHook:
"""
def __init__(self, transient: bool = False):
- super().__init__()
self.transient = transient
def __enter__(self):
-
self.progress = Progress(
TextColumn("[progress.description]{task.description}"),
BarColumn(),
@@ -90,7 +89,6 @@ def __call__(
total: Optional[int] = None,
completed: Optional[int] = None,
):
-
if completed is None:
completed = total = 1
@@ -103,3 +101,88 @@ def __call__(
# force refresh when completed
if completed >= total:
self.progress.refresh()
+
+
+class TimingHook:
+ """Hook to compute processing time of internal steps
+
+ Parameters
+ ----------
+ file_key: str, optional
+ Key used to store processing time in `file`.
+ Defaults to "timing_hook".
+
+ Usage
+ -----
+ >>> with TimingHook() as hook:
+ ... output = pipeline(file, hook=hook)
+ # file["timing_hook"] contains processing time for each step
+ """
+
+ def __init__(self, file_key: str = "timing_hook"):
+ self.file_key = file_key
+
+ def __enter__(self):
+ self._pipeline_start_time = time.time()
+ self._start_time = dict()
+ self._end_time = dict()
+ return self
+
+ def __exit__(self, *args):
+ _pipeline_end_time = time.time()
+ processing_time = dict()
+ processing_time["total"] = _pipeline_end_time - self._pipeline_start_time
+ for step_name, _start_time in self._start_time.items():
+ _end_time = self._end_time[step_name]
+ processing_time[step_name] = _end_time - _start_time
+
+ self._file[self.file_key] = processing_time
+
+ def __call__(
+ self,
+ step_name: Text,
+ step_artifact: Any,
+ file: Optional[Mapping] = None,
+ total: Optional[int] = None,
+ completed: Optional[int] = None,
+ ):
+ if not hasattr(self, "_file"):
+ self._file = file
+
+ if completed is None:
+ return
+
+ if completed == 0:
+ self._start_time[step_name] = time.time()
+
+ if completed >= total:
+ self._end_time[step_name] = time.time()
+
+
+class Hooks:
+ """List of hooks
+
+ Usage
+ -----
+ >>> with Hooks(ProgessHook(), TimingHook()) as hook:
+ ... output = pipeline("audio.wav", hook=hook)
+
+ """
+
+ def __init__(self, *hooks):
+ self.hooks = hooks
+
+ def __enter__(self):
+ for hook in self.hooks:
+ if hasattr(hook, "__enter__"):
+ hook.__enter__()
+ return self
+
+ def __exit__(self, *args):
+ for hook in self.hooks:
+ if hasattr(hook, "__exit__"):
+ hook.__exit__(*args)
+
+ def __call__(self, *args: Any, **kwds: Any) -> Any:
+ for hook in self.hooks:
+ hook(*args, **kwds)
From 4b09aefe4093a681b258626523a1408d754dd90c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Sat, 21 Oct 2023 16:11:03 +0200
Subject: [PATCH 03/19] feat(pipeline): add ArtifactHook for saving internal
steps (#1511)
---
CHANGELOG.md | 169 +++++++++++++------------
pyannote/audio/pipelines/utils/hook.py | 57 +++++++--
2 files changed, 128 insertions(+), 98 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b9805e758..fcdebb82c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,137 +2,138 @@
## `develop` branch
- - feat(pipeline): add `TimingHook` for profiling processing time
- - feat(pipeline): add support for list of hooks with `Hooks`
- - fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
+- feat(pipeline): add `TimingHook` for profiling processing time
+- feat(pipeline): add `ArtifactHook` for saving internal steps
+- feat(pipeline): add support for list of hooks with `Hooks`
+- BREAKING(pipeline): remove `logging_hook` (use `ArtifactHook` instead)
+- fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
## Version 3.0.1 (2023-09-28)
- - fix(pipeline): fix WeSpeaker GPU support
+- fix(pipeline): fix WeSpeaker GPU support
## Version 3.0.0 (2023-09-26)
### Features and improvements
- - feat(pipeline): send pipeline to device with `pipeline.to(device)`
- - feat(pipeline): add `return_embeddings` option to `SpeakerDiarization` pipeline
- - feat(pipeline): make `segmentation_batch_size` and `embedding_batch_size` mutable in `SpeakerDiarization` pipeline (they now default to `1`)
- - feat(pipeline): add progress hook to pipelines
- - feat(task): add [powerset](https://www.isca-speech.org/archive/interspeech_2023/plaquet23_interspeech.html) support to `SpeakerDiarization` task
- - feat(task): add support for multi-task models
- - feat(task): add support for label scope in speaker diarization task
- - feat(task): add support for missing classes in multi-label segmentation task
- - feat(model): add segmentation model based on torchaudio self-supervised representation
- - feat(pipeline): check version compatibility at load time
- - improve(task): load metadata as tensors rather than pyannote.core instances
- - improve(task): improve error message on missing specifications
+- feat(pipeline): send pipeline to device with `pipeline.to(device)`
+- feat(pipeline): add `return_embeddings` option to `SpeakerDiarization` pipeline
+- feat(pipeline): make `segmentation_batch_size` and `embedding_batch_size` mutable in `SpeakerDiarization` pipeline (they now default to `1`)
+- feat(pipeline): add progress hook to pipelines
+- feat(task): add [powerset](https://www.isca-speech.org/archive/interspeech_2023/plaquet23_interspeech.html) support to `SpeakerDiarization` task
+- feat(task): add support for multi-task models
+- feat(task): add support for label scope in speaker diarization task
+- feat(task): add support for missing classes in multi-label segmentation task
+- feat(model): add segmentation model based on torchaudio self-supervised representation
+- feat(pipeline): check version compatibility at load time
+- improve(task): load metadata as tensors rather than pyannote.core instances
+- improve(task): improve error message on missing specifications
### Breaking changes
- - BREAKING(task): rename `Segmentation` task to `SpeakerDiarization`
- - BREAKING(pipeline): pipeline defaults to CPU (use `pipeline.to(device)`)
- - BREAKING(pipeline): remove `SpeakerSegmentation` pipeline (use `SpeakerDiarization` pipeline)
- - BREAKING(pipeline): remove `segmentation_duration` parameter from `SpeakerDiarization` pipeline (defaults to `duration` of segmentation model)
- - BREAKING(task): remove support for variable chunk duration for segmentation tasks
- - BREAKING(pipeline): remove support for `FINCHClustering` and `HiddenMarkovModelClustering`
- - BREAKING(setup): drop support for Python 3.7
- - BREAKING(io): channels are now 0-indexed (used to be 1-indexed)
- - BREAKING(io): multi-channel audio is no longer downmixed to mono by default.
- You should update how `pyannote.audio.core.io.Audio` is instantiated:
- * replace `Audio()` by `Audio(mono="downmix")`;
- * replace `Audio(mono=True)` by `Audio(mono="downmix")`;
- * replace `Audio(mono=False)` by `Audio()`.
- - BREAKING(model): get rid of (flaky) `Model.introspection`
- If, for some weird reason, you wrote some custom code based on that,
- you should instead rely on `Model.example_output`.
- - BREAKING(interactive): remove support for Prodigy recipes
-
+- BREAKING(task): rename `Segmentation` task to `SpeakerDiarization`
+- BREAKING(pipeline): pipeline defaults to CPU (use `pipeline.to(device)`)
+- BREAKING(pipeline): remove `SpeakerSegmentation` pipeline (use `SpeakerDiarization` pipeline)
+- BREAKING(pipeline): remove `segmentation_duration` parameter from `SpeakerDiarization` pipeline (defaults to `duration` of segmentation model)
+- BREAKING(task): remove support for variable chunk duration for segmentation tasks
+- BREAKING(pipeline): remove support for `FINCHClustering` and `HiddenMarkovModelClustering`
+- BREAKING(setup): drop support for Python 3.7
+- BREAKING(io): channels are now 0-indexed (used to be 1-indexed)
+- BREAKING(io): multi-channel audio is no longer downmixed to mono by default.
+ You should update how `pyannote.audio.core.io.Audio` is instantiated:
+ - replace `Audio()` by `Audio(mono="downmix")`;
+ - replace `Audio(mono=True)` by `Audio(mono="downmix")`;
+ - replace `Audio(mono=False)` by `Audio()`.
+- BREAKING(model): get rid of (flaky) `Model.introspection`
+ If, for some weird reason, you wrote some custom code based on that,
+ you should instead rely on `Model.example_output`.
+- BREAKING(interactive): remove support for Prodigy recipes
### Fixes and improvements
- - fix(pipeline): fix reproducibility issue with Ampere CUDA devices
- - fix(pipeline): fix support for IOBase audio
- - fix(pipeline): fix corner case with no speaker
- - fix(train): prevent metadata preparation to happen twice
- - fix(task): fix support for "balance" option
- - improve(task): shorten and improve structure of Tensorboard tags
+- fix(pipeline): fix reproducibility issue with Ampere CUDA devices
+- fix(pipeline): fix support for IOBase audio
+- fix(pipeline): fix corner case with no speaker
+- fix(train): prevent metadata preparation to happen twice
+- fix(task): fix support for "balance" option
+- improve(task): shorten and improve structure of Tensorboard tags
### Dependencies update
- - setup: switch to torch 2.0+, torchaudio 2.0+, soundfile 0.12+, lightning 2.0+, torchmetrics 0.11+
- - setup: switch to pyannote.core 5.0+, pyannote.database 5.0+, and pyannote.pipeline 3.0+
- - setup: switch to speechbrain 0.5.14+
+- setup: switch to torch 2.0+, torchaudio 2.0+, soundfile 0.12+, lightning 2.0+, torchmetrics 0.11+
+- setup: switch to pyannote.core 5.0+, pyannote.database 5.0+, and pyannote.pipeline 3.0+
+- setup: switch to speechbrain 0.5.14+
## Version 2.1.1 (2022-10-27)
- - BREAKING(pipeline): rewrite speaker diarization pipeline
- - feat(pipeline): add option to optimize for DER variant
- - feat(clustering): add support for NeMo speaker embedding
- - feat(clustering): add FINCH clustering
- - feat(clustering): add min_cluster_size hparams to AgglomerativeClustering
- - feat(hub): add support for private/gated models
- - setup(hub): switch to latest hugginface_hub API
- - fix(pipeline): fix support for missing reference in Resegmentation pipeline
- - fix(clustering) fix corner case where HMM.fit finds too little states
+- BREAKING(pipeline): rewrite speaker diarization pipeline
+- feat(pipeline): add option to optimize for DER variant
+- feat(clustering): add support for NeMo speaker embedding
+- feat(clustering): add FINCH clustering
+- feat(clustering): add min_cluster_size hparams to AgglomerativeClustering
+- feat(hub): add support for private/gated models
+- setup(hub): switch to latest hugginface_hub API
+- fix(pipeline): fix support for missing reference in Resegmentation pipeline
+- fix(clustering) fix corner case where HMM.fit finds too little states
## Version 2.0.1 (2022-07-20)
- - BREAKING: complete rewrite
- - feat: much better performance
- - feat: Python-first API
- - feat: pretrained pipelines (and models) on Huggingface model hub
- - feat: multi-GPU training with pytorch-lightning
- - feat: data augmentation with torch-audiomentations
- - feat: Prodigy recipe for model-assisted audio annotation
+- BREAKING: complete rewrite
+- feat: much better performance
+- feat: Python-first API
+- feat: pretrained pipelines (and models) on Huggingface model hub
+- feat: multi-GPU training with pytorch-lightning
+- feat: data augmentation with torch-audiomentations
+- feat: Prodigy recipe for model-assisted audio annotation
## Version 1.1.2 (2021-01-28)
- - fix: make sure master branch is used to load pretrained models (#599)
+- fix: make sure master branch is used to load pretrained models (#599)
## Version 1.1 (2020-11-08)
- - last release before complete rewriting
+- last release before complete rewriting
## Version 1.0.1 (2018-07-19)
- - fix: fix regression in Precomputed.__call__ (#110, #105)
+- fix: fix regression in Precomputed.**call** (#110, #105)
## Version 1.0 (2018-07-03)
- - chore: switch from keras to pytorch (with tensorboard support)
- - improve: faster & better traning (`AutoLR`, advanced learning rate schedulers, improved batch generators)
- - feat: add tunable speaker diarization pipeline (with its own tutorial)
- - chore: drop support for Python 2 (use Python 3.6 or later)
+- chore: switch from keras to pytorch (with tensorboard support)
+- improve: faster & better traning (`AutoLR`, advanced learning rate schedulers, improved batch generators)
+- feat: add tunable speaker diarization pipeline (with its own tutorial)
+- chore: drop support for Python 2 (use Python 3.6 or later)
## Version 0.3.1 (2017-07-06)
- - feat: add python 3 support
- - chore: rewrite neural speaker embedding using autograd
- - feat: add new embedding architectures
- - feat: add new embedding losses
- - chore: switch to Keras 2
- - doc: add tutorial for (MFCC) feature extraction
- - doc: add tutorial for (LSTM-based) speech activity detection
- - doc: add tutorial for (LSTM-based) speaker change detection
- - doc: add tutorial for (TristouNet) neural speaker embedding
+- feat: add python 3 support
+- chore: rewrite neural speaker embedding using autograd
+- feat: add new embedding architectures
+- feat: add new embedding losses
+- chore: switch to Keras 2
+- doc: add tutorial for (MFCC) feature extraction
+- doc: add tutorial for (LSTM-based) speech activity detection
+- doc: add tutorial for (LSTM-based) speaker change detection
+- doc: add tutorial for (TristouNet) neural speaker embedding
## Version 0.2.1 (2017-03-28)
- - feat: add LSTM-based speech activity detection
- - feat: add LSTM-based speaker change detection
- - improve: refactor LSTM-based speaker embedding
- - feat: add librosa basic support
- - feat: add SMORMS3 optimizer
+- feat: add LSTM-based speech activity detection
+- feat: add LSTM-based speaker change detection
+- improve: refactor LSTM-based speaker embedding
+- feat: add librosa basic support
+- feat: add SMORMS3 optimizer
## Version 0.1.4 (2016-09-26)
- - feat: add 'covariance_type' option to BIC segmentation
+- feat: add 'covariance_type' option to BIC segmentation
## Version 0.1.3 (2016-09-23)
- - chore: rename sequence generator in preparation of the release of
- TristouNet reproducible research package.
+- chore: rename sequence generator in preparation of the release of
+ TristouNet reproducible research package.
## Version 0.1.2 (2016-09-22)
- - first public version
+- first public version
diff --git a/pyannote/audio/pipelines/utils/hook.py b/pyannote/audio/pipelines/utils/hook.py
index 86ecf1ec1..fc6e56734 100644
--- a/pyannote/audio/pipelines/utils/hook.py
+++ b/pyannote/audio/pipelines/utils/hook.py
@@ -33,20 +33,49 @@
)
-def logging_hook(
- step_name: Text,
- step_artifact: Any,
- file: Optional[Mapping] = None,
- completed: Optional[int] = None,
- total: Optional[int] = None,
-):
- """Hook to save step_artifact as file[step_name]
-
- Useful for debugging purposes
+class ArtifactHook:
+ """Hook to save artifacts of each internal step
+
+ Parameters
+ ----------
+ artifacts: list of str, optional
+ List of steps to save. Defaults to all steps.
+ file_key: str, optional
+ Key used to store artifacts in `file`.
+ Defaults to "artifact".
+
+ Usage
+ -----
+ >>> with ArtifactHook() as hook:
+ ... output = pipeline(file, hook=hook)
+ # file["artifact"] contains a dict with artifacts of each step
+
"""
- if completed is None:
- file[step_name] = deepcopy(step_artifact)
+ def __init__(self, *artifacts, file_key: str = "artifact"):
+ self.artifacts = artifacts
+ self.file_key = file_key
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *args):
+ pass
+
+ def __call__(
+ self,
+ step_name: Text,
+ step_artifact: Any,
+ file: Optional[Mapping] = None,
+ total: Optional[int] = None,
+ completed: Optional[int] = None,
+ ):
+ if (step_artifact is None) or (
+ self.artifacts and step_name not in self.artifacts
+ ):
+ return
+
+ file.setdefault(self.file_key, dict())[step_name] = deepcopy(step_artifact)
class ProgressHook:
@@ -119,7 +148,7 @@ class TimingHook:
# file["timing_hook"] contains processing time for each step
"""
- def __init__(self, file_key: str = "timing_hook"):
+ def __init__(self, file_key: str = "timing"):
self.file_key = file_key
def __enter__(self):
@@ -164,7 +193,7 @@ class Hooks:
Usage
-----
- >>> with Hooks(ProgessHook(), TimingHook()) as hook:
+ >>> with Hooks(ProgessHook(), TimingHook(), ArtifactHook()) as hook:
... output = pipeline("audio.wav", hook=hook)
"""
From 03f826535995648d5205f5ec22203e2e3006ed61 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Sun, 22 Oct 2023 12:04:30 +0200
Subject: [PATCH 04/19] fix: fix of list of Hooks (#1514)
---
pyannote/audio/pipelines/utils/hook.py | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/pyannote/audio/pipelines/utils/hook.py b/pyannote/audio/pipelines/utils/hook.py
index fc6e56734..2a675d1c9 100644
--- a/pyannote/audio/pipelines/utils/hook.py
+++ b/pyannote/audio/pipelines/utils/hook.py
@@ -212,6 +212,13 @@ def __exit__(self, *args):
if hasattr(hook, "__exit__"):
hook.__exit__(*args)
- def __call__(self, *args: Any, **kwds: Any) -> Any:
+ def __call__(
+ self,
+ step_name: Text,
+ step_artifact: Any,
+ file: Optional[Mapping] = None,
+ total: Optional[int] = None,
+ completed: Optional[int] = None,
+ ):
for hook in self.hooks:
- hook(*args, **kwds)
+ hook(step_name, step_artifact, file=file, total=total, completed=completed)
From 0b45103cb228a81a9d9d776cca92694cb30ddb41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Sun, 22 Oct 2023 16:44:32 +0200
Subject: [PATCH 05/19] feat(utils): add "soft" option to
Powerset.to_multilabel conversion (#1516)
---
CHANGELOG.md | 1 +
pyannote/audio/utils/powerset.py | 22 ++++++++++++++--------
2 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index fcdebb82c..c63e65bea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
- feat(pipeline): add support for list of hooks with `Hooks`
- BREAKING(pipeline): remove `logging_hook` (use `ArtifactHook` instead)
- fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
+- feat(utils): add `"soft"` option to `Powerset.to_multilabel`
## Version 3.0.1 (2023-09-28)
diff --git a/pyannote/audio/utils/powerset.py b/pyannote/audio/utils/powerset.py
index 0f5cfb5bc..810519829 100644
--- a/pyannote/audio/utils/powerset.py
+++ b/pyannote/audio/utils/powerset.py
@@ -84,26 +84,32 @@ def build_cardinality(self) -> torch.Tensor:
powerset_k += 1
return cardinality
- def to_multilabel(self, powerset: torch.Tensor) -> torch.Tensor:
- """Convert predictions from (soft) powerset to (hard) multi-label
+ def to_multilabel(self, powerset: torch.Tensor, soft: bool = False) -> torch.Tensor:
+ """Convert predictions from powerset to multi-label
Parameter
---------
powerset : (batch_size, num_frames, num_powerset_classes) torch.Tensor
Soft predictions in "powerset" space.
+ soft : bool, optional
+ Return soft multi-label predictions. Defaults to False (i.e. hard predictions)
+ Assumes that `powerset` are "logits" (not "probabilities").
Returns
-------
multi_label : (batch_size, num_frames, num_classes) torch.Tensor
- Hard predictions in "multi-label" space.
+ Predictions in "multi-label" space.
"""
- hard_powerset = torch.nn.functional.one_hot(
- torch.argmax(powerset, dim=-1),
- self.num_powerset_classes,
- ).float()
+ if soft:
+ powerset_probs = torch.exp(powerset)
+ else:
+ powerset_probs = torch.nn.functional.one_hot(
+ torch.argmax(powerset, dim=-1),
+ self.num_powerset_classes,
+ ).float()
- return torch.matmul(hard_powerset, self.mapping)
+ return torch.matmul(powerset_probs, self.mapping)
def forward(self, powerset: torch.Tensor) -> torch.Tensor:
"""Alias for `to_multilabel`"""
From 40fa67b1ead65d08963b9cfd85541cd833c072ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Tue, 7 Nov 2023 09:38:45 +0100
Subject: [PATCH 06/19] fix(pipeline): compute fbank on selected device (#1529)
---
CHANGELOG.md | 1 +
pyannote/audio/pipelines/speaker_verification.py | 7 ++++---
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c63e65bea..f52704595 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@
- BREAKING(pipeline): remove `logging_hook` (use `ArtifactHook` instead)
- fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
- feat(utils): add `"soft"` option to `Powerset.to_multilabel`
+- improve(pipeline): compute `fbank` on GPU when requested
## Version 3.0.1 (2023-09-28)
diff --git a/pyannote/audio/pipelines/speaker_verification.py b/pyannote/audio/pipelines/speaker_verification.py
index 6b39679dd..8a36fc70c 100644
--- a/pyannote/audio/pipelines/speaker_verification.py
+++ b/pyannote/audio/pipelines/speaker_verification.py
@@ -556,6 +556,7 @@ def compute_fbank(
for waveform in waveforms
]
)
+
return features - torch.mean(features, dim=1, keepdim=True)
def __call__(
@@ -578,12 +579,12 @@ def __call__(
batch_size, num_channels, num_samples = waveforms.shape
assert num_channels == 1
- features = self.compute_fbank(waveforms)
+ features = self.compute_fbank(waveforms.to(self.device))
_, num_frames, _ = features.shape
if masks is None:
embeddings = self.session_.run(
- output_names=["embs"], input_feed={"feats": features.numpy()}
+ output_names=["embs"], input_feed={"feats": features.numpy(force=True)}
)[0]
return embeddings
@@ -606,7 +607,7 @@ def __call__(
embeddings[f] = self.session_.run(
output_names=["embs"],
- input_feed={"feats": masked_feature.numpy()[None]},
+ input_feed={"feats": masked_feature.numpy(force=True)[None]},
)[0][0]
return embeddings
From 0e1a726ad2141bc6c4cc9672a2899ac2bb73fe73 Mon Sep 17 00:00:00 2001
From: Olivier <22832930+olvb@users.noreply.github.com>
Date: Tue, 7 Nov 2023 12:02:41 +0100
Subject: [PATCH 07/19] fix(pipeline): fix `AgglomerativeClustering` to honor
`num_clusters` when provided
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Co-authored-by: Olivier Birot
Co-authored-by: Hervé BREDIN
---
CHANGELOG.md | 1 +
pyannote/audio/pipelines/clustering.py | 3 ++-
tests/test_clustering.py | 29 ++++++++++++++++++++++++++
3 files changed, 32 insertions(+), 1 deletion(-)
create mode 100644 tests/test_clustering.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f52704595..cb025c0bd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@
- fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
- feat(utils): add `"soft"` option to `Powerset.to_multilabel`
- improve(pipeline): compute `fbank` on GPU when requested
+- fix(pipeline): fix `AgglomerativeClustering` to honor `num_clusters` when provided
## Version 3.0.1 (2023-09-28)
diff --git a/pyannote/audio/pipelines/clustering.py b/pyannote/audio/pipelines/clustering.py
index a779016cb..c51cdcc50 100644
--- a/pyannote/audio/pipelines/clustering.py
+++ b/pyannote/audio/pipelines/clustering.py
@@ -386,7 +386,8 @@ def cluster(
elif num_large_clusters > max_clusters:
num_clusters = max_clusters
- if num_clusters is not None:
+ # look for perfect candidate if necessary
+ if num_clusters is not None and num_large_clusters != num_clusters:
# switch stopping criterion from "inter-cluster distance" stopping to "iteration index"
_dendrogram = np.copy(dendrogram)
_dendrogram[:, 2] = np.arange(num_embeddings - 1)
diff --git a/tests/test_clustering.py b/tests/test_clustering.py
new file mode 100644
index 000000000..535da47de
--- /dev/null
+++ b/tests/test_clustering.py
@@ -0,0 +1,29 @@
+import numpy as np
+
+from pyannote.audio.pipelines.clustering import AgglomerativeClustering
+
+
+def test_agglomerative_clustering_num_cluster():
+ """
+ Make sure AgglomerativeClustering doesn't "over-merge" clusters when initial
+ clustering already matches target num_clusters, cf
+ https://github.com/pyannote/pyannote-audio/issues/1525
+ """
+
+ # 2 embeddings different enough
+ embeddings = np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 1.0, 2.0]])
+
+ # clustering with params that should yield 1 cluster per embedding
+ clustering = AgglomerativeClustering().instantiate(
+ {
+ "method": "centroid",
+ "min_cluster_size": 0,
+ "threshold": 0.0,
+ }
+ )
+
+ # request 2 clusters
+ clusters = clustering.cluster(
+ embeddings=embeddings, min_clusters=2, max_clusters=2, num_clusters=2
+ )
+ assert np.array_equal(clusters, np.array([0, 1]))
From fb1c5c4bce604ecc2cf383f6669d13f1058c21c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Fri, 10 Nov 2023 15:58:52 +0100
Subject: [PATCH 08/19] improve: add support for "soft" option in
Powerset.forward
---
pyannote/audio/utils/powerset.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pyannote/audio/utils/powerset.py b/pyannote/audio/utils/powerset.py
index 810519829..b75221e48 100644
--- a/pyannote/audio/utils/powerset.py
+++ b/pyannote/audio/utils/powerset.py
@@ -111,9 +111,9 @@ def to_multilabel(self, powerset: torch.Tensor, soft: bool = False) -> torch.Ten
return torch.matmul(powerset_probs, self.mapping)
- def forward(self, powerset: torch.Tensor) -> torch.Tensor:
+ def forward(self, powerset: torch.Tensor, soft: bool = False) -> torch.Tensor:
"""Alias for `to_multilabel`"""
- return self.to_multilabel(powerset)
+ return self.to_multilabel(powerset, soft=soft)
def to_powerset(self, multilabel: torch.Tensor) -> torch.Tensor:
"""Convert (hard) predictions from multi-label to powerset
From e0544b8ce16481001beac195275de55e4525521a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Fri, 10 Nov 2023 16:00:27 +0100
Subject: [PATCH 09/19] feat: use strings for track identifiers
---
pyannote/audio/pipelines/utils/diarization.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pyannote/audio/pipelines/utils/diarization.py b/pyannote/audio/pipelines/utils/diarization.py
index f494c6073..91413350b 100644
--- a/pyannote/audio/pipelines/utils/diarization.py
+++ b/pyannote/audio/pipelines/utils/diarization.py
@@ -197,7 +197,7 @@ def to_annotation(
min_duration_off=min_duration_off,
)
- return binarize(discrete_diarization)
+ return binarize(discrete_diarization).rename_tracks(generator="string")
@staticmethod
def to_diarization(
From 2787f123cc408dfc6a43fbf437d725fd93b7c537 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Mon, 13 Nov 2023 17:12:32 +0100
Subject: [PATCH 10/19] BREAKING(setup): prepare for getting rid of ONNX
runtime (#1541)
---
CHANGELOG.md | 3 +++
pyannote/audio/pipelines/speaker_verification.py | 16 +++++++++++-----
requirements.txt | 1 -
3 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb025c0bd..8cd56ad36 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,9 @@
- feat(utils): add `"soft"` option to `Powerset.to_multilabel`
- improve(pipeline): compute `fbank` on GPU when requested
- fix(pipeline): fix `AgglomerativeClustering` to honor `num_clusters` when provided
+- BREAKING(pipeline): rename `WeSpeakerPretrainedSpeakerEmbedding` to `ONNXWeSpeakerPretrainedSpeakerEmbedding`
+- BREAKING(setup): remove `onnxruntime` dependency.
+ You can still use ONNX `hbredin/wespeaker-voxceleb-resnet34-LM` but you will have to install `onnxruntime` yourself.
## Version 3.0.1 (2023-09-28)
diff --git a/pyannote/audio/pipelines/speaker_verification.py b/pyannote/audio/pipelines/speaker_verification.py
index 8a36fc70c..9f10f6e51 100644
--- a/pyannote/audio/pipelines/speaker_verification.py
+++ b/pyannote/audio/pipelines/speaker_verification.py
@@ -386,7 +386,7 @@ def __call__(
return embeddings
-class WeSpeakerPretrainedSpeakerEmbedding(BaseInference):
+class ONNXWeSpeakerPretrainedSpeakerEmbedding(BaseInference):
"""Pretrained WeSpeaker speaker embedding
Parameters
@@ -398,7 +398,7 @@ class WeSpeakerPretrainedSpeakerEmbedding(BaseInference):
Usage
-----
- >>> get_embedding = WeSpeakerPretrainedSpeakerEmbedding("hbredin/wespeaker-voxceleb-resnet34-LM")
+ >>> get_embedding = ONNXWeSpeakerPretrainedSpeakerEmbedding("hbredin/wespeaker-voxceleb-resnet34-LM")
>>> assert waveforms.ndim == 3
>>> batch_size, num_channels, num_samples = waveforms.shape
>>> assert num_channels == 1
@@ -418,7 +418,7 @@ def __init__(
):
if not ONNX_IS_AVAILABLE:
raise ImportError(
- f"'onnxruntime' must be installed to use '{embedding}' embeddings. "
+ f"'onnxruntime' must be installed to use '{embedding}' embeddings."
)
super().__init__()
@@ -745,7 +745,12 @@ def PretrainedSpeakerEmbedding(
>>> embeddings = get_embedding(waveforms, masks=masks)
"""
- if isinstance(embedding, str) and "speechbrain" in embedding:
+ if isinstance(embedding, str) and "pyannote" in embedding:
+ return PyannoteAudioPretrainedSpeakerEmbedding(
+ embedding, device=device, use_auth_token=use_auth_token
+ )
+
+ elif isinstance(embedding, str) and "speechbrain" in embedding:
return SpeechBrainPretrainedSpeakerEmbedding(
embedding, device=device, use_auth_token=use_auth_token
)
@@ -754,9 +759,10 @@ def PretrainedSpeakerEmbedding(
return NeMoPretrainedSpeakerEmbedding(embedding, device=device)
elif isinstance(embedding, str) and "wespeaker" in embedding:
- return WeSpeakerPretrainedSpeakerEmbedding(embedding, device=device)
+ return ONNXWeSpeakerPretrainedSpeakerEmbedding(embedding, device=device)
else:
+ # fallback to pyannote in case we are loading a local model
return PyannoteAudioPretrainedSpeakerEmbedding(
embedding, device=device, use_auth_token=use_auth_token
)
diff --git a/requirements.txt b/requirements.txt
index 7829ada37..7e71fe024 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,6 @@ einops >=0.6.0
huggingface_hub >= 0.13.0
lightning >= 2.0.1
omegaconf >=2.1,<3.0
-onnxruntime-gpu >= 1.16.0
pyannote.core >= 5.0.0
pyannote.database >= 5.0.1
pyannote.metrics >= 3.2
From 884913e1ac838af7d1451e5def997878c91ea46f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Pag=C3=A9s?=
<55240756+clement-pages@users.noreply.github.com>
Date: Tue, 14 Nov 2023 10:33:56 +0100
Subject: [PATCH 11/19] feat(model): add support for multi-speaker statistics
pooling (#1386)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Co-authored-by: Hervé BREDIN
---
CHANGELOG.md | 1 +
pyannote/audio/models/blocks/pooling.py | 109 ++++++++++++++------
tests/test_stats_pool.py | 131 ++++++++++++++++++++++++
3 files changed, 209 insertions(+), 32 deletions(-)
create mode 100644 tests/test_stats_pool.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8cd56ad36..3c8a4d8ff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,7 @@
- BREAKING(pipeline): rename `WeSpeakerPretrainedSpeakerEmbedding` to `ONNXWeSpeakerPretrainedSpeakerEmbedding`
- BREAKING(setup): remove `onnxruntime` dependency.
You can still use ONNX `hbredin/wespeaker-voxceleb-resnet34-LM` but you will have to install `onnxruntime` yourself.
+- feat(model): add support for multi-speaker statistics pooling
## Version 3.0.1 (2023-09-28)
diff --git a/pyannote/audio/models/blocks/pooling.py b/pyannote/audio/models/blocks/pooling.py
index debb05d13..b02f6f69e 100644
--- a/pyannote/audio/models/blocks/pooling.py
+++ b/pyannote/audio/models/blocks/pooling.py
@@ -1,6 +1,6 @@
# MIT License
#
-# Copyright (c) 2020 CNRS
+# Copyright (c) 2020- CNRS
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -26,6 +26,7 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
+from einops import rearrange
class StatsPool(nn.Module):
@@ -40,49 +41,93 @@ class StatsPool(nn.Module):
"""
- def forward(
- self, sequences: torch.Tensor, weights: Optional[torch.Tensor] = None
- ) -> torch.Tensor:
- """Forward pass
+ def _pool(self, sequences: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
+ """Helper function to compute statistics pooling
+
+ Assumes that weights are already interpolated to match the number of frames
+ in sequences and that they encode the activation of only one speaker.
Parameters
----------
- sequences : (batch, channel, frames) torch.Tensor
- Sequences.
- weights : (batch, frames) torch.Tensor, optional
- When provided, compute weighted mean and standard deviation.
+ sequences : (batch, features, frames) torch.Tensor
+ Sequences of features.
+ weights : (batch, frames) torch.Tensor
+ (Already interpolated) weights.
Returns
-------
- output : (batch, 2 * channel) torch.Tensor
+ output : (batch, 2 * features) torch.Tensor
Concatenation of mean and (unbiased) standard deviation.
"""
- if weights is None:
- mean = sequences.mean(dim=2)
- std = sequences.std(dim=2, unbiased=True)
+ weights = weights.unsqueeze(dim=1)
+ # (batch, 1, frames)
- else:
- weights = weights.unsqueeze(dim=1)
- # (batch, 1, frames)
+ v1 = weights.sum(dim=2) + 1e-8
+ mean = torch.sum(sequences * weights, dim=2) / v1
+
+ dx2 = torch.square(sequences - mean.unsqueeze(2))
+ v2 = torch.square(weights).sum(dim=2)
+
+ var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1)
+ std = torch.sqrt(var)
+
+ return torch.cat([mean, std], dim=1)
+
+ def forward(
+ self, sequences: torch.Tensor, weights: Optional[torch.Tensor] = None
+ ) -> torch.Tensor:
+ """Forward pass
- num_frames = sequences.shape[2]
- num_weights = weights.shape[2]
- if num_frames != num_weights:
- warnings.warn(
- f"Mismatch between frames ({num_frames}) and weights ({num_weights}) numbers."
- )
- weights = F.interpolate(
- weights, size=num_frames, mode="linear", align_corners=False
- )
+ Parameters
+ ----------
+ sequences : (batch, features, frames) torch.Tensor
+ Sequences of features.
+ weights : (batch, frames) or (batch, speakers, frames) torch.Tensor, optional
+ Compute weighted mean and standard deviation, using provided `weights`.
- v1 = weights.sum(dim=2)
- mean = torch.sum(sequences * weights, dim=2) / v1
+ Note
+ ----
+ `sequences` and `weights` might use a different number of frames, in which case `weights`
+ are interpolated linearly to reach the number of frames in `sequences`.
- dx2 = torch.square(sequences - mean.unsqueeze(2))
- v2 = torch.square(weights).sum(dim=2)
+ Returns
+ -------
+ output : (batch, 2 * features) or (batch, speakers, 2 * features) torch.Tensor
+ Concatenation of mean and (unbiased) standard deviation. When `weights` are
+ provided with the `speakers` dimension, `output` is computed for each speaker
+ separately and returned as (batch, speakers, 2 * channel)-shaped tensor.
+ """
- var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1)
- std = torch.sqrt(var)
+ if weights is None:
+ mean = sequences.mean(dim=-1)
+ std = sequences.std(dim=-1, correction=1)
+ return torch.cat([mean, std], dim=-1)
- return torch.cat([mean, std], dim=1)
+ if weights.dim() == 2:
+ has_speaker_dimension = False
+ weights = weights.unsqueeze(dim=1)
+ # (batch, frames) -> (batch, 1, frames)
+ else:
+ has_speaker_dimension = True
+
+ # interpolate weights if needed
+ _, _, num_frames = sequences.shape
+ _, _, num_weights = weights.shape
+ if num_frames != num_weights:
+ warnings.warn(
+ f"Mismatch between frames ({num_frames}) and weights ({num_weights}) numbers."
+ )
+ weights = F.interpolate(
+ weights, size=num_frames, mode="linear", align_corners=False
+ )
+
+ output = rearrange(
+ torch.vmap(self._pool, in_dims=(None, 1))(sequences, weights),
+ "speakers batch features -> batch speakers features",
+ )
+
+ if not has_speaker_dimension:
+ return output.squeeze(dim=1)
+
+ return output
diff --git a/tests/test_stats_pool.py b/tests/test_stats_pool.py
new file mode 100644
index 000000000..e30262eda
--- /dev/null
+++ b/tests/test_stats_pool.py
@@ -0,0 +1,131 @@
+# MIT License
+#
+# Copyright (c) 2023- CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+
+from pyannote.audio.models.blocks.pooling import StatsPool
+
+
+def test_stats_pool_weightless():
+ x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]])
+ # (batch = 2, features = 2, frames = 2)
+
+ stats_pool = StatsPool()
+
+ y = stats_pool(x)
+ # (batch = 2, features = 4)
+
+ assert torch.equal(
+ torch.round(y, decimals=4),
+ torch.Tensor([[3.0, 3.0, 1.4142, 1.4142], [1.0, 1.0, 0.0, 0.0]]),
+ )
+
+
+def test_stats_pool_one_speaker():
+ x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]])
+ # (batch = 2, features = 2, frames = 2)
+
+ w = torch.Tensor(
+ [
+ [0.5, 0.01],
+ [0.2, 0.1],
+ ]
+ )
+ # (batch = 2, frames = 2)
+
+ stats_pool = StatsPool()
+
+ y = stats_pool(x, weights=w)
+ # (batch = 2, features = 4)
+
+ assert torch.equal(
+ torch.round(y, decimals=4),
+ torch.Tensor([[2.0392, 2.0392, 1.4142, 1.4142], [1.0, 1.0, 0.0, 0.0]]),
+ )
+
+
+def test_stats_pool_multi_speaker():
+ x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]])
+ # (batch = 2, features = 2, frames = 2)
+
+ w = torch.Tensor([[[0.1, 0.2], [0.2, 0.3]], [[0.001, 0.001], [0.2, 0.3]]])
+ # (batch = 2, speakers = 2, frames = 2)
+
+ stats_pool = StatsPool()
+
+ y = stats_pool(x, weights=w)
+ # (batch = 2, speakers = 2, features = 4)
+
+ assert torch.equal(
+ torch.round(y, decimals=4),
+ torch.Tensor(
+ [
+ [[3.3333, 3.3333, 1.4142, 1.4142], [3.2, 3.2, 1.4142, 1.4142]],
+ [[1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]],
+ ]
+ ),
+ )
+
+
+def test_stats_pool_frame_mismatch():
+ x = torch.Tensor([[[2.0, 2.0], [2.0, 2.0]], [[1.0, 1.0], [1.0, 1.0]]])
+ # (batch = 2, features = 2, frames = 2)
+
+ stats_pool = StatsPool()
+ w = torch.Tensor(
+ [
+ [0.5, 0.5, 0.0],
+ [0.0, 0.5, 0.5],
+ ]
+ )
+ # (batch = 2, frames = 3)
+
+ y = stats_pool(x, weights=w)
+ # (batch = 2, features = 4)
+
+ assert torch.equal(
+ torch.round(y, decimals=4),
+ torch.Tensor([[2.0, 2.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]]),
+ )
+
+
+def test_stats_pool_all_zero_weights():
+ x = torch.Tensor([[[2.0, 4.0], [2.0, 4.0]], [[1.0, 1.0], [1.0, 1.0]]])
+ # (batch = 2, features = 2, frames = 2)
+
+ w = torch.Tensor(
+ [
+ [0.5, 0.01],
+ [0.0, 0.0], # all zero weights
+ ]
+ )
+ # (batch = 2, frames = 2)
+
+ stats_pool = StatsPool()
+
+ y = stats_pool(x, weights=w)
+ # (batch = 2, features = 4)
+
+ assert torch.equal(
+ torch.round(y, decimals=4),
+ torch.Tensor([[2.0392, 2.0392, 1.4142, 1.4142], [0.0, 0.0, 0.0, 0.0]]),
+ )
From 15b847acfbb4d80ee6262f1814eba97fd2e8f47f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Wed, 15 Nov 2023 16:23:40 +0100
Subject: [PATCH 12/19] fix(model): fix pooling layer (#1546)
* fix(mps): fix support for MPS
* fix: fix (unlikely but possible) division by zero
---
pyannote/audio/models/blocks/pooling.py | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/pyannote/audio/models/blocks/pooling.py b/pyannote/audio/models/blocks/pooling.py
index b02f6f69e..22d736a03 100644
--- a/pyannote/audio/models/blocks/pooling.py
+++ b/pyannote/audio/models/blocks/pooling.py
@@ -69,7 +69,7 @@ def _pool(self, sequences: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:
dx2 = torch.square(sequences - mean.unsqueeze(2))
v2 = torch.square(weights).sum(dim=2)
- var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1)
+ var = torch.sum(dx2 * weights, dim=2) / (v1 - v2 / v1 + 1e-8)
std = torch.sqrt(var)
return torch.cat([mean, std], dim=1)
@@ -118,9 +118,7 @@ def forward(
warnings.warn(
f"Mismatch between frames ({num_frames}) and weights ({num_weights}) numbers."
)
- weights = F.interpolate(
- weights, size=num_frames, mode="linear", align_corners=False
- )
+ weights = F.interpolate(weights, size=num_frames, mode="nearest")
output = rearrange(
torch.vmap(self._pool, in_dims=(None, 1))(sequences, weights),
From 343ce66d674c5a1d970bbaf4fc6177ad936c7d82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Thu, 16 Nov 2023 09:22:45 +0100
Subject: [PATCH 13/19] feat(model): add WeSpeaker embedding wrapper based on
pytorch (#1540)
---
CHANGELOG.md | 1 +
pyannote/audio/models/embedding/__init__.py | 15 +-
.../embedding/wespeaker/LICENSE.WeSpeaker | 21 ++
.../models/embedding/wespeaker/__init__.py | 236 ++++++++++++++
.../models/embedding/wespeaker/convert.py | 62 ++++
.../models/embedding/wespeaker/resnet.py | 302 ++++++++++++++++++
.../audio/pipelines/speaker_verification.py | 2 +-
7 files changed, 637 insertions(+), 2 deletions(-)
create mode 100644 pyannote/audio/models/embedding/wespeaker/LICENSE.WeSpeaker
create mode 100644 pyannote/audio/models/embedding/wespeaker/__init__.py
create mode 100644 pyannote/audio/models/embedding/wespeaker/convert.py
create mode 100644 pyannote/audio/models/embedding/wespeaker/resnet.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3c8a4d8ff..2dff9c969 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
## `develop` branch
+- feat(pipeline): add WeSpeaker embedding wrapper
- feat(pipeline): add `TimingHook` for profiling processing time
- feat(pipeline): add `ArtifactHook` for saving internal steps
- feat(pipeline): add support for list of hooks with `Hooks`
diff --git a/pyannote/audio/models/embedding/__init__.py b/pyannote/audio/models/embedding/__init__.py
index 08f8a576c..2819096c2 100644
--- a/pyannote/audio/models/embedding/__init__.py
+++ b/pyannote/audio/models/embedding/__init__.py
@@ -21,6 +21,19 @@
# SOFTWARE.
+from .wespeaker import (
+ WeSpeakerResNet34,
+ WeSpeakerResNet152,
+ WeSpeakerResNet221,
+ WeSpeakerResNet293,
+)
from .xvector import XVectorMFCC, XVectorSincNet
-__all__ = ["XVectorSincNet", "XVectorMFCC"]
+__all__ = [
+ "XVectorSincNet",
+ "XVectorMFCC",
+ "WeSpeakerResNet34",
+ "WeSpeakerResNet152",
+ "WeSpeakerResNet221",
+ "WeSpeakerResNet293",
+]
diff --git a/pyannote/audio/models/embedding/wespeaker/LICENSE.WeSpeaker b/pyannote/audio/models/embedding/wespeaker/LICENSE.WeSpeaker
new file mode 100644
index 000000000..136492006
--- /dev/null
+++ b/pyannote/audio/models/embedding/wespeaker/LICENSE.WeSpeaker
@@ -0,0 +1,21 @@
+Copyright (c) 2021 Shuai Wang (wsstriving@gmail.com)
+2022 Zhengyang Chen (chenzhengyang117@gmail.com)
+2023 Bing Han (hanbing97@sjtu.edu.cn)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+File `resnet.py` has been borrowed from WeSpeaker that is available under the Apache License, Version 2.0.
+
+The original file is available at https://github.com/wenet-e2e/wespeaker/blob/c20d765295359e681321625fbefc1a02e8794163/wespeaker/models/resnet.py
+
+Neither Shuai Wang (@wsstriving on Github) nor myself (Hervé Bredin, or @hbredin on Github) are lawyers, but we both agreed that putting this license file in this directory is enough to comply with the license. See https://github.com/pyannote/pyannote-audio/issues/1537#issuecomment-1808029836. If you know better about this potential MIT/Apache 2.0 compatibility issue, please let us know.
diff --git a/pyannote/audio/models/embedding/wespeaker/__init__.py b/pyannote/audio/models/embedding/wespeaker/__init__.py
new file mode 100644
index 000000000..603a88c64
--- /dev/null
+++ b/pyannote/audio/models/embedding/wespeaker/__init__.py
@@ -0,0 +1,236 @@
+# MIT License
+#
+# Copyright (c) 2023 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from functools import partial
+from typing import Optional
+
+import torch
+import torchaudio.compliance.kaldi as kaldi
+
+from pyannote.audio.core.model import Model
+from pyannote.audio.core.task import Task
+
+from .resnet import ResNet34, ResNet152, ResNet221, ResNet293
+
+
+class BaseWeSpeakerResNet(Model):
+ def __init__(
+ self,
+ sample_rate: int = 16000,
+ num_channels: int = 1,
+ num_mel_bins: int = 80,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ dither: float = 0.0,
+ window_type: str = "hamming",
+ use_energy: bool = False,
+ task: Optional[Task] = None,
+ ):
+ super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
+
+ self.save_hyperparameters(
+ "sample_rate",
+ "num_channels",
+ "num_mel_bins",
+ "frame_length",
+ "frame_shift",
+ "dither",
+ "window_type",
+ "use_energy",
+ )
+
+ self._fbank = partial(
+ kaldi.fbank,
+ num_mel_bins=self.hparams.num_mel_bins,
+ frame_length=self.hparams.frame_length,
+ frame_shift=self.hparams.frame_shift,
+ dither=self.hparams.dither,
+ sample_frequency=self.hparams.sample_rate,
+ window_type=self.hparams.window_type,
+ use_energy=self.hparams.use_energy,
+ )
+
+ def compute_fbank(self, waveforms: torch.Tensor) -> torch.Tensor:
+ """Extract fbank features
+
+ Parameters
+ ----------
+ waveforms : (batch_size, num_channels, num_samples)
+
+ Returns
+ -------
+ fbank : (batch_size, num_frames, num_mel_bins)
+
+ Source: https://github.com/wenet-e2e/wespeaker/blob/45941e7cba2c3ea99e232d02bedf617fc71b0dad/wespeaker/bin/infer_onnx.py#L30C1-L50
+ """
+
+ waveforms = waveforms * (1 << 15)
+
+ # fall back to CPU for FFT computation when using MPS
+ # until FFT is fixed in MPS
+ device = waveforms.device
+ fft_device = torch.device("cpu") if device.type == "mps" else device
+
+ features = torch.vmap(self._fbank)(waveforms.to(fft_device)).to(device)
+
+ return features - torch.mean(features, dim=1, keepdim=True)
+
+ def forward(
+ self, waveforms: torch.Tensor, weights: torch.Tensor = None
+ ) -> torch.Tensor:
+ """
+
+ Parameters
+ ----------
+ waveforms : torch.Tensor
+ Batch of waveforms with shape (batch, channel, sample)
+ weights : torch.Tensor, optional
+ Batch of weights with shape (batch, frame).
+ """
+
+ fbank = self.compute_fbank(waveforms)
+ return self.resnet(fbank, weights=weights)[1]
+
+
+class WeSpeakerResNet34(BaseWeSpeakerResNet):
+ def __init__(
+ self,
+ sample_rate: int = 16000,
+ num_channels: int = 1,
+ num_mel_bins: int = 80,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ dither: float = 0.0,
+ window_type: str = "hamming",
+ use_energy: bool = False,
+ task: Optional[Task] = None,
+ ):
+ super().__init__(
+ sample_rate=sample_rate,
+ num_channels=num_channels,
+ num_mel_bins=num_mel_bins,
+ frame_length=frame_length,
+ frame_shift=frame_shift,
+ dither=dither,
+ window_type=window_type,
+ use_energy=use_energy,
+ task=task,
+ )
+ self.resnet = ResNet34(
+ num_mel_bins, 256, pooling_func="TSTP", two_emb_layer=False
+ )
+
+
+class WeSpeakerResNet152(BaseWeSpeakerResNet):
+ def __init__(
+ self,
+ sample_rate: int = 16000,
+ num_channels: int = 1,
+ num_mel_bins: int = 80,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ dither: float = 0.0,
+ window_type: str = "hamming",
+ use_energy: bool = False,
+ task: Optional[Task] = None,
+ ):
+ super().__init__(
+ sample_rate=sample_rate,
+ num_channels=num_channels,
+ num_mel_bins=num_mel_bins,
+ frame_length=frame_length,
+ frame_shift=frame_shift,
+ dither=dither,
+ window_type=window_type,
+ use_energy=use_energy,
+ task=task,
+ )
+ self.resnet = ResNet152(
+ num_mel_bins, 256, pooling_func="TSTP", two_emb_layer=False
+ )
+
+
+class WeSpeakerResNet221(BaseWeSpeakerResNet):
+ def __init__(
+ self,
+ sample_rate: int = 16000,
+ num_channels: int = 1,
+ num_mel_bins: int = 80,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ dither: float = 0.0,
+ window_type: str = "hamming",
+ use_energy: bool = False,
+ task: Optional[Task] = None,
+ ):
+ super().__init__(
+ sample_rate=sample_rate,
+ num_channels=num_channels,
+ num_mel_bins=num_mel_bins,
+ frame_length=frame_length,
+ frame_shift=frame_shift,
+ dither=dither,
+ window_type=window_type,
+ use_energy=use_energy,
+ task=task,
+ )
+ self.resnet = ResNet221(
+ num_mel_bins, 256, pooling_func="TSTP", two_emb_layer=False
+ )
+
+
+class WeSpeakerResNet293(BaseWeSpeakerResNet):
+ def __init__(
+ self,
+ sample_rate: int = 16000,
+ num_channels: int = 1,
+ num_mel_bins: int = 80,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ dither: float = 0.0,
+ window_type: str = "hamming",
+ use_energy: bool = False,
+ task: Optional[Task] = None,
+ ):
+ super().__init__(
+ sample_rate=sample_rate,
+ num_channels=num_channels,
+ num_mel_bins=num_mel_bins,
+ frame_length=frame_length,
+ frame_shift=frame_shift,
+ dither=dither,
+ window_type=window_type,
+ use_energy=use_energy,
+ task=task,
+ )
+ self.resnet = ResNet293(
+ num_mel_bins, 256, pooling_func="TSTP", two_emb_layer=False
+ )
+
+
+__all__ = [
+ "WeSpeakerResNet34",
+ "WeSpeakerResNet152",
+ "WeSpeakerResNet221",
+ "WeSpeakerResNet293",
+]
diff --git a/pyannote/audio/models/embedding/wespeaker/convert.py b/pyannote/audio/models/embedding/wespeaker/convert.py
new file mode 100644
index 000000000..34aec6092
--- /dev/null
+++ b/pyannote/audio/models/embedding/wespeaker/convert.py
@@ -0,0 +1,62 @@
+# MIT License
+#
+# Copyright (c) 2023 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Script used to convert from WeSpeaker to pyannote.audio
+
+import sys
+from pathlib import Path
+
+import pytorch_lightning as pl
+import torch
+
+import pyannote.audio.models.embedding.wespeaker as wespeaker
+from pyannote.audio import Model
+from pyannote.audio.core.task import Problem, Resolution, Specifications
+
+wespeaker_checkpoint_dir = sys.argv[1] # /path/to/wespeaker_cnceleb-resnet34-LM
+
+wespeaker_checkpoint = Path(wespeaker_checkpoint_dir) / "wespeaker.pt"
+
+depth = Path(wespeaker_checkpoint_dir).parts[-1].split("-")[-2][6:] # '34'
+Klass = getattr(wespeaker, f"WeSpeakerResNet{depth}") # WeSpeakerResNet34
+
+duration = 5.0 # whatever
+specifications = Specifications(
+ problem=Problem.REPRESENTATION, resolution=Resolution.CHUNK, duration=duration
+)
+
+state_dict = torch.load(wespeaker_checkpoint, map_location=torch.device("cpu"))
+state_dict.pop("projection.weight")
+
+model = Klass()
+model.resnet.load_state_dict(state_dict, strict=True)
+model.specifications = specifications
+
+checkpoint = {"state_dict": model.state_dict()}
+model.on_save_checkpoint(checkpoint)
+checkpoint["pytorch-lightning_version"] = pl.__version__
+
+pyannote_checkpoint = Path(wespeaker_checkpoint_dir) / "pytorch_model.bin"
+torch.save(checkpoint, pyannote_checkpoint)
+
+model = Model.from_pretrained(pyannote_checkpoint)
+print(model)
diff --git a/pyannote/audio/models/embedding/wespeaker/resnet.py b/pyannote/audio/models/embedding/wespeaker/resnet.py
new file mode 100644
index 000000000..54f95fa8b
--- /dev/null
+++ b/pyannote/audio/models/embedding/wespeaker/resnet.py
@@ -0,0 +1,302 @@
+# Copyright (c) 2021 Shuai Wang (wsstriving@gmail.com)
+# 2022 Zhengyang Chen (chenzhengyang117@gmail.com)
+# 2023 Bing Han (hanbing97@sjtu.edu.cn)
+# 2023 CNRS
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+from pyannote.audio.models.blocks.pooling import StatsPool
+
+
+class TSTP(nn.Module):
+ """
+ Temporal statistics pooling, concatenate mean and std, which is used in
+ x-vector
+ Comment: simple concatenation can not make full use of both statistics
+ """
+
+ def __init__(self, in_dim=0, **kwargs):
+ super(TSTP, self).__init__()
+ self.in_dim = in_dim
+ self.stats_pool = StatsPool()
+
+ def forward(self, features, weights: torch.Tensor = None):
+ """
+
+ Parameters
+ ----------
+ features : (batch, dimension, channel, frames) torch.Tensor
+ Batch of features
+ weights: (batch, frames) torch.Tensor, optional
+ Batch of weights
+
+ """
+
+ features = rearrange(
+ features,
+ "batch dimension channel frames -> batch (dimension channel) frames",
+ )
+
+ return self.stats_pool(features, weights=weights)
+
+ # # The last dimension is the temporal axis
+ # pooling_mean = features.mean(dim=-1)
+ # pooling_std = torch.sqrt(torch.var(features, dim=-1) + 1e-7)
+ # pooling_mean = pooling_mean.flatten(start_dim=1)
+ # pooling_std = pooling_std.flatten(start_dim=1)
+ # stats = torch.cat((pooling_mean, pooling_std), 1)
+ # return stats
+
+ def get_out_dim(self):
+ self.out_dim = self.in_dim * 2
+ return self.out_dim
+
+
+POOLING_LAYERS = {"TSTP": TSTP}
+
+
+class BasicBlock(nn.Module):
+ expansion = 1
+
+ def __init__(self, in_planes, planes, stride=1):
+ super(BasicBlock, self).__init__()
+ self.conv1 = nn.Conv2d(
+ in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
+ )
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.conv2 = nn.Conv2d(
+ planes, planes, kernel_size=3, stride=1, padding=1, bias=False
+ )
+ self.bn2 = nn.BatchNorm2d(planes)
+
+ self.shortcut = nn.Sequential()
+ if stride != 1 or in_planes != self.expansion * planes:
+ self.shortcut = nn.Sequential(
+ nn.Conv2d(
+ in_planes,
+ self.expansion * planes,
+ kernel_size=1,
+ stride=stride,
+ bias=False,
+ ),
+ nn.BatchNorm2d(self.expansion * planes),
+ )
+
+ def forward(self, x):
+ out = F.relu(self.bn1(self.conv1(x)))
+ out = self.bn2(self.conv2(out))
+ out += self.shortcut(x)
+ out = F.relu(out)
+ return out
+
+
+class Bottleneck(nn.Module):
+ expansion = 4
+
+ def __init__(self, in_planes, planes, stride=1):
+ super(Bottleneck, self).__init__()
+ self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.conv2 = nn.Conv2d(
+ planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
+ )
+ self.bn2 = nn.BatchNorm2d(planes)
+ self.conv3 = nn.Conv2d(
+ planes, self.expansion * planes, kernel_size=1, bias=False
+ )
+ self.bn3 = nn.BatchNorm2d(self.expansion * planes)
+
+ self.shortcut = nn.Sequential()
+ if stride != 1 or in_planes != self.expansion * planes:
+ self.shortcut = nn.Sequential(
+ nn.Conv2d(
+ in_planes,
+ self.expansion * planes,
+ kernel_size=1,
+ stride=stride,
+ bias=False,
+ ),
+ nn.BatchNorm2d(self.expansion * planes),
+ )
+
+ def forward(self, x):
+ out = F.relu(self.bn1(self.conv1(x)))
+ out = F.relu(self.bn2(self.conv2(out)))
+ out = self.bn3(self.conv3(out))
+ out += self.shortcut(x)
+ out = F.relu(out)
+ return out
+
+
+class ResNet(nn.Module):
+ def __init__(
+ self,
+ block,
+ num_blocks,
+ m_channels=32,
+ feat_dim=40,
+ embed_dim=128,
+ pooling_func="TSTP",
+ two_emb_layer=True,
+ ):
+ super(ResNet, self).__init__()
+ self.in_planes = m_channels
+ self.feat_dim = feat_dim
+ self.embed_dim = embed_dim
+ self.stats_dim = int(feat_dim / 8) * m_channels * 8
+ self.two_emb_layer = two_emb_layer
+
+ self.conv1 = nn.Conv2d(
+ 1, m_channels, kernel_size=3, stride=1, padding=1, bias=False
+ )
+ self.bn1 = nn.BatchNorm2d(m_channels)
+ self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=1)
+ self.layer2 = self._make_layer(block, m_channels * 2, num_blocks[1], stride=2)
+ self.layer3 = self._make_layer(block, m_channels * 4, num_blocks[2], stride=2)
+ self.layer4 = self._make_layer(block, m_channels * 8, num_blocks[3], stride=2)
+
+ self.pool = POOLING_LAYERS[pooling_func](
+ in_dim=self.stats_dim * block.expansion
+ )
+ self.pool_out_dim = self.pool.get_out_dim()
+ self.seg_1 = nn.Linear(self.pool_out_dim, embed_dim)
+ if self.two_emb_layer:
+ self.seg_bn_1 = nn.BatchNorm1d(embed_dim, affine=False)
+ self.seg_2 = nn.Linear(embed_dim, embed_dim)
+ else:
+ self.seg_bn_1 = nn.Identity()
+ self.seg_2 = nn.Identity()
+
+ def _make_layer(self, block, planes, num_blocks, stride):
+ strides = [stride] + [1] * (num_blocks - 1)
+ layers = []
+ for stride in strides:
+ layers.append(block(self.in_planes, planes, stride))
+ self.in_planes = planes * block.expansion
+ return nn.Sequential(*layers)
+
+ def forward(self, x: torch.Tensor, weights: torch.Tensor = None):
+ """
+
+ Parameters
+ ----------
+ x : (batch, frames, features) torch.Tensor
+ Batch of features
+ weights : (batch, frames) torch.Tensor, optional
+ Batch of weights
+
+ Returns
+ -------
+ embedding : (batch, embedding_dim) torch.Tensor
+ """
+ x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
+
+ x = x.unsqueeze_(1)
+ out = F.relu(self.bn1(self.conv1(x)))
+ out = self.layer1(out)
+ out = self.layer2(out)
+ out = self.layer3(out)
+ out = self.layer4(out)
+
+ stats = self.pool(out, weights=weights)
+
+ embed_a = self.seg_1(stats)
+ if self.two_emb_layer:
+ out = F.relu(embed_a)
+ out = self.seg_bn_1(out)
+ embed_b = self.seg_2(out)
+ return embed_a, embed_b
+ else:
+ return torch.tensor(0.0), embed_a
+
+
+def ResNet18(feat_dim, embed_dim, pooling_func="TSTP", two_emb_layer=True):
+ return ResNet(
+ BasicBlock,
+ [2, 2, 2, 2],
+ feat_dim=feat_dim,
+ embed_dim=embed_dim,
+ pooling_func=pooling_func,
+ two_emb_layer=two_emb_layer,
+ )
+
+
+def ResNet34(feat_dim, embed_dim, pooling_func="TSTP", two_emb_layer=True):
+ return ResNet(
+ BasicBlock,
+ [3, 4, 6, 3],
+ feat_dim=feat_dim,
+ embed_dim=embed_dim,
+ pooling_func=pooling_func,
+ two_emb_layer=two_emb_layer,
+ )
+
+
+def ResNet50(feat_dim, embed_dim, pooling_func="TSTP", two_emb_layer=True):
+ return ResNet(
+ Bottleneck,
+ [3, 4, 6, 3],
+ feat_dim=feat_dim,
+ embed_dim=embed_dim,
+ pooling_func=pooling_func,
+ two_emb_layer=two_emb_layer,
+ )
+
+
+def ResNet101(feat_dim, embed_dim, pooling_func="TSTP", two_emb_layer=True):
+ return ResNet(
+ Bottleneck,
+ [3, 4, 23, 3],
+ feat_dim=feat_dim,
+ embed_dim=embed_dim,
+ pooling_func=pooling_func,
+ two_emb_layer=two_emb_layer,
+ )
+
+
+def ResNet152(feat_dim, embed_dim, pooling_func="TSTP", two_emb_layer=True):
+ return ResNet(
+ Bottleneck,
+ [3, 8, 36, 3],
+ feat_dim=feat_dim,
+ embed_dim=embed_dim,
+ pooling_func=pooling_func,
+ two_emb_layer=two_emb_layer,
+ )
+
+
+def ResNet221(feat_dim, embed_dim, pooling_func="TSTP", two_emb_layer=True):
+ return ResNet(
+ Bottleneck,
+ [6, 16, 48, 3],
+ feat_dim=feat_dim,
+ embed_dim=embed_dim,
+ pooling_func=pooling_func,
+ two_emb_layer=two_emb_layer,
+ )
+
+
+def ResNet293(feat_dim, embed_dim, pooling_func="TSTP", two_emb_layer=True):
+ return ResNet(
+ Bottleneck,
+ [10, 20, 64, 3],
+ feat_dim=feat_dim,
+ embed_dim=embed_dim,
+ pooling_func=pooling_func,
+ two_emb_layer=two_emb_layer,
+ )
diff --git a/pyannote/audio/pipelines/speaker_verification.py b/pyannote/audio/pipelines/speaker_verification.py
index 9f10f6e51..c870ea622 100644
--- a/pyannote/audio/pipelines/speaker_verification.py
+++ b/pyannote/audio/pipelines/speaker_verification.py
@@ -687,7 +687,7 @@ def min_num_samples(self) -> int:
try:
_ = self.model_(torch.randn(1, 1, middle).to(self.device))
upper = middle
- except RuntimeError:
+ except Exception:
lower = middle
middle = (lower + upper) // 2
From 6ad2f87015d6cfe660a098233ff2f72bbdc5cd84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Thu, 16 Nov 2023 09:37:10 +0100
Subject: [PATCH 14/19] doc: update changelog (#1549)
---
CHANGELOG.md | 21 ++++++++++++++++-----
1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2dff9c969..e506c0413 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,19 +2,30 @@
## `develop` branch
-- feat(pipeline): add WeSpeaker embedding wrapper
+### New features
+
+- feat(model): add WeSpeaker embedding wrapper based on PyTorch
+- feat(model): add support for multi-speaker statistics pooling
- feat(pipeline): add `TimingHook` for profiling processing time
- feat(pipeline): add `ArtifactHook` for saving internal steps
- feat(pipeline): add support for list of hooks with `Hooks`
-- BREAKING(pipeline): remove `logging_hook` (use `ArtifactHook` instead)
-- fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
- feat(utils): add `"soft"` option to `Powerset.to_multilabel`
-- improve(pipeline): compute `fbank` on GPU when requested
+
+### Fixes
+
+- fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
- fix(pipeline): fix `AgglomerativeClustering` to honor `num_clusters` when provided
+
+## Improvements
+
+- improve(pipeline): compute `fbank` on GPU when requested
+
+### Breaking changes
+
- BREAKING(pipeline): rename `WeSpeakerPretrainedSpeakerEmbedding` to `ONNXWeSpeakerPretrainedSpeakerEmbedding`
- BREAKING(setup): remove `onnxruntime` dependency.
You can still use ONNX `hbredin/wespeaker-voxceleb-resnet34-LM` but you will have to install `onnxruntime` yourself.
-- feat(model): add support for multi-speaker statistics pooling
+- BREAKING(pipeline): remove `logging_hook` (use `ArtifactHook` instead)
## Version 3.0.1 (2023-09-28)
From 23001a75cf6b465d1d824b5a0bd96ffe32ba5bbc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Thu, 16 Nov 2023 10:22:43 +0100
Subject: [PATCH 15/19] doc: fix typo in changelog
---
CHANGELOG.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e506c0413..6feac98ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,7 +16,7 @@
- fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
- fix(pipeline): fix `AgglomerativeClustering` to honor `num_clusters` when provided
-## Improvements
+### Improvements
- improve(pipeline): compute `fbank` on GPU when requested
From bbc804401ead3fe4819ccb1ee820fabbbd35404c Mon Sep 17 00:00:00 2001
From: Dmitrii Mukhutdinov
Date: Thu, 16 Nov 2023 18:29:41 +0800
Subject: [PATCH 16/19] fix(pipeline): fix frame-wise speaker count exceeding
max_speakers
* fix(pipeline): fix frame-wise speaker count exceeding max_speakers or detected num_speakers in SpeakerDiarization pipeline
* BREAKING(pipeline): remove onset and offset parameter in SpeakerDiarizationMixin.speaker_count
---
CHANGELOG.md | 3 +
pyannote/audio/pipelines/clustering.py | 1 -
pyannote/audio/pipelines/resegmentation.py | 11 +++-
.../audio/pipelines/speaker_diarization.py | 65 +++++++++++++------
pyannote/audio/pipelines/utils/diarization.py | 18 ++---
5 files changed, 62 insertions(+), 36 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6feac98ae..19e25f36e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
- fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
- fix(pipeline): fix `AgglomerativeClustering` to honor `num_clusters` when provided
+- fix(pipeline): fix frame-wise speaker count exceeding `max_speakers` or detected `num_speakers` in `SpeakerDiarization` pipeline
### Improvements
@@ -26,6 +27,8 @@
- BREAKING(setup): remove `onnxruntime` dependency.
You can still use ONNX `hbredin/wespeaker-voxceleb-resnet34-LM` but you will have to install `onnxruntime` yourself.
- BREAKING(pipeline): remove `logging_hook` (use `ArtifactHook` instead)
+- BREAKING(pipeline): remove `onset` and `offset` parameter in `SpeakerDiarizationMixin.speaker_count`
+ You should now binarize segmentations before passing them to `speaker_count`
## Version 3.0.1 (2023-09-28)
diff --git a/pyannote/audio/pipelines/clustering.py b/pyannote/audio/pipelines/clustering.py
index c51cdcc50..b63ab214f 100644
--- a/pyannote/audio/pipelines/clustering.py
+++ b/pyannote/audio/pipelines/clustering.py
@@ -253,7 +253,6 @@ def __call__(
hard_clusters = np.zeros((num_chunks, num_speakers), dtype=np.int8)
soft_clusters = np.ones((num_chunks, num_speakers, 1))
centroids = np.mean(train_embeddings, axis=0, keepdims=True)
-
return hard_clusters, soft_clusters, centroids
train_clusters = self.cluster(
diff --git a/pyannote/audio/pipelines/resegmentation.py b/pyannote/audio/pipelines/resegmentation.py
index bb71abf22..d01e5d65f 100644
--- a/pyannote/audio/pipelines/resegmentation.py
+++ b/pyannote/audio/pipelines/resegmentation.py
@@ -39,6 +39,7 @@
get_model,
)
from pyannote.audio.utils.permutation import mae_cost_func, permutate
+from pyannote.audio.utils.signal import binarize
class Resegmentation(SpeakerDiarizationMixin, Pipeline):
@@ -181,11 +182,17 @@ def apply(
hook("segmentation", segmentations)
- # estimate frame-level number of instantaneous speakers
- count = self.speaker_count(
+ # binarize segmentations before speaker counting
+ binarized_segmentations: SlidingWindowFeature = binarize(
segmentations,
onset=self.onset,
offset=self.offset,
+ initial_state=False,
+ )
+
+ # estimate frame-level number of instantaneous speakers
+ count = self.speaker_count(
+ binarized_segmentations,
warm_up=(self.warm_up, self.warm_up),
frames=self._frames,
)
diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py
index d5cf04e05..354f6be7e 100644
--- a/pyannote/audio/pipelines/speaker_diarization.py
+++ b/pyannote/audio/pipelines/speaker_diarization.py
@@ -25,6 +25,8 @@
import functools
import itertools
import math
+import textwrap
+import warnings
from typing import Callable, Optional, Text, Union
import numpy as np
@@ -478,12 +480,19 @@ def apply(
hook("segmentation", segmentations)
# shape: (num_chunks, num_frames, local_num_speakers)
+ # binarize segmentation
+ if self._segmentation.model.specifications.powerset:
+ binarized_segmentations = segmentations
+ else:
+ binarized_segmentations: SlidingWindowFeature = binarize(
+ segmentations,
+ onset=self.segmentation.threshold,
+ initial_state=False,
+ )
+
# estimate frame-level number of instantaneous speakers
count = self.speaker_count(
- segmentations,
- onset=0.5
- if self._segmentation.model.specifications.powerset
- else self.segmentation.threshold,
+ binarized_segmentations,
frames=self._frames,
warm_up=(0.0, 0.0),
)
@@ -499,16 +508,6 @@ def apply(
return diarization
- # binarize segmentation
- if self._segmentation.model.specifications.powerset:
- binarized_segmentations = segmentations
- else:
- binarized_segmentations: SlidingWindowFeature = binarize(
- segmentations,
- onset=self.segmentation.threshold,
- initial_state=False,
- )
-
if self.klustering == "OracleClustering" and not return_embeddings:
embeddings = None
else:
@@ -533,6 +532,27 @@ def apply(
# hard_clusters: (num_chunks, num_speakers)
# centroids: (num_speakers, dimension)
+ # number of detected clusters is the number of different speakers
+ num_different_speakers = np.max(hard_clusters) + 1
+
+ # detected number of speakers can still be out of bounds
+ # (specifically, lower than `min_speakers`), since there could be too few embeddings
+ # to make enough clusters with a given minimum cluster size.
+ if num_different_speakers < min_speakers or num_different_speakers > max_speakers:
+ warnings.warn(textwrap.dedent(
+ f"""
+ The detected number of speakers ({num_different_speakers}) is outside
+ the given bounds [{min_speakers}, {max_speakers}]. This can happen if the
+ given audio file is too short to contain {min_speakers} or more speakers.
+ Try to lower the desired minimal number of speakers.
+ """
+ ))
+
+ # during counting, we could possibly overcount the number of instantaneous
+ # speakers due to segmentation errors, so we cap the maximum instantaneous number
+ # of speakers by the `max_speakers` value
+ count.data = np.minimum(count.data, max_speakers).astype(np.int8)
+
# reconstruct discrete diarization from raw hard clusters
# keep track of inactive speakers
@@ -588,6 +608,18 @@ def apply(
if not return_embeddings:
return diarization
+ # this can happen when we use OracleClustering
+ if centroids is None:
+ return diarization, None
+
+ # The number of centroids may be smaller than the number of speakers
+ # in the annotation. This can happen if the number of active speakers
+ # obtained from `speaker_count` for some frames is larger than the number
+ # of clusters obtained from `clustering`. In this case, we append zero embeddings
+ # for extra speakers
+ if len(diarization.labels()) > centroids.shape[0]:
+ centroids = np.pad(centroids, ((0, len(diarization.labels()) - centroids.shape[0]), (0, 0)))
+
# re-order centroids so that they match
# the order given by diarization.labels()
inverse_mapping = {label: index for index, label in mapping.items()}
@@ -595,11 +627,6 @@ def apply(
[inverse_mapping[label] for label in diarization.labels()]
]
- # FIXME: the number of centroids may be smaller than the number of speakers
- # in the annotation. This can happen if the number of active speakers
- # obtained from `speaker_count` for some frames is larger than the number
- # of clusters obtained from `clustering`. Will be fixed in the future
-
return diarization, centroids
def get_metric(self) -> GreedyDiarizationErrorRate:
diff --git a/pyannote/audio/pipelines/utils/diarization.py b/pyannote/audio/pipelines/utils/diarization.py
index 91413350b..4a35f7049 100644
--- a/pyannote/audio/pipelines/utils/diarization.py
+++ b/pyannote/audio/pipelines/utils/diarization.py
@@ -117,13 +117,10 @@ def optimal_mapping(
else:
return mapped_hypothesis
- # TODO: get rid of onset/offset (binarization should be applied before calling speaker_count)
# TODO: get rid of warm-up parameter (trimming should be applied before calling speaker_count)
@staticmethod
def speaker_count(
- segmentations: SlidingWindowFeature,
- onset: float = 0.5,
- offset: float = None,
+ binarized_segmentations: SlidingWindowFeature,
warm_up: Tuple[float, float] = (0.1, 0.1),
frames: SlidingWindow = None,
) -> SlidingWindowFeature:
@@ -131,12 +128,8 @@ def speaker_count(
Parameters
----------
- segmentations : SlidingWindowFeature
- (num_chunks, num_frames, num_classes)-shaped scores.
- onset : float, optional
- Onset threshold. Defaults to 0.5
- offset : float, optional
- Offset threshold. Defaults to `onset`.
+ binarized_segmentations : SlidingWindowFeature
+ (num_chunks, num_frames, num_classes)-shaped binarized scores.
warm_up : (float, float) tuple, optional
Left/right warm up ratio of chunk duration.
Defaults to (0.1, 0.1), i.e. 10% on both sides.
@@ -151,10 +144,7 @@ def speaker_count(
(num_frames, 1)-shaped instantaneous speaker count
"""
- binarized: SlidingWindowFeature = binarize(
- segmentations, onset=onset, offset=offset, initial_state=False
- )
- trimmed = Inference.trim(binarized, warm_up=warm_up)
+ trimmed = Inference.trim(binarized_segmentations, warm_up=warm_up)
count = Inference.aggregate(
np.sum(trimmed, axis=-1, keepdims=True),
frames=frames,
From 3403b7a9a36fef836e3fa0fc934590a9ea4a1570 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Thu, 16 Nov 2023 12:57:12 +0100
Subject: [PATCH 17/19] improve(cli): store top 10 checkpoints rather than top
1
---
pyannote/audio/cli/train.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pyannote/audio/cli/train.py b/pyannote/audio/cli/train.py
index 9ab8b1658..74041554b 100644
--- a/pyannote/audio/cli/train.py
+++ b/pyannote/audio/cli/train.py
@@ -115,7 +115,7 @@ def configure_optimizers(self):
checkpoint = ModelCheckpoint(
monitor=monitor,
mode=direction,
- save_top_k=None if monitor is None else 1,
+ save_top_k=None if monitor is None else 10,
every_n_epochs=1,
save_last=True,
save_weights_only=False,
From ffd5b816caf272b7fce599f1b59ca09762b70b20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Thu, 16 Nov 2023 13:24:08 +0100
Subject: [PATCH 18/19] doc: update changelog
---
CHANGELOG.md | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19e25f36e..346d8ad26 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,12 @@
## `develop` branch
+## Version 3.1.0 (2023-11-16)
+
+### TL;DR
+
+[`pyannote/speaker-diarization-3.1`](https://hf.co/pyannote/speaker-diarization-3.1) no longer requires [unpopular](https://github.com/pyannote/pyannote-audio/issues/1537) ONNX runtime
+
### New features
- feat(model): add WeSpeaker embedding wrapper based on PyTorch
@@ -27,7 +33,7 @@
- BREAKING(setup): remove `onnxruntime` dependency.
You can still use ONNX `hbredin/wespeaker-voxceleb-resnet34-LM` but you will have to install `onnxruntime` yourself.
- BREAKING(pipeline): remove `logging_hook` (use `ArtifactHook` instead)
-- BREAKING(pipeline): remove `onset` and `offset` parameter in `SpeakerDiarizationMixin.speaker_count`
+- BREAKING(pipeline): remove `onset` and `offset` parameter in `SpeakerDiarizationMixin.speaker_count`
You should now binarize segmentations before passing them to `speaker_count`
## Version 3.0.1 (2023-09-28)
From eecc634df83c5c273b4aa0723e8e467efa1e7765 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?=
Date: Thu, 16 Nov 2023 13:24:41 +0100
Subject: [PATCH 19/19] setup: bump version
---
version.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/version.txt b/version.txt
index cb2b00e4f..fd2a01863 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-3.0.1
+3.1.0