Add files via upload

dataprofessor · web-flow · commit 2d8dfdb2678b · 2020-09-06T23:00:35.000+07:00
diff --git a/python/Hummingbird_ML.ipynb b/python/Hummingbird_ML.ipynb
@@ -0,0 +1,360 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Hummingbird-ML.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "IpOdlr3WAPHJ",
+        "colab_type": "text"
+      },
+      "source": [
+        "# **Hummingbird-ML**\n",
+        "\n",
+        "[How to Harness GPU to Speed Up Machine Learning with Hummingbird-ML](https://www.youtube.com/watch?v=qN8jcUmo8TI)\n",
+        "\n",
+        "Adapted from: https://github.com/microsoft/hummingbird"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ir3DZd5-_jiu",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Install Hummingbird-ML"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ra3JEgWN_bfp",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 408
+        },
+        "outputId": "4fae39de-26f0-4939-846d-039fb876725a"
+      },
+      "source": [
+        "! pip install hummingbird-ml[extra]"
+      ],
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Collecting hummingbird-ml[extra]\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ed/3b/cf1b8c1e7531377adead8de29e29b00b5aed380544ad0def4c0188b50d80/hummingbird_ml-0.0.5-py2.py3-none-any.whl (60kB)\n",
+            "\r\u001b[K     |█████▌                          | 10kB 16.6MB/s eta 0:00:01\r\u001b[K     |███████████                     | 20kB 1.8MB/s eta 0:00:01\r\u001b[K     |████████████████▍               | 30kB 2.2MB/s eta 0:00:01\r\u001b[K     |█████████████████████▉          | 40kB 2.5MB/s eta 0:00:01\r\u001b[K     |███████████████████████████▎    | 51kB 2.0MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 61kB 1.8MB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy>=1.15 in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (1.18.5)\n",
+            "Requirement already satisfied: torch>=1.4.* in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (1.6.0+cu101)\n",
+            "Collecting onnxconverter-common>=1.6.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fe/7a/7e30c643cd7d2ad87689188ef34ce93e657bd14da3605f87bcdbc19cd5b1/onnxconverter_common-1.7.0-py2.py3-none-any.whl (64kB)\n",
+            "\u001b[K     |████████████████████████████████| 71kB 3.7MB/s \n",
+            "\u001b[?25hRequirement already satisfied: scikit-learn>=0.22.1 in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (0.22.2.post1)\n",
+            "Requirement already satisfied: xgboost==0.90; extra == \"extra\" in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (0.90)\n",
+            "Requirement already satisfied: lightgbm>=2.2; extra == \"extra\" in /usr/local/lib/python3.6/dist-packages (from hummingbird-ml[extra]) (2.2.3)\n",
+            "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch>=1.4.*->hummingbird-ml[extra]) (0.16.0)\n",
+            "Collecting onnx\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/36/ee/bc7bc88fc8449266add978627e90c363069211584b937fd867b0ccc59f09/onnx-1.7.0-cp36-cp36m-manylinux1_x86_64.whl (7.4MB)\n",
+            "\u001b[K     |████████████████████████████████| 7.4MB 16.0MB/s \n",
+            "\u001b[?25hRequirement already satisfied: protobuf in /usr/local/lib/python3.6/dist-packages (from onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (3.12.4)\n",
+            "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.22.1->hummingbird-ml[extra]) (0.16.0)\n",
+            "Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn>=0.22.1->hummingbird-ml[extra]) (1.4.1)\n",
+            "Requirement already satisfied: typing-extensions>=3.6.2.1 in /usr/local/lib/python3.6/dist-packages (from onnx->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (3.7.4.3)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from onnx->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (1.15.0)\n",
+            "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf->onnxconverter-common>=1.6.0->hummingbird-ml[extra]) (49.6.0)\n",
+            "Installing collected packages: onnx, onnxconverter-common, hummingbird-ml\n",
+            "Successfully installed hummingbird-ml-0.0.5 onnx-1.7.0 onnxconverter-common-1.7.0\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YnA-PmeA_q70",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Import libraries"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lkIThThi_puf",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import numpy as np\n",
+        "from sklearn.ensemble import RandomForestClassifier\n",
+        "from hummingbird.ml import convert"
+      ],
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rFw_4cGa_-tF",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Create some random data for binary classification"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hGGngPPp__mx",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "num_classes = 2\n",
+        "X = np.random.rand(100000, 28)\n",
+        "y = np.random.randint(num_classes, size=100000)"
+      ],
+      "execution_count": 3,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WusxNKH4AHII",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Create and train a model (scikit-learn RandomForestClassifier)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "GMRJRuBwAGeV",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "skl_model = RandomForestClassifier(n_estimators=10, max_depth=10)"
+      ],
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "M_kGo80yAYTn",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "aa863652-02f8-4578-8fb7-e3b028685cd7"
+      },
+      "source": [
+        "%%timeit\n",
+        "skl_model.fit(X, y)"
+      ],
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "1 loop, best of 3: 4.78 s per loop\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Hp4a8I0tAbBl",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "4e083fd5-981f-4238-9158-3f4500585560"
+      },
+      "source": [
+        "%%timeit\n",
+        "skl_model.predict(X)"
+      ],
+      "execution_count": 6,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "10 loops, best of 3: 85.6 ms per loop\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mNiBvy9BA7wR",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Use Hummingbird to convert the model to PyTorch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vcAOpuxxAzPc",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model = convert(skl_model, 'pytorch')"
+      ],
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dpt6_4l8BF7e",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Run predictions on CPU"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "_BiU63hNBDu-",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "1bd8b158-a62b-4fe0-be09-ca382c817247"
+      },
+      "source": [
+        "%%timeit\n",
+        "model.predict(X)"
+      ],
+      "execution_count": 8,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "1 loop, best of 3: 174 ms per loop\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "F10tJEMKBPZG",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Run predictions on GPU"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "l2PUbqoHBJBX",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model.to('cuda')"
+      ],
+      "execution_count": 9,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-AB23_VTBRMP",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 51
+        },
+        "outputId": "b9efea7d-913c-4326-c14a-6b6ca0e9c063"
+      },
+      "source": [
+        "%%timeit\n",
+        "model.predict(X)"
+      ],
+      "execution_count": 10,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "The slowest run took 5.22 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
+            "100 loops, best of 3: 14.8 ms per loop\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dbkQU69JDt7T",
+        "colab_type": "text"
+      },
+      "source": [
+        "# Calculation Time"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Hr1R_9nwDwpc",
+        "colab_type": "text"
+      },
+      "source": [
+        "Methods | Timing | Performance\n",
+        "--|--|--\n",
+        "scikit-learn | 85.6 ms | -\n",
+        "PyTorch (CPU) | 174 ms | 2 X slower than scikit-learn\n",
+        "PyTorch (GPU) | 14.8 ms | Almost 6 X faster than scikit-learn; Almost 12 X faster than PyTorch (CPU)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9lmR3LHoEzhl",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}