ML/LlamaIndex: Add software tests and CI configuration (#707)

* ML/LlamaIndex: Adjustments to make it work with non-Azure OpenAI * ML/LlamaIndex: Add software tests and CI configuration
crate · Nov 6, 2024 · 7c163f2 · 7c163f2
1 parent ca8a44f
commit 7c163f2
Show file tree

Hide file tree

Showing 11 changed files with 263 additions and 130 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -114,6 +114,11 @@ updates:
     schedule:
       interval: "daily"
 
+  - directory: "/topic/machine-learning/llama-index"
+    package-ecosystem: "pip"
+    schedule:
+      interval: "daily"
+
   - directory: "/topic/machine-learning/mlops-mlflow"
     package-ecosystem: "pip"
     schedule:

diff --git a/.github/workflows/ml-llamaindex.yml b/.github/workflows/ml-llamaindex.yml
@@ -0,0 +1,82 @@
+name: LlamaIndex
+
+on:
+  pull_request:
+    branches: ~
+    paths:
+    - '.github/workflows/ml-llamaindex.yml'
+    - 'topic/machine-learning/llama-index/**'
+    - '/requirements.txt'
+  push:
+    branches: [ main ]
+    paths:
+    - '.github/workflows/ml-llamaindex.yml'
+    - 'topic/machine-learning/llama-index/**'
+    - '/requirements.txt'
+
+  # Allow job to be triggered manually.
+  workflow_dispatch:
+
+  # Run job each night after CrateDB nightly has been published.
+  schedule:
+    - cron: '0 3 * * *'
+
+# Cancel in-progress jobs when pushing to the same branch.
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+jobs:
+  test:
+    name: "
+     Python: ${{ matrix.python-version }}
+     CrateDB: ${{ matrix.cratedb-version }}
+     on ${{ matrix.os }}"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [
+          'ubuntu-latest',
+        ]
+        python-version: [
+          '3.8',
+          '3.13',
+        ]
+        cratedb-version: [ 'nightly' ]
+
+    services:
+      cratedb:
+        image: crate/crate:${{ matrix.cratedb-version }}
+        ports:
+          - 4200:4200
+          - 5432:5432
+        env:
+          CRATE_HEAP_SIZE: 4g
+
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
+    steps:
+
+      - name: Acquire sources
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: |
+            requirements.txt
+            topic/machine-learning/llama-index/requirements.txt
+            topic/machine-learning/llama-index/requirements-dev.txt
+
+      - name: Install utilities
+        run: |
+          pip install -r requirements.txt
+
+      - name: Validate topic/machine-learning/llama-index
+        run: |
+          ngr test --accept-no-venv topic/machine-learning/llama-index
diff --git a/topic/machine-learning/llama-index/README.md b/topic/machine-learning/llama-index/README.md
@@ -4,16 +4,25 @@ This folder contains the codebase for [this tutorial](https://community.cratedb.
 
 This has been tested using:
 
-* Python 3.12.2
-* macOS Sequoia 15.0.1
-* CrateDB 5.8.3 running in CrateDB Cloud on AWS Europe (Ireland)
+* Python 3.12
+* macOS
+* CrateDB 5.8 and higher
 
 ## Database Setup
 
 You will need a CrateDB Cloud database: sign up [here](https://console.cratedb.cloud/) and use the free "CRFREE" tier.
 
 Make a note of the hostname, username and password for your database.  You'll need those when configuring the environment file later.
 
+If you don't use CrateDB Cloud, you can also provide an instance for testing
+purposes like this:
+
+```shell
+docker run --rm -it --name=cratedb \
+  --publish=4200:4200 --publish=5432:5432 \
+  --env=CRATE_HEAP_SIZE=2g crate:latest -Cdiscovery.type=single-node
+```
+
 Create a table in CrateDB:
 
 ```sql
@@ -61,7 +70,7 @@ pip install -r requirements.txt
 
 ## Configure your Environment
 
-To configure your environment, copy the provided [`env.example`](./env.example) file to a new file named `.env`, then open it with a text editor.
+To configure your environment, copy the provided [`env.azure`](./env.azure) or [`env.standalone`](./env.standalone) file to a new file named `.env`, then open it with a text editor.
 
 Set the values in the file as follows:
 
@@ -72,7 +81,7 @@ OPENAI_AZURE_ENDPOINT=https://<Your endpoint from Azure e.g. myendpoint.openai.a
 OPENAI_AZURE_API_VERSION=2024-08-01-preview
 LLM_INSTANCE=<The name of your Chat GPT 3.5 turbo instance from Azure>
 EMBEDDING_MODEL_INSTANCE=<The name of your Text Embedding Ada 2.0 instance from Azure>
-CRATEDB_URL="crate://<Database user name>:<Database password>@<Database host>:4200/?ssl=true"
+CRATEDB_SQLALCHEMY_URL="crate://<Database user name>:<Database password>@<Database host>:4200/?ssl=true"
 CRATEDB_TABLE_NAME=time_series_data
 ```
 

diff --git a/.../machine-learning/llama-index/env.example → topic/machine-learning/llama-index/env.azure b/.../machine-learning/llama-index/env.example → topic/machine-learning/llama-index/env.azure
@@ -4,5 +4,5 @@ OPENAI_AZURE_ENDPOINT=https://TODO.openai.azure.com
 OPENAI_AZURE_API_VERSION=2024-08-01-preview
 LLM_INSTANCE=TODO
 EMBEDDING_MODEL_INSTANCE=TODO
-CRATEDB_URL="crate://USER:PASSWORD@HOST:4200/?ssl=true"
-CRATEDB_TABLE_NAME=time_series_data
+CRATEDB_SQLALCHEMY_URL="crate://USER:PASSWORD@HOST:4200/?ssl=true"
+CRATEDB_TABLE_NAME=time_series_data
diff --git a/topic/machine-learning/llama-index/env.standalone b/topic/machine-learning/llama-index/env.standalone
@@ -0,0 +1,4 @@
+# OPENAI_API_KEY=sk-XJZ7pfog5Gp8Kus8D--invalid--0CJ5lyAKSefZLaV1Y9S1
+OPENAI_API_TYPE=openai
+CRATEDB_SQLALCHEMY_URL="crate://crate@localhost:4200/"
+CRATEDB_TABLE_NAME=time_series_data
diff --git a/topic/machine-learning/llama-index/init.sql b/topic/machine-learning/llama-index/init.sql
@@ -0,0 +1,23 @@
+CREATE TABLE IF NOT EXISTS time_series_data (
+    timestamp TIMESTAMP,
+    value DOUBLE,
+    location STRING,
+    sensor_id INT
+);
+
+INSERT INTO time_series_data (timestamp, value, location, sensor_id)
+VALUES
+    ('2023-09-14T00:00:00', 10.5, 'Sensor A', 1),
+    ('2023-09-14T01:00:00', 15.2, 'Sensor A', 1),
+    ('2023-09-14T02:00:00', 18.9, 'Sensor A', 1),
+    ('2023-09-14T03:00:00', 12.7, 'Sensor B', 2),
+    ('2023-09-14T04:00:00', 17.3, 'Sensor B', 2),
+    ('2023-09-14T05:00:00', 20.1, 'Sensor B', 2),
+    ('2023-09-14T06:00:00', 22.5, 'Sensor A', 1),
+    ('2023-09-14T07:00:00', 18.3, 'Sensor A', 1),
+    ('2023-09-14T08:00:00', 16.8, 'Sensor A', 1),
+    ('2023-09-14T09:00:00', 14.6, 'Sensor B', 2),
+    ('2023-09-14T10:00:00', 13.2, 'Sensor B', 2),
+    ('2023-09-14T11:00:00', 11.7, 'Sensor B', 2);
+
+REFRESH TABLE time_series_data;
diff --git a/topic/machine-learning/llama-index/main.py b/topic/machine-learning/llama-index/main.py
@@ -1,59 +1,92 @@
-""" Example code using Azure Open AI and llama-index. """
+"""
+Use an LLM to query a database in human language.
+Example code using LlamaIndex with vanilla Open AI and Azure Open AI.
+"""
 
 import os
 import openai
 import sqlalchemy as sa
 
 from dotenv import load_dotenv
 from langchain_openai import AzureOpenAIEmbeddings
+from langchain_openai import OpenAIEmbeddings
 from llama_index.llms.azure_openai import AzureOpenAI
+from llama_index.llms.openai import OpenAI
 from llama_index.embeddings.langchain import LangchainEmbedding
 from llama_index.core.utilities.sql_wrapper import SQLDatabase
 from llama_index.core.query_engine import NLSQLTableQueryEngine
 from llama_index.core import Settings
 
-if __name__ == "__main__":
-    load_dotenv()
+
+def configure_llm():
+    """
+    Configure LLM. Use either vanilla Open AI, or Azure Open AI.
+    """
 
     openai.api_type = os.getenv("OPENAI_API_TYPE")
     openai.azure_endpoint = os.getenv("OPENAI_AZURE_ENDPOINT")
     openai.api_version = os.getenv("OPENAI_AZURE_API_VERSION")
     openai.api_key = os.getenv("OPENAI_API_KEY")
 
-    llm = AzureOpenAI(
-        engine=os.getenv("LLM_INSTANCE"),
-        azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"),
-        api_key = os.getenv("OPENAI_API_KEY"),
-        api_version = os.getenv("OPENAI_AZURE_API_VERSION"),
-        temperature=0.0
-    )
+    if openai.api_type == "openai":
+        llm = OpenAI(
+            api_key=os.getenv("OPENAI_API_KEY"),
+            temperature=0.0
+        )
+    elif openai.api_type == "azure":
+        llm = AzureOpenAI(
+            engine=os.getenv("LLM_INSTANCE"),
+            azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"),
+            api_key = os.getenv("OPENAI_API_KEY"),
+            api_version = os.getenv("OPENAI_AZURE_API_VERSION"),
+            temperature=0.0
+        )
+    else:
+        raise ValueError(f"Open AI API type not defined or invalid: {openai.api_type}")
 
     Settings.llm = llm
-    Settings.embed_model = LangchainEmbedding(
-        AzureOpenAIEmbeddings(
-            azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"),
-            model=os.getenv("EMBEDDING_MODEL_INSTANCE")
+    if openai.api_type == "openai":
+        Settings.embed_model = LangchainEmbedding(OpenAIEmbeddings())
+    elif openai.api_type == "azure":
+        Settings.embed_model = LangchainEmbedding(
+            AzureOpenAIEmbeddings(
+                azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"),
+                model=os.getenv("EMBEDDING_MODEL_INSTANCE")
+            )
         )
-    )
 
-    print("Creating SQLAlchemy engine...")
-    engine_crate = sa.create_engine(os.getenv("CRATEDB_URL"))
-    print("Connecting to CrateDB...")
+
+def main():
+    """
+    Use an LLM to query a database in human language.
+    """
+
+    # Configure application.
+    load_dotenv()
+    configure_llm()
+
+    # Configure database connection and query engine.
+    print("Connecting to CrateDB")
+    engine_crate = sa.create_engine(os.getenv("CRATEDB_SQLALCHEMY_URL"))
     engine_crate.connect()
-    print("Creating SQLDatabase instance...")
+
+    print("Creating LlamaIndex QueryEngine")
     sql_database = SQLDatabase(engine_crate, include_tables=[os.getenv("CRATEDB_TABLE_NAME")])
-    print("Creating QueryEngine...")
     query_engine = NLSQLTableQueryEngine(
         sql_database=sql_database,
         tables=[os.getenv("CRATEDB_TABLE_NAME")],
-        llm = llm
+        llm=Settings.llm
     )
 
-    print("Running query...")
-
+    # Invoke an inquiry.
+    print("Running query")
     QUERY_STR = "What is the average value for sensor 1?"
     answer = query_engine.query(QUERY_STR)
     print(answer.get_formatted_sources())
     print("Query was:", QUERY_STR)
     print("Answer was:", answer)
     print(answer.metadata)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/topic/machine-learning/llama-index/pyproject.toml b/topic/machine-learning/llama-index/pyproject.toml
@@ -0,0 +1,27 @@
+[tool.pytest.ini_options]
+minversion = "2.0"
+addopts = """
+  -rfEX -p pytester --strict-markers --verbosity=3 --capture=no
+  --cov=. --cov-report=term-missing --cov-report=xml
+  """
+
+#log_level = "DEBUG"
+#log_cli_level = "DEBUG"
+
+testpaths = [
+    "*.py",
+]
+xfail_strict = true
+markers = [
+]
+
+[tool.coverage.run]
+branch = false
+
+[tool.coverage.report]
+fail_under = 0
+show_missing = true
+omit = [
+    "conftest.py",
+    "test*.py",
+]
diff --git a/topic/machine-learning/llama-index/requirements-dev.txt b/topic/machine-learning/llama-index/requirements-dev.txt
@@ -0,0 +1,3 @@
+cratedb-toolkit
+pueblo[testing]
+sqlparse