move pandas to experimental (run-llama#12419)

chrisalexiuk-nvidia · Apr 25, 2024 · 047576b · 047576b
1 parent d96c966
commit 047576b
Show file tree

Hide file tree

Showing 26 changed files with 614 additions and 364 deletions.
diff --git a/docs/docs/api_reference/query_engine/pandas.md b/docs/docs/api_reference/query_engine/pandas.md
@@ -1,4 +1,4 @@
-::: llama_index.core.query_engine
+::: llama_index.experimental.query_engine
     options:
        members:
          - PandasQueryEngine
diff --git a/docs/docs/examples/pipeline/query_pipeline_pandas.ipynb b/docs/docs/examples/pipeline/query_pipeline_pandas.ipynb
@@ -9,7 +9,12 @@
     "\n",
     "This is a simple example that builds a query pipeline that can perform structured operations over a Pandas DataFrame to satisfy a user query, using LLMs to infer the set of operations.\n",
     "\n",
-    "This can be treated as the \"from-scratch\" version of our `PandasQueryEngine`."
+    "This can be treated as the \"from-scratch\" version of our `PandasQueryEngine`.\n",
+    "\n",
+    "WARNING: This tool provides the LLM access to the `eval` function.\n",
+    "Arbitrary code execution is possible on the machine running this tool.\n",
+    "This tool is not recommended to be used in a production setting, and would\n",
+    "require heavy sandboxing or virtual machines."
    ]
   },
   {
@@ -19,7 +24,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install llama-index-llms-openai"
+    "%pip install llama-index-llms-openai llama-index-experimental"
    ]
   },
   {
@@ -34,7 +39,9 @@
     "    Link,\n",
     "    InputComponent,\n",
     ")\n",
-    "from llama_index.core.query_engine.pandas import PandasInstructionParser\n",
+    "from llama_index.experimental.query_engine.pandas import (\n",
+    "    PandasInstructionParser,\n",
+    ")\n",
     "from llama_index.llms.openai import OpenAI\n",
     "from llama_index.core import PromptTemplate"
    ]

diff --git a/docs/docs/examples/query_engine/pandas_query_engine.ipynb b/docs/docs/examples/query_engine/pandas_query_engine.ipynb
@@ -20,7 +20,10 @@
     "\n",
     "The input to the `PandasQueryEngine` is a Pandas dataframe, and the output is a response. The LLM infers dataframe operations to perform in order to retrieve the result.\n",
     "\n",
-    "**NOTE**: We have measures in PandasQueryEngine to enforce safety and prevent arbitrary code execution. For instance, no execution of private/dunder methods, and access to a restricted set of globals.\n"
+    "**WARNING:** This tool provides the LLM access to the `eval` function.\n",
+    "Arbitrary code execution is possible on the machine running this tool.\n",
+    "While some level of filtering is done on code, this tool is not recommended \n",
+    "to be used in a production setting without heavy sandboxing or virtual machines.\n"
    ]
   },
   {
@@ -38,7 +41,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install llama-index"
+    "!pip install llama-index llama-index-experimental"
    ]
   },
   {
@@ -53,7 +56,7 @@
     "from IPython.display import Markdown, display\n",
     "\n",
     "import pandas as pd\n",
-    "from llama_index.core.query_engine import PandasQueryEngine\n",
+    "from llama_index.experimental.query_engine import PandasQueryEngine\n",
     "\n",
     "\n",
     "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",

diff --git a/docs/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb b/docs/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb
@@ -32,7 +32,8 @@
    "source": [
     "%pip install llama-index-embeddings-openai\n",
     "%pip install llama-index-readers-file pymupdf\n",
-    "%pip install llama-index-llms-openai"
+    "%pip install llama-index-llms-openai\n",
+    "%pip install llama-index-experimental"
    ]
   },
   {
@@ -46,7 +47,7 @@
     "\n",
     "# https://en.wikipedia.org/wiki/The_World%27s_Billionaires\n",
     "from llama_index.core import VectorStoreIndex\n",
-    "from llama_index.core.query_engine import PandasQueryEngine\n",
+    "from llama_index.experimental.query_engine import PandasQueryEngine\n",
     "from llama_index.core.schema import IndexNode\n",
     "from llama_index.llms.openai import OpenAI\n",
     "\n",
@@ -594,7 +595,12 @@
     "\n",
     "We create a pandas query engine over each structured table.\n",
     "\n",
-    "These can be executed on their own to answer queries about each table."
+    "These can be executed on their own to answer queries about each table.\n",
+    "\n",
+    "**WARNING:** This tool provides the LLM access to the `eval` function.\n",
+    "Arbitrary code execution is possible on the machine running this tool.\n",
+    "While some level of filtering is done on code, this tool is not recommended \n",
+    "to be used in a production setting without heavy sandboxing or virtual machines."
    ]
   },
   {

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
@@ -1345,6 +1345,7 @@ plugins:
             signature_crossrefs: true
           paths:
             - ../llama-index-core
+            - ../llama-index-experimental
             - ../llama-index-packs/llama-index-packs-retry-engine-weaviate
             - ../llama-index-packs/llama-index-packs-llava-completion
             - ../llama-index-packs/llama-index-packs-zephyr-query-engine

diff --git a/llama-index-cli/llama_index/cli/upgrade/mappings.json b/llama-index-cli/llama_index/cli/upgrade/mappings.json
@@ -254,7 +254,7 @@
   "RetrySourceQueryEngine": "llama_index.core.query_engine",
   "RetryGuidelineQueryEngine": "llama_index.core.query_engine",
   "FLAREInstructQueryEngine": "llama_index.core.query_engine",
-  "PandasQueryEngine": "llama_index.core.query_engine",
+  "PandasQueryEngine": "llama_index.experimental.query_engine",
   "JSONalyzeQueryEngine": "llama_index.core.query_engine",
   "KnowledgeGraphQueryEngine": "llama_index.core.query_engine",
   "BaseQueryEngine": "llama_index.core.query_engine",

diff --git a/llama-index-core/llama_index/core/command_line/mappings.json b/llama-index-core/llama_index/core/command_line/mappings.json
@@ -254,7 +254,7 @@
   "RetrySourceQueryEngine": "llama_index.core.query_engine",
   "RetryGuidelineQueryEngine": "llama_index.core.query_engine",
   "FLAREInstructQueryEngine": "llama_index.core.query_engine",
-  "PandasQueryEngine": "llama_index.core.query_engine",
+  "PandasQueryEngine": "llama_index.experimental.query_engine",
   "JSONalyzeQueryEngine": "llama_index.core.query_engine",
   "KnowledgeGraphQueryEngine": "llama_index.core.query_engine",
   "BaseQueryEngine": "llama_index.core.query_engine",

diff --git a/llama-index-core/llama_index/core/exec_utils.py b/llama-index-core/llama_index/core/exec_utils.py
@@ -86,6 +86,11 @@ def _get_restricted_globals(__globals: Union[dict, None]) -> Any:
     return restricted_globals
 
 
+vulnerable_code_snippets = [
+    "os.",
+]
+
+
 class DunderVisitor(ast.NodeVisitor):
     def __init__(self) -> None:
         self.has_access_to_private_entity = False
@@ -123,6 +128,11 @@ def _contains_protected_access(code: str) -> bool:
 
     dunder_visitor = DunderVisitor()
     dunder_visitor.visit(tree)
+
+    for vulnerable_code_snippet in vulnerable_code_snippets:
+        if vulnerable_code_snippet in code:
+            dunder_visitor.has_access_to_disallowed_builtin = True
+
     return (
         dunder_visitor.has_access_to_private_entity
         or dunder_visitor.has_access_to_disallowed_builtin

diff --git a/llama-index-core/llama_index/core/indices/struct_store/pandas.py b/llama-index-core/llama_index/core/indices/struct_store/pandas.py
@@ -1,83 +1,25 @@
-"""Pandas csv structured store."""
+"""Pandas csv structured store.
 
-import logging
-from typing import Any, Optional, Sequence
+DEPRECATED: Please use :class:`PandasQueryEngine` in `llama-index-experimental` instead.
+"""
 
-import pandas as pd
-from llama_index.core.base.base_query_engine import BaseQueryEngine
-from llama_index.core.base.base_retriever import BaseRetriever
-from llama_index.core.data_structs.table import PandasStructTable
-from llama_index.core.indices.struct_store.base import BaseStructStoreIndex
-from llama_index.core.llms.utils import LLMType
-from llama_index.core.schema import BaseNode
+from typing import Any
 
-logger = logging.getLogger(__name__)
-
-
-class PandasIndex(BaseStructStoreIndex[PandasStructTable]):
-    """Pandas Index.
-
-    Deprecated. Please use :class:`PandasQueryEngine` instead.
-
-    The PandasIndex is an index that stores
-    a Pandas dataframe under the hood.
-    Currently index "construction" is not supported.
-
-    During query time, the user can either specify a raw SQL query
-    or a natural language query to retrieve their data.
-
-    Args:
-        pandas_df (Optional[pd.DataFrame]): Pandas dataframe to use.
-            See :ref:`Ref-Struct-Store` for more details.
-
-    """
-
-    index_struct_cls = PandasStructTable
 
+class PandasIndex:
     def __init__(
         self,
-        df: pd.DataFrame,
-        nodes: Optional[Sequence[BaseNode]] = None,
-        index_struct: Optional[PandasStructTable] = None,
+        *args: Any,
         **kwargs: Any,
     ) -> None:
-        """Initialize params."""
-        logger.warning(
-            "PandasIndex is deprecated. \
-            Please directly use `PandasQueryEngine(df)` instead."
+        raise DeprecationWarning(
+            "PandasQueryEngine has been moved to `llama-index-experimental`.\n"
+            "`pip install llama-index-experimental`\n"
+            "`from llama_index.experimental.query_engine import PandasQueryEngine`\n"
+            "Note that the PandasQueryEngine allows for arbitrary code execution, \n"
+            "and should be used in a secure environment."
         )
 
-        if nodes is not None:
-            raise ValueError("We currently do not support indexing documents or nodes.")
-        self.df = df
-
-        super().__init__(
-            nodes=[],
-            index_struct=index_struct,
-            **kwargs,
-        )
-
-    def as_retriever(self, **kwargs: Any) -> BaseRetriever:
-        raise NotImplementedError("Not supported")
-
-    def as_query_engine(
-        self, llm: Optional[LLMType] = None, **kwargs: Any
-    ) -> BaseQueryEngine:
-        # NOTE: lazy import
-        from llama_index.core.query_engine.pandas.pandas_query_engine import (
-            PandasQueryEngine,
-        )
-
-        return PandasQueryEngine.from_index(self, llm=llm, **kwargs)
-
-    def _build_index_from_nodes(self, nodes: Sequence[BaseNode]) -> PandasStructTable:
-        """Build index from documents."""
-        return self.index_struct_cls()
-
-    def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None:
-        """Insert a document."""
-        raise NotImplementedError("We currently do not support inserting documents.")
-
 
-# legacy
+# Legacy
 GPTPandasIndex = PandasIndex
diff --git a/llama-index-core/llama_index/core/prompts/default_prompts.py b/llama-index-core/llama_index/core/prompts/default_prompts.py
@@ -361,26 +361,6 @@
 )
 
 
-############################################
-# Pandas
-############################################
-
-DEFAULT_PANDAS_TMPL = (
-    "You are working with a pandas dataframe in Python.\n"
-    "The name of the dataframe is `df`.\n"
-    "This is the result of `print(df.head())`:\n"
-    "{df_str}\n\n"
-    "Follow these instructions:\n"
-    "{instruction_str}\n"
-    "Query: {query_str}\n\n"
-    "Expression:"
-)
-
-DEFAULT_PANDAS_PROMPT = PromptTemplate(
-    DEFAULT_PANDAS_TMPL, prompt_type=PromptType.PANDAS
-)
-
-
 ############################################
 # JSON Path
 ############################################

diff --git a/llama-index-core/llama_index/core/query_engine/pandas/output_parser.py b/llama-index-core/llama_index/core/query_engine/pandas/output_parser.py
@@ -1,86 +1,23 @@
-"""Pandas output parser."""
+"""Pandas output parser.
 
-import logging
-from typing import Any, Dict, Optional
+DEPRECATED: This class has been moved to `llama-index-experimental`.
+"""
 
-import numpy as np
-import pandas as pd
-from llama_index.core.exec_utils import safe_eval, safe_exec
-from llama_index.core.output_parsers.base import ChainableOutputParser
-from llama_index.core.output_parsers.utils import parse_code_markdown
+from typing import Any
 
-logger = logging.getLogger(__name__)
 
-
-def default_output_processor(
-    output: str, df: pd.DataFrame, **output_kwargs: Any
-) -> str:
-    """Process outputs in a default manner."""
-    import ast
-    import sys
-    import traceback
-
-    if sys.version_info < (3, 9):
-        logger.warning(
-            "Python version must be >= 3.9 in order to use "
-            "the default output processor, which executes "
-            "the Python query. Instead, we will return the "
-            "raw Python instructions as a string."
-        )
-        return output
-
-    local_vars = {"df": df}
-    global_vars = {"np": np, "pd": pd}
-
-    output = parse_code_markdown(output, only_last=True)[0]
-
-    # NOTE: inspired from langchain's tool
-    # see langchain.tools.python.tool (PythonAstREPLTool)
-    try:
-        tree = ast.parse(output)
-        module = ast.Module(tree.body[:-1], type_ignores=[])
-        safe_exec(ast.unparse(module), {}, local_vars)  # type: ignore
-        module_end = ast.Module(tree.body[-1:], type_ignores=[])
-        module_end_str = ast.unparse(module_end)  # type: ignore
-        if module_end_str.strip("'\"") != module_end_str:
-            # if there's leading/trailing quotes, then we need to eval
-            # string to get the actual expression
-            module_end_str = safe_eval(module_end_str, global_vars, local_vars)
-        try:
-            # str(pd.dataframe) will truncate output by display.max_colwidth
-            # set width temporarily to extract more text
-            if "max_colwidth" in output_kwargs:
-                pd.set_option("display.max_colwidth", output_kwargs["max_colwidth"])
-            output_str = str(safe_eval(module_end_str, global_vars, local_vars))
-            pd.reset_option("display.max_colwidth")
-            return output_str
-
-        except Exception:
-            raise
-    except Exception as e:
-        err_string = (
-            "There was an error running the output as Python code. "
-            f"Error message: {e}"
-        )
-        traceback.print_exc()
-        return err_string
-
-
-class PandasInstructionParser(ChainableOutputParser):
+class PandasInstructionParser:
     """Pandas instruction parser.
 
-    This 'output parser' takes in pandas instructions (in Python code) and
-    executes them to return an output.
+    DEPRECATED: This class has been moved to `llama-index-experimental`.
 
     """
 
-    def __init__(
-        self, df: pd.DataFrame, output_kwargs: Optional[Dict[str, Any]] = None
-    ) -> None:
-        """Initialize params."""
-        self.df = df
-        self.output_kwargs = output_kwargs or {}
-
-    def parse(self, output: str) -> Any:
-        """Parse, validate, and correct errors programmatically."""
-        return default_output_processor(output, self.df, **self.output_kwargs)
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        raise DeprecationWarning(
+            "PandasInstructionParser has been moved to `llama-index-experimental`.\n"
+            "`pip install llama-index-experimental`\n"
+            "`from llama_index.experimental.query_engine.pandas import PandasInstructionParser`\n"
+            "Note that the PandasInstructionParser allows for arbitrary code execution, \n"
+            "and should be used in a secure environment."
+        )