diff --git a/docs/docs/api_reference/query_engine/pandas.md b/docs/docs/api_reference/query_engine/pandas.md index b7293b27ff976d..34c9740e3973ee 100644 --- a/docs/docs/api_reference/query_engine/pandas.md +++ b/docs/docs/api_reference/query_engine/pandas.md @@ -1,4 +1,4 @@ -::: llama_index.core.query_engine +::: llama_index.experimental.query_engine options: members: - PandasQueryEngine diff --git a/docs/docs/examples/pipeline/query_pipeline_pandas.ipynb b/docs/docs/examples/pipeline/query_pipeline_pandas.ipynb index e8041a0cfcbde4..100bade97b46c9 100644 --- a/docs/docs/examples/pipeline/query_pipeline_pandas.ipynb +++ b/docs/docs/examples/pipeline/query_pipeline_pandas.ipynb @@ -9,7 +9,12 @@ "\n", "This is a simple example that builds a query pipeline that can perform structured operations over a Pandas DataFrame to satisfy a user query, using LLMs to infer the set of operations.\n", "\n", - "This can be treated as the \"from-scratch\" version of our `PandasQueryEngine`." + "This can be treated as the \"from-scratch\" version of our `PandasQueryEngine`.\n", + "\n", + "WARNING: This tool provides the LLM access to the `eval` function.\n", + "Arbitrary code execution is possible on the machine running this tool.\n", + "This tool is not recommended to be used in a production setting, and would\n", + "require heavy sandboxing or virtual machines." ] }, { @@ -19,7 +24,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install llama-index-llms-openai" + "%pip install llama-index-llms-openai llama-index-experimental" ] }, { @@ -34,7 +39,9 @@ " Link,\n", " InputComponent,\n", ")\n", - "from llama_index.core.query_engine.pandas import PandasInstructionParser\n", + "from llama_index.experimental.query_engine.pandas import (\n", + " PandasInstructionParser,\n", + ")\n", "from llama_index.llms.openai import OpenAI\n", "from llama_index.core import PromptTemplate" ] diff --git a/docs/docs/examples/query_engine/pandas_query_engine.ipynb b/docs/docs/examples/query_engine/pandas_query_engine.ipynb index 281ef2a239f5b8..3343353f2b7f7e 100644 --- a/docs/docs/examples/query_engine/pandas_query_engine.ipynb +++ b/docs/docs/examples/query_engine/pandas_query_engine.ipynb @@ -20,7 +20,10 @@ "\n", "The input to the `PandasQueryEngine` is a Pandas dataframe, and the output is a response. The LLM infers dataframe operations to perform in order to retrieve the result.\n", "\n", - "**NOTE**: We have measures in PandasQueryEngine to enforce safety and prevent arbitrary code execution. For instance, no execution of private/dunder methods, and access to a restricted set of globals.\n" + "**WARNING:** This tool provides the LLM access to the `eval` function.\n", + "Arbitrary code execution is possible on the machine running this tool.\n", + "While some level of filtering is done on code, this tool is not recommended \n", + "to be used in a production setting without heavy sandboxing or virtual machines.\n" ] }, { @@ -38,7 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install llama-index" + "!pip install llama-index llama-index-experimental" ] }, { @@ -53,7 +56,7 @@ "from IPython.display import Markdown, display\n", "\n", "import pandas as pd\n", - "from llama_index.core.query_engine import PandasQueryEngine\n", + "from llama_index.experimental.query_engine import PandasQueryEngine\n", "\n", "\n", "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", diff --git a/docs/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb b/docs/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb index 05c490d2f7f5ae..828d7320470402 100644 --- a/docs/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb +++ b/docs/docs/examples/query_engine/pdf_tables/recursive_retriever.ipynb @@ -32,7 +32,8 @@ "source": [ "%pip install llama-index-embeddings-openai\n", "%pip install llama-index-readers-file pymupdf\n", - "%pip install llama-index-llms-openai" + "%pip install llama-index-llms-openai\n", + "%pip install llama-index-experimental" ] }, { @@ -46,7 +47,7 @@ "\n", "# https://en.wikipedia.org/wiki/The_World%27s_Billionaires\n", "from llama_index.core import VectorStoreIndex\n", - "from llama_index.core.query_engine import PandasQueryEngine\n", + "from llama_index.experimental.query_engine import PandasQueryEngine\n", "from llama_index.core.schema import IndexNode\n", "from llama_index.llms.openai import OpenAI\n", "\n", @@ -594,7 +595,12 @@ "\n", "We create a pandas query engine over each structured table.\n", "\n", - "These can be executed on their own to answer queries about each table." + "These can be executed on their own to answer queries about each table.\n", + "\n", + "**WARNING:** This tool provides the LLM access to the `eval` function.\n", + "Arbitrary code execution is possible on the machine running this tool.\n", + "While some level of filtering is done on code, this tool is not recommended \n", + "to be used in a production setting without heavy sandboxing or virtual machines." ] }, { diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 64d3eb7e47e82e..fe14302fc8dabb 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1345,6 +1345,7 @@ plugins: signature_crossrefs: true paths: - ../llama-index-core + - ../llama-index-experimental - ../llama-index-packs/llama-index-packs-retry-engine-weaviate - ../llama-index-packs/llama-index-packs-llava-completion - ../llama-index-packs/llama-index-packs-zephyr-query-engine diff --git a/llama-index-cli/llama_index/cli/upgrade/mappings.json b/llama-index-cli/llama_index/cli/upgrade/mappings.json index 99ea7305ae1779..04a5933cd7c448 100644 --- a/llama-index-cli/llama_index/cli/upgrade/mappings.json +++ b/llama-index-cli/llama_index/cli/upgrade/mappings.json @@ -254,7 +254,7 @@ "RetrySourceQueryEngine": "llama_index.core.query_engine", "RetryGuidelineQueryEngine": "llama_index.core.query_engine", "FLAREInstructQueryEngine": "llama_index.core.query_engine", - "PandasQueryEngine": "llama_index.core.query_engine", + "PandasQueryEngine": "llama_index.experimental.query_engine", "JSONalyzeQueryEngine": "llama_index.core.query_engine", "KnowledgeGraphQueryEngine": "llama_index.core.query_engine", "BaseQueryEngine": "llama_index.core.query_engine", diff --git a/llama-index-core/llama_index/core/command_line/mappings.json b/llama-index-core/llama_index/core/command_line/mappings.json index 99ea7305ae1779..04a5933cd7c448 100644 --- a/llama-index-core/llama_index/core/command_line/mappings.json +++ b/llama-index-core/llama_index/core/command_line/mappings.json @@ -254,7 +254,7 @@ "RetrySourceQueryEngine": "llama_index.core.query_engine", "RetryGuidelineQueryEngine": "llama_index.core.query_engine", "FLAREInstructQueryEngine": "llama_index.core.query_engine", - "PandasQueryEngine": "llama_index.core.query_engine", + "PandasQueryEngine": "llama_index.experimental.query_engine", "JSONalyzeQueryEngine": "llama_index.core.query_engine", "KnowledgeGraphQueryEngine": "llama_index.core.query_engine", "BaseQueryEngine": "llama_index.core.query_engine", diff --git a/llama-index-core/llama_index/core/exec_utils.py b/llama-index-core/llama_index/core/exec_utils.py index d4f032249a69ee..2abf6e3defea9f 100644 --- a/llama-index-core/llama_index/core/exec_utils.py +++ b/llama-index-core/llama_index/core/exec_utils.py @@ -86,6 +86,11 @@ def _get_restricted_globals(__globals: Union[dict, None]) -> Any: return restricted_globals +vulnerable_code_snippets = [ + "os.", +] + + class DunderVisitor(ast.NodeVisitor): def __init__(self) -> None: self.has_access_to_private_entity = False @@ -123,6 +128,11 @@ def _contains_protected_access(code: str) -> bool: dunder_visitor = DunderVisitor() dunder_visitor.visit(tree) + + for vulnerable_code_snippet in vulnerable_code_snippets: + if vulnerable_code_snippet in code: + dunder_visitor.has_access_to_disallowed_builtin = True + return ( dunder_visitor.has_access_to_private_entity or dunder_visitor.has_access_to_disallowed_builtin diff --git a/llama-index-core/llama_index/core/indices/struct_store/pandas.py b/llama-index-core/llama_index/core/indices/struct_store/pandas.py index a288a9ec1ae7c2..6bd260837c2733 100644 --- a/llama-index-core/llama_index/core/indices/struct_store/pandas.py +++ b/llama-index-core/llama_index/core/indices/struct_store/pandas.py @@ -1,83 +1,25 @@ -"""Pandas csv structured store.""" +"""Pandas csv structured store. -import logging -from typing import Any, Optional, Sequence +DEPRECATED: Please use :class:`PandasQueryEngine` in `llama-index-experimental` instead. +""" -import pandas as pd -from llama_index.core.base.base_query_engine import BaseQueryEngine -from llama_index.core.base.base_retriever import BaseRetriever -from llama_index.core.data_structs.table import PandasStructTable -from llama_index.core.indices.struct_store.base import BaseStructStoreIndex -from llama_index.core.llms.utils import LLMType -from llama_index.core.schema import BaseNode +from typing import Any -logger = logging.getLogger(__name__) - - -class PandasIndex(BaseStructStoreIndex[PandasStructTable]): - """Pandas Index. - - Deprecated. Please use :class:`PandasQueryEngine` instead. - - The PandasIndex is an index that stores - a Pandas dataframe under the hood. - Currently index "construction" is not supported. - - During query time, the user can either specify a raw SQL query - or a natural language query to retrieve their data. - - Args: - pandas_df (Optional[pd.DataFrame]): Pandas dataframe to use. - See :ref:`Ref-Struct-Store` for more details. - - """ - - index_struct_cls = PandasStructTable +class PandasIndex: def __init__( self, - df: pd.DataFrame, - nodes: Optional[Sequence[BaseNode]] = None, - index_struct: Optional[PandasStructTable] = None, + *args: Any, **kwargs: Any, ) -> None: - """Initialize params.""" - logger.warning( - "PandasIndex is deprecated. \ - Please directly use `PandasQueryEngine(df)` instead." + raise DeprecationWarning( + "PandasQueryEngine has been moved to `llama-index-experimental`.\n" + "`pip install llama-index-experimental`\n" + "`from llama_index.experimental.query_engine import PandasQueryEngine`\n" + "Note that the PandasQueryEngine allows for arbitrary code execution, \n" + "and should be used in a secure environment." ) - if nodes is not None: - raise ValueError("We currently do not support indexing documents or nodes.") - self.df = df - - super().__init__( - nodes=[], - index_struct=index_struct, - **kwargs, - ) - - def as_retriever(self, **kwargs: Any) -> BaseRetriever: - raise NotImplementedError("Not supported") - - def as_query_engine( - self, llm: Optional[LLMType] = None, **kwargs: Any - ) -> BaseQueryEngine: - # NOTE: lazy import - from llama_index.core.query_engine.pandas.pandas_query_engine import ( - PandasQueryEngine, - ) - - return PandasQueryEngine.from_index(self, llm=llm, **kwargs) - - def _build_index_from_nodes(self, nodes: Sequence[BaseNode]) -> PandasStructTable: - """Build index from documents.""" - return self.index_struct_cls() - - def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None: - """Insert a document.""" - raise NotImplementedError("We currently do not support inserting documents.") - -# legacy +# Legacy GPTPandasIndex = PandasIndex diff --git a/llama-index-core/llama_index/core/prompts/default_prompts.py b/llama-index-core/llama_index/core/prompts/default_prompts.py index 30499708939cb6..65b38f3466c691 100644 --- a/llama-index-core/llama_index/core/prompts/default_prompts.py +++ b/llama-index-core/llama_index/core/prompts/default_prompts.py @@ -361,26 +361,6 @@ ) -############################################ -# Pandas -############################################ - -DEFAULT_PANDAS_TMPL = ( - "You are working with a pandas dataframe in Python.\n" - "The name of the dataframe is `df`.\n" - "This is the result of `print(df.head())`:\n" - "{df_str}\n\n" - "Follow these instructions:\n" - "{instruction_str}\n" - "Query: {query_str}\n\n" - "Expression:" -) - -DEFAULT_PANDAS_PROMPT = PromptTemplate( - DEFAULT_PANDAS_TMPL, prompt_type=PromptType.PANDAS -) - - ############################################ # JSON Path ############################################ diff --git a/llama-index-core/llama_index/core/query_engine/pandas/output_parser.py b/llama-index-core/llama_index/core/query_engine/pandas/output_parser.py index 547fa8e0b10658..cb4ffbd4f6fa37 100644 --- a/llama-index-core/llama_index/core/query_engine/pandas/output_parser.py +++ b/llama-index-core/llama_index/core/query_engine/pandas/output_parser.py @@ -1,86 +1,23 @@ -"""Pandas output parser.""" +"""Pandas output parser. -import logging -from typing import Any, Dict, Optional +DEPRECATED: This class has been moved to `llama-index-experimental`. +""" -import numpy as np -import pandas as pd -from llama_index.core.exec_utils import safe_eval, safe_exec -from llama_index.core.output_parsers.base import ChainableOutputParser -from llama_index.core.output_parsers.utils import parse_code_markdown +from typing import Any -logger = logging.getLogger(__name__) - -def default_output_processor( - output: str, df: pd.DataFrame, **output_kwargs: Any -) -> str: - """Process outputs in a default manner.""" - import ast - import sys - import traceback - - if sys.version_info < (3, 9): - logger.warning( - "Python version must be >= 3.9 in order to use " - "the default output processor, which executes " - "the Python query. Instead, we will return the " - "raw Python instructions as a string." - ) - return output - - local_vars = {"df": df} - global_vars = {"np": np, "pd": pd} - - output = parse_code_markdown(output, only_last=True)[0] - - # NOTE: inspired from langchain's tool - # see langchain.tools.python.tool (PythonAstREPLTool) - try: - tree = ast.parse(output) - module = ast.Module(tree.body[:-1], type_ignores=[]) - safe_exec(ast.unparse(module), {}, local_vars) # type: ignore - module_end = ast.Module(tree.body[-1:], type_ignores=[]) - module_end_str = ast.unparse(module_end) # type: ignore - if module_end_str.strip("'\"") != module_end_str: - # if there's leading/trailing quotes, then we need to eval - # string to get the actual expression - module_end_str = safe_eval(module_end_str, global_vars, local_vars) - try: - # str(pd.dataframe) will truncate output by display.max_colwidth - # set width temporarily to extract more text - if "max_colwidth" in output_kwargs: - pd.set_option("display.max_colwidth", output_kwargs["max_colwidth"]) - output_str = str(safe_eval(module_end_str, global_vars, local_vars)) - pd.reset_option("display.max_colwidth") - return output_str - - except Exception: - raise - except Exception as e: - err_string = ( - "There was an error running the output as Python code. " - f"Error message: {e}" - ) - traceback.print_exc() - return err_string - - -class PandasInstructionParser(ChainableOutputParser): +class PandasInstructionParser: """Pandas instruction parser. - This 'output parser' takes in pandas instructions (in Python code) and - executes them to return an output. + DEPRECATED: This class has been moved to `llama-index-experimental`. """ - def __init__( - self, df: pd.DataFrame, output_kwargs: Optional[Dict[str, Any]] = None - ) -> None: - """Initialize params.""" - self.df = df - self.output_kwargs = output_kwargs or {} - - def parse(self, output: str) -> Any: - """Parse, validate, and correct errors programmatically.""" - return default_output_processor(output, self.df, **self.output_kwargs) + def __init__(self, *args: Any, **kwargs: Any) -> None: + raise DeprecationWarning( + "PandasInstructionParser has been moved to `llama-index-experimental`.\n" + "`pip install llama-index-experimental`\n" + "`from llama_index.experimental.query_engine.pandas import PandasInstructionParser`\n" + "Note that the PandasInstructionParser allows for arbitrary code execution, \n" + "and should be used in a secure environment." + ) diff --git a/llama-index-core/llama_index/core/query_engine/pandas/pandas_query_engine.py b/llama-index-core/llama_index/core/query_engine/pandas/pandas_query_engine.py index d30c014b911bce..3503fb27669e9c 100644 --- a/llama-index-core/llama_index/core/query_engine/pandas/pandas_query_engine.py +++ b/llama-index-core/llama_index/core/query_engine/pandas/pandas_query_engine.py @@ -1,191 +1,31 @@ """Default query for PandasIndex. -WARNING: This tool provides the Agent access to the `eval` function. +WARNING: This tool provides the LLM access to the `eval` function. Arbitrary code execution is possible on the machine running this tool. This tool is not recommended to be used in a production setting, and would -require heavy sandboxing or virtual machines +require heavy sandboxing or virtual machines. -""" - -import logging -from typing import Any, Dict, Optional - -import pandas as pd -from llama_index.core.base.base_query_engine import BaseQueryEngine -from llama_index.core.base.response.schema import Response -from llama_index.core.indices.struct_store.pandas import PandasIndex -from llama_index.core.llms.llm import LLM -from llama_index.core.prompts import BasePromptTemplate, PromptTemplate -from llama_index.core.prompts.default_prompts import DEFAULT_PANDAS_PROMPT -from llama_index.core.prompts.mixin import PromptDictType, PromptMixinType -from llama_index.core.query_engine.pandas.output_parser import ( - PandasInstructionParser, -) -from llama_index.core.schema import QueryBundle -from llama_index.core.service_context import ServiceContext -from llama_index.core.settings import ( - Settings, - callback_manager_from_settings_or_context, - llm_from_settings_or_context, -) -from llama_index.core.utils import print_text - -logger = logging.getLogger(__name__) - - -DEFAULT_INSTRUCTION_STR = ( - "1. Convert the query to executable Python code using Pandas.\n" - "2. The final line of code should be a Python expression that can be called with the `eval()` function.\n" - "3. The code should represent a solution to the query.\n" - "4. PRINT ONLY THE EXPRESSION.\n" - "5. Do not quote the expression.\n" -) +DEPRECATED: Use `PandasQueryEngine` from `llama-index-experimental` instead. +""" -# **NOTE**: newer version of sql query engine -DEFAULT_RESPONSE_SYNTHESIS_PROMPT_TMPL = ( - "Given an input question, synthesize a response from the query results.\n" - "Query: {query_str}\n\n" - "Pandas Instructions (optional):\n{pandas_instructions}\n\n" - "Pandas Output: {pandas_output}\n\n" - "Response: " -) -DEFAULT_RESPONSE_SYNTHESIS_PROMPT = PromptTemplate( - DEFAULT_RESPONSE_SYNTHESIS_PROMPT_TMPL, -) +from typing import Any -class PandasQueryEngine(BaseQueryEngine): +class PandasQueryEngine: """Pandas query engine. - Convert natural language to Pandas python code. - - WARNING: This tool provides the Agent access to the `eval` function. - Arbitrary code execution is possible on the machine running this tool. - This tool is not recommended to be used in a production setting, and would - require heavy sandboxing or virtual machines - - - Args: - df (pd.DataFrame): Pandas dataframe to use. - instruction_str (Optional[str]): Instruction string to use. - output_processor (Optional[Callable[[str], str]]): Output processor. - A callable that takes in the output string, pandas DataFrame, - and any output kwargs and returns a string. - eg.kwargs["max_colwidth"] = [int] is used to set the length of text - that each column can display during str(df). Set it to a higher number - if there is possibly long text in the dataframe. - pandas_prompt (Optional[BasePromptTemplate]): Pandas prompt to use. - head (int): Number of rows to show in the table context. - llm (Optional[LLM]): Language model to use. - + DEPRECATED: Use `PandasQueryEngine` from `llama-index-experimental` instead. """ - def __init__( - self, - df: pd.DataFrame, - instruction_str: Optional[str] = None, - instruction_parser: Optional[PandasInstructionParser] = None, - pandas_prompt: Optional[BasePromptTemplate] = None, - output_kwargs: Optional[dict] = None, - head: int = 5, - verbose: bool = False, - service_context: Optional[ServiceContext] = None, - llm: Optional[LLM] = None, - synthesize_response: bool = False, - response_synthesis_prompt: Optional[BasePromptTemplate] = None, - **kwargs: Any, - ) -> None: - """Initialize params.""" - self._df = df - - self._head = head - self._pandas_prompt = pandas_prompt or DEFAULT_PANDAS_PROMPT - self._instruction_str = instruction_str or DEFAULT_INSTRUCTION_STR - self._instruction_parser = instruction_parser or PandasInstructionParser( - df, output_kwargs or {} + def __init__(self, *args: Any, **kwargs: Any) -> None: + raise DeprecationWarning( + "PandasQueryEngine has been moved to `llama-index-experimental`.\n" + "`pip install llama-index-experimental`\n" + "`from llama_index.experimental.query_engine import PandasQueryEngine`\n" + "Note that the PandasQueryEngine allows for arbitrary code execution, \n" + "and should be used in a secure environment." ) - self._verbose = verbose - - self._llm = llm or llm_from_settings_or_context(Settings, service_context) - self._synthesize_response = synthesize_response - self._response_synthesis_prompt = ( - response_synthesis_prompt or DEFAULT_RESPONSE_SYNTHESIS_PROMPT - ) - - super().__init__( - callback_manager=callback_manager_from_settings_or_context( - Settings, service_context - ) - ) - - def _get_prompt_modules(self) -> PromptMixinType: - """Get prompt sub-modules.""" - return {} - - def _get_prompts(self) -> Dict[str, Any]: - """Get prompts.""" - return { - "pandas_prompt": self._pandas_prompt, - "response_synthesis_prompt": self._response_synthesis_prompt, - } - - def _update_prompts(self, prompts: PromptDictType) -> None: - """Update prompts.""" - if "pandas_prompt" in prompts: - self._pandas_prompt = prompts["pandas_prompt"] - if "response_synthesis_prompt" in prompts: - self._response_synthesis_prompt = prompts["response_synthesis_prompt"] - - @classmethod - def from_index(cls, index: PandasIndex, **kwargs: Any) -> "PandasQueryEngine": - logger.warning( - "PandasIndex is deprecated. " - "Directly construct PandasQueryEngine with df instead." - ) - return cls(df=index.df, service_context=index.service_context, **kwargs) - - def _get_table_context(self) -> str: - """Get table context.""" - return str(self._df.head(self._head)) - - def _query(self, query_bundle: QueryBundle) -> Response: - """Answer a query.""" - context = self._get_table_context() - - pandas_response_str = self._llm.predict( - self._pandas_prompt, - df_str=context, - query_str=query_bundle.query_str, - instruction_str=self._instruction_str, - ) - - if self._verbose: - print_text(f"> Pandas Instructions:\n" f"```\n{pandas_response_str}\n```\n") - pandas_output = self._instruction_parser.parse(pandas_response_str) - if self._verbose: - print_text(f"> Pandas Output: {pandas_output}\n") - - response_metadata = { - "pandas_instruction_str": pandas_response_str, - "raw_pandas_output": pandas_output, - } - if self._synthesize_response: - response_str = str( - self._llm.predict( - self._response_synthesis_prompt, - query_str=query_bundle.query_str, - pandas_instructions=pandas_response_str, - pandas_output=pandas_output, - ) - ) - else: - response_str = str(pandas_output) - - return Response(response=response_str, metadata=response_metadata) - - async def _aquery(self, query_bundle: QueryBundle) -> Response: - return self._query(query_bundle) # legacy diff --git a/llama-index-experimental/llama_index/experimental/BUILD b/llama-index-experimental/llama_index/experimental/BUILD new file mode 100644 index 00000000000000..db46e8d6c978c6 --- /dev/null +++ b/llama-index-experimental/llama_index/experimental/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-experimental/llama_index/experimental/__init__.py b/llama-index-experimental/llama_index/experimental/__init__.py index e69de29bb2d1d6..4186b4873b35de 100644 --- a/llama-index-experimental/llama_index/experimental/__init__.py +++ b/llama-index-experimental/llama_index/experimental/__init__.py @@ -0,0 +1,6 @@ +from llama_index.experimental.query_engine.pandas.pandas_query_engine import ( + PandasQueryEngine, +) +from llama_index.experimental.param_tuner.base import ParamTuner + +__all__ = ["PandasQueryEngine", "ParamTuner"] diff --git a/llama-index-experimental/llama_index/experimental/exec_utils.py b/llama-index-experimental/llama_index/experimental/exec_utils.py new file mode 100644 index 00000000000000..a9a3224f638d50 --- /dev/null +++ b/llama-index-experimental/llama_index/experimental/exec_utils.py @@ -0,0 +1,171 @@ +import ast +import copy +from types import CodeType, ModuleType +from typing import Any, Dict, Mapping, Sequence, Union + +ALLOWED_IMPORTS = { + "math", + "time", + "datetime", + "pandas", + "scipy", + "numpy", + "matplotlib", + "plotly", + "seaborn", +} + + +def _restricted_import( + name: str, + globals: Union[Mapping[str, object], None] = None, + locals: Union[Mapping[str, object], None] = None, + fromlist: Sequence[str] = (), + level: int = 0, +) -> ModuleType: + if name in ALLOWED_IMPORTS: + return __import__(name, globals, locals, fromlist, level) + raise ImportError(f"Import of module '{name}' is not allowed") + + +ALLOWED_BUILTINS = { + "abs": abs, + "all": all, + "any": any, + "ascii": ascii, + "bin": bin, + "bool": bool, + "bytearray": bytearray, + "bytes": bytes, + "chr": chr, + "complex": complex, + "divmod": divmod, + "enumerate": enumerate, + "filter": filter, + "float": float, + "format": format, + "frozenset": frozenset, + "hash": hash, + "hex": hex, + "int": int, + "isinstance": isinstance, + "issubclass": issubclass, + "len": len, + "list": list, + "map": map, + "max": max, + "min": min, + "oct": oct, + "ord": ord, + "pow": pow, + "print": print, + "range": range, + "repr": repr, + "reversed": reversed, + "round": round, + "set": set, + "slice": slice, + "sorted": sorted, + "str": str, + "sum": sum, + "tuple": tuple, + "type": type, + "zip": zip, + # Constants + "True": True, + "False": False, + "None": None, + "__import__": _restricted_import, +} + + +def _get_restricted_globals(__globals: Union[dict, None]) -> Any: + restricted_globals = copy.deepcopy(ALLOWED_BUILTINS) + if __globals: + restricted_globals.update(__globals) + return restricted_globals + + +class DunderVisitor(ast.NodeVisitor): + def __init__(self) -> None: + self.has_access_to_private_entity = False + self.has_access_to_disallowed_builtin = False + + builtins = globals()["__builtins__"].keys() + self._builtins = builtins + + def visit_Name(self, node: ast.Name) -> None: + if node.id.startswith("_"): + self.has_access_to_private_entity = True + if node.id not in ALLOWED_BUILTINS and node.id in self._builtins: + self.has_access_to_disallowed_builtin = True + self.generic_visit(node) + + def visit_Attribute(self, node: ast.Attribute) -> None: + if node.attr.startswith("_"): + self.has_access_to_private_entity = True + if node.attr not in ALLOWED_BUILTINS and node.attr in self._builtins: + self.has_access_to_disallowed_builtin = True + self.generic_visit(node) + + +def _contains_protected_access(code: str) -> bool: + # do not allow imports + imports_modules = False + tree = ast.parse(code) + for node in ast.iter_child_nodes(tree): + if isinstance(node, ast.Import): + imports_modules = True + elif isinstance(node, ast.ImportFrom): + imports_modules = True + else: + continue + + dunder_visitor = DunderVisitor() + dunder_visitor.visit(tree) + + return ( + dunder_visitor.has_access_to_private_entity + or dunder_visitor.has_access_to_disallowed_builtin + or imports_modules + ) + + +def _verify_source_safety(__source: Union[str, bytes, CodeType]) -> None: + """ + Verify that the source is safe to execute. For now, this means that it + does not contain any references to private or dunder methods. + """ + if isinstance(__source, CodeType): + raise RuntimeError("Direct execution of CodeType is forbidden!") + if isinstance(__source, bytes): + __source = __source.decode() + if _contains_protected_access(__source): + raise RuntimeError( + "Execution of code containing references to private or dunder methods, " + "disallowed builtins, or any imports, is forbidden!" + ) + + +def safe_eval( + __source: Union[str, bytes, CodeType], + __globals: Union[Dict[str, Any], None] = None, + __locals: Union[Mapping[str, object], None] = None, +) -> Any: + """ + eval within safe global context. + """ + _verify_source_safety(__source) + return eval(__source, _get_restricted_globals(__globals), __locals) + + +def safe_exec( + __source: Union[str, bytes, CodeType], + __globals: Union[Dict[str, Any], None] = None, + __locals: Union[Mapping[str, object], None] = None, +) -> None: + """ + eval within safe global context. + """ + _verify_source_safety(__source) + return exec(__source, _get_restricted_globals(__globals), __locals) diff --git a/llama-index-experimental/llama_index/experimental/query_engine/BUILD b/llama-index-experimental/llama_index/experimental/query_engine/BUILD new file mode 100644 index 00000000000000..db46e8d6c978c6 --- /dev/null +++ b/llama-index-experimental/llama_index/experimental/query_engine/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-experimental/llama_index/experimental/query_engine/__init__.py b/llama-index-experimental/llama_index/experimental/query_engine/__init__.py new file mode 100644 index 00000000000000..54a9fbbf6e378a --- /dev/null +++ b/llama-index-experimental/llama_index/experimental/query_engine/__init__.py @@ -0,0 +1,8 @@ +from llama_index.experimental.query_engine.pandas.pandas_query_engine import ( + PandasQueryEngine, +) +from llama_index.experimental.query_engine.pandas.output_parser import ( + PandasInstructionParser, +) + +__all__ = ["PandasQueryEngine", "PandasInstructionParser"] diff --git a/llama-index-experimental/llama_index/experimental/query_engine/pandas/BUILD b/llama-index-experimental/llama_index/experimental/query_engine/pandas/BUILD new file mode 100644 index 00000000000000..db46e8d6c978c6 --- /dev/null +++ b/llama-index-experimental/llama_index/experimental/query_engine/pandas/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-experimental/llama_index/experimental/query_engine/pandas/__init__.py b/llama-index-experimental/llama_index/experimental/query_engine/pandas/__init__.py new file mode 100644 index 00000000000000..e6d31a40718d93 --- /dev/null +++ b/llama-index-experimental/llama_index/experimental/query_engine/pandas/__init__.py @@ -0,0 +1,10 @@ +"""Init file.""" + +from llama_index.experimental.query_engine.pandas.output_parser import ( + PandasInstructionParser, +) +from llama_index.experimental.query_engine.pandas.pandas_query_engine import ( + PandasQueryEngine, +) + +__all__ = ["PandasInstructionParser", "PandasQueryEngine"] diff --git a/llama-index-experimental/llama_index/experimental/query_engine/pandas/output_parser.py b/llama-index-experimental/llama_index/experimental/query_engine/pandas/output_parser.py new file mode 100644 index 00000000000000..4233c07e46c749 --- /dev/null +++ b/llama-index-experimental/llama_index/experimental/query_engine/pandas/output_parser.py @@ -0,0 +1,86 @@ +"""Pandas output parser.""" + +import logging +from typing import Any, Dict, Optional + +import numpy as np +import pandas as pd +from llama_index.experimental.exec_utils import safe_eval, safe_exec +from llama_index.core.output_parsers.base import ChainableOutputParser +from llama_index.core.output_parsers.utils import parse_code_markdown + +logger = logging.getLogger(__name__) + + +def default_output_processor( + output: str, df: pd.DataFrame, **output_kwargs: Any +) -> str: + """Process outputs in a default manner.""" + import ast + import sys + import traceback + + if sys.version_info < (3, 9): + logger.warning( + "Python version must be >= 3.9 in order to use " + "the default output processor, which executes " + "the Python query. Instead, we will return the " + "raw Python instructions as a string." + ) + return output + + local_vars = {"df": df} + global_vars = {"np": np, "pd": pd} + + output = parse_code_markdown(output, only_last=True)[0] + + # NOTE: inspired from langchain's tool + # see langchain.tools.python.tool (PythonAstREPLTool) + try: + tree = ast.parse(output) + module = ast.Module(tree.body[:-1], type_ignores=[]) + safe_exec(ast.unparse(module), {}, local_vars) # type: ignore + module_end = ast.Module(tree.body[-1:], type_ignores=[]) + module_end_str = ast.unparse(module_end) # type: ignore + if module_end_str.strip("'\"") != module_end_str: + # if there's leading/trailing quotes, then we need to eval + # string to get the actual expression + module_end_str = safe_eval(module_end_str, global_vars, local_vars) + try: + # str(pd.dataframe) will truncate output by display.max_colwidth + # set width temporarily to extract more text + if "max_colwidth" in output_kwargs: + pd.set_option("display.max_colwidth", output_kwargs["max_colwidth"]) + output_str = str(safe_eval(module_end_str, global_vars, local_vars)) + pd.reset_option("display.max_colwidth") + return output_str + + except Exception: + raise + except Exception as e: + err_string = ( + "There was an error running the output as Python code. " + f"Error message: {e}" + ) + traceback.print_exc() + return err_string + + +class PandasInstructionParser(ChainableOutputParser): + """Pandas instruction parser. + + This 'output parser' takes in pandas instructions (in Python code) and + executes them to return an output. + + """ + + def __init__( + self, df: pd.DataFrame, output_kwargs: Optional[Dict[str, Any]] = None + ) -> None: + """Initialize params.""" + self.df = df + self.output_kwargs = output_kwargs or {} + + def parse(self, output: str) -> Any: + """Parse, validate, and correct errors programmatically.""" + return default_output_processor(output, self.df, **self.output_kwargs) diff --git a/llama-index-experimental/llama_index/experimental/query_engine/pandas/pandas_query_engine.py b/llama-index-experimental/llama_index/experimental/query_engine/pandas/pandas_query_engine.py new file mode 100644 index 00000000000000..39ccf31200605f --- /dev/null +++ b/llama-index-experimental/llama_index/experimental/query_engine/pandas/pandas_query_engine.py @@ -0,0 +1,212 @@ +"""Default query for PandasIndex. + +WARNING: This tool provides the LLM with access to the `eval` function. +Arbitrary code execution is possible on the machine running this tool. +This tool is not recommended to be used in a production setting, and would +require heavy sandboxing or virtual machines + +""" + +import logging +from typing import Any, Dict, Optional + +import pandas as pd +from llama_index.core.base.base_query_engine import BaseQueryEngine +from llama_index.core.base.response.schema import Response +from llama_index.core.indices.struct_store.pandas import PandasIndex +from llama_index.core.llms.llm import LLM +from llama_index.core.prompts import BasePromptTemplate, PromptTemplate +from llama_index.core.prompts.mixin import PromptDictType, PromptMixinType +from llama_index.core.schema import QueryBundle +from llama_index.core.service_context import ServiceContext +from llama_index.core.settings import ( + Settings, + callback_manager_from_settings_or_context, + llm_from_settings_or_context, +) +from llama_index.core.utils import print_text +from llama_index.experimental.query_engine.pandas.prompts import DEFAULT_PANDAS_PROMPT +from llama_index.experimental.query_engine.pandas.output_parser import ( + PandasInstructionParser, +) + +logger = logging.getLogger(__name__) + + +DEFAULT_INSTRUCTION_STR = ( + "1. Convert the query to executable Python code using Pandas.\n" + "2. The final line of code should be a Python expression that can be called with the `eval()` function.\n" + "3. The code should represent a solution to the query.\n" + "4. PRINT ONLY THE EXPRESSION.\n" + "5. Do not quote the expression.\n" +) + + +# **NOTE**: newer version of sql query engine +DEFAULT_RESPONSE_SYNTHESIS_PROMPT_TMPL = ( + "Given an input question, synthesize a response from the query results.\n" + "Query: {query_str}\n\n" + "Pandas Instructions (optional):\n{pandas_instructions}\n\n" + "Pandas Output: {pandas_output}\n\n" + "Response: " +) +DEFAULT_RESPONSE_SYNTHESIS_PROMPT = PromptTemplate( + DEFAULT_RESPONSE_SYNTHESIS_PROMPT_TMPL, +) + + +class PandasQueryEngine(BaseQueryEngine): + """Pandas query engine. + + Convert natural language to Pandas python code. + + WARNING: This tool provides the Agent access to the `eval` function. + Arbitrary code execution is possible on the machine running this tool. + This tool is not recommended to be used in a production setting, and would + require heavy sandboxing or virtual machines + + + Args: + df (pd.DataFrame): Pandas dataframe to use. + instruction_str (Optional[str]): Instruction string to use. + output_processor (Optional[Callable[[str], str]]): Output processor. + A callable that takes in the output string, pandas DataFrame, + and any output kwargs and returns a string. + eg.kwargs["max_colwidth"] = [int] is used to set the length of text + that each column can display during str(df). Set it to a higher number + if there is possibly long text in the dataframe. + pandas_prompt (Optional[BasePromptTemplate]): Pandas prompt to use. + head (int): Number of rows to show in the table context. + llm (Optional[LLM]): Language model to use. + + Examples: + `pip install llama-index-experimental` + + ```python + import pandas as pd + from llama_index.experimental.query_engine.pandas import PandasQueryEngine + + df = pd.DataFrame( + { + "city": ["Toronto", "Tokyo", "Berlin"], + "population": [2930000, 13960000, 3645000] + } + ) + + query_engine = PandasQueryEngine(df=df, verbose=True) + + response = query_engine.query("What is the population of Tokyo?") + ``` + + """ + + def __init__( + self, + df: pd.DataFrame, + instruction_str: Optional[str] = None, + instruction_parser: Optional[PandasInstructionParser] = None, + pandas_prompt: Optional[BasePromptTemplate] = None, + output_kwargs: Optional[dict] = None, + head: int = 5, + verbose: bool = False, + service_context: Optional[ServiceContext] = None, + llm: Optional[LLM] = None, + synthesize_response: bool = False, + response_synthesis_prompt: Optional[BasePromptTemplate] = None, + **kwargs: Any, + ) -> None: + """Initialize params.""" + self._df = df + + self._head = head + self._pandas_prompt = pandas_prompt or DEFAULT_PANDAS_PROMPT + self._instruction_str = instruction_str or DEFAULT_INSTRUCTION_STR + self._instruction_parser = instruction_parser or PandasInstructionParser( + df, output_kwargs or {} + ) + self._verbose = verbose + + self._llm = llm or llm_from_settings_or_context(Settings, service_context) + self._synthesize_response = synthesize_response + self._response_synthesis_prompt = ( + response_synthesis_prompt or DEFAULT_RESPONSE_SYNTHESIS_PROMPT + ) + + super().__init__( + callback_manager=callback_manager_from_settings_or_context( + Settings, service_context + ) + ) + + def _get_prompt_modules(self) -> PromptMixinType: + """Get prompt sub-modules.""" + return {} + + def _get_prompts(self) -> Dict[str, Any]: + """Get prompts.""" + return { + "pandas_prompt": self._pandas_prompt, + "response_synthesis_prompt": self._response_synthesis_prompt, + } + + def _update_prompts(self, prompts: PromptDictType) -> None: + """Update prompts.""" + if "pandas_prompt" in prompts: + self._pandas_prompt = prompts["pandas_prompt"] + if "response_synthesis_prompt" in prompts: + self._response_synthesis_prompt = prompts["response_synthesis_prompt"] + + @classmethod + def from_index(cls, index: PandasIndex, **kwargs: Any) -> "PandasQueryEngine": + logger.warning( + "PandasIndex is deprecated. " + "Directly construct PandasQueryEngine with df instead." + ) + return cls(df=index.df, service_context=index.service_context, **kwargs) + + def _get_table_context(self) -> str: + """Get table context.""" + return str(self._df.head(self._head)) + + def _query(self, query_bundle: QueryBundle) -> Response: + """Answer a query.""" + context = self._get_table_context() + + pandas_response_str = self._llm.predict( + self._pandas_prompt, + df_str=context, + query_str=query_bundle.query_str, + instruction_str=self._instruction_str, + ) + + if self._verbose: + print_text(f"> Pandas Instructions:\n" f"```\n{pandas_response_str}\n```\n") + pandas_output = self._instruction_parser.parse(pandas_response_str) + if self._verbose: + print_text(f"> Pandas Output: {pandas_output}\n") + + response_metadata = { + "pandas_instruction_str": pandas_response_str, + "raw_pandas_output": pandas_output, + } + if self._synthesize_response: + response_str = str( + self._llm.predict( + self._response_synthesis_prompt, + query_str=query_bundle.query_str, + pandas_instructions=pandas_response_str, + pandas_output=pandas_output, + ) + ) + else: + response_str = str(pandas_output) + + return Response(response=response_str, metadata=response_metadata) + + async def _aquery(self, query_bundle: QueryBundle) -> Response: + return self._query(query_bundle) + + +# legacy +NLPandasQueryEngine = PandasQueryEngine +GPTNLPandasQueryEngine = PandasQueryEngine diff --git a/llama-index-experimental/llama_index/experimental/query_engine/pandas/prompts.py b/llama-index-experimental/llama_index/experimental/query_engine/pandas/prompts.py new file mode 100644 index 00000000000000..9403b9f7f4d5c3 --- /dev/null +++ b/llama-index-experimental/llama_index/experimental/query_engine/pandas/prompts.py @@ -0,0 +1,20 @@ +from llama_index.core.prompts import PromptTemplate, PromptType + +############################################ +# Pandas +############################################ + +DEFAULT_PANDAS_TMPL = ( + "You are working with a pandas dataframe in Python.\n" + "The name of the dataframe is `df`.\n" + "This is the result of `print(df.head())`:\n" + "{df_str}\n\n" + "Follow these instructions:\n" + "{instruction_str}\n" + "Query: {query_str}\n\n" + "Expression:" +) + +DEFAULT_PANDAS_PROMPT = PromptTemplate( + DEFAULT_PANDAS_TMPL, prompt_type=PromptType.PANDAS +) diff --git a/llama-index-experimental/pyproject.toml b/llama-index-experimental/pyproject.toml index 5cd5ca6cff1f43..3f51702e6d16f7 100644 --- a/llama-index-experimental/pyproject.toml +++ b/llama-index-experimental/pyproject.toml @@ -25,7 +25,7 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-experimental" readme = "README.md" -version = "0.1.2" +version = "0.1.3" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" diff --git a/llama-index-experimental/tests/BUILD b/llama-index-experimental/tests/BUILD new file mode 100644 index 00000000000000..dabf212d7e7162 --- /dev/null +++ b/llama-index-experimental/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-core/tests/test_exec_utils.py b/llama-index-experimental/tests/test_exec_utils.py similarity index 100% rename from llama-index-core/tests/test_exec_utils.py rename to llama-index-experimental/tests/test_exec_utils.py diff --git a/llama-index-core/tests/query_engine/test_pandas.py b/llama-index-experimental/tests/test_pandas.py similarity index 89% rename from llama-index-core/tests/query_engine/test_pandas.py rename to llama-index-experimental/tests/test_pandas.py index c25b2d6a6afc06..fd039ddddbb61b 100644 --- a/llama-index-core/tests/query_engine/test_pandas.py +++ b/llama-index-experimental/tests/test_pandas.py @@ -9,18 +9,27 @@ import pytest from llama_index.core.base.response.schema import Response from llama_index.core.indices.query.schema import QueryBundle -from llama_index.core.indices.service_context import ServiceContext -from llama_index.core.prompts.default_prompts import DEFAULT_PANDAS_PROMPT -from llama_index.core.query_engine.pandas.output_parser import ( +from llama_index.core.llms.mock import MockLLM +from llama_index.experimental.query_engine.pandas.prompts import DEFAULT_PANDAS_PROMPT +from llama_index.experimental.query_engine.pandas.output_parser import ( PandasInstructionParser, ) -from llama_index.core.query_engine.pandas.pandas_query_engine import ( +from llama_index.experimental.query_engine.pandas.pandas_query_engine import ( PandasQueryEngine, ) -def test_pandas_query_engine(mock_service_context: ServiceContext) -> None: +def _mock_predict(*args: Any, **kwargs: Any) -> str: + """Mock predict.""" + query_str = kwargs["query_str"] + return f'df["{query_str}"]' + + +def test_pandas_query_engine(monkeypatch: pytest.MonkeyPatch) -> None: """Test pandas query engine.""" + monkeypatch.setattr(MockLLM, "predict", _mock_predict) + llm = MockLLM() + # Test on some sample data df = pd.DataFrame( { @@ -38,9 +47,7 @@ def test_pandas_query_engine(mock_service_context: ServiceContext) -> None: } ) # the mock prompt just takes the all items in the given column - query_engine = PandasQueryEngine( - df, service_context=mock_service_context, verbose=True - ) + query_engine = PandasQueryEngine(df, llm=llm, verbose=True) response = query_engine.query(QueryBundle("population")) import sys @@ -53,7 +60,7 @@ def test_pandas_query_engine(mock_service_context: ServiceContext) -> None: query_engine = PandasQueryEngine( df, - service_context=mock_service_context, + llm=llm, verbose=True, output_kwargs={"max_colwidth": 90}, )