Skip to content

Commit

Permalink
Ray based document parsing of more file types (jupyterlab#94)
Browse files Browse the repository at this point in the history
* Ray based document parsing of more file types.

* Renaming to learn/ask to make for human centered.

* Improvements to the learn/ask commands.

* fix typo

Co-authored-by: Jason Weill <[email protected]>

* improve grammar

Co-authored-by: Jason Weill <[email protected]>

* improve wording

Co-authored-by: Jason Weill <[email protected]>

* Adding new extensions and excludes.

* Update langchain to version 0.144.

---------

Co-authored-by: david qiu <[email protected]>
Co-authored-by: Jason Weill <[email protected]>
  • Loading branch information
3 people authored and Marchlak committed Oct 28, 2024
1 parent 6915d1d commit 2f1735c
Show file tree
Hide file tree
Showing 13 changed files with 322 additions and 173 deletions.
2 changes: 1 addition & 1 deletion packages/jupyter-ai-magics/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies = [
"ipython",
"pydantic",
"importlib_metadata~=5.2.0",
"langchain~=0.0.115"
"langchain~=0.0.144"
]

[project.optional-dependencies]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
from langchain import OpenAI
import argparse

import ray
import time
from uuid import uuid4
from ray.util.queue import Queue

from langchain import OpenAI
from langchain.chains import ConversationalRetrievalChain
from jupyter_ai.models import AgentChatMessage, HumanChatMessage

from jupyter_ai.models import HumanChatMessage
from jupyter_ai.actors.base import ACTOR_TYPE, BaseActor, Logger


@ray.remote
class FileSystemActor(BaseActor):
"""Processes messages prefixed with /fs. This actor will
class AskActor(BaseActor):
"""Processes messages prefixed with /ask. This actor will
send the message as input to a RetrieverQA chain, that
follows the Retrieval and Generation (RAG) tehnique to
query the documents from the index, and sends this context
to the LLM to generate the final reply.
"""

def __init__(self, reply_queue: Queue, log: Logger):
super().__init__(log=log, reply_queue=reply_queue)
index_actor = ray.get_actor(ACTOR_TYPE.INDEX.value)
super().__init__(reply_queue=reply_queue, log=log)
index_actor = ray.get_actor(ACTOR_TYPE.LEARN.value)
handle = index_actor.get_index.remote()
vectorstore = ray.get(handle)
if not vectorstore:
Expand All @@ -31,22 +33,26 @@ def __init__(self, reply_queue: Queue, log: Logger):
vectorstore.as_retriever()
)

def process_message(self, message: HumanChatMessage):
query = message.body.split(' ', 1)[-1]
self.parser.prog = '/ask'
self.parser.add_argument('query', nargs=argparse.REMAINDER)


def _process_message(self, message: HumanChatMessage):
args = self.parse_args(message)
if args is None:
return
query = ' '.join(args.query)
if not query:
self.reply(f"{self.parser.format_usage()}", message)
return

index_actor = ray.get_actor(ACTOR_TYPE.INDEX.value)
index_actor = ray.get_actor(ACTOR_TYPE.LEARN.value)
handle = index_actor.get_index.remote()
vectorstore = ray.get(handle)
# Have to reference the latest index
self.chat_provider.retriever = vectorstore.as_retriever()

result = self.chat_provider({"question": query, "chat_history": self.chat_history})
reply = result['answer']
self.chat_history.append((query, reply))
agent_message = AgentChatMessage(
id=uuid4().hex,
time=time.time(),
body=reply,
reply_to=message.id
)
self.reply_queue.put(agent_message)
response = result['answer']
self.chat_history.append((query, response))
self.reply(response, message)
48 changes: 41 additions & 7 deletions packages/jupyter-ai/jupyter_ai/actors/base.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
import argparse
from enum import Enum
from uuid import uuid4
import time
import logging
from typing import Union
from jupyter_ai.models import HumanChatMessage
import traceback

from ray.util.queue import Queue

from jupyter_ai.models import HumanChatMessage, AgentChatMessage


Logger = Union[logging.Logger, logging.LoggerAdapter]

class ACTOR_TYPE(str, Enum):
DEFAULT = "default"
FILESYSTEM = "filesystem"
INDEX = 'index'
ASK = "ask"
LEARN = 'learn'
MEMORY = 'memory'

COMMANDS = {
'/fs': ACTOR_TYPE.FILESYSTEM,
'/filesystem': ACTOR_TYPE.FILESYSTEM,
'/index': ACTOR_TYPE.INDEX
'/ask': ACTOR_TYPE.ASK,
'/learn': ACTOR_TYPE.LEARN
}

class BaseActor():
Expand All @@ -29,7 +34,36 @@ def __init__(
):
self.log = log
self.reply_queue = reply_queue
self.parser = argparse.ArgumentParser()

def process_message(self, message: HumanChatMessage):
"""Processes the message passed by the `Router`"""
raise NotImplementedError("Should be implemented by subclasses.")
try:
self._process_message(message)
except Exception as e:
formatted_e = traceback.format_exc()
response = f"Sorry, something went wrong and I wasn't able to index that path.\n\n```\n{formatted_e}\n```"
self.reply(response, message)

def _process_message(self, message: HumanChatMessage):
"""Processes the message passed by the `Router`"""
raise NotImplementedError("Should be implemented by subclasses.")

def reply(self, response, message: HumanChatMessage):
m = AgentChatMessage(
id=uuid4().hex,
time=time.time(),
body=response,
reply_to=message.id
)
self.reply_queue.put(m)

def parse_args(self, message):
args = message.body.split(' ')
try:
args = self.parser.parse_args(args[1:])
except (argparse.ArgumentError, SystemExit) as e:
response = f"{self.parser.format_usage()}"
self.reply(response, message)
return None
return args
29 changes: 10 additions & 19 deletions packages/jupyter-ai/jupyter_ai/actors/default.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,25 @@
import time
from uuid import uuid4
from jupyter_ai.actors.base import BaseActor, Logger, ACTOR_TYPE
from jupyter_ai.actors.memory import RemoteMemory
from jupyter_ai.models import AgentChatMessage, HumanChatMessage
from jupyter_ai_magics.providers import ChatOpenAINewProvider
from langchain import ConversationChain
import ray
from ray.util.queue import Queue
from langchain.memory import ConversationBufferMemory

from langchain import ConversationChain
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate
)

SYSTEM_PROMPT = "The following is a friendly conversation between a human and an AI, whose name is Jupyter AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know."
from jupyter_ai.actors.base import BaseActor, Logger, ACTOR_TYPE
from jupyter_ai.actors.memory import RemoteMemory
from jupyter_ai.models import HumanChatMessage
from jupyter_ai_magics.providers import ChatOpenAINewProvider

SYSTEM_PROMPT = "The following is a friendly conversation between a human and an AI, whose name is Jupyter AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know."

@ray.remote
class DefaultActor(BaseActor):
def __init__(self, reply_queue: Queue, log: Logger):
super().__init__(log=log, reply_queue=reply_queue)
# TODO: Should take the provider/model id as strings
super().__init__(reply_queue=reply_queue, log=log)
provider = ChatOpenAINewProvider(model_id="gpt-3.5-turbo")

# Create a conversation memory
Expand All @@ -40,12 +37,6 @@ def __init__(self, reply_queue: Queue, log: Logger):
)
self.chat_provider = chain

def process_message(self, message: HumanChatMessage):
def _process_message(self, message: HumanChatMessage):
response = self.chat_provider.predict(input=message.body)
agent_message = AgentChatMessage(
id=uuid4().hex,
time=time.time(),
body=response,
reply_to=message.id
)
self.reply_queue.put(agent_message)
self.reply(response, message)
63 changes: 0 additions & 63 deletions packages/jupyter-ai/jupyter_ai/actors/index.py

This file was deleted.

119 changes: 119 additions & 0 deletions packages/jupyter-ai/jupyter_ai/actors/learn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os
import traceback
from collections import Counter
import argparse

import ray
from ray.util.queue import Queue

from jupyter_core.paths import jupyter_data_dir

from langchain import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import (
RecursiveCharacterTextSplitter, PythonCodeTextSplitter,
MarkdownTextSplitter, LatexTextSplitter
)

from jupyter_ai.models import HumanChatMessage
from jupyter_ai.actors.base import BaseActor, Logger
from jupyter_ai_magics.providers import ChatOpenAINewProvider
from jupyter_ai.document_loaders.directory import RayRecursiveDirectoryLoader
from jupyter_ai.document_loaders.splitter import ExtensionSplitter, NotebookSplitter


@ray.remote
class LearnActor(BaseActor):

def __init__(self, reply_queue: Queue, log: Logger, root_dir: str):
super().__init__(reply_queue=reply_queue, log=log)
self.root_dir = root_dir
self.index_save_dir = os.path.join(jupyter_data_dir(), 'jupyter_ai', 'indices')
self.chunk_size = 2000
self.chunk_overlap = 100
self.parser.prog = '/learn'
self.parser.add_argument('-v', '--verbose', action='store_true')
self.parser.add_argument('-d', '--delete', action='store_true')
self.parser.add_argument('path', nargs=argparse.REMAINDER)
self.index_name = 'default'
self.index = None

if ChatOpenAINewProvider.auth_strategy.name not in os.environ:
return

if not os.path.exists(self.index_save_dir):
os.makedirs(self.index_save_dir)

self.load_or_create()

def _process_message(self, message: HumanChatMessage):
args = self.parse_args(message)
if args is None:
return

if args.delete:
self.delete()
self.reply(f"👍 I have deleted everything I previously learned.", message)
return

# Make sure the path exists.
if not len(args.path) == 1:
self.reply(f"{self.parser.format_usage()}", message)
return
short_path = args.path[0]
load_path = os.path.join(self.root_dir, short_path)
if not os.path.exists(load_path):
response = f"Sorry, that path doesn't exist: {load_path}"
self.reply(response, message)
return

if args.verbose:
self.reply(f"Loading and splitting files for {load_path}", message)

splitters={
'.py': PythonCodeTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap),
'.md': MarkdownTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap),
'.tex': LatexTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap),
'.ipynb': NotebookSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
}
splitter = ExtensionSplitter(
splitters=splitters,
default_splitter=RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
)

loader = RayRecursiveDirectoryLoader(load_path)
texts = loader.load_and_split(text_splitter=splitter)
self.index.add_documents(texts)
self.save()

response = f"""🎉 I have indexed documents at **{load_path}** and I am ready to answer questions about them.
You can ask questions about these docs by prefixing your message with **/ask**."""
self.reply(response, message)

def get_index(self):
return self.index

def delete(self):
self.index = None
paths = [os.path.join(self.index_save_dir, self.index_name+ext) for ext in ['.pkl', '.faiss']]
for path in paths:
if os.path.isfile(path):
os.remove(path)
self.create()

def create(self):
embeddings = OpenAIEmbeddings()
self.index = FAISS.from_texts(["Jupyter AI knows about your filesystem, to ask questions first use the /learn command."], embeddings)
self.save()

def save(self):
if self.index is not None:
self.index.save_local(self.index_save_dir, index_name=self.index_name)

def load_or_create(self):
embeddings = OpenAIEmbeddings()
if self.index is None:
try:
self.index = FAISS.load_local(self.index_save_dir, embeddings, index_name=self.index_name)
except Exception as e:
self.create()
Loading

0 comments on commit 2f1735c

Please sign in to comment.