Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ray based document parsing of more file types #94

Merged
merged 8 commits into from
Apr 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion packages/jupyter-ai-magics/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies = [
"ipython",
"pydantic",
"importlib_metadata~=5.2.0",
"langchain~=0.0.115"
"langchain~=0.0.144"
]

[project.optional-dependencies]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
from langchain import OpenAI
import argparse

import ray
import time
from uuid import uuid4
from ray.util.queue import Queue

from langchain import OpenAI
from langchain.chains import ConversationalRetrievalChain
from jupyter_ai.models import AgentChatMessage, HumanChatMessage

from jupyter_ai.models import HumanChatMessage
from jupyter_ai.actors.base import ACTOR_TYPE, BaseActor, Logger


@ray.remote
class FileSystemActor(BaseActor):
"""Processes messages prefixed with /fs. This actor will
class AskActor(BaseActor):
"""Processes messages prefixed with /ask. This actor will
send the message as input to a RetrieverQA chain, that
follows the Retrieval and Generation (RAG) tehnique to
query the documents from the index, and sends this context
to the LLM to generate the final reply.
"""

def __init__(self, reply_queue: Queue, log: Logger):
super().__init__(log=log, reply_queue=reply_queue)
index_actor = ray.get_actor(ACTOR_TYPE.INDEX.value)
super().__init__(reply_queue=reply_queue, log=log)
index_actor = ray.get_actor(ACTOR_TYPE.LEARN.value)
handle = index_actor.get_index.remote()
vectorstore = ray.get(handle)
if not vectorstore:
Expand All @@ -31,22 +33,26 @@ def __init__(self, reply_queue: Queue, log: Logger):
vectorstore.as_retriever()
)

def process_message(self, message: HumanChatMessage):
query = message.body.split(' ', 1)[-1]
self.parser.prog = '/ask'
self.parser.add_argument('query', nargs=argparse.REMAINDER)


def _process_message(self, message: HumanChatMessage):
args = self.parse_args(message)
if args is None:
return
query = ' '.join(args.query)
if not query:
self.reply(f"{self.parser.format_usage()}", message)
return

index_actor = ray.get_actor(ACTOR_TYPE.INDEX.value)
index_actor = ray.get_actor(ACTOR_TYPE.LEARN.value)
handle = index_actor.get_index.remote()
vectorstore = ray.get(handle)
# Have to reference the latest index
self.chat_provider.retriever = vectorstore.as_retriever()

result = self.chat_provider({"question": query, "chat_history": self.chat_history})
reply = result['answer']
self.chat_history.append((query, reply))
agent_message = AgentChatMessage(
id=uuid4().hex,
time=time.time(),
body=reply,
reply_to=message.id
)
self.reply_queue.put(agent_message)
response = result['answer']
self.chat_history.append((query, response))
self.reply(response, message)
48 changes: 41 additions & 7 deletions packages/jupyter-ai/jupyter_ai/actors/base.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
import argparse
from enum import Enum
from uuid import uuid4
import time
import logging
from typing import Union
from jupyter_ai.models import HumanChatMessage
import traceback

from ray.util.queue import Queue

from jupyter_ai.models import HumanChatMessage, AgentChatMessage


Logger = Union[logging.Logger, logging.LoggerAdapter]

class ACTOR_TYPE(str, Enum):
DEFAULT = "default"
FILESYSTEM = "filesystem"
INDEX = 'index'
ASK = "ask"
LEARN = 'learn'
MEMORY = 'memory'

COMMANDS = {
'/fs': ACTOR_TYPE.FILESYSTEM,
'/filesystem': ACTOR_TYPE.FILESYSTEM,
'/index': ACTOR_TYPE.INDEX
'/ask': ACTOR_TYPE.ASK,
'/learn': ACTOR_TYPE.LEARN
}

class BaseActor():
Expand All @@ -29,7 +34,36 @@ def __init__(
):
self.log = log
self.reply_queue = reply_queue
self.parser = argparse.ArgumentParser()

def process_message(self, message: HumanChatMessage):
"""Processes the message passed by the `Router`"""
raise NotImplementedError("Should be implemented by subclasses.")
try:
self._process_message(message)
except Exception as e:
formatted_e = traceback.format_exc()
response = f"Sorry, something went wrong and I wasn't able to index that path.\n\n```\n{formatted_e}\n```"
self.reply(response, message)

def _process_message(self, message: HumanChatMessage):
"""Processes the message passed by the `Router`"""
raise NotImplementedError("Should be implemented by subclasses.")

def reply(self, response, message: HumanChatMessage):
m = AgentChatMessage(
id=uuid4().hex,
time=time.time(),
body=response,
reply_to=message.id
)
self.reply_queue.put(m)

def parse_args(self, message):
args = message.body.split(' ')
try:
args = self.parser.parse_args(args[1:])
except (argparse.ArgumentError, SystemExit) as e:
response = f"{self.parser.format_usage()}"
self.reply(response, message)
return None
return args
29 changes: 10 additions & 19 deletions packages/jupyter-ai/jupyter_ai/actors/default.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,25 @@
import time
from uuid import uuid4
from jupyter_ai.actors.base import BaseActor, Logger, ACTOR_TYPE
from jupyter_ai.actors.memory import RemoteMemory
from jupyter_ai.models import AgentChatMessage, HumanChatMessage
from jupyter_ai_magics.providers import ChatOpenAINewProvider
from langchain import ConversationChain
import ray
from ray.util.queue import Queue
from langchain.memory import ConversationBufferMemory

from langchain import ConversationChain
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate
)

SYSTEM_PROMPT = "The following is a friendly conversation between a human and an AI, whose name is Jupyter AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know."
from jupyter_ai.actors.base import BaseActor, Logger, ACTOR_TYPE
from jupyter_ai.actors.memory import RemoteMemory
from jupyter_ai.models import HumanChatMessage
from jupyter_ai_magics.providers import ChatOpenAINewProvider

SYSTEM_PROMPT = "The following is a friendly conversation between a human and an AI, whose name is Jupyter AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know."

@ray.remote
class DefaultActor(BaseActor):
def __init__(self, reply_queue: Queue, log: Logger):
super().__init__(log=log, reply_queue=reply_queue)
# TODO: Should take the provider/model id as strings
super().__init__(reply_queue=reply_queue, log=log)
provider = ChatOpenAINewProvider(model_id="gpt-3.5-turbo")

# Create a conversation memory
Expand All @@ -40,12 +37,6 @@ def __init__(self, reply_queue: Queue, log: Logger):
)
self.chat_provider = chain

def process_message(self, message: HumanChatMessage):
def _process_message(self, message: HumanChatMessage):
response = self.chat_provider.predict(input=message.body)
agent_message = AgentChatMessage(
id=uuid4().hex,
time=time.time(),
body=response,
reply_to=message.id
)
self.reply_queue.put(agent_message)
self.reply(response, message)
63 changes: 0 additions & 63 deletions packages/jupyter-ai/jupyter_ai/actors/index.py

This file was deleted.

119 changes: 119 additions & 0 deletions packages/jupyter-ai/jupyter_ai/actors/learn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os
import traceback
from collections import Counter
import argparse

import ray
from ray.util.queue import Queue

from jupyter_core.paths import jupyter_data_dir

from langchain import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import (
RecursiveCharacterTextSplitter, PythonCodeTextSplitter,
MarkdownTextSplitter, LatexTextSplitter
)

from jupyter_ai.models import HumanChatMessage
from jupyter_ai.actors.base import BaseActor, Logger
from jupyter_ai_magics.providers import ChatOpenAINewProvider
from jupyter_ai.document_loaders.directory import RayRecursiveDirectoryLoader
from jupyter_ai.document_loaders.splitter import ExtensionSplitter, NotebookSplitter


@ray.remote
class LearnActor(BaseActor):

def __init__(self, reply_queue: Queue, log: Logger, root_dir: str):
super().__init__(reply_queue=reply_queue, log=log)
self.root_dir = root_dir
self.index_save_dir = os.path.join(jupyter_data_dir(), 'jupyter_ai', 'indices')
self.chunk_size = 2000
self.chunk_overlap = 100
self.parser.prog = '/learn'
self.parser.add_argument('-v', '--verbose', action='store_true')
self.parser.add_argument('-d', '--delete', action='store_true')
self.parser.add_argument('path', nargs=argparse.REMAINDER)
self.index_name = 'default'
self.index = None

if ChatOpenAINewProvider.auth_strategy.name not in os.environ:
return

if not os.path.exists(self.index_save_dir):
os.makedirs(self.index_save_dir)

self.load_or_create()

def _process_message(self, message: HumanChatMessage):
args = self.parse_args(message)
if args is None:
return

if args.delete:
self.delete()
self.reply(f"👍 I have deleted everything I previously learned.", message)
return

# Make sure the path exists.
if not len(args.path) == 1:
self.reply(f"{self.parser.format_usage()}", message)
return
short_path = args.path[0]
load_path = os.path.join(self.root_dir, short_path)
if not os.path.exists(load_path):
response = f"Sorry, that path doesn't exist: {load_path}"
self.reply(response, message)
return

if args.verbose:
self.reply(f"Loading and splitting files for {load_path}", message)

splitters={
'.py': PythonCodeTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap),
'.md': MarkdownTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap),
'.tex': LatexTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap),
'.ipynb': NotebookSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
}
splitter = ExtensionSplitter(
splitters=splitters,
default_splitter=RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
)

loader = RayRecursiveDirectoryLoader(load_path)
texts = loader.load_and_split(text_splitter=splitter)
self.index.add_documents(texts)
self.save()

response = f"""🎉 I have indexed documents at **{load_path}** and I am ready to answer questions about them.
You can ask questions about these docs by prefixing your message with **/ask**."""
self.reply(response, message)

def get_index(self):
return self.index

def delete(self):
self.index = None
paths = [os.path.join(self.index_save_dir, self.index_name+ext) for ext in ['.pkl', '.faiss']]
for path in paths:
if os.path.isfile(path):
os.remove(path)
self.create()

def create(self):
embeddings = OpenAIEmbeddings()
self.index = FAISS.from_texts(["Jupyter AI knows about your filesystem, to ask questions first use the /learn command."], embeddings)
ellisonbg marked this conversation as resolved.
Show resolved Hide resolved
self.save()

def save(self):
if self.index is not None:
self.index.save_local(self.index_save_dir, index_name=self.index_name)

def load_or_create(self):
embeddings = OpenAIEmbeddings()
if self.index is None:
try:
self.index = FAISS.load_local(self.index_save_dir, embeddings, index_name=self.index_name)
except Exception as e:
self.create()
Loading