Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

4 memory #3

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
7 changes: 6 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,18 @@ openai = "*"
pinecone-client = "*"
unstructured = "*"
nltk = "*"
python-magic-bin = "*"
fastapi = "*"
jinja2 = "*"
uvicorn = "*"
streamlit = "*"
streamlit-chat = "*"
tqdm = "*"
watchdog = "*"
yt-dlp = "*"
youtube-dl = "*"
pydub = "*"
faiss-cpu = "*"
python-dotenv = "*"

[dev-packages]

Expand Down
1,452 changes: 929 additions & 523 deletions Pipfile.lock

Large diffs are not rendered by default.

67 changes: 1 addition & 66 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,66 +1 @@

# LangChain Documentation Helper

A repository for learning LangChain by building a generative ai application.

This is a web application is using a Pinecone as a vectorsotre and answers questions about LangChain
(sources from LangChain official documentation).


![Logo](https://github.com/emarco177/documentation-helper/blob/main/static/banner.gif)


## Environment Variables

To run this project, you will need to add the following environment variables to your .env file

`PINECONE_API_KEY`
`PINECONE_ENVIRONMENT_REGION`
`OPENAI_API_KEY`

## Run Locally

Clone the project

```bash
git clone https://github.com/emarco177/documentation-helper.git
```

Go to the project directory

```bash
cd documentation-helper
```

Download LangChain Documentation
```bash
mkdir langchain-docs
wget -r -A.html -P langchain-docs https://langchain.readthedocs.io/en/latest/
```

Install dependencies

```bash
pipenv install
```

Start the flask server

```bash
streamlit run main.py
```


## Running Tests

To run tests, run the following command

```bash
pipenv run pytest .
```


## 🔗 Links
[![portfolio](https://img.shields.io/badge/my_portfolio-000?style=for-the-badge&logo=ko-fi&logoColor=white)](https://www.udemy.com/user/eden-marco/)
[![linkedin](https://img.shields.io/badge/linkedin-0A66C2?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/eden-marco/)
[![twitter](https://img.shields.io/badge/twitter-1DA1F2?style=for-the-badge&logo=twitter&logoColor=white)](https://www.udemy.com/user/eden-marco/)
# LangChain Documentation Helper
38 changes: 18 additions & 20 deletions backend/core.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,32 @@
from dotenv import load_dotenv
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import Pinecone
import pinecone
from typing import Any, Dict, List

from consts import INDEX_NAME

load_dotenv()
pinecone.init(
api_key=os.environ["PINECONE_API_KEY"],
environment=os.environ["PINECONE_ENVIRONMENT_REGION"],
)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

INDEX_NAME = "langchain-doc-index"
def run_llm(query: str, chat_history: List[Dict[str, Any]] = []):
embeddings = OpenAIEmbeddings()
docsearch = Pinecone.from_existing_index(INDEX_NAME, embeddings)
chat = ChatOpenAI(verbose=True, temperature=0)


def run_llm(query: str):
embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
docsearch = Pinecone.from_existing_index(
embedding=embeddings,
index_name=INDEX_NAME,
)
chat = ChatOpenAI(
verbose=True,
temperature=0,
qa = ConversationalRetrievalChain.from_llm(
llm=chat, retriever=docsearch.as_retriever(), return_source_documents=True
)

qa = RetrievalQA.from_chain_type(
llm=chat,
chain_type="stuff",
retriever=docsearch.as_retriever(),
return_source_documents=True,
)
return qa({"query": query})
return qa({"question": query, "chat_history": chat_history})


if __name__ == "__main__":
print(run_llm(query="What is Langchain?"))
1 change: 1 addition & 0 deletions consts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
INDEX_NAME = "langchain-doc-index"
33 changes: 20 additions & 13 deletions ingestion.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,43 @@
import os

from langchain.document_loaders import ReadTheDocsLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import pinecone

from consts import INDEX_NAME


pinecone.init(
api_key=os.environ["PINECONE_API_KEY"],
environment=os.environ["PINECONE_ENVIRONMENT_REGION"],
)
INDEX_NAME = "langchain-doc-index"


def ingest_docs():
loader = ReadTheDocsLoader("langchain-docs/langchain.readthedocs.io/en/latest")
def ingest_docs() -> None:
loader = ReadTheDocsLoader(path="langchain-docs/langchain.readthedocs.io/en/latest")
raw_documents = loader.load()
print(f"loaded {len(raw_documents)} documents")
print(f"Loaded {len(raw_documents)} documents")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=400, chunk_overlap=50, separators=["\n\n", "\n", " ", ""]
chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""]
)
documents = text_splitter.split_documents(raw_documents)
documents = text_splitter.split_documents(documents=raw_documents)
print(f"Split into {len(documents)} chunks")

for doc in documents:
new_url = doc.metadata["source"]
new_url = new_url.replace("langchain-docs", "https:/")
old_path = doc.metadata["source"]
new_url = old_path.replace("langchain-docs", "https:/")
doc.metadata.update({"source": new_url})

print(f"Going to insert {len(documents)} to Pinecone")
embeddings = OpenAIEmbeddings()
print(f"Going to add {len(documents)} to Pinecone")
Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME)
print("****Loading to vectorestore done ***")
Pinecone.from_documents(documents, embeddings, index_name="INDEX_NAME")
print("****** Added to Pinecone vectorstore vectors!")


if __name__ == "__main__":
ingest_docs()




97 changes: 62 additions & 35 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,63 @@
import streamlit as st
from backend.core import run_llm


def show_messages(text):
messages_str = [
f"{_['role']}: {_['content']}" for _ in st.session_state["messages"][1:]
]
text.text_area("Messages", value=str("\n".join(messages_str)), height=400)


BASE_PROMPT = [{"role": "system", "content": "You are a helpful assistant."}]
from typing import Set

if __name__ == "__main__":
if "messages" not in st.session_state:
st.session_state["messages"] = BASE_PROMPT

st.header("LangChain Udemy Course- Helper Bot")
text = st.empty()
show_messages(text)

prompt = st.text_input("Prompt", value="Enter your message here...")

if st.button("Send"):
with st.spinner("Generating response..."):
st.session_state["messages"] += [{"role": "user", "content": prompt}]
message_response = run_llm(query=prompt)
formatted_response = message_response["result"]
st.session_state["messages"] += [
{"role": "system", "content": formatted_response}
]
show_messages(text)

if st.button("Clear"):
st.session_state["messages"] = BASE_PROMPT
show_messages(text)
from backend.core import run_llm
import streamlit as st
from streamlit_chat import message


def create_sources_string(source_urls: Set[str]) -> str:
if not source_urls:
return ""
sources_list = list(source_urls)
sources_list.sort()
sources_string = "sources:\n"
for i, source in enumerate(sources_list):
sources_string += f"{i+1}. {source}\n"
return sources_string


if "chat_answers_history" not in st.session_state:
st.session_state.chat_answers_history = []

st.header("LangChain🦜🔗 Udemy Course- Helper Bot")
if (
"chat_answers_history" not in st.session_state
and "user_prompt_history" not in st.session_state
and "chat_history" not in st.session_state
):
st.session_state["chat_answers_history"] = []
st.session_state["user_prompt_history"] = []
st.session_state["chat_history"] = []


prompt = st.text_input("Prompt", placeholder="Enter your message here...") or st.button(
"Submit"
)

if prompt:
with st.spinner("Generating response..."):
generated_response = run_llm(
query=prompt, chat_history=st.session_state["chat_history"]
)

sources = set(
[doc.metadata["source"] for doc in generated_response["source_documents"]]
)
formatted_response = (
f"{generated_response['answer']} \n\n {create_sources_string(sources)}"
)

st.session_state.chat_history.append((prompt, generated_response["answer"]))
st.session_state.user_prompt_history.append(prompt)
st.session_state.chat_answers_history.append(formatted_response)

if st.session_state["chat_answers_history"]:
for generated_response, user_query in zip(
st.session_state["chat_answers_history"],
st.session_state["user_prompt_history"],
):
message(
user_query,
is_user=True,
)
message(generated_response)
Empty file added modules/__init__.py
Empty file.
61 changes: 61 additions & 0 deletions modules/youtubeloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from dotenv import load_dotenv
import os

from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Two Karpathy lecture videos
#urls = ["https://youtu.be/kCc8FmEb1nY", "https://youtu.be/VMj-3S1tku0"]
urls = ["https://youtu.be/kCc8FmEb1nY"]


# Directory to save audio files
save_dir = "/Users/maximeberthelot/Downloads/YouTube"

# Check if transcript file exists
if os.path.exists(os.path.join(save_dir, "transcript.txt")):
with open(os.path.join(save_dir, "transcript.txt"), "r") as f:
text = f.read()
else:
# Transcribe the videos to text
loader = GenericLoader(YoutubeAudioLoader(urls, save_dir), OpenAIWhisperParser(api_key=OPENAI_API_KEY))
docs = loader.load()
#print(docs) # Add this line to check if the docs list is empty

# Combine doc
combined_docs = [doc.page_content for doc in docs]
text = " ".join(combined_docs)

# Save transcript to file
with open(os.path.join(save_dir, "transcript.txt"), "w") as f:
f.write(text)

# Split them
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
splits = text_splitter.split_text(text)

# Build an index
embeddings = OpenAIEmbeddings()
vectordb = FAISS.from_texts(splits, embeddings)

# Build a QA chain
qa_chain = RetrievalQA.from_chain_type(
llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
chain_type="stuff",
retriever=vectordb.as_retriever(),
)

# Ask a question!
query = "Why do we need to zero out the gradient before backprop at each step?"
answer = qa_chain.run(query)
print(answer)
Binary file removed static/banner.gif
Binary file not shown.