Skip to content

Commit

Permalink
add gmail loader (#9810)
Browse files Browse the repository at this point in the history
  • Loading branch information
hwchase17 authored Aug 28, 2023
1 parent 0d01ced commit c1badc1
Show file tree
Hide file tree
Showing 3 changed files with 290 additions and 1 deletion.
2 changes: 1 addition & 1 deletion docs/extras/integrations/chat_loaders/facebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
"version": "3.10.1"
}
},
"nbformat": 4,
Expand Down
179 changes: 179 additions & 0 deletions docs/extras/integrations/chat_loaders/gmail.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "b3d1705d",
"metadata": {},
"source": [
"# GMail\n",
"\n",
"This loader goes over how to load data from GMail. There are many ways you could want to load data from GMail. This loader is currently fairly opionated in how to do so. The way it does it is it first looks for all messages that you have sent. It then looks for messages where you are responding to a previous email. It then fetches that previous email, and creates a training example of that email, followed by your email.\n",
"\n",
"Note that there are clear limitations here. For example, all examples created are only looking at the previous email for context.\n",
"\n",
"To use:\n",
"\n",
"- Set up a Google Developer Account: Go to the Google Developer Console, create a project, and enable the Gmail API for that project. This will give you a credentials.json file that you'll need later.\n",
"\n",
"- Install the Google Client Library: Run the following command to install the Google Client Library:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "84578039",
"metadata": {},
"outputs": [],
"source": [
"!pip install --upgrade google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "be18f796",
"metadata": {},
"outputs": [],
"source": [
"import os.path\n",
"import base64\n",
"import json\n",
"import re\n",
"import time\n",
"from google.auth.transport.requests import Request\n",
"from google.oauth2.credentials import Credentials\n",
"from google_auth_oauthlib.flow import InstalledAppFlow\n",
"from googleapiclient.discovery import build\n",
"import logging\n",
"import requests\n",
"\n",
"SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']\n",
"\n",
"\n",
"creds = None\n",
"# The file token.json stores the user's access and refresh tokens, and is\n",
"# created automatically when the authorization flow completes for the first\n",
"# time.\n",
"if os.path.exists('email_token.json'):\n",
" creds = Credentials.from_authorized_user_file('email_token.json', SCOPES)\n",
"# If there are no (valid) credentials available, let the user log in.\n",
"if not creds or not creds.valid:\n",
" if creds and creds.expired and creds.refresh_token:\n",
" creds.refresh(Request())\n",
" else:\n",
" flow = InstalledAppFlow.from_client_secrets_file( \n",
" # your creds file here. Please create json file as here https://cloud.google.com/docs/authentication/getting-started\n",
" 'creds.json', SCOPES)\n",
" creds = flow.run_local_server(port=0)\n",
" # Save the credentials for the next run\n",
" with open('email_token.json', 'w') as token:\n",
" token.write(creds.to_json())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "a2793ba0",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chat_loaders.gmail import GMailLoader"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2154597f",
"metadata": {},
"outputs": [],
"source": [
"loader = GMailLoader(creds=creds, n=3)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "0b7d11bd",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "74764bc7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Sometimes there can be errors which we silently ignore\n",
"len(data)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "d9360a85",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chat_loaders.utils import (\n",
" map_ai_messages,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "a9646f7a",
"metadata": {},
"outputs": [],
"source": [
"# This makes messages sent by [email protected] the AI Messages\n",
"# This means you will train an LLM to predict as if it's responding as hchase\n",
"training_data = list(map_ai_messages(data, sender=\"Harrison Chase <[email protected]>\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1a182f0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
110 changes: 110 additions & 0 deletions libs/langchain/langchain/chat_loaders/gmail.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import base64
import re
from typing import Any, Iterator

from langchain.chat_loaders.base import BaseChatLoader, ChatSession
from langchain.schema.messages import HumanMessage


def _extract_email_content(msg: Any) -> HumanMessage:
from_email = None
for values in msg["payload"]["headers"]:
name = values["name"]
if name == "From":
from_email = values["value"]
if from_email is None:
raise ValueError
for part in msg["payload"]["parts"]:
if part["mimeType"] == "text/plain":
data = part["body"]["data"]
data = base64.urlsafe_b64decode(data).decode("utf-8")
# Regular expression to split the email body at the first
# occurrence of a line that starts with "On ... wrote:"
pattern = re.compile(r"\r\nOn .+(\r\n)*wrote:\r\n")
# Split the email body and extract the first part
newest_response = re.split(pattern, data)[0]
message = HumanMessage(
content=newest_response, additional_kwargs={"sender": from_email}
)
return message
raise ValueError


def _get_message_data(service: Any, message: Any) -> ChatSession:
msg = service.users().messages().get(userId="me", id=message["id"]).execute()
message_content = _extract_email_content(msg)
in_reply_to = None
email_data = msg["payload"]["headers"]
for values in email_data:
name = values["name"]
if name == "In-Reply-To":
in_reply_to = values["value"]
if in_reply_to is None:
raise ValueError

thread_id = msg["threadId"]

thread = service.users().threads().get(userId="me", id=thread_id).execute()
messages = thread["messages"]

response_email = None
for message in messages:
email_data = message["payload"]["headers"]
for values in email_data:
if values["name"] == "Message-ID":
message_id = values["value"]
if message_id == in_reply_to:
response_email = message
if response_email is None:
raise ValueError
starter_content = _extract_email_content(response_email)
return ChatSession(messages=[starter_content, message_content])


class GMailLoader(BaseChatLoader):
"""This loader goes over how to load data from GMail.
There are many ways you could want to load data from GMail.
This loader is currently fairly opinionated in how to do so.
The way it does it is it first looks for all messages that you have sent.
It then looks for messages where you are responding to a previous email.
It then fetches that previous email, and creates a training example
of that email, followed by your email.
Note that there are clear limitations here. For example,
all examples created are only looking at the previous email for context.
To use:
- Set up a Google Developer Account:
Go to the Google Developer Console, create a project,
and enable the Gmail API for that project.
This will give you a credentials.json file that you'll need later.
"""

def __init__(self, creds: Any, n: int = 100, raise_error: bool = False) -> None:
super().__init__()
self.creds = creds
self.n = n
self.raise_error = raise_error

def lazy_load(self) -> Iterator[ChatSession]:
from googleapiclient.discovery import build

service = build("gmail", "v1", credentials=self.creds)
results = (
service.users()
.messages()
.list(userId="me", labelIds=["SENT"], maxResults=self.n)
.execute()
)
messages = results.get("messages", [])
for message in messages:
try:
yield _get_message_data(service, message)
except Exception as e:
# TODO: handle errors better
if self.raise_error:
raise e
else:
pass

0 comments on commit c1badc1

Please sign in to comment.