-
Notifications
You must be signed in to change notification settings - Fork 50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
google drive crawler #101
Merged
Merged
google drive crawler #101
Changes from 13 commits
Commits
Show all changes
17 commits
Select commit
Hold shift + click to select a range
90a1167
google drive crawler
fba3ede
updated gdrive crawler config
a5a9568
Refactored GdriveCrawler to use date filtering and removed local stor…
08d1bcb
Refactor: Switch to using index_file() for direct uploads, sanitize f…
9361662
added numpy
78f9d38
Refactor gdrive_crawler.py: use slugify, logging.info, adjust date co…
1690c8e
standardize date handling
bf2deef
changed the file to earlier versions
c59c0c3
changed crawing to crawling
24bf28d
Merge branch 'main' into gdrive_crawler
ofermend 4fed2b6
removed redundant checks on dates and clubbed download() and export()…
8f7d906
resolving commit issues
7b07a8a
Merge branch 'gdrive_crawler' of https://github.com/vectara/vectara-i…
b899019
minor fixes
ofermend 6b2aa76
small mypy fix
ofermend 48f2f02
updated Docker load of credentials.json
ofermend 8b5e50d
added typing annotations
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
vectara: | ||
corpus_id: 277 | ||
customer_id: 1526022105 | ||
reindex: true | ||
|
||
crawling: | ||
crawler_type: gdrive | ||
|
||
gdrive_crawler: | ||
delegated_users: | ||
- <add email id> | ||
- <add email id> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
import os | ||
from core.crawler import Crawler | ||
from omegaconf import OmegaConf | ||
import logging | ||
import io | ||
from datetime import datetime, timedelta | ||
from google.oauth2 import service_account | ||
from googleapiclient.discovery import build | ||
from googleapiclient.errors import HttpError | ||
from googleapiclient.http import MediaIoBaseDownload | ||
import pandas as pd | ||
from typing import List | ||
from slugify import slugify | ||
|
||
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] | ||
SERVICE_ACCOUNT_FILE = 'credentials.json' | ||
|
||
def get_credentials(delegated_user): | ||
credentials = service_account.Credentials.from_service_account_file( | ||
SERVICE_ACCOUNT_FILE, scopes=SCOPES) | ||
delegated_credentials = credentials.with_subject(delegated_user) | ||
return delegated_credentials | ||
|
||
def download_or_export_file(service, file_id, mime_type=None): | ||
try: | ||
if mime_type: | ||
request = service.files().export_media(fileId=file_id, mimeType=mime_type) | ||
else: | ||
request = service.files().get_media(fileId=file_id) | ||
|
||
byte_stream = io.BytesIO() # an in-memory bytestream | ||
downloader = MediaIoBaseDownload(byte_stream, request) | ||
done = False | ||
while not done: | ||
status, done = downloader.next_chunk() | ||
logging.info(f"Download {int(status.progress() * 100)}.") | ||
byte_stream.seek(0) # Reset the file pointer to the beginning | ||
return byte_stream | ||
except HttpError as error: | ||
logging.info(f"An error occurred: {error}") | ||
return None | ||
# Note: Handling of large files that may exceed memory limits should be implemented if necessary. | ||
|
||
def save_local_file(service, file_id, name, mime_type=None): | ||
sanitized_name = slugify(name) | ||
file_path = os.path.join("/tmp", sanitized_name) | ||
try: | ||
byte_stream = download_or_export_file(service, file_id, mime_type) | ||
if byte_stream: | ||
with open(file_path, 'wb') as f: | ||
f.write(byte_stream.read()) | ||
return file_path | ||
except Exception as e: | ||
logging.info(f"Error saving local file: {e}") | ||
return None | ||
|
||
class GdriveCrawler(Crawler): | ||
|
||
def __init__(self, cfg: OmegaConf, endpoint: str, customer_id: str, corpus_id: int, api_key: str, delegated_users: List[str]) -> None: | ||
super().__init__(cfg, endpoint, customer_id, corpus_id, api_key) | ||
logging.info("Google Drive Crawler initialized") | ||
|
||
self.delegated_users = delegated_users | ||
self.creds = None | ||
self.service = None | ||
self.api_key = api_key | ||
self.customer_id = customer_id | ||
self.corpus_id = corpus_id | ||
|
||
def list_files(self, service, parent_id=None, date_threshold=None): | ||
results = [] | ||
page_token = None | ||
query = f"('{parent_id}' in parents or sharedWithMe) and trashed=false and modifiedTime > '{date_threshold}'" if parent_id else f"('root' in parents or sharedWithMe) and trashed=false and modifiedTime > '{date_threshold}'" | ||
|
||
while True: | ||
try: | ||
params = { | ||
'fields': 'nextPageToken, files(id, name, mimeType, permissions, modifiedTime, createdTime, owners, size)', | ||
'q': query, | ||
'corpora': 'allDrives', | ||
'includeItemsFromAllDrives': True, | ||
'supportsAllDrives': True | ||
} | ||
if page_token: | ||
params['pageToken'] = page_token | ||
response = service.files().list(**params).execute() | ||
files = response.get('files', []) | ||
for file in files: | ||
permissions = file.get('permissions', []) | ||
if any(p.get('displayName') == 'Vectara' or p.get('displayName') == 'all' for p in permissions): | ||
results.append(file) | ||
page_token = response.get('nextPageToken', None) | ||
if not page_token: | ||
break | ||
except HttpError as error: | ||
logging.info(f"An error occurred: {error}") | ||
break | ||
return results | ||
|
||
def handle_file(self, file): | ||
AbhilashaLodha marked this conversation as resolved.
Show resolved
Hide resolved
|
||
file_id = file['id'] | ||
mime_type = file['mimeType'] | ||
name = file['name'] | ||
permissions = file.get('permissions', []) | ||
|
||
logging.info(f"\nHandling file: {name} with MIME type: {mime_type}") | ||
|
||
if not any(p.get('displayName') == 'Vectara' or p.get('displayName') == 'all' for p in permissions): | ||
logging.info(f"Skipping restricted file: {name}") | ||
return None | ||
|
||
if mime_type == 'application/vnd.google-apps.document': | ||
local_file_path = save_local_file(self.service, file_id, name + '.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') | ||
url = f'https://docs.google.com/document/d/{file_id}/edit' | ||
elif mime_type == 'application/vnd.google-apps.spreadsheet': | ||
local_file_path = save_local_file(self.service, file_id, name + '.csv', 'text/csv') | ||
url = f'https://docs.google.com/spreadsheets/d/{file_id}/edit' | ||
elif mime_type == 'application/vnd.google-apps.presentation': | ||
local_file_path = save_local_file(self.service, file_id, name + '.pptx', 'application/vnd.openxmlformats-officedocument.presentationml.presentation') | ||
url = f'https://docs.google.com/presentation/d/{file_id}/edit' | ||
elif mime_type.startswith('application/'): | ||
local_file_path = save_local_file(self.service, file_id, name) | ||
if local_file_path and name.endswith('.xlsx'): | ||
df = pd.read_excel(local_file_path) | ||
csv_file_path = local_file_path.replace('.xlsx', '.csv') | ||
df.to_csv(csv_file_path, index=False) | ||
local_file_path = csv_file_path | ||
url = f'https://drive.google.com/file/d/{file_id}/view' | ||
else: | ||
logging.info(f"Unsupported file type: {mime_type}") | ||
return None, None | ||
|
||
if local_file_path: | ||
logging.info(f"local_file_path :: {local_file_path}") | ||
return local_file_path, url | ||
else: | ||
logging.info(f"local_file_path :: None") | ||
return None, None | ||
|
||
def crawl_file(self, file): | ||
local_file_path, url = self.handle_file(file) | ||
if local_file_path: | ||
file_id = file['id'] | ||
name = file['name'] | ||
created_time = file.get('createdTime', 'N/A') | ||
modified_time = file.get('modifiedTime', 'N/A') | ||
owners = ', '.join([owner['displayName'] for owner in file.get('owners', [])]) | ||
size = file.get('size', 'N/A') | ||
|
||
logging.info(f'\nCrawling file {name}') | ||
|
||
file_metadata = { | ||
'id': file_id, | ||
'name': name, | ||
'created_at': created_time, | ||
AbhilashaLodha marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'modified_at': modified_time, | ||
'owners': owners, | ||
'size': size, | ||
'source': 'gdrive' | ||
} | ||
|
||
try: | ||
self.indexer.index_file(filename=local_file_path, uri=url, metadata=file_metadata) | ||
except Exception as e: | ||
logging.info(f"Error {e} indexing document for file {name}, file_id {file_id}") | ||
|
||
def crawl(self) -> None: | ||
N = 7 # Number of days to look back | ||
date_threshold = datetime.utcnow() - timedelta(days=N) | ||
|
||
for user in self.delegated_users: | ||
logging.info(f"Processing files for user: {user}") | ||
self.creds = get_credentials(user) | ||
self.service = build("drive", "v3", credentials=self.creds) | ||
|
||
list_files = self.list_files(self.service, date_threshold=date_threshold.isoformat() + 'Z') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see you using the ISO format + "z": is this because Google API demands it to be in UTC format and in this way? |
||
for file in list_files: | ||
self.crawl_file(file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not a good idea. I'll explain offline why.