From 85bc2933df30b0ff6588a8a2163db599f4841f81 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Wed, 30 Mar 2022 16:05:20 +0000 Subject: [PATCH] add google docs to preprocessing options --- requirements.txt | 7 ++- stringprint/tools/google_download.py | 76 ++++++++++++++++++++++++++++ stringprint/tools/preprocessing.py | 30 ++++++++++- 3 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 stringprint/tools/google_download.py diff --git a/requirements.txt b/requirements.txt index d6c4a5f..0f35c7b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -44,7 +44,10 @@ xlwt==1.3.0 openpyxl==2.6.4 xlrd==1.2.0 PyMuPDF==1.18.16 -black==22.1.0 +black==22.3.0 pylint==1.4.0 rich==12.0.1 -cookiecutter==1.7.3 \ No newline at end of file +cookiecutter==1.7.3 +google-api-python-client==2.42.0 +oauth2client==4.1.3 +python-unsplash==2.27.1 \ No newline at end of file diff --git a/stringprint/tools/google_download.py b/stringprint/tools/google_download.py new file mode 100644 index 0000000..6946954 --- /dev/null +++ b/stringprint/tools/google_download.py @@ -0,0 +1,76 @@ +import io +import json +import os +from pathlib import Path +from typing import Optional, Union + +from googleapiclient.discovery import build +from googleapiclient.http import MediaIoBaseDownload +from oauth2client.service_account import ServiceAccountCredentials + + +class GoogleDownloader: + + mime_types = { + ".pdf": "application/pdf", + ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + } + + def __init__(self): + + data = os.environ.get("GDRIVE_SERVICE_USER", None) + if data is None: + raise ValueError("GDRIVE_SERVICE_USER env var not set.") + + credentials = json.loads(data) + + credentials = ServiceAccountCredentials.from_json_keyfile_dict( + credentials, + [ + "https://spreadsheets.google.com/feeds", + "https://www.googleapis.com/auth/drive", + ], + ) + self.drive_service = build("drive", "v3", credentials=credentials) + + def download_file( + self, + *, + dest: Union[str, Path], + google_id: Optional[str] = None, + url: Optional[str] = None, + mime_type: Optional[str] = None, + ): + """ + url takes precidence over google_id + if mime_type not specified, will try and work out from destination extention + + """ + + if not (google_id or url): + raise ValueError("Must specify google id or url of file to download") + + if url: + # assuming the edit view + google_id = url.split("/")[-2] + + if isinstance(dest, str): + dest = Path(dest) + + if mime_type is None: + mime_type = self.__class__.mime_types.get(dest.suffix, None) + + if mime_type is None: + raise ValueError(f"Can't extract a mimetype from {dest.suffix}") + + request = self.drive_service.files().export_media( + fileId=google_id, mimeType=mime_type + ) + fh = io.FileIO(dest, "wb") + downloader = MediaIoBaseDownload(fh, request) + done = False + print(f"waiting for download of {dest.name} start") + while done is False: + status, done = downloader.next_chunk() + print(f"Download {int(status.progress() * 100)}%.") + print("Download Finished.") diff --git a/stringprint/tools/preprocessing.py b/stringprint/tools/preprocessing.py index 04096b2..c33d90f 100644 --- a/stringprint/tools/preprocessing.py +++ b/stringprint/tools/preprocessing.py @@ -1,14 +1,17 @@ from __future__ import annotations +import shutil from dataclasses import dataclass from pathlib import Path -from typing import Callable, List, Optional, Tuple, TypeVar, Union, Dict +from typing import Callable, Dict, List, Optional, Tuple, TypeVar, Union from PyPDF2 import PdfFileReader, PdfFileWriter from stringprint.models import Article from stringprint.tools.word_convert import convert_word from useful_inkleby.files import QuickText +from .google_download import GoogleDownloader + def merge_pdfs( front: Path, @@ -172,6 +175,8 @@ def process_text(self, qt: QuickText) -> QuickText: class BasePreprocess: + google_download: bool = True + google_download_formats: list = ["contents.pdf", "doc_source.docx"] word_source_file: Optional[str] = None word_demote: bool = False # push everyone down one header level word_convert: bool = True @@ -201,6 +206,7 @@ def process(self) -> None: """ Run standard features """ + self.do_google_download() self.do_convert_word() self.import_markdown() self.combine_pdfs() @@ -210,6 +216,18 @@ def process(self) -> None: self.postprocess() self.output() + def do_google_download(self): + """ """ + if ( + self.__class__.google_download + and "google_doc_url" in self.article.extra_values + ): + google_doc_url = self.article.extra_values["google_doc_url"] + gd = GoogleDownloader() + for f in self.__class__.google_download_formats: + dest = self.doc_folder / f + gd.download_file(dest=dest, url=google_doc_url) + def import_markdown(self): """ Import the source file as a QuickText object @@ -237,6 +255,16 @@ def combine_pdfs(self) -> None: filename = self.pdf_output_filename output = self.doc_folder / filename + + if not (any([front_page.exists(), contents.exists()])): + print("neither source files for merging pdf, skipping") + return + + if contents.exists() and not front_page.exists(): + print("no front page, just reusing contents page") + shutil.copy(contents, output) + return + merge_pdfs(front_page, contents, output, start_page) print("combined pdfs")