add google docs to preprocessing options

mysociety · Mar 30, 2022 · 85bc293 · 85bc293
1 parent cd51874
commit 85bc293
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 3 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -44,7 +44,10 @@ xlwt==1.3.0
 openpyxl==2.6.4
 xlrd==1.2.0
 PyMuPDF==1.18.16
-black==22.1.0
+black==22.3.0
 pylint==1.4.0
 rich==12.0.1
-cookiecutter==1.7.3
+cookiecutter==1.7.3
+google-api-python-client==2.42.0
+oauth2client==4.1.3
+python-unsplash==2.27.1
diff --git a/stringprint/tools/google_download.py b/stringprint/tools/google_download.py
@@ -0,0 +1,76 @@
+import io
+import json
+import os
+from pathlib import Path
+from typing import Optional, Union
+
+from googleapiclient.discovery import build
+from googleapiclient.http import MediaIoBaseDownload
+from oauth2client.service_account import ServiceAccountCredentials
+
+
+class GoogleDownloader:
+
+    mime_types = {
+        ".pdf": "application/pdf",
+        ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    }
+
+    def __init__(self):
+
+        data = os.environ.get("GDRIVE_SERVICE_USER", None)
+        if data is None:
+            raise ValueError("GDRIVE_SERVICE_USER env var not set.")
+
+        credentials = json.loads(data)
+
+        credentials = ServiceAccountCredentials.from_json_keyfile_dict(
+            credentials,
+            [
+                "https://spreadsheets.google.com/feeds",
+                "https://www.googleapis.com/auth/drive",
+            ],
+        )
+        self.drive_service = build("drive", "v3", credentials=credentials)
+
+    def download_file(
+        self,
+        *,
+        dest: Union[str, Path],
+        google_id: Optional[str] = None,
+        url: Optional[str] = None,
+        mime_type: Optional[str] = None,
+    ):
+        """
+        url takes precidence over google_id
+        if mime_type not specified, will try and work out from destination extention
+
+        """
+
+        if not (google_id or url):
+            raise ValueError("Must specify google id or url of file to download")
+
+        if url:
+            # assuming the edit view
+            google_id = url.split("/")[-2]
+
+        if isinstance(dest, str):
+            dest = Path(dest)
+
+        if mime_type is None:
+            mime_type = self.__class__.mime_types.get(dest.suffix, None)
+
+        if mime_type is None:
+            raise ValueError(f"Can't extract a mimetype from {dest.suffix}")
+
+        request = self.drive_service.files().export_media(
+            fileId=google_id, mimeType=mime_type
+        )
+        fh = io.FileIO(dest, "wb")
+        downloader = MediaIoBaseDownload(fh, request)
+        done = False
+        print(f"waiting for download of {dest.name} start")
+        while done is False:
+            status, done = downloader.next_chunk()
+            print(f"Download {int(status.progress() * 100)}%.")
+        print("Download Finished.")
diff --git a/stringprint/tools/preprocessing.py b/stringprint/tools/preprocessing.py
@@ -1,14 +1,17 @@
 from __future__ import annotations
 
+import shutil
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Callable, List, Optional, Tuple, TypeVar, Union, Dict
+from typing import Callable, Dict, List, Optional, Tuple, TypeVar, Union
 
 from PyPDF2 import PdfFileReader, PdfFileWriter
 from stringprint.models import Article
 from stringprint.tools.word_convert import convert_word
 from useful_inkleby.files import QuickText
 
+from .google_download import GoogleDownloader
+
 
 def merge_pdfs(
     front: Path,
@@ -172,6 +175,8 @@ def process_text(self, qt: QuickText) -> QuickText:
 
 
 class BasePreprocess:
+    google_download: bool = True
+    google_download_formats: list = ["contents.pdf", "doc_source.docx"]
     word_source_file: Optional[str] = None
     word_demote: bool = False  # push everyone down one header level
     word_convert: bool = True
@@ -201,6 +206,7 @@ def process(self) -> None:
         """
         Run standard features
         """
+        self.do_google_download()
         self.do_convert_word()
         self.import_markdown()
         self.combine_pdfs()
@@ -210,6 +216,18 @@ def process(self) -> None:
         self.postprocess()
         self.output()
 
+    def do_google_download(self):
+        """ """
+        if (
+            self.__class__.google_download
+            and "google_doc_url" in self.article.extra_values
+        ):
+            google_doc_url = self.article.extra_values["google_doc_url"]
+            gd = GoogleDownloader()
+            for f in self.__class__.google_download_formats:
+                dest = self.doc_folder / f
+                gd.download_file(dest=dest, url=google_doc_url)
+
     def import_markdown(self):
         """
         Import the source file as a QuickText object
@@ -237,6 +255,16 @@ def combine_pdfs(self) -> None:
             filename = self.pdf_output_filename
 
         output = self.doc_folder / filename
+
+        if not (any([front_page.exists(), contents.exists()])):
+            print("neither source files for merging pdf, skipping")
+            return
+
+        if contents.exists() and not front_page.exists():
+            print("no front page, just reusing contents page")
+            shutil.copy(contents, output)
+            return
+
         merge_pdfs(front_page, contents, output, start_page)
         print("combined pdfs")